def execute(self): pipeline = self.recon_pipeline with DagsterInstance.from_ref(self.instance_ref) as instance: start_termination_thread(self.term_event) execution_plan = create_execution_plan( pipeline=pipeline, run_config=self.run_config, mode=self.pipeline_run.mode, step_keys_to_execute=[self.step_key], known_state=self.known_state, ) yield instance.report_engine_event( "Executing step {} in subprocess".format(self.step_key), self.pipeline_run, EngineEventData( [ MetadataEntry.text(str(os.getpid()), "pid"), MetadataEntry.text(self.step_key, "step_key"), ], marker_end=DELEGATE_MARKER, ), MultiprocessExecutor, self.step_key, ) yield from execute_plan_iterator( execution_plan, pipeline, self.pipeline_run, run_config=self.run_config, retry_mode=self.retry_mode.for_inner_plan(), instance=instance, )
def _timing_to_metadata(timings: List[Dict[str, Any]]) -> List[MetadataEntry]: metadata = [] for timing in timings: if timing["name"] == "execute": desc = "Execution" elif timing["name"] == "compile": desc = "Compilation" else: continue started_at = dateutil.parser.isoparse(timing["started_at"]) completed_at = dateutil.parser.isoparse(timing["completed_at"]) duration = completed_at - started_at metadata.extend( [ MetadataEntry.text( text=started_at.isoformat(timespec="seconds"), label=f"{desc} Started At" ), MetadataEntry.text( text=started_at.isoformat(timespec="seconds"), label=f"{desc} Completed At" ), MetadataEntry.float(value=duration.total_seconds(), label=f"{desc} Duration"), ] ) return metadata
def handle_output(self, context: OutputContext, obj: Union[PandasDataFrame, SparkDataFrame]): schema, table = DB_SCHEMA, context.asset_key.path[-1] time_window = context.asset_partitions_time_window if context.has_asset_partitions else None with connect_snowflake(config=self._config, schema=schema) as con: con.execute(self._get_cleanup_statement(table, schema, time_window)) if isinstance(obj, SparkDataFrame): yield from self._handle_spark_output(obj, schema, table) elif isinstance(obj, PandasDataFrame): yield from self._handle_pandas_output(obj, schema, table) elif obj is None: # dbt config = dict(SHARED_SNOWFLAKE_CONF) config["schema"] = DB_SCHEMA with connect_snowflake(config=config) as con: df = read_sql(f"SELECT * FROM {context.name} LIMIT 5", con=con) num_rows = con.execute(f"SELECT COUNT(*) FROM {context.name}").fetchone() yield MetadataEntry.md(df.to_markdown(), "Data sample") yield MetadataEntry.int(num_rows, "Rows") else: raise Exception( "SnowflakeIOManager only supports pandas DataFrames and spark DataFrames" ) yield MetadataEntry.text( self._get_select_statement( table, schema, None, time_window, ), "Query", )
def handle_output(self, context, obj): table_name = context.name write_dataframe_to_table(name=table_name, dataframe=obj) # attach these to the Handled Output event yield MetadataEntry.int(len(obj), label="number of rows") yield MetadataEntry.text(table_name, label="table name")
def join_q2_data( context, april_data, may_data, june_data, master_cord_data, ): dfs = {"april": april_data, "may": may_data, "june": june_data} missing_things = [] for required_column in ["DestAirportSeqID", "OriginAirportSeqID"]: for month, df in dfs.items(): if required_column not in df.columns: missing_things.append({"month": month, "missing_column": required_column}) yield ExpectationResult( success=not bool(missing_things), label="airport_ids_present", description="Sequence IDs present in incoming monthly flight data.", metadata_entries=[ MetadataEntry.json(label="metadata", data={"missing_columns": missing_things}) ], ) yield ExpectationResult( success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns), label="flight_data_same_shape", metadata_entries=[ MetadataEntry.json(label="metadata", data={"columns": april_data.columns}) ], ) q2_data = april_data.union(may_data).union(june_data) sampled_q2_data = q2_data.sample( withReplacement=False, fraction=context.solid_config["subsample_pct"] / 100.0 ) sampled_q2_data.createOrReplaceTempView("q2_data") dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "DEST_") dest_prefixed_master_cord_data.createOrReplaceTempView("dest_cord_data") origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "ORIGIN_") origin_prefixed_master_cord_data.createOrReplaceTempView("origin_cord_data") full_data = context.resources.pyspark.spark_session.sql( """ SELECT * FROM origin_cord_data LEFT JOIN ( SELECT * FROM q2_data LEFT JOIN dest_cord_data ON q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID ) q2_dest_data ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID """ ) yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
def handle_output(self, context, obj): file_path = os.path.join("my_base_dir", context.step_key, context.name) obj.to_csv(file_path) yield MetadataEntry.int(obj.shape[0], label="number of rows") yield MetadataEntry.float(obj["some_column"].mean(), "some_column mean")
def df_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ MetadataEntry.text(str(len(value)), "row_count", "Number of rows in DataFrame"), # string cast columns since they may be things like datetime MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"), ], )
def df_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ MetadataEntry("row_count", value=str(len(value))), # string cast columns since they may be things like datetime MetadataEntry("metadata", value={"columns": list(map(str, value.columns))}), ], )
def _node_result_to_metadata(node_result: Dict[str, Any]) -> List[MetadataEntry]: return [ MetadataEntry.text( text=node_result["config"]["materialized"], label="Materialization Strategy", ), MetadataEntry.text(text=node_result["database"], label="Database"), MetadataEntry.text(text=node_result["schema"], label="Schema"), MetadataEntry.text(text=node_result["alias"], label="Alias"), MetadataEntry.text(text=node_result["description"], label="Description"), ]
def _launch_k8s_job_with_args(self, job_name, args, run): container_context = self.get_container_context_for_run(run) pod_name = job_name pipeline_origin = run.pipeline_code_origin user_defined_k8s_config = get_user_defined_k8s_config( frozentags(run.tags)) repository_origin = pipeline_origin.repository_origin job_config = container_context.get_k8s_job_config( job_image=repository_origin.container_image, run_launcher=self) self._instance.add_run_tags( run.run_id, {DOCKER_IMAGE_TAG: job_config.job_image}, ) job = construct_dagster_k8s_job( job_config=job_config, args=args, job_name=job_name, pod_name=pod_name, component="run_worker", user_defined_k8s_config=user_defined_k8s_config, labels={ "dagster/job": pipeline_origin.pipeline_name, "dagster/run-id": run.run_id, }, ) self._instance.report_engine_event( "Creating Kubernetes run worker job", run, EngineEventData([ MetadataEntry("Kubernetes Job name", value=job_name), MetadataEntry("Kubernetes Namespace", value=container_context.namespace), MetadataEntry("Run ID", value=run.run_id), ]), cls=self.__class__, ) self._batch_api.create_namespaced_job( body=job, namespace=container_context.namespace) self._instance.report_engine_event( "Kubernetes run worker job created", run, cls=self.__class__, )
def result_to_materialization( result: Dict[str, Any], asset_key_prefix: List[str] = None, docs_url: str = None ) -> Optional[AssetMaterialization]: """ This is a hacky solution that attempts to consolidate parsing many of the potential formats that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+, as well as RPC responses for a similar time period, but as the RPC response schema is not documented nor enforced, this can become out of date easily. """ asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str) # status comes from set of fields rather than "status" if "fail" in result: success = not result.get("fail") and not result.get("skip") and not result.get("error") else: success = result["status"] == "success" if not success: return None # all versions represent timing the same way metadata = [ MetadataEntry.float(value=result["execution_time"], label="Execution Time (seconds)") ] + _timing_to_metadata(result["timing"]) # working with a response that contains the node block (RPC and CLI 0.18.x) if "node" in result: unique_id = result["node"]["unique_id"] metadata += _node_result_to_metadata(result["node"]) else: unique_id = result["unique_id"] id_prefix = unique_id.split(".") # only generate materializations for models if id_prefix[0] != "model": return None if docs_url: metadata = [ MetadataEntry.url(url=f"{docs_url}#!/model/{unique_id}", label="docs_url") ] + metadata return AssetMaterialization( description=f"dbt node: {unique_id}", metadata_entries=metadata, asset_key=asset_key_prefix + id_prefix, )
def load_data_to_database_from_spark(context, data_frame): context.resources.db_info.load_table(data_frame, context.solid_config["table_name"]) table_name = context.solid_config["table_name"] yield AssetMaterialization( asset_key="table:{table_name}".format(table_name=table_name), description=( "Persisted table {table_name} in database configured in the db_info resource." ).format(table_name=table_name), metadata_entries=[ MetadataEntry.text(label="Host", text=context.resources.db_info.host), MetadataEntry.text(label="Db", text=context.resources.db_info.db_name), ], ) yield Output(value=table_name, output_name="table_name")
def handle_output( self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame] ): path = self._get_path(context) if isinstance(obj, pandas.DataFrame): row_count = len(obj) obj.to_parquet(path=path, index=False) elif isinstance(obj, pyspark.sql.DataFrame): row_count = obj.count() obj.write.parquet(path=path, mode="overwrite") else: raise Exception(f"Outputs of type {type(obj)} not supported.") yield MetadataEntry.int(value=row_count, label="row_count") yield MetadataEntry.path(path=path, label="path")
def raise_for_rpc_error(context: SolidExecutionContext, resp: Response) -> None: error = resp.json().get("error") if error is not None: if error["code"] in [ DBTErrors.project_currently_compiling_error.value, DBTErrors.runtime_error.value, DBTErrors.server_error.value, ]: context.log.warning(error["message"]) raise RetryRequested(max_retries=5, seconds_to_wait=30) elif error["code"] == DBTErrors.project_compile_failure_error.value: raise Failure( description=error["message"], metadata_entries=[ MetadataEntry.text(text=str(error["code"]), label="RPC Error Code"), MetadataEntry.text(text=error["data"]["cause"]["message"], label="RPC Error Cause"), ], ) elif error["code"] == DBTErrors.rpc_process_killed_error.value: raise Failure( description=error["message"], metadata_entries=[ MetadataEntry.text(text=str(error["code"]), label="RPC Error Code"), MetadataEntry.text(text=str(error["data"]["signum"]), label="RPC Signum"), MetadataEntry.text(text=error["data"]["message"], label="RPC Error Message"), ], ) elif error["code"] == DBTErrors.rpc_timeout_error.value: raise Failure( description=error["message"], metadata_entries=[ MetadataEntry.text(text=str(error["code"]), label="RPC Error Code"), MetadataEntry.text(text=str(error["data"]["timeout"]), label="RPC Timeout"), MetadataEntry.text(text=error["data"]["message"], label="RPC Error Message"), ], ) else: raise Failure( description=error["message"], metadata_entries=[ MetadataEntry.text(text=str(error["code"]), label="RPC Error Code"), ], )
def handle_output(self, context: OutputContext, obj: Union[PandasDataFrame, SparkDataFrame]): schema, table = context.metadata["table"].split(".") partition_bounds = (context.resources.partition_bounds if context.metadata.get("partitioned") is True else None) with connect_snowflake(config=self._config, schema=schema) as con: con.execute( self._get_cleanup_statement(table, schema, partition_bounds)) if isinstance(obj, SparkDataFrame): yield from self._handle_spark_output(obj, schema, table) elif isinstance(obj, PandasDataFrame): yield from self._handle_pandas_output(obj, schema, table) else: raise Exception( "SnowflakeIOManager only supports pandas DataFrames and spark DataFrames" ) yield MetadataEntry.text( self._get_select_statement(table, schema, context.metadata.get("columns"), partition_bounds), "Query", )
def test_raise_on_error_true_type_check_returns_unsuccessful_type_check(): FalsyType = DagsterType( name="FalsyType", type_check_fn=lambda _, _val: TypeCheck( success=False, metadata_entries=[MetadataEntry.text("foo", "bar", "baz")]), ) @solid(output_defs=[OutputDefinition(FalsyType)]) def foo_solid(_): return 1 @pipeline def foo_pipeline(): foo_solid() with pytest.raises(DagsterTypeCheckDidNotPass) as e: execute_pipeline(foo_pipeline) assert e.value.metadata_entries[0].label == "bar" assert e.value.metadata_entries[0].entry_data.text == "foo" assert e.value.metadata_entries[0].description == "baz" assert isinstance(e.value.dagster_type, DagsterType) pipeline_result = execute_pipeline(foo_pipeline, raise_on_error=False) assert not pipeline_result.success assert [ event.event_type_value for event in pipeline_result.step_event_list ] == [ DagsterEventType.STEP_START.value, DagsterEventType.STEP_OUTPUT.value, DagsterEventType.STEP_FAILURE.value, ] for event in pipeline_result.step_event_list: if event.event_type_value == DagsterEventType.STEP_FAILURE.value: assert event.event_specific_data.error.cls_name == "DagsterTypeCheckDidNotPass"
def __init__(self, description: str, logs: List[Dict[str, Any]], raw_output: str): metadata_entries = [ MetadataEntry.json( {"logs": logs}, label="Parsed CLI Output (JSON)", ), MetadataEntry.text( DagsterDbtCliRuntimeError.stitch_messages(logs), label="Parsed CLI Output (JSON) Message Attributes", ), MetadataEntry.text( raw_output, label="Raw CLI Output", ), ] super().__init__(description, metadata_entries)
def _get_metadata(self, result: Dict[str, Any]) -> List[MetadataEntry]: """ Here, we run queries against our output Snowflake database tables to add additional context to our asset materializations. """ table_name = result["unique_id"].split(".")[-1] with connect_snowflake(config=self._snowflake_config, schema=self._dbt_schema) as con: n_rows = pandas.read_sql_query(f"SELECT COUNT(*) FROM {table_name}", con) sample_rows = pandas.read_sql_query( f"SELECT * FROM {table_name} SAMPLE ROW (10 rows)", con ) return super()._get_metadata(result) + [ MetadataEntry.int(int(n_rows.iloc[0][0]), "dbt Model Number of Rows"), MetadataEntry.md(sample_rows.astype("str").to_markdown(), "dbt Model Sample Rows"), ]
def handle_output(self, context, obj: pd.DataFrame): """This saves the dataframe as a CSV.""" fpath = self._get_fs_path(context.asset_key) os.makedirs(os.path.dirname(fpath), exist_ok=True) obj.to_csv(fpath) with open(fpath + ".version", "w") as f: f.write(context.version if context.version else "None") yield MetadataEntry.int(obj.shape[0], "Rows") yield MetadataEntry.path(fpath, "Path") yield MetadataEntry.md(obj.head(5).to_markdown(), "Sample") yield MetadataEntry.text(context.version, "Resolved version") yield MetadataEntry.table_schema( self.get_schema(context.dagster_type), "Schema", )
def _handle_pandas_output(self, obj: PandasDataFrame, schema: str, table: str): from snowflake import connector # pylint: disable=no-name-in-module yield MetadataEntry.int(obj.shape[0], "Rows") yield MetadataEntry.md(pandas_columns_to_markdown(obj), "DataFrame columns") connector.paramstyle = "pyformat" with connect_snowflake(config=self._config, schema=schema) as con: with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns") with_uppercase_cols.to_sql( table, con=con, if_exists="append", index=False, method=pd_writer, )
def cache_file_from_s3(context, s3_coordinate: S3Coordinate) -> FileHandle: target_key = context.solid_config.get("file_key", s3_coordinate["key"].split("/")[-1]) file_cache = context.resources.file_cache target_file_handle = file_cache.get_file_handle(target_key) if file_cache.overwrite or not file_cache.has_file_object(target_key): with get_temp_file_name() as tmp_file: context.resources.s3.download_file( Bucket=s3_coordinate["bucket"], Key=s3_coordinate["key"], Filename=tmp_file ) context.log.info("File downloaded to {}".format(tmp_file)) with open(tmp_file, "rb") as tmp_file_object: file_cache.write_file_object(target_key, tmp_file_object) context.log.info("File handle written at : {}".format(target_file_handle.path_desc)) else: context.log.info("File {} already present in cache".format(target_file_handle.path_desc)) yield ExpectationResult( success=file_cache.has_file_object(target_key), label="file_handle_exists", metadata_entries=[MetadataEntry.path(path=target_file_handle.path_desc, label=target_key)], ) yield Output(target_file_handle)
def test_explicit_failure(): with tempfile.TemporaryDirectory() as tmpdir: run_config = { "resources": { "step_launcher": { "config": { "scratch_dir": tmpdir } }, "io_manager": { "config": { "base_dir": tmpdir } }, } } with instance_for_test() as instance: run = execute_pipeline( pipeline=reconstructable(_define_failure_job), run_config=run_config, instance=instance, raise_on_error=False, ) fd = run.result_for_solid("retry_op").failure_data assert fd.user_failure_data.description == "some failure description" assert fd.user_failure_data.metadata_entries == [ MetadataEntry.float(label="foo", value=1.23) ]
def _ge_validation_fn(context, dataset): data_context = context.resources.ge_data_context validator_kwargs = { "datasource_name": datasource_name, "data_connector_name": data_connector_name, "data_asset_name": datasource_name or data_asset_name, "runtime_parameters": { runtime_method_type: dataset }, "batch_identifiers": batch_identifiers, "expectation_suite_name": suite_name, **extra_kwargs, } validator = data_context.get_validator(**validator_kwargs) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = validator.validate(run_id=run_id) validation_results_page_renderer = ValidationResultsPageRenderer( run_info_at_end=True) rendered_document_content_list = validation_results_page_renderer.render( validation_results=results) md_str = "".join( DefaultMarkdownPageView().render(rendered_document_content_list)) meta_stats = MetadataEntry("Expectation Results", value=MetadataValue.md(md_str)) yield ExpectationResult( success=bool(results["success"]), metadata_entries=[meta_stats], ) yield Output(results.to_json_dict())
def pandera_schema_to_dagster_type( schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]], ) -> DagsterType: """ Convert a Pandera dataframe schema to a `DagsterType`. The generated Dagster type will be given an automatically generated `name`. The schema's `title` property, `name` property, or class name (in that order) will be used. If neither `title` or `name` is defined, a name of the form `DagsterPanderaDataframe<n>` is generated. Additional metadata is also extracted from the Pandera schema and attached to the returned `DagsterType` in an `MetadataEntry` object. The extracted metadata includes: - Descriptions on the schema and constituent columns and checks. - Data types for each column. - String representations of all column-wise checks. - String representations of all row-wise (i.e. "wide") checks. The returned `DagsterType` type will call the Pandera schema's `validate()` method in its type check function. Validation is done in `lazy` mode, i.e. pandera will attempt to validate all values in the dataframe, rather than stopping on the first error. If validation fails, the returned `TypeCheck` object will contain two pieces of metadata: - `num_failures` total number of validation errors. - `failure_sample` a table containing up to the first 10 validation errors. Args: schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]): Returns: DagsterType: Dagster Type constructed from the Pandera schema. """ if not ( isinstance(schema, pa.DataFrameSchema) or (isinstance(schema, type) and issubclass(schema, pa.SchemaModel)) ): raise TypeError( "schema must be a pandera `DataFrameSchema` or a subclass of a pandera `SchemaModel`" ) name = _extract_name_from_pandera_schema(schema) norm_schema = ( schema.to_schema() # type: ignore[attr-defined] if isinstance(schema, type) and issubclass(schema, pa.SchemaModel) else schema ) tschema = _pandera_schema_to_table_schema(norm_schema) type_check_fn = _pandera_schema_to_type_check_fn(norm_schema, tschema) return DagsterType( type_check_fn=type_check_fn, name=name, description=norm_schema.description, metadata_entries=[ MetadataEntry("schema", value=tschema), ], )
def handle_output(self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]): path = self._get_path(context) if "://" not in self._base_path: os.makedirs(os.path.dirname(path), exist_ok=True) if isinstance(obj, pandas.DataFrame): row_count = len(obj) context.log.info(f"Row count: {row_count}") obj.to_parquet(path=path, index=False) elif isinstance(obj, pyspark.sql.DataFrame): row_count = obj.count() obj.write.parquet(path=path, mode="overwrite") else: raise Exception(f"Outputs of type {type(obj)} not supported.") yield MetadataEntry.int(value=row_count, label="row_count") yield MetadataEntry.path(path=path, label="path")
def handle_output(self, context, obj): keys = tuple(context.get_output_identifier()) self.values[keys] = obj context.add_output_metadata({"foo": "bar"}) yield MetadataEntry("baz", value="baz") context.add_output_metadata({"bar": "bar"}) yield materialization
def __init__(self, invalid_line_nos: List[int]): check.list_param(invalid_line_nos, "invalid_line_nos", int) line_nos_str = ", ".join(map(str, invalid_line_nos)) description = f"dbt CLI emitted unexpected output on lines {line_nos_str}" metadata_entries = [ MetadataEntry.json({"line_nos": invalid_line_nos}, "Invalid CLI Output Line Numbers") ] super().__init__(description, metadata_entries) self.invalid_line_nos = invalid_line_nos
def should_fail(_): raise Failure( description="Foolure", metadata_entries=[ MetadataEntry.text(label="label", text="text", description="description") ], )
def df_type_check(_, value): if not isinstance(value, dd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ # string cast columns since they may be things like datetime MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"), ], )
def handle_output(self, context, obj): key = context.asset_key.path[-1] bucket = context.resource_config["bucket"] context.log.debug("about to pickle object") pickled_obj = pickle.dumps(obj) yield MetadataEntry.int(len(pickled_obj), "Bytes") client = s3_client() context.log.debug("created S3 client") client.put_object(Bucket=bucket, Key=key, Body=pickled_obj)