def _expect_all_resources_in_result( resources: ResourcesDataFrame, result: UtilizationDataFrame) -> ExpectationResult: input_ids = set(resources.resource_id) output_ids = set(result.resource_id) missing_ids = input_ids - output_ids missing_count = len(missing_ids) entries = [ EventMetadataEntry.json( { 'input': len(input_ids), 'output': len(output_ids), 'missing': missing_count }, label='Summary Counts') ] if missing_count > 0: entries.append( EventMetadataEntry.json({'ids': list(missing_ids)}, label='Missing Resources')) return ExpectationResult( success=(missing_count == 0), label='Found All Resources', description= 'Check if all the input resource ids were found in the Azure Monitor Logs workspace.', metadata_entries=entries)
def check_users_and_groups_one_fails_one_succeeds(_context): yield ExpectationResult( success=True, label='user_expectations', description='Battery of expectations for user', metadata_entries=[ EventMetadataEntry.json( label='table_summary', data={ 'columns': { 'name': {'nulls': 0, 'empty': 0, 'values': 123, 'average_length': 3.394893}, 'time_created': {'nulls': 1, 'empty': 2, 'values': 120, 'average': 1231283}, } }, ) ], ) yield ExpectationResult( success=False, label='groups_expectations', description='Battery of expectations for groups', metadata_entries=[ EventMetadataEntry.json( label='table_summary', data={ 'columns': { 'name': {'nulls': 1, 'empty': 0, 'values': 122, 'average_length': 3.394893}, 'time_created': {'nulls': 1, 'empty': 2, 'values': 120, 'average': 1231283}, } }, ) ], )
def join_q2_data( context, april_data: DataFrame, may_data: DataFrame, june_data: DataFrame, master_cord_data: DataFrame, ) -> DataFrame: dfs = {'april': april_data, 'may': may_data, 'june': june_data} missing_things = [] for required_column in ['DestAirportSeqID', 'OriginAirportSeqID']: for month, df in dfs.items(): if required_column not in df.columns: missing_things.append({'month': month, 'missing_column': required_column}) yield ExpectationResult( success=not bool(missing_things), label='airport_ids_present', description='Sequence IDs present in incoming monthly flight data.', metadata_entries=[ EventMetadataEntry.json(label='metadata', data={'missing_columns': missing_things}) ], ) yield ExpectationResult( success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns), label='flight_data_same_shape', metadata_entries=[ EventMetadataEntry.json(label='metadata', data={'columns': april_data.columns}) ], ) q2_data = april_data.union(may_data).union(june_data) sampled_q2_data = q2_data.sample( withReplacement=False, fraction=context.solid_config['subsample_pct'] / 100.0 ) sampled_q2_data.createOrReplaceTempView('q2_data') dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, 'DEST_') dest_prefixed_master_cord_data.createOrReplaceTempView('dest_cord_data') origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, 'ORIGIN_') origin_prefixed_master_cord_data.createOrReplaceTempView('origin_cord_data') full_data = context.resources.spark.sql( ''' SELECT * FROM origin_cord_data LEFT JOIN ( SELECT * FROM q2_data LEFT JOIN dest_cord_data ON q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID ) q2_dest_data ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID ''' ) yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
def join_q2_data( context, april_data: DataFrame, may_data: DataFrame, june_data: DataFrame, master_cord_data: DataFrame, ) -> DataFrame: dfs = {"april": april_data, "may": may_data, "june": june_data} missing_things = [] for required_column in ["DestAirportSeqID", "OriginAirportSeqID"]: for month, df in dfs.items(): if required_column not in df.columns: missing_things.append({"month": month, "missing_column": required_column}) yield ExpectationResult( success=not bool(missing_things), label="airport_ids_present", description="Sequence IDs present in incoming monthly flight data.", metadata_entries=[ EventMetadataEntry.json(label="metadata", data={"missing_columns": missing_things}) ], ) yield ExpectationResult( success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns), label="flight_data_same_shape", metadata_entries=[ EventMetadataEntry.json(label="metadata", data={"columns": april_data.columns}) ], ) q2_data = april_data.union(may_data).union(june_data) sampled_q2_data = q2_data.sample( withReplacement=False, fraction=context.solid_config["subsample_pct"] / 100.0 ) sampled_q2_data.createOrReplaceTempView("q2_data") dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "DEST_") dest_prefixed_master_cord_data.createOrReplaceTempView("dest_cord_data") origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "ORIGIN_") origin_prefixed_master_cord_data.createOrReplaceTempView("origin_cord_data") full_data = context.resources.pyspark.spark_session.sql( """ SELECT * FROM origin_cord_data LEFT JOIN ( SELECT * FROM q2_data LEFT JOIN dest_cord_data ON q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID ) q2_dest_data ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID """ ) yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
def check_users_and_groups_one_fails_one_succeeds(_context): yield ExpectationResult( success=True, label="user_expectations", description="Battery of expectations for user", metadata_entries=[ EventMetadataEntry.json( label="table_summary", data={ "columns": { "name": { "nulls": 0, "empty": 0, "values": 123, "average_length": 3.394893 }, "time_created": { "nulls": 1, "empty": 2, "values": 120, "average": 1231283 }, } }, ) ], ) yield ExpectationResult( success=False, label="groups_expectations", description="Battery of expectations for groups", metadata_entries=[ EventMetadataEntry.json( label="table_summary", data={ "columns": { "name": { "nulls": 1, "empty": 0, "values": 122, "average_length": 3.394893 }, "time_created": { "nulls": 1, "empty": 2, "values": 120, "average": 1231283 }, } }, ) ], )
def event_metadata_entries(metadata_entry_datas): if not metadata_entry_datas: return for metadata_entry_data in metadata_entry_datas: typename = metadata_entry_data['__typename'] label = metadata_entry_data['label'] description = metadata_entry_data.get('description') if typename == 'EventPathMetadataEntry': yield EventMetadataEntry.path(label=label, description=description, path=metadata_entry_data['path']) elif typename == 'EventJsonMetadataEntry': yield EventMetadataEntry.json( label=label, description=description, data=json.loads(metadata_entry_data.get('jsonString', '')), ) elif typename == 'EventTextMetadataEntry': yield EventMetadataEntry.text(label=label, description=description, text=metadata_entry_data['text']) elif typename == 'EventUrlMetadataEntry': yield EventMetadataEntry.url(label=label, description=description, url=metadata_entry_data['url']) else: check.not_implemented('TODO for type {}'.format(typename))
def dbt_cli_compile(context) -> Dict: """This solid executes ``dbt compile`` via the dbt CLI.""" cli_output = execute_cli( context.solid_config["dbt_executable"], command=("compile", ), flags_dict=passthrough_flags_only( context.solid_config, ( "parse-only", "threads", "no-version-check", "models", "exclude", "selector", "state", "full-refresh", ), ), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) if context.solid_config["yield_materializations"]: yield AssetMaterialization( asset_key="dbt_compile_cli_output", description="Output from the CLI execution of `dbt compile`.", metadata_entries=[ EventMetadataEntry.json(cli_output, label="CLI Output") ], ) yield Output(cli_output)
def dbt_cli_test(context) -> DbtCliOutput: """This solid executes ``dbt test`` via the dbt CLI.""" cli_output = execute_cli( context.solid_config["dbt_executable"], command=("test", ), flags_dict=passthrough_flags_only( context.solid_config, ("data", "schema", "fail-fast", "threads", "models", "exclude")), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) run_results = parse_run_results(context.solid_config["project-dir"]) cli_output = {**run_results, **cli_output} if context.solid_config["yield_materializations"]: yield AssetMaterialization( asset_key="dbt_test_cli_output", description="Output from the CLI execution of `dbt test`.", metadata_entries=[ EventMetadataEntry.json(cli_output, label="CLI Output") ], ) yield Output(DbtCliOutput.from_dict(cli_output))
def dbt_cli_run(context) -> DbtCliOutput: """This solid executes ``dbt run`` via the dbt CLI.""" from ..utils import generate_materializations cli_output = execute_cli( context.solid_config["dbt_executable"], command=("run", ), flags_dict=passthrough_flags_only( context.solid_config, ("threads", "models", "exclude", "full-refresh", "fail-fast")), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) run_results = parse_run_results(context.solid_config["project-dir"]) cli_output_dict = {**run_results, **cli_output} cli_output = DbtCliOutput.from_dict(cli_output_dict) if context.solid_config["yield_materializations"]: for materialization in generate_materializations(cli_output): yield materialization yield AssetMaterialization( asset_key="dbt_run_cli_output", description="Output from the CLI execution of `dbt run`.", metadata_entries=[ EventMetadataEntry.json(cli_output_dict, label="CLI Output") ], ) yield Output(cli_output)
def ge_validation_solid(context, pandas_df): data_context = context.resources.ge_data_context suite = data_context.get_expectation_suite(suite_name) batch_kwargs = { "dataset": pandas_df, "datasource": datasource_name, } batch = data_context.get_batch(batch_kwargs, suite) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = data_context.run_validation_operator( "action_list_operator", assets_to_validate=[batch], run_id=run_id) res = convert_to_json_serializable( results.list_validation_results())[0] nmeta = EventMetadataEntry.json( { 'overall': res['statistics'], 'individual': res['results'] }, 'constraint-metadata', ) yield ExpectationResult(success=res["success"], metadata_entries=[nmeta]) yield Output(res)
def _generate_materializations(dro: DbtRpcOutput) -> Iterator[AssetMaterialization]: """Yields ``AssetMaterializations`` for metadata in the dbt RPC ``DbtRpcOutput``.""" for node_result in dro.result.results: if node_result.node["resource_type"] in ["model", "snapshot"]: success = not node_result.fail and not node_result.skip and not node_result.error if success: entries = [ EventMetadataEntry.json(data=node_result.node, label="Node"), EventMetadataEntry.text(text=str(node_result.status), label="Status"), EventMetadataEntry.text( text=str(node_result.execution_time), label="Execution Time" ), EventMetadataEntry.text( text=node_result.node["config"]["materialized"], label="Materialization Strategy", ), EventMetadataEntry.text(text=node_result.node["database"], label="Database"), EventMetadataEntry.text(text=node_result.node["schema"], label="Schema"), EventMetadataEntry.text(text=node_result.node["alias"], label="Alias"), EventMetadataEntry.text( text=node_result.node["description"], label="Description" ), ] for step_timing in node_result.step_timings: if step_timing.name == "execute": execution_entries = [ EventMetadataEntry.text( text=step_timing.started_at.isoformat(timespec="seconds"), label="Execution Started At", ), EventMetadataEntry.text( text=step_timing.completed_at.isoformat(timespec="seconds"), label="Execution Completed At", ), EventMetadataEntry.text( text=str(step_timing.duration), label="Execution Duration" ), ] entries.extend(execution_entries) if step_timing.name == "compile": execution_entries = [ EventMetadataEntry.text( text=step_timing.started_at.isoformat(timespec="seconds"), label="Compilation Started At", ), EventMetadataEntry.text( text=step_timing.completed_at.isoformat(timespec="seconds"), label="Compilation Completed At", ), EventMetadataEntry.text( text=str(step_timing.duration), label="Compilation Duration" ), ] entries.extend(execution_entries) yield AssetMaterialization( description="A materialized node within the dbt graph.", metadata_entries=entries, asset_key=node_result.node["unique_id"], )
def expect_column_to_be_integers( data_frame: LessSimpleDataFrame, column_name: str ) -> ExpectationResult: bad_values = [] for idx in range(len(data_frame)): line = data_frame[idx] if not isinstance(line[column_name], int): bad_values.append((idx, str(line[column_name]))) return ExpectationResult( success=(not bad_values), label='col_{column_name}_is_int'.format(column_name=column_name), description=( 'Check whether type of column {column_name} in ' 'LessSimpleDataFrame is int' ).format(column_name=column_name), metadata_entries=[ EventMetadataEntry.json( {'index': idx, 'bad_value': value}, 'bad_value', 'Bad value in column {column_name}'.format( column_name=column_name ), ) for (idx, value) in bad_values ], )
def _base_compute(context): time.sleep(context.solid_config["sleep"]) if random() < context.solid_config["error_rate"]: raise Exception("blah") asset_key = None if context.solid_config.get("materialization_key_list") is not None: asset_key = AssetKey( context.solid_config.get("materialization_key_list")) elif context.solid_config.get("materialization_key") is not None: asset_key = AssetKey(context.solid_config.get("materialization_key")) if asset_key: metadata_entries = [] if context.solid_config.get("materialization_text") is not None: metadata_entries.append( EventMetadataEntry.text( context.solid_config.get("materialization_text"), context.solid.name, )) if context.solid_config.get("materialization_url") is not None: metadata_entries.append( EventMetadataEntry.url( context.solid_config.get("materialization_url"), context.solid.name, )) if context.solid_config.get("materialization_path") is not None: metadata_entries.append( EventMetadataEntry.path( context.solid_config.get("materialization_url"), context.solid.name, )) if context.solid_config.get("materialization_json") is not None: metadata_entries.append( EventMetadataEntry.json( context.solid_config.get("materialization_json"), context.solid.name, )) if context.solid_config.get("materialization_value") is not None: metadata_entries = [ EventMetadataEntry.float( context.solid_config.get("materialization_value"), context.solid.name, ) ] if len(metadata_entries) == 0: metadata_entries = None yield AssetMaterialization( asset_key=asset_key, metadata_entries=metadata_entries, ) yield Output(1)
def create_expectation_result(label, ge_evr): check.dict_param(ge_evr, 'ge_evr', key_type=str) check.param_invariant('success' in ge_evr, 'ge_evr') return ExpectationResult( success=ge_evr['success'], label=label, metadata_entries=[EventMetadataEntry.json(ge_evr, label='evr')], )
def emit_failed_expectation(_context): yield ExpectationResult( success=False, label='always_false', description='Failure', metadata_entries=[ EventMetadataEntry.json(label='data', data={'reason': 'Relentless pessimism.'}) ], )
def emit_failed_expectation(_context): yield ExpectationResult( success=False, label="always_false", description="Failure", metadata_entries=[ EventMetadataEntry.json(label="data", data={"reason": "Relentless pessimism."}) ], )
def emit_successful_expectation(_context): yield ExpectationResult( success=True, label='always_true', description='Successful', metadata_entries=[ EventMetadataEntry.json(label='data', data={'reason': 'Just because.'}) ], )
def emit_successful_expectation(_context): yield ExpectationResult( success=True, label="always_true", description="Successful", metadata_entries=[ EventMetadataEntry.json(label="data", data={"reason": "Just because."}) ], )
def __init__(self, description: str, logs: List[Dict[str, Any]], raw_output: str): metadata_entries = [ EventMetadataEntry.json({"logs": logs}, label="Parsed CLI Output (JSON)",), EventMetadataEntry.text( DagsterDbtCliRuntimeError.stitch_messages(logs), label="Parsed CLI Output (JSON) Message Attributes", ), EventMetadataEntry.text(raw_output, label="Raw CLI Output",), ] super().__init__(description, metadata_entries)
def materialize_one(_): yield AssetMaterialization( asset_key=asset_key, metadata_entries=[ EventMetadataEntry.text("hello", "text"), EventMetadataEntry.json({"hello": "world"}, "json"), EventMetadataEntry.float(1.0, "one"), ], ) yield Output(1)
def __init__(self, invalid_line_nos: List[int]): check.list_param(invalid_line_nos, "invalid_line_nos", int) line_nos_str = ", ".join(map(str, invalid_line_nos)) description = f"dbt CLI emitted unexpected output on lines {line_nos_str}" metadata_entries = [ EventMetadataEntry.json({"line_nos": invalid_line_nos}, "Invalid CLI Output Line Numbers") ] super().__init__(description, metadata_entries) self.invalid_line_nos = invalid_line_nos
def materialize_one(_): yield AssetMaterialization( asset_key=asset_key, metadata_entries=[ EventMetadataEntry.text('hello', 'text'), EventMetadataEntry.json({'hello': 'world'}, 'json'), EventMetadataEntry.float(1.0, 'one'), ], ) yield Output(1)
def df_type_check(_, value): if not isinstance(value, dd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ # string cast columns since they may be things like datetime EventMetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"), ], )
def convert_to_metadata(self): return EventMetadataEntry.json( { 'constraint_name': self.constraint_name, 'constraint_description': self.constraint_description, 'expected': self.expectation, 'offending': self.offending, 'actual': self.actual, }, 'constraint-metadata', )
def df_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'), # string cast columns since they may be things like datetime EventMetadataEntry.json({'columns': list(map(str, value.columns))}, 'metadata'), ], )
def convert_to_metadata(self): return EventMetadataEntry.json( { "constraint_name": self.constraint_name, "constraint_description": self.constraint_description, "expected": self.expectation, "offending": self.offending, "actual": self.actual, }, "constraint-metadata", )
def materialize(_): yield Materialization( label='all_types', description='a materialization with all metadata types', metadata_entries=[ EventMetadataEntry.text('text is cool', 'text'), EventMetadataEntry.url('https://bigty.pe/neato', 'url'), EventMetadataEntry.fspath('/tmp/awesome', 'path'), EventMetadataEntry.json({'is_dope': True}, 'json'), ], ) yield Output(None)
def _base_compute(context): time.sleep(context.solid_config['sleep']) if random() < context.solid_config['error_rate']: raise Exception('blah') if context.solid_config.get('materialization_key') is not None: metadata_entries = [] if context.solid_config.get('materialization_text') is not None: metadata_entries.append( EventMetadataEntry.text( context.solid_config.get('materialization_text'), context.solid.name, ) ) if context.solid_config.get('materialization_url') is not None: metadata_entries.append( EventMetadataEntry.url( context.solid_config.get('materialization_url'), context.solid.name, ) ) if context.solid_config.get('materialization_path') is not None: metadata_entries.append( EventMetadataEntry.path( context.solid_config.get('materialization_url'), context.solid.name, ) ) if context.solid_config.get('materialization_json') is not None: metadata_entries.append( EventMetadataEntry.json( context.solid_config.get('materialization_json'), context.solid.name, ) ) if context.solid_config.get('materialization_value') is not None: metadata_entries = [ EventMetadataEntry.float( context.solid_config.get('materialization_value'), context.solid.name, ) ] if len(metadata_entries) == 0: metadata_entries = None yield Materialization( label=context.solid.name, asset_key=context.solid_config.get('materialization_key'), metadata_entries=metadata_entries, ) yield Output(1)
def materialization_and_expectation(_context): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ EventMetadataEntry.text("text is cool", "text"), EventMetadataEntry.url("https://bigty.pe/neato", "url"), EventMetadataEntry.fspath("/tmp/awesome", "path"), EventMetadataEntry.json({"is_dope": True}, "json"), ], ) yield ExpectationResult(success=True, label="row_count", description="passed") yield ExpectationResult(True) yield Output(True)
def materialization_and_expectation(_context): yield Materialization( label='all_types', description='a materialization with all metadata types', metadata_entries=[ EventMetadataEntry.text('text is cool', 'text'), EventMetadataEntry.url('https://bigty.pe/neato', 'url'), EventMetadataEntry.fspath('/tmp/awesome', 'path'), EventMetadataEntry.json({'is_dope': True}, 'json'), ], ) yield ExpectationResult(success=True, label='row_count', description='passed') yield ExpectationResult(True) yield Output(True)