def _columns_to_metadata( columns: Mapping[str, Any]) -> Optional[Mapping[str, Any]]: return ({ "schema": MetadataValue.table_schema( TableSchema(columns=[ TableColumn( name=name, type=metadata.get("data_type") or "?", description=metadata.get("description"), ) for name, metadata in columns.items() ])) } if len(columns) > 0 else None)
def materialize(_): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ MetadataEntry("text", value="text is cool"), MetadataEntry( "url", value=MetadataValue.url("https://bigty.pe/neato")), MetadataEntry("path", value=MetadataValue.path("/tmp/awesome")), MetadataEntry("json", value={"is_dope": True}), MetadataEntry( "python class", value=MetadataValue.python_artifact(MetadataEntry)), MetadataEntry( "python function", value=MetadataValue.python_artifact(file_relative_path)), MetadataEntry("float", value=1.2), MetadataEntry("int", value=1), MetadataEntry("float NaN", value=float("nan")), MetadataEntry("long int", value=LONG_INT), MetadataEntry("pipeline run", value=MetadataValue.pipeline_run("fake_run_id")), MetadataEntry("my asset", value=AssetKey("my_asset")), MetadataEntry( "table", value=MetadataValue.table(records=[ TableRecord(foo=1, bar=2), TableRecord(foo=3, bar=4), ], ), ), MetadataEntry( "table_schema", value=TableSchema( columns=[ TableColumn( name="foo", type="integer", constraints=TableColumnConstraints( unique=True), ), TableColumn(name="bar", type="string"), ], constraints=TableConstraints(other=["some constraint" ], ), ), ), ], ) yield Output(None)
def materialize(_): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ EventMetadataEntry.text("text is cool", "text"), EventMetadataEntry.url("https://bigty.pe/neato", "url"), EventMetadataEntry.fspath("/tmp/awesome", "path"), EventMetadataEntry.json({"is_dope": True}, "json"), EventMetadataEntry.python_artifact(EventMetadataEntry, "python class"), EventMetadataEntry.python_artifact(file_relative_path, "python function"), EventMetadataEntry.float(1.2, "float"), EventMetadataEntry.int(1, "int"), EventMetadataEntry.float(float("nan"), "float NaN"), EventMetadataEntry.int(LONG_INT, "long int"), EventMetadataEntry.pipeline_run("fake_run_id", "pipeline run"), EventMetadataEntry.asset(AssetKey("my_asset"), "my asset"), EventMetadataEntry.table( label="table", records=[ TableRecord(foo=1, bar=2), TableRecord(foo=3, bar=4), ], ), EventMetadataEntry.table_schema( label="table_schema", schema=TableSchema( columns=[ TableColumn( name="foo", type="integer", constraints=TableColumnConstraints( unique=True), ), TableColumn(name="bar", type="string"), ], constraints=TableConstraints(other=["some constraint" ], ), ), ), ], ) yield Output(None)
def test_assets(schema_prefix): ab_resource = airbyte_resource( build_init_resource_context(config={ "host": "some_host", "port": "8000", })) destination_tables = ["foo", "bar"] if schema_prefix: destination_tables = [schema_prefix + t for t in destination_tables] ab_assets = build_airbyte_assets( "12345", destination_tables=destination_tables, asset_key_prefix=["some", "prefix"], ) assert ab_assets[0].asset_keys == { AssetKey(["some", "prefix", t]) for t in destination_tables } assert len(ab_assets[0].op.output_defs) == 2 responses.add( method=responses.POST, url=ab_resource.api_base_url + "/connections/get", json=get_sample_connection_json(prefix=schema_prefix), status=200, ) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/connections/sync", json={"job": { "id": 1 }}, status=200, ) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/jobs/get", json=get_sample_job_json(schema_prefix=schema_prefix), status=200, ) ab_job = build_assets_job( "ab_job", ab_assets, resource_defs={ "airbyte": airbyte_resource.configured({ "host": "some_host", "port": "8000", }) }, ) res = ab_job.execute_in_process() materializations = [ event.event_specific_data.materialization for event in res.events_for_node("airbyte_sync_12345") if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 3 assert {m.asset_key for m in materializations} == { AssetKey(["some", "prefix", schema_prefix + "foo"]), AssetKey(["some", "prefix", schema_prefix + "bar"]), AssetKey(["some", "prefix", schema_prefix + "baz"]), } assert MetadataEntry("bytesEmitted", value=1234) in materializations[0].metadata_entries assert MetadataEntry("recordsCommitted", value=4321) in materializations[0].metadata_entries assert (MetadataEntry( "schema", value=TableSchema(columns=[ TableColumn(name="a", type="str"), TableColumn(name="b", type="int"), ]), ) in materializations[0].metadata_entries)
def _pandera_schema_to_table_schema(schema: pa.DataFrameSchema) -> TableSchema: df_constraints = _pandera_schema_wide_checks_to_table_constraints(schema.checks) columns = [_pandera_column_to_table_column(col) for k, col in schema.columns.items()] return TableSchema(columns=columns, constraints=df_constraints)
return type_check_fn PANDERA_FAILURE_CASES_SCHEMA = TableSchema( columns=[ TableColumn( name="schema_context", type="string", description="`Column` for column-wise checks, or `DataFrameSchema`", ), TableColumn( name="column", type="string", description="Column of value that failed the check, or `None` for wide checks.", ), TableColumn( name="check", type="string", description="Description of the failed Pandera check." ), TableColumn(name="check_number", description="Index of the failed check."), TableColumn( name="failure_case", type="number | string", description="Value that failed a check." ), TableColumn( name="index", type="number | string", description="Index (row) of value that failed a check.", ), ] ) def _pandera_errors_to_type_check(