def _id_range_for_time(start: int, end: int, hn_client): check.invariant(end >= start, "End time comes before start time") def _get_item_timestamp(item_id): item = hn_client.fetch_item_by_id(item_id) return item["time"] max_item_id = hn_client.fetch_max_item_id() # declared by resource to allow testability against snapshot min_item_id = hn_client.min_item_id() start_id = binary_search_nearest_left(_get_item_timestamp, min_item_id, max_item_id, start) end_id = binary_search_nearest_right(_get_item_timestamp, min_item_id, max_item_id, end) start_timestamp = str( datetime.fromtimestamp(_get_item_timestamp(start_id), tz=timezone.utc)) end_timestamp = str( datetime.fromtimestamp(_get_item_timestamp(end_id), tz=timezone.utc)) metadata_entries = [ MetadataEntry.int(value=max_item_id, label="max_item_id"), MetadataEntry.int(value=start_id, label="start_id"), MetadataEntry.int(value=end_id, label="end_id"), MetadataEntry.int(value=end_id - start_id, label="items"), MetadataEntry.text(text=start_timestamp, label="start_timestamp"), MetadataEntry.text(text=end_timestamp, label="end_timestamp"), ] id_range = (start_id, end_id) return id_range, metadata_entries
def handle_output(self, context: OutputContext, obj: Union[PandasDataFrame, SparkDataFrame]): schema, table = DB_SCHEMA, context.asset_key.path[-1] time_window = context.asset_partitions_time_window if context.has_asset_partitions else None with connect_snowflake(config=self._config, schema=schema) as con: con.execute(self._get_cleanup_statement(table, schema, time_window)) if isinstance(obj, SparkDataFrame): yield from self._handle_spark_output(obj, schema, table) elif isinstance(obj, PandasDataFrame): yield from self._handle_pandas_output(obj, schema, table) elif obj is None: # dbt config = dict(SHARED_SNOWFLAKE_CONF) config["schema"] = DB_SCHEMA with connect_snowflake(config=config) as con: df = read_sql(f"SELECT * FROM {context.name} LIMIT 5", con=con) num_rows = con.execute(f"SELECT COUNT(*) FROM {context.name}").fetchone() yield MetadataEntry.md(df.to_markdown(), "Data sample") yield MetadataEntry.int(num_rows, "Rows") else: raise Exception( "SnowflakeIOManager only supports pandas DataFrames and spark DataFrames" ) yield MetadataEntry.text( self._get_select_statement( table, schema, None, time_window, ), "Query", )
def handle_output(self, context, obj): table_name = context.name write_dataframe_to_table(name=table_name, dataframe=obj) # attach these to the Handled Output event yield MetadataEntry.int(len(obj), label="number of rows") yield MetadataEntry.text(table_name, label="table name")
def handle_output(self, context, obj): file_path = os.path.join("my_base_dir", context.step_key, context.name) obj.to_csv(file_path) yield MetadataEntry.int(obj.shape[0], label="number of rows") yield MetadataEntry.float(obj["some_column"].mean(), "some_column mean")
def materialize(_): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ MetadataEntry.text("text is cool", "text"), MetadataEntry.url("https://bigty.pe/neato", "url"), MetadataEntry.fspath("/tmp/awesome", "path"), MetadataEntry.json({"is_dope": True}, "json"), MetadataEntry.python_artifact(MetadataEntry, "python class"), MetadataEntry.python_artifact(file_relative_path, "python function"), MetadataEntry.float(1.2, "float"), MetadataEntry.int(1, "int"), MetadataEntry.float(float("nan"), "float NaN"), MetadataEntry.int(LONG_INT, "long int"), MetadataEntry.pipeline_run("fake_run_id", "pipeline run"), MetadataEntry.asset(AssetKey("my_asset"), "my asset"), MetadataEntry.table( label="table", records=[ TableRecord(foo=1, bar=2), TableRecord(foo=3, bar=4), ], ), MetadataEntry.table_schema( label="table_schema", schema=TableSchema( columns=[ TableColumn( name="foo", type="integer", constraints=TableColumnConstraints( unique=True), ), TableColumn(name="bar", type="string"), ], constraints=TableConstraints(other=["some constraint" ], ), ), ), ], ) yield Output(None)
def handle_output(self, context, obj): key = context.asset_key.path[-1] bucket = context.resource_config["bucket"] context.log.debug("about to pickle object") pickled_obj = pickle.dumps(obj) yield MetadataEntry.int(len(pickled_obj), "Bytes") client = s3_client() context.log.debug("created S3 client") client.put_object(Bucket=bucket, Key=key, Body=pickled_obj)
def backcompat_materialize(_): yield Materialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ MetadataEntry.text("text is cool", "text"), MetadataEntry.url("https://bigty.pe/neato", "url"), MetadataEntry.fspath("/tmp/awesome", "path"), MetadataEntry.json({"is_dope": True}, "json"), MetadataEntry.python_artifact(MetadataEntry, "python class"), MetadataEntry.python_artifact(file_relative_path, "python function"), MetadataEntry.float(1.2, "float"), MetadataEntry.int(1, "int"), MetadataEntry.float(float("nan"), "float NaN"), MetadataEntry.int(LONG_INT, "long int"), MetadataEntry.pipeline_run("fake_run_id", "pipeline run"), MetadataEntry.asset(AssetKey("my_asset"), "my asset"), ], ) yield Output(None)
def handle_output( self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame] ): path = self._get_path(context) if isinstance(obj, pandas.DataFrame): row_count = len(obj) obj.to_parquet(path=path, index=False) elif isinstance(obj, pyspark.sql.DataFrame): row_count = obj.count() obj.write.parquet(path=path, mode="overwrite") else: raise Exception(f"Outputs of type {type(obj)} not supported.") yield MetadataEntry.int(value=row_count, label="row_count") yield MetadataEntry.path(path=path, label="path")
def handle_output(self, context, obj: pd.DataFrame): """This saves the dataframe as a CSV.""" fpath = self._get_fs_path(context.asset_key) os.makedirs(os.path.dirname(fpath), exist_ok=True) obj.to_csv(fpath) with open(fpath + ".version", "w") as f: f.write(context.version if context.version else "None") yield MetadataEntry.int(obj.shape[0], "Rows") yield MetadataEntry.path(fpath, "Path") yield MetadataEntry.md(obj.head(5).to_markdown(), "Sample") yield MetadataEntry.text(context.version, "Resolved version") yield MetadataEntry.table_schema( self.get_schema(context.dagster_type), "Schema", )
def _get_metadata(self, result: Dict[str, Any]) -> List[MetadataEntry]: """ Here, we run queries against our output Snowflake database tables to add additional context to our asset materializations. """ table_name = result["unique_id"].split(".")[-1] with connect_snowflake(config=self._snowflake_config, schema=self._dbt_schema) as con: n_rows = pandas.read_sql_query(f"SELECT COUNT(*) FROM {table_name}", con) sample_rows = pandas.read_sql_query( f"SELECT * FROM {table_name} SAMPLE ROW (10 rows)", con ) return super()._get_metadata(result) + [ MetadataEntry.int(int(n_rows.iloc[0][0]), "dbt Model Number of Rows"), MetadataEntry.md(sample_rows.astype("str").to_markdown(), "dbt Model Sample Rows"), ]
def _handle_pandas_output(self, obj: PandasDataFrame, schema: str, table: str): from snowflake import connector # pylint: disable=no-name-in-module yield MetadataEntry.int(obj.shape[0], "Rows") yield MetadataEntry.md(pandas_columns_to_markdown(obj), "DataFrame columns") connector.paramstyle = "pyformat" with connect_snowflake(config=self._config, schema=schema) as con: with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns") with_uppercase_cols.to_sql( table, con=con, if_exists="append", index=False, method=pd_writer, )
def handle_output(self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]): path = self._get_path(context) if "://" not in self._base_path: os.makedirs(os.path.dirname(path), exist_ok=True) if isinstance(obj, pandas.DataFrame): row_count = len(obj) context.log.info(f"Row count: {row_count}") obj.to_parquet(path=path, index=False) elif isinstance(obj, pyspark.sql.DataFrame): row_count = obj.count() obj.write.parquet(path=path, mode="overwrite") else: raise Exception(f"Outputs of type {type(obj)} not supported.") yield MetadataEntry.int(value=row_count, label="row_count") yield MetadataEntry.path(path=path, label="path")
def test_assets(): ab_resource = airbyte_resource( build_init_resource_context(config={ "host": "some_host", "port": "8000", })) ab_assets = build_airbyte_assets("12345", ["foo", "bar"], asset_key_prefix=["some", "prefix"]) assert len(ab_assets[0].op.output_defs) == 2 responses.add( method=responses.POST, url=ab_resource.api_base_url + "/connections/get", json={ "name": "xyz", "syncCatalog": { "streams": [ { "stream": { "name": "foo", "jsonSchema": { "properties": { "a": { "type": "str" }, "b": { "type": "int" } } }, }, "config": { "selected": True }, }, { "stream": { "name": "bar", "jsonSchema": { "properties": { "c": { "type": "str" }, } }, }, "config": { "selected": True }, }, { "stream": { "name": "baz", "jsonSchema": { "properties": { "d": { "type": "str" }, } }, }, "config": { "selected": True }, }, ] }, }, status=200, ) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/connections/sync", json={"job": { "id": 1 }}, status=200, ) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/jobs/get", json={ "job": { "id": 1, "status": AirbyteState.SUCCEEDED }, "attempts": [{ "attempt": { "streamStats": [ { "streamName": "foo", "stats": { "bytesEmitted": 1234, "recordsCommitted": 4321, }, }, { "streamName": "bar", "stats": { "bytesEmitted": 1234, "recordsCommitted": 4321, }, }, { "streamName": "baz", "stats": { "bytesEmitted": 1111, "recordsCommitted": 1111, }, }, ] } }], }, status=200, ) ab_job = build_assets_job( "ab_job", ab_assets, resource_defs={ "airbyte": airbyte_resource.configured({ "host": "some_host", "port": "8000", }) }, ) res = ab_job.execute_in_process() materializations = [ event for event in res.events_for_node("airbyte_sync_12345") if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 3 assert (MetadataEntry.text("a,b", "columns") in materializations[0]. event_specific_data.materialization.metadata_entries) assert (MetadataEntry.int(1234, "bytesEmitted") in materializations[0]. event_specific_data.materialization.metadata_entries) assert (MetadataEntry.int(4321, "recordsCommitted") in materializations[0]. event_specific_data.materialization.metadata_entries)
def test_assets(): ab_resource = airbyte_resource( build_init_resource_context(config={ "host": "some_host", "port": "8000", })) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/connections/get", json={ "name": "xyz", "syncCatalog": { "streams": [ { "stream": { "name": "foo", "jsonSchema": { "properties": { "a": { "type": "str" }, "b": { "type": "int" } } }, }, "config": { "selected": True }, }, { "stream": { "name": "bar", "jsonSchema": { "properties": { "c": { "type": "str" }, } }, }, "config": { "selected": True }, }, { "stream": { "name": "baz", "jsonSchema": { "properties": { "d": { "type": "str" }, } }, }, "config": { "selected": False }, }, ] }, }, status=200, ) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/connections/sync", json={"job": { "id": 1 }}, status=200, ) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/jobs/get", json={ "job": { "id": 1, "status": AirbyteState.SUCCEEDED }, "attempts": [{ "attempt": { "streamStats": [ { "streamName": "foo", "stats": { "bytesEmitted": 1234, "recordsCommitted": 4321, }, }, { "streamName": "bar", "stats": { "bytesEmitted": 1234, "recordsCommitted": 4321, }, }, ] } }], }, status=200, ) airbyte_output = ab_resource.sync_and_poll("some_connection", 0, None) materializations = list(generate_materializations(airbyte_output, [])) assert len(materializations) == 2 assert MetadataEntry.text( "a,b", "columns") in materializations[0].metadata_entries assert MetadataEntry.int( 1234, "bytesEmitted") in materializations[0].metadata_entries assert MetadataEntry.int( 4321, "recordsCommitted") in materializations[0].metadata_entries