def _init_extractor(self, programmatic_description_enabled: bool = True) -> None: repository_path = pathlib.Path( __file__).parent.parent.resolve() / "resources/extractor/feast/fs" conf = { f"extractor.feast.{FeastExtractor.FEAST_REPOSITORY_PATH}": repository_path, f"extractor.feast.{FeastExtractor.DESCRIBE_FEATURE_VIEWS}": programmatic_description_enabled, } self.extractor = FeastExtractor() self.extractor.init( Scoped.get_scoped_conf(conf=ConfigFactory.from_dict(conf), scope=self.extractor.get_scope()))
def _init_extractor(self, programmatic_description_enabled: bool = True) -> None: conf = { f'extractor.feast.{FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY}': 'feast-core.example.com:6565', f'extractor.feast.{FeastExtractor.FEAST_SERVICE_CONFIG_KEY}': 'unittest-feast-instance', f'extractor.feast.{FeastExtractor.DESCRIBE_FEATURE_TABLES}': programmatic_description_enabled, } self.extractor = FeastExtractor() self.extractor.init( Scoped.get_scoped_conf(conf=ConfigFactory.from_dict(conf), scope=self.extractor.get_scope())) self.extractor._client = MagicMock(return_value=None)
class TestFeastExtractor(unittest.TestCase): def test_no_feature_tables_registered(self) -> None: self._init_extractor() self.extractor._client.list_projects.return_value = ["default"] self.assertIsNone(self.extractor.extract()) def test_every_project_is_scanned(self) -> None: self._init_extractor() self.extractor._client.list_projects.return_value = [ "default", "dev", "prod" ] list_feature_tables_mock = self.extractor._client.list_feature_tables list_feature_tables_mock.return_value = [] self.assertIsNone(self.extractor.extract()) list_feature_tables_mock.assert_has_calls([ call(project="default"), call(project="dev"), call(project="prod"), ]) def test_feature_table_extraction(self) -> None: self._init_extractor(programmatic_description_enabled=False) self.extractor._client.list_projects.return_value = ["default"] self._mock_feature_table() table = self.extractor.extract() self.extractor._client.get_entity.assert_called_with("driver_id", project="default") expected = TableMetadata( database="feast", cluster="unittest-feast-instance", schema="default", name="driver_trips", description=None, columns=[ ColumnMetadata("driver_id", "Internal identifier of the driver", "INT64", 0), ColumnMetadata("trips_today", None, "INT32", 1), ], ) self.assertEqual(expected.__repr__(), table.__repr__()) self.assertIsNone(self.extractor.extract()) def test_feature_table_extraction_with_description_batch(self) -> None: self._init_extractor(programmatic_description_enabled=True) self.extractor._client.list_projects.return_value = ["default"] self._mock_feature_table(labels={"label1": "value1"}) feature_table_definition = self.extractor.extract() assert isinstance(feature_table_definition, TableMetadata) description = self.extractor.extract() assert isinstance(description, TableMetadata) expected = DescriptionMetadata( TestFeastExtractor._strip_margin( """* Created at **2020-01-01 00:00:00** |* Labels: | * label1: **value1** |"""), "feature_table_details", ) self.assertEqual(expected.__repr__(), description.description.__repr__()) batch_source = self.extractor.extract() assert isinstance(batch_source, TableMetadata) expected = DescriptionMetadata( TestFeastExtractor._strip_margin("""``` |fileOptions: | fileFormat: | parquetFormat: {} | fileUrl: file:///some/location |type: BATCH_FILE |```"""), "batch_source", ) self.assertEqual(expected.__repr__(), batch_source.description.__repr__()) self.assertIsNone(self.extractor.extract()) def test_feature_table_extraction_with_description_stream(self) -> None: self._init_extractor(programmatic_description_enabled=True) self.extractor._client.list_projects.return_value = ["default"] self._mock_feature_table(add_stream_source=True) feature_table_definition = self.extractor.extract() assert isinstance(feature_table_definition, TableMetadata) description = self.extractor.extract() assert isinstance(description, TableMetadata) expected = DescriptionMetadata( TestFeastExtractor._strip_margin( """* Created at **2020-01-01 00:00:00** |"""), "feature_table_details", ) self.assertEqual(expected.__repr__(), description.description.__repr__()) batch_source = self.extractor.extract() assert isinstance(batch_source, TableMetadata) expected = DescriptionMetadata( TestFeastExtractor._strip_margin("""``` |fileOptions: | fileFormat: | parquetFormat: {} | fileUrl: file:///some/location |type: BATCH_FILE |```"""), "batch_source", ) self.assertEqual(expected.__repr__(), batch_source.description.__repr__()) stream_source = self.extractor.extract() assert isinstance(stream_source, TableMetadata) expected = DescriptionMetadata( TestFeastExtractor._strip_margin("""``` |createdTimestampColumn: datetime |eventTimestampColumn: datetime |kafkaOptions: | bootstrapServers: broker1 | messageFormat: | avroFormat: | schemaJson: '{"type": "record", "name": "DriverTrips", "fields": [{"name": "driver_id", | "type": "long"}, {"name": "trips_today", "type": "int"}, {"name": "datetime", | "type": {"type": "long", "logicalType": "timestamp-micros"}}]}' | topic: driver_trips |type: STREAM_KAFKA |```"""), "stream_source", ) self.assertEqual(expected.__repr__(), stream_source.description.__repr__()) self.assertIsNone(self.extractor.extract()) def _init_extractor(self, programmatic_description_enabled: bool = True) -> None: conf = { f'extractor.feast.{FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY}': 'feast-core.example.com:6565', f'extractor.feast.{FeastExtractor.FEAST_SERVICE_CONFIG_KEY}': 'unittest-feast-instance', f'extractor.feast.{FeastExtractor.DESCRIBE_FEATURE_TABLES}': programmatic_description_enabled, } self.extractor = FeastExtractor() self.extractor.init( Scoped.get_scoped_conf(conf=ConfigFactory.from_dict(conf), scope=self.extractor.get_scope())) self.extractor._client = MagicMock(return_value=None) @staticmethod def _strip_margin(text: str) -> str: return re.sub("\n[ \t]*\\|", "\n", text) def _mock_feature_table(self, labels: dict = {}, add_stream_source: bool = False) -> None: table_spec = { "name": "driver_trips", "entities": ["driver_id"], "features": [{ "name": "trips_today", "valueType": "INT32" }], "labels": labels, "batchSource": { "type": "BATCH_FILE", "fileOptions": { "fileFormat": { "parquetFormat": {} }, "fileUrl": "file:///some/location", }, }, } if add_stream_source: avro_schema_json = json.dumps({ "type": "record", "name": "DriverTrips", "fields": [ { "name": "driver_id", "type": "long" }, { "name": "trips_today", "type": "int" }, { "name": "datetime", "type": { "type": "long", "logicalType": "timestamp-micros" }, }, ], }) table_spec["streamSource"] = { "type": "STREAM_KAFKA", "eventTimestampColumn": "datetime", "createdTimestampColumn": "datetime", "kafkaOptions": { "bootstrapServers": "broker1", "topic": "driver_trips", "messageFormat": { "avroFormat": { "schemaJson": avro_schema_json, } }, }, } self.extractor._client.list_feature_tables.return_value = [ FeatureTable.from_dict({ "spec": table_spec, "meta": { "createdTimestamp": "2020-01-01T00:00:00Z" }, }) ] self.extractor._client.get_entity.return_value = Entity.from_dict({ "spec": { "name": "driver_id", "valueType": "INT64", "description": "Internal identifier of the driver", } })
class TestFeastExtractor(unittest.TestCase): expected_created_time = datetime.strptime("2020-01-01 00:00:00", "%Y-%m-%d %H:%M:%S") def setUp(self) -> None: repo_path = pathlib.Path( __file__).parent.parent.resolve() / "resources/extractor/feast/fs" os.system(f"cd {repo_path} && feast apply") def test_feature_view_extraction(self) -> None: self._init_extractor(programmatic_description_enabled=False) table = self.extractor.extract() expected = TableMetadata( database="feast", cluster="local", schema="fs", name="driver_hourly_stats", description=None, columns=[ ColumnMetadata("driver_id", "Internal identifier of the driver", "INT64", 0), ColumnMetadata("conv_rate", None, "FLOAT", 1), ColumnMetadata("acc_rate", None, "FLOAT", 2), ColumnMetadata("avg_daily_trips", None, "INT64", 3), ], ) self.assertEqual(expected.__repr__(), table.__repr__()) def test_feature_table_extraction_with_description_batch(self) -> None: self._init_extractor(programmatic_description_enabled=True) root_tests_path = pathlib.Path(__file__).parent.parent.resolve() feature_table_definition = self.extractor.extract() assert isinstance(feature_table_definition, TableMetadata) description = self.extractor.extract() assert isinstance(description, TableMetadata) expected = DescriptionMetadata( TestFeastExtractor._strip_margin( f"""* Created at **{self.expected_created_time}** |* Tags: | * is_pii: **true** |"""), "feature_view_details", ) self.assertEqual(expected.__repr__(), description.description.__repr__()) batch_source = self.extractor.extract() assert isinstance(batch_source, TableMetadata) expected = DescriptionMetadata( TestFeastExtractor._strip_margin(f"""``` |type: BATCH_FILE |event_timestamp_column: "event_timestamp" |created_timestamp_column: "created" |file_options {"{"} | file_url: "{root_tests_path}/resources/extractor/feast/fs/data/driver_stats.parquet" |{"}"} |```"""), "batch_source", ) self.assertEqual(expected.__repr__(), batch_source.description.__repr__()) def test_feature_table_extraction_with_description_stream(self) -> None: self._init_extractor(programmatic_description_enabled=True) root_tests_path = pathlib.Path(__file__).parent.parent.resolve() feature_table_definition = self.extractor.extract() assert isinstance(feature_table_definition, TableMetadata) description = self.extractor.extract() assert isinstance(description, TableMetadata) expected = DescriptionMetadata( TestFeastExtractor._strip_margin( f"""* Created at **{self.expected_created_time}** |* Tags: | * is_pii: **true** |"""), "feature_view_details", ) self.assertEqual(expected.__repr__(), description.description.__repr__()) batch_source = self.extractor.extract() assert isinstance(batch_source, TableMetadata) expected = DescriptionMetadata( TestFeastExtractor._strip_margin(f"""``` |type: BATCH_FILE |event_timestamp_column: "event_timestamp" |created_timestamp_column: "created" |file_options {"{"} | file_url: "{root_tests_path}/resources/extractor/feast/fs/data/driver_stats.parquet" |{"}"} |```"""), "batch_source", ) self.assertEqual(expected.__repr__(), batch_source.description.__repr__()) stream_source = self.extractor.extract() assert isinstance(stream_source, TableMetadata) schema_json = re.sub( "\n[ \t]*\\|", "", """\\\'{\\"type\\": \\"record\\", |\\"name\\": \\"driver_hourly_stats\\", |\\"fields\\": [ | {\\"name\\": \\"conv_rate\\", \\"type\\": \\"float\\"}, | {\\"name\\": \\"acc_rate\\", \\"type\\": \\"float\\"}, | {\\"name\\": \\"avg_daily_trips\\", \\"type\\": \\"int\\"}, | {\\"name\\": \\"datetime\\", \\"type\\": | {\\"type\\": \\"long\\", \\"logicalType\\": \\"timestamp-micros\\"}}]}\\\'""" ) expected = DescriptionMetadata( TestFeastExtractor._strip_margin("""``` |type: STREAM_KAFKA |event_timestamp_column: "datetime" |created_timestamp_column: "datetime" |kafka_options {{ | bootstrap_servers: "broker1" | topic: "driver_hourly_stats" | message_format {{ | avro_format {{ | schema_json: "{schema_json}" | }} | }} |}} |```""").format(schema_json=schema_json), "stream_source", ) print(stream_source.description.__repr__()) print(expected.__repr__()) self.assertEqual(expected.__repr__(), stream_source.description.__repr__()) def _init_extractor(self, programmatic_description_enabled: bool = True) -> None: repository_path = pathlib.Path( __file__).parent.parent.resolve() / "resources/extractor/feast/fs" conf = { f"extractor.feast.{FeastExtractor.FEAST_REPOSITORY_PATH}": repository_path, f"extractor.feast.{FeastExtractor.DESCRIBE_FEATURE_VIEWS}": programmatic_description_enabled, } self.extractor = FeastExtractor() self.extractor.init( Scoped.get_scoped_conf(conf=ConfigFactory.from_dict(conf), scope=self.extractor.get_scope())) @staticmethod def _strip_margin(text: str) -> str: return re.sub("\n[ \t]*\\|", "\n", text) def tearDown(self) -> None: root_path = pathlib.Path(__file__).parent.parent.resolve( ) / "resources/extractor/feast/fs/data" os.remove(root_path / "online_store.db") os.remove(root_path / "registry.db")
if cypher_query: job_config.put( f"extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}", cypher_query, ) if elasticsearch_mapping: job_config.put( f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}", elasticsearch_mapping, ) return job_config if __name__ == "__main__": feast_job = DefaultJob( conf=create_feast_job_config(), task=DefaultTask(extractor=FeastExtractor(), loader=FsNeo4jCSVLoader()), publisher=neo4j_csv_publisher.Neo4jCsvPublisher(), ) feast_job.launch() es_publish_job = DefaultJob( conf=create_es_publish_job_config(), task=DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor()), publisher=ElasticsearchPublisher(), ) es_publish_job.launch()