Beispiel #1
0
 def _init_extractor(self,
                     programmatic_description_enabled: bool = True) -> None:
     repository_path = pathlib.Path(
         __file__).parent.parent.resolve() / "resources/extractor/feast/fs"
     conf = {
         f"extractor.feast.{FeastExtractor.FEAST_REPOSITORY_PATH}":
         repository_path,
         f"extractor.feast.{FeastExtractor.DESCRIBE_FEATURE_VIEWS}":
         programmatic_description_enabled,
     }
     self.extractor = FeastExtractor()
     self.extractor.init(
         Scoped.get_scoped_conf(conf=ConfigFactory.from_dict(conf),
                                scope=self.extractor.get_scope()))
 def _init_extractor(self,
                     programmatic_description_enabled: bool = True) -> None:
     conf = {
         f'extractor.feast.{FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY}':
         'feast-core.example.com:6565',
         f'extractor.feast.{FeastExtractor.FEAST_SERVICE_CONFIG_KEY}':
         'unittest-feast-instance',
         f'extractor.feast.{FeastExtractor.DESCRIBE_FEATURE_TABLES}':
         programmatic_description_enabled,
     }
     self.extractor = FeastExtractor()
     self.extractor.init(
         Scoped.get_scoped_conf(conf=ConfigFactory.from_dict(conf),
                                scope=self.extractor.get_scope()))
     self.extractor._client = MagicMock(return_value=None)
class TestFeastExtractor(unittest.TestCase):
    def test_no_feature_tables_registered(self) -> None:
        self._init_extractor()
        self.extractor._client.list_projects.return_value = ["default"]

        self.assertIsNone(self.extractor.extract())

    def test_every_project_is_scanned(self) -> None:
        self._init_extractor()
        self.extractor._client.list_projects.return_value = [
            "default", "dev", "prod"
        ]
        list_feature_tables_mock = self.extractor._client.list_feature_tables
        list_feature_tables_mock.return_value = []

        self.assertIsNone(self.extractor.extract())
        list_feature_tables_mock.assert_has_calls([
            call(project="default"),
            call(project="dev"),
            call(project="prod"),
        ])

    def test_feature_table_extraction(self) -> None:
        self._init_extractor(programmatic_description_enabled=False)
        self.extractor._client.list_projects.return_value = ["default"]
        self._mock_feature_table()

        table = self.extractor.extract()
        self.extractor._client.get_entity.assert_called_with("driver_id",
                                                             project="default")
        expected = TableMetadata(
            database="feast",
            cluster="unittest-feast-instance",
            schema="default",
            name="driver_trips",
            description=None,
            columns=[
                ColumnMetadata("driver_id",
                               "Internal identifier of the driver", "INT64",
                               0),
                ColumnMetadata("trips_today", None, "INT32", 1),
            ],
        )

        self.assertEqual(expected.__repr__(), table.__repr__())
        self.assertIsNone(self.extractor.extract())

    def test_feature_table_extraction_with_description_batch(self) -> None:
        self._init_extractor(programmatic_description_enabled=True)
        self.extractor._client.list_projects.return_value = ["default"]
        self._mock_feature_table(labels={"label1": "value1"})

        feature_table_definition = self.extractor.extract()
        assert isinstance(feature_table_definition, TableMetadata)

        description = self.extractor.extract()
        assert isinstance(description, TableMetadata)
        expected = DescriptionMetadata(
            TestFeastExtractor._strip_margin(
                """* Created at **2020-01-01 00:00:00**
                  |* Labels:
                  |    * label1: **value1**
                  |"""),
            "feature_table_details",
        )
        self.assertEqual(expected.__repr__(),
                         description.description.__repr__())

        batch_source = self.extractor.extract()
        assert isinstance(batch_source, TableMetadata)
        expected = DescriptionMetadata(
            TestFeastExtractor._strip_margin("""```
                |fileOptions:
                |  fileFormat:
                |    parquetFormat: {}
                |  fileUrl: file:///some/location
                |type: BATCH_FILE
                |```"""),
            "batch_source",
        )
        self.assertEqual(expected.__repr__(),
                         batch_source.description.__repr__())

        self.assertIsNone(self.extractor.extract())

    def test_feature_table_extraction_with_description_stream(self) -> None:
        self._init_extractor(programmatic_description_enabled=True)
        self.extractor._client.list_projects.return_value = ["default"]
        self._mock_feature_table(add_stream_source=True)

        feature_table_definition = self.extractor.extract()
        assert isinstance(feature_table_definition, TableMetadata)

        description = self.extractor.extract()
        assert isinstance(description, TableMetadata)
        expected = DescriptionMetadata(
            TestFeastExtractor._strip_margin(
                """* Created at **2020-01-01 00:00:00**
                  |"""),
            "feature_table_details",
        )
        self.assertEqual(expected.__repr__(),
                         description.description.__repr__())

        batch_source = self.extractor.extract()
        assert isinstance(batch_source, TableMetadata)
        expected = DescriptionMetadata(
            TestFeastExtractor._strip_margin("""```
                |fileOptions:
                |  fileFormat:
                |    parquetFormat: {}
                |  fileUrl: file:///some/location
                |type: BATCH_FILE
                |```"""),
            "batch_source",
        )
        self.assertEqual(expected.__repr__(),
                         batch_source.description.__repr__())

        stream_source = self.extractor.extract()
        assert isinstance(stream_source, TableMetadata)
        expected = DescriptionMetadata(
            TestFeastExtractor._strip_margin("""```
                 |createdTimestampColumn: datetime
                 |eventTimestampColumn: datetime
                 |kafkaOptions:
                 |  bootstrapServers: broker1
                 |  messageFormat:
                 |    avroFormat:
                 |      schemaJson: '{"type": "record", "name": "DriverTrips", "fields": [{"name": "driver_id",
                 |        "type": "long"}, {"name": "trips_today", "type": "int"}, {"name": "datetime",
                 |        "type": {"type": "long", "logicalType": "timestamp-micros"}}]}'
                 |  topic: driver_trips
                 |type: STREAM_KAFKA
                 |```"""),
            "stream_source",
        )
        self.assertEqual(expected.__repr__(),
                         stream_source.description.__repr__())

        self.assertIsNone(self.extractor.extract())

    def _init_extractor(self,
                        programmatic_description_enabled: bool = True) -> None:
        conf = {
            f'extractor.feast.{FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY}':
            'feast-core.example.com:6565',
            f'extractor.feast.{FeastExtractor.FEAST_SERVICE_CONFIG_KEY}':
            'unittest-feast-instance',
            f'extractor.feast.{FeastExtractor.DESCRIBE_FEATURE_TABLES}':
            programmatic_description_enabled,
        }
        self.extractor = FeastExtractor()
        self.extractor.init(
            Scoped.get_scoped_conf(conf=ConfigFactory.from_dict(conf),
                                   scope=self.extractor.get_scope()))
        self.extractor._client = MagicMock(return_value=None)

    @staticmethod
    def _strip_margin(text: str) -> str:
        return re.sub("\n[ \t]*\\|", "\n", text)

    def _mock_feature_table(self,
                            labels: dict = {},
                            add_stream_source: bool = False) -> None:
        table_spec = {
            "name": "driver_trips",
            "entities": ["driver_id"],
            "features": [{
                "name": "trips_today",
                "valueType": "INT32"
            }],
            "labels": labels,
            "batchSource": {
                "type": "BATCH_FILE",
                "fileOptions": {
                    "fileFormat": {
                        "parquetFormat": {}
                    },
                    "fileUrl": "file:///some/location",
                },
            },
        }

        if add_stream_source:
            avro_schema_json = json.dumps({
                "type":
                "record",
                "name":
                "DriverTrips",
                "fields": [
                    {
                        "name": "driver_id",
                        "type": "long"
                    },
                    {
                        "name": "trips_today",
                        "type": "int"
                    },
                    {
                        "name": "datetime",
                        "type": {
                            "type": "long",
                            "logicalType": "timestamp-micros"
                        },
                    },
                ],
            })

            table_spec["streamSource"] = {
                "type": "STREAM_KAFKA",
                "eventTimestampColumn": "datetime",
                "createdTimestampColumn": "datetime",
                "kafkaOptions": {
                    "bootstrapServers": "broker1",
                    "topic": "driver_trips",
                    "messageFormat": {
                        "avroFormat": {
                            "schemaJson": avro_schema_json,
                        }
                    },
                },
            }

        self.extractor._client.list_feature_tables.return_value = [
            FeatureTable.from_dict({
                "spec": table_spec,
                "meta": {
                    "createdTimestamp": "2020-01-01T00:00:00Z"
                },
            })
        ]
        self.extractor._client.get_entity.return_value = Entity.from_dict({
            "spec": {
                "name": "driver_id",
                "valueType": "INT64",
                "description": "Internal identifier of the driver",
            }
        })
Beispiel #4
0
class TestFeastExtractor(unittest.TestCase):
    expected_created_time = datetime.strptime("2020-01-01 00:00:00",
                                              "%Y-%m-%d %H:%M:%S")

    def setUp(self) -> None:
        repo_path = pathlib.Path(
            __file__).parent.parent.resolve() / "resources/extractor/feast/fs"
        os.system(f"cd {repo_path} && feast apply")

    def test_feature_view_extraction(self) -> None:
        self._init_extractor(programmatic_description_enabled=False)

        table = self.extractor.extract()

        expected = TableMetadata(
            database="feast",
            cluster="local",
            schema="fs",
            name="driver_hourly_stats",
            description=None,
            columns=[
                ColumnMetadata("driver_id",
                               "Internal identifier of the driver", "INT64",
                               0),
                ColumnMetadata("conv_rate", None, "FLOAT", 1),
                ColumnMetadata("acc_rate", None, "FLOAT", 2),
                ColumnMetadata("avg_daily_trips", None, "INT64", 3),
            ],
        )

        self.assertEqual(expected.__repr__(), table.__repr__())

    def test_feature_table_extraction_with_description_batch(self) -> None:
        self._init_extractor(programmatic_description_enabled=True)

        root_tests_path = pathlib.Path(__file__).parent.parent.resolve()
        feature_table_definition = self.extractor.extract()
        assert isinstance(feature_table_definition, TableMetadata)

        description = self.extractor.extract()
        assert isinstance(description, TableMetadata)
        expected = DescriptionMetadata(
            TestFeastExtractor._strip_margin(
                f"""* Created at **{self.expected_created_time}**
                  |* Tags:
                  |    * is_pii: **true**
                  |"""),
            "feature_view_details",
        )
        self.assertEqual(expected.__repr__(),
                         description.description.__repr__())

        batch_source = self.extractor.extract()
        assert isinstance(batch_source, TableMetadata)
        expected = DescriptionMetadata(
            TestFeastExtractor._strip_margin(f"""```
                |type: BATCH_FILE
                |event_timestamp_column: "event_timestamp"
                |created_timestamp_column: "created"
                |file_options {"{"}
                |  file_url: "{root_tests_path}/resources/extractor/feast/fs/data/driver_stats.parquet"
                |{"}"}
                |```"""),
            "batch_source",
        )
        self.assertEqual(expected.__repr__(),
                         batch_source.description.__repr__())

    def test_feature_table_extraction_with_description_stream(self) -> None:
        self._init_extractor(programmatic_description_enabled=True)
        root_tests_path = pathlib.Path(__file__).parent.parent.resolve()

        feature_table_definition = self.extractor.extract()
        assert isinstance(feature_table_definition, TableMetadata)

        description = self.extractor.extract()
        assert isinstance(description, TableMetadata)
        expected = DescriptionMetadata(
            TestFeastExtractor._strip_margin(
                f"""* Created at **{self.expected_created_time}**
                  |* Tags:
                  |    * is_pii: **true**
                  |"""),
            "feature_view_details",
        )
        self.assertEqual(expected.__repr__(),
                         description.description.__repr__())

        batch_source = self.extractor.extract()
        assert isinstance(batch_source, TableMetadata)
        expected = DescriptionMetadata(
            TestFeastExtractor._strip_margin(f"""```
                |type: BATCH_FILE
                |event_timestamp_column: "event_timestamp"
                |created_timestamp_column: "created"
                |file_options {"{"}
                |  file_url: "{root_tests_path}/resources/extractor/feast/fs/data/driver_stats.parquet"
                |{"}"}
                |```"""),
            "batch_source",
        )
        self.assertEqual(expected.__repr__(),
                         batch_source.description.__repr__())

        stream_source = self.extractor.extract()
        assert isinstance(stream_source, TableMetadata)
        schema_json = re.sub(
            "\n[ \t]*\\|", "", """\\\'{\\"type\\": \\"record\\",
                 |\\"name\\": \\"driver_hourly_stats\\",
                 |\\"fields\\": [
                 | {\\"name\\": \\"conv_rate\\", \\"type\\": \\"float\\"},
                 | {\\"name\\": \\"acc_rate\\", \\"type\\": \\"float\\"},
                 | {\\"name\\": \\"avg_daily_trips\\", \\"type\\": \\"int\\"},
                 | {\\"name\\": \\"datetime\\", \\"type\\":
                 | {\\"type\\": \\"long\\", \\"logicalType\\": \\"timestamp-micros\\"}}]}\\\'"""
        )
        expected = DescriptionMetadata(
            TestFeastExtractor._strip_margin("""```
                 |type: STREAM_KAFKA
                 |event_timestamp_column: "datetime"
                 |created_timestamp_column: "datetime"
                 |kafka_options {{
                 |  bootstrap_servers: "broker1"
                 |  topic: "driver_hourly_stats"
                 |  message_format {{
                 |    avro_format {{
                 |      schema_json: "{schema_json}"
                 |    }}
                 |  }}
                 |}}
                 |```""").format(schema_json=schema_json),
            "stream_source",
        )
        print(stream_source.description.__repr__())

        print(expected.__repr__())
        self.assertEqual(expected.__repr__(),
                         stream_source.description.__repr__())

    def _init_extractor(self,
                        programmatic_description_enabled: bool = True) -> None:
        repository_path = pathlib.Path(
            __file__).parent.parent.resolve() / "resources/extractor/feast/fs"
        conf = {
            f"extractor.feast.{FeastExtractor.FEAST_REPOSITORY_PATH}":
            repository_path,
            f"extractor.feast.{FeastExtractor.DESCRIBE_FEATURE_VIEWS}":
            programmatic_description_enabled,
        }
        self.extractor = FeastExtractor()
        self.extractor.init(
            Scoped.get_scoped_conf(conf=ConfigFactory.from_dict(conf),
                                   scope=self.extractor.get_scope()))

    @staticmethod
    def _strip_margin(text: str) -> str:
        return re.sub("\n[ \t]*\\|", "\n", text)

    def tearDown(self) -> None:
        root_path = pathlib.Path(__file__).parent.parent.resolve(
        ) / "resources/extractor/feast/fs/data"
        os.remove(root_path / "online_store.db")
        os.remove(root_path / "registry.db")
Beispiel #5
0
    if cypher_query:
        job_config.put(
            f"extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}",
            cypher_query,
        )
    if elasticsearch_mapping:
        job_config.put(
            f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}",
            elasticsearch_mapping,
        )

    return job_config


if __name__ == "__main__":
    feast_job = DefaultJob(
        conf=create_feast_job_config(),
        task=DefaultTask(extractor=FeastExtractor(),
                         loader=FsNeo4jCSVLoader()),
        publisher=neo4j_csv_publisher.Neo4jCsvPublisher(),
    )
    feast_job.launch()

    es_publish_job = DefaultJob(
        conf=create_es_publish_job_config(),
        task=DefaultTask(loader=FSElasticsearchJSONLoader(),
                         extractor=Neo4jSearchDataExtractor()),
        publisher=ElasticsearchPublisher(),
    )
    es_publish_job.launch()