def test_restarting_failed_jobs(feature_table):
    """ If configured - restart failed jobs """

    feast_client = FeastClient(
        job_service_pause_between_jobs=0,
        job_service_retry_failed_jobs=True,
        options={"whitelisted_projects": "default,ride"},
    )
    feast_client.list_projects = Mock(return_value=["default"])
    feast_client.list_feature_tables = Mock()

    spark_client = Client(feast_client)
    spark_client.list_jobs = Mock()
    spark_client.start_stream_to_online_ingestion = Mock()

    spark_client.feature_store.list_feature_tables.return_value = [
        feature_table
    ]
    spark_client.list_jobs.return_value = []

    ensure_stream_ingestion_jobs(spark_client, all_projects=True)

    spark_client.list_jobs.assert_called_once_with(include_terminated=False)
    spark_client.start_stream_to_online_ingestion.assert_called_once_with(
        feature_table, [], project="default")
Ejemplo n.º 2
0
def _get_expected_job_hash_to_table_refs(
        client: feast.Client,
        projects: List[str]) -> Dict[str, Tuple[str, str]]:
    """
    Checks all feature tables for the requires project(s) and determines all required stream
    ingestion jobs from them. Outputs a map of the expected job_hash to a tuple of (project, table_name).

    Args:
        all_projects (bool): If true, runs the check for all project.
            Otherwise only checks the current project.

    Returns:
        Dict[str, Tuple[str, str]]: Map of job_hash -> (project, table_name) for expected stream ingestion jobs
    """
    job_hash_to_table_refs = {}

    for project in projects:
        feature_tables = client.list_feature_tables(project)
        for feature_table in feature_tables:
            if feature_table.stream_source is not None:
                params = get_stream_to_online_ingestion_params(
                    client, project, feature_table, [])
                job_hash = params.get_job_hash()
                job_hash_to_table_refs[job_hash] = (project,
                                                    feature_table.name)

    return job_hash_to_table_refs
def feast_client():
    c = FeastClient(
        job_service_pause_between_jobs=0,
        options={"whitelisted_projects": "default,ride"},
    )
    c.list_projects = Mock(return_value=["default", "ride", "invalid_project"])
    c.list_feature_tables = Mock()

    yield c
Ejemplo n.º 4
0
def cli(core_url, output_path):

    client = Client(core_url=core_url)

    tables = client.list_feature_tables()

    # sort tables by name for consistent outputs
    tables = sorted(tables, key=lambda x: x.name)

    parsed_tables = []

    for table in tables:

        # sort entities by name for consistent outputs
        entities = sorted(table.entities)

        batch_source = None
        stream_source = None

        # platform and name for constructing URN later on
        batch_source_platform = "unknown"
        stream_source_platform = "unknown"
        batch_source_name = "unknown"
        stream_source_name = "unknown"

        if isinstance(table.batch_source, BigQuerySource):
            batch_source = "BigQuerySource"
            batch_source_platform = "bigquery"
            batch_source_name = table.batch_source.bigquery_options.table_ref

        if isinstance(table.batch_source, FileSource):
            batch_source = "FileSource"
            batch_source_platform = "file"

            # replace slashes because the react frontend can't parse them correctly
            batch_source_name = table.batch_source.file_options.file_url.replace(
                "/", "."
            )

            # replace redundant file prefix
            if batch_source_name.startswith("file:.."):
                batch_source_name = batch_source_name[7:]

        if isinstance(table.stream_source, KafkaSource):
            stream_source = "KafkaSource"
            stream_source_platform = "kafka"
            stream_source_name = table.stream_source.kafka_options.topic

        if isinstance(table.stream_source, KinesisSource):
            stream_source = "KinesisSource"
            stream_source_platform = "kinesis"
            stream_source_name = f"{table.stream_source.kinesis_options.region}-{table.stream_source.kinesis_options.stream_name}"

        # currently unused in MCE outputs, but useful for debugging
        stream_source_config = table.to_dict()["spec"].get("streamSource")
        batch_source_config = table.to_dict()["spec"]["batchSource"]

        raw_entities = [
            client.get_entity(entity_name) for entity_name in table.entities
        ]
        raw_entities = sorted(raw_entities, key=lambda x: x.name)

        source_info = {
            "batch_source": batch_source,
            "stream_source": stream_source,
            "batch_source_config": batch_source_config,
            "stream_source_config": stream_source_config,
            "batch_source_platform": batch_source_platform,
            "stream_source_platform": stream_source_platform,
            "batch_source_name": batch_source_name,
            "stream_source_name": stream_source_name,
        }

        # sort entities by name for consistent outputs
        entities = sorted(
            [
                {
                    "name": x.name,
                    "type": x.value_type.name,
                    "description": x.description,
                    **source_info,
                }
                for x in raw_entities
            ],
            key=lambda x: x["name"],
        )

        # sort features by name for consistent outputs
        features = sorted(
            [
                {"name": x.name, "type": x.dtype.name, **source_info}
                for x in table.features
            ],
            key=lambda x: x["name"],
        )

        parsed_tables.append(
            {
                "name": table.name,
                "entities": entities,
                "features": features,
            }
        )

    if output_path is not None:

        with open(output_path, "w") as f:
            json.dump(parsed_tables, f)

    else:

        print(parsed_tables)
def feast_client():
    c = FeastClient(job_service_pause_between_jobs=0)
    c.list_projects = Mock(return_value=["default"])
    c.list_feature_tables = Mock()

    yield c
Ejemplo n.º 6
0
class FeastExtractor(Extractor):
    """
    Extracts feature tables from Feast Core service. Since Feast is
    a metadata store (and not the database itself), it maps the
    following atributes:

     * a database is name of feast project
     * table name is a name of the feature table
     * columns are features stored in the feature table
    """

    FEAST_SERVICE_CONFIG_KEY = "instance_name"
    FEAST_ENDPOINT_CONFIG_KEY = "endpoint"
    DESCRIBE_FEATURE_TABLES = "describe_feature_tables"
    DEFAULT_CONFIG = ConfigFactory.from_dict({
        FEAST_SERVICE_CONFIG_KEY: "main",
        DESCRIBE_FEATURE_TABLES: True
    })

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(FeastExtractor.DEFAULT_CONFIG)
        self._feast_service = conf.get_string(
            FeastExtractor.FEAST_SERVICE_CONFIG_KEY)
        self._describe_feature_tables = conf.get_bool(
            FeastExtractor.DESCRIBE_FEATURE_TABLES)
        self._client = Client(
            core_url=conf.get_string(FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY))
        self._extract_iter: Union[None, Iterator] = None

    def get_scope(self) -> str:
        return "extractor.feast"

    def extract(self) -> Union[TableMetadata, None]:
        """
        For every feature table from Feast, a multiple objets are extracted:

        1. TableMetadata with feature table description
        2. Programmatic Description of the feature table, containing
           metadata - date of creation and labels
        3. Programmatic Description with Batch Source specification
        4. (if applicable) Programmatic Description with Stream Source
           specification
        """
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        for project in self._client.list_projects():
            for feature_table in self._client.list_feature_tables(
                    project=project):
                yield from self._extract_feature_table(project, feature_table)

    def _extract_feature_table(
            self, project: str,
            feature_table: FeatureTable) -> Iterator[TableMetadata]:
        columns = []
        for index, entity_name in enumerate(feature_table.entities):
            entity = self._client.get_entity(entity_name, project=project)
            columns.append(
                ColumnMetadata(entity.name, entity.description,
                               entity.value_type, index))

        for index, feature in enumerate(feature_table.features):
            columns.append(
                ColumnMetadata(
                    feature.name,
                    None,
                    feature.dtype.name,
                    len(feature_table.entities) + index,
                ))

        yield TableMetadata(
            "feast",
            self._feast_service,
            project,
            feature_table.name,
            None,
            columns,
        )

        if self._describe_feature_tables:
            created_at = datetime.utcfromtimestamp(
                feature_table.created_timestamp.seconds)
            description = f"* Created at **{created_at}**\n"

            if feature_table.labels:
                description += "* Labels:\n"
                for key, value in feature_table.labels.items():
                    description += f"    * {key}: **{value}**\n"

            yield TableMetadata(
                "feast",
                self._feast_service,
                project,
                feature_table.name,
                description,
                description_source="feature_table_details",
            )

            yield TableMetadata(
                "feast",
                self._feast_service,
                project,
                feature_table.name,
                f'```\n{yaml.dump(feature_table.to_dict()["spec"]["batchSource"])}```',
                description_source="batch_source",
            )

            if feature_table.stream_source:
                yield TableMetadata(
                    "feast",
                    self._feast_service,
                    project,
                    feature_table.name,
                    f'```\n{yaml.dump(feature_table.to_dict()["spec"]["streamSource"])}```',
                    description_source="stream_source",
                )