def test_restarting_failed_jobs(feature_table):
    """ If configured - restart failed jobs """

    feast_client = FeastClient(
        job_service_pause_between_jobs=0,
        job_service_retry_failed_jobs=True,
        options={"whitelisted_projects": "default,ride"},
    )
    feast_client.list_projects = Mock(return_value=["default"])
    feast_client.list_feature_tables = Mock()

    spark_client = Client(feast_client)
    spark_client.list_jobs = Mock()
    spark_client.start_stream_to_online_ingestion = Mock()

    spark_client.feature_store.list_feature_tables.return_value = [
        feature_table
    ]
    spark_client.list_jobs.return_value = []

    ensure_stream_ingestion_jobs(spark_client, all_projects=True)

    spark_client.list_jobs.assert_called_once_with(include_terminated=False)
    spark_client.start_stream_to_online_ingestion.assert_called_once_with(
        feature_table, [], project="default")
def feast_client():
    c = FeastClient(
        job_service_pause_between_jobs=0,
        options={"whitelisted_projects": "default,ride"},
    )
    c.list_projects = Mock(return_value=["default", "ride", "invalid_project"])
    c.list_feature_tables = Mock()

    yield c
Exemple #3
0
def ensure_stream_ingestion_jobs(client: feast.Client, all_projects: bool):
    """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs.

    More concretely, it will determine
    - which stream ingestion jobs are running
    - which stream ingestion jobs should be running
    And it'll do 2 kinds of operations
    - Cancel all running jobs that should not be running
    - Start all non-existent jobs that should be running

    Args:
        all_projects (bool): If true, runs the check for all project.
                             Otherwise only checks the client's current project.
    """

    projects = client.list_projects() if all_projects else [client.project]

    expected_job_hash_to_table_refs = _get_expected_job_hash_to_table_refs(
        client, projects)

    expected_job_hashes = set(expected_job_hash_to_table_refs.keys())

    jobs_by_hash: Dict[str, StreamIngestionJob] = {}
    for job in client.list_jobs(include_terminated=False):
        if isinstance(job, StreamIngestionJob):
            jobs_by_hash[job.get_hash()] = job

    existing_job_hashes = set(jobs_by_hash.keys())

    job_hashes_to_cancel = existing_job_hashes - expected_job_hashes
    job_hashes_to_start = expected_job_hashes - existing_job_hashes

    logging.debug(
        f"existing_job_hashes = {sorted(list(existing_job_hashes))} expected_job_hashes = {sorted(list(expected_job_hashes))}"
    )

    for job_hash in job_hashes_to_cancel:
        job = jobs_by_hash[job_hash]
        logging.info(
            f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}"
        )
        try:
            job.cancel()
        except FailedPrecondition as exc:
            logging.warning(f"Job canceling failed with exception {exc}")

    for job_hash in job_hashes_to_start:
        # Any job that we wish to start should be among expected table refs map
        project, table_name = expected_job_hash_to_table_refs[job_hash]
        logging.info(
            f"Starting a stream ingestion job for project={project}, table_name={table_name} with job_hash={job_hash}"
        )
        feature_table = client.get_feature_table(name=table_name,
                                                 project=project)
        client.start_stream_to_online_ingestion(feature_table, [],
                                                project=project)
def feast_client():
    c = FeastClient(job_service_pause_between_jobs=0)
    c.list_projects = Mock(return_value=["default"])
    c.list_feature_tables = Mock()

    yield c
class FeastExtractor(Extractor):
    """
    Extracts feature tables from Feast Core service. Since Feast is
    a metadata store (and not the database itself), it maps the
    following atributes:

     * a database is name of feast project
     * table name is a name of the feature table
     * columns are features stored in the feature table
    """

    FEAST_SERVICE_CONFIG_KEY = "instance_name"
    FEAST_ENDPOINT_CONFIG_KEY = "endpoint"
    DESCRIBE_FEATURE_TABLES = "describe_feature_tables"
    DEFAULT_CONFIG = ConfigFactory.from_dict({
        FEAST_SERVICE_CONFIG_KEY: "main",
        DESCRIBE_FEATURE_TABLES: True
    })

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(FeastExtractor.DEFAULT_CONFIG)
        self._feast_service = conf.get_string(
            FeastExtractor.FEAST_SERVICE_CONFIG_KEY)
        self._describe_feature_tables = conf.get_bool(
            FeastExtractor.DESCRIBE_FEATURE_TABLES)
        self._client = Client(
            core_url=conf.get_string(FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY))
        self._extract_iter: Union[None, Iterator] = None

    def get_scope(self) -> str:
        return "extractor.feast"

    def extract(self) -> Union[TableMetadata, None]:
        """
        For every feature table from Feast, a multiple objets are extracted:

        1. TableMetadata with feature table description
        2. Programmatic Description of the feature table, containing
           metadata - date of creation and labels
        3. Programmatic Description with Batch Source specification
        4. (if applicable) Programmatic Description with Stream Source
           specification
        """
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        for project in self._client.list_projects():
            for feature_table in self._client.list_feature_tables(
                    project=project):
                yield from self._extract_feature_table(project, feature_table)

    def _extract_feature_table(
            self, project: str,
            feature_table: FeatureTable) -> Iterator[TableMetadata]:
        columns = []
        for index, entity_name in enumerate(feature_table.entities):
            entity = self._client.get_entity(entity_name, project=project)
            columns.append(
                ColumnMetadata(entity.name, entity.description,
                               entity.value_type, index))

        for index, feature in enumerate(feature_table.features):
            columns.append(
                ColumnMetadata(
                    feature.name,
                    None,
                    feature.dtype.name,
                    len(feature_table.entities) + index,
                ))

        yield TableMetadata(
            "feast",
            self._feast_service,
            project,
            feature_table.name,
            None,
            columns,
        )

        if self._describe_feature_tables:
            created_at = datetime.utcfromtimestamp(
                feature_table.created_timestamp.seconds)
            description = f"* Created at **{created_at}**\n"

            if feature_table.labels:
                description += "* Labels:\n"
                for key, value in feature_table.labels.items():
                    description += f"    * {key}: **{value}**\n"

            yield TableMetadata(
                "feast",
                self._feast_service,
                project,
                feature_table.name,
                description,
                description_source="feature_table_details",
            )

            yield TableMetadata(
                "feast",
                self._feast_service,
                project,
                feature_table.name,
                f'```\n{yaml.dump(feature_table.to_dict()["spec"]["batchSource"])}```',
                description_source="batch_source",
            )

            if feature_table.stream_source:
                yield TableMetadata(
                    "feast",
                    self._feast_service,
                    project,
                    feature_table.name,
                    f'```\n{yaml.dump(feature_table.to_dict()["spec"]["streamSource"])}```',
                    description_source="stream_source",
                )