Example #1
0
def ensure_stream_ingestion_jobs(client: feast_spark.Client,
                                 all_projects: bool):
    """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs.

    More concretely, it will determine
    - which stream ingestion jobs are running
    - which stream ingestion jobs should be running
    And it'll do 2 kinds of operations
    - Cancel all running jobs that should not be running
    - Start all non-existent jobs that should be running

    Args:
        all_projects (bool): If true, runs the check for all project.
                             Otherwise only checks the client's current project.
    """

    projects = (client.feature_store.list_projects()
                if all_projects else [client.feature_store.project])

    expected_job_hash_to_table_refs = _get_expected_job_hash_to_table_refs(
        client, projects)

    expected_job_hashes = set(expected_job_hash_to_table_refs.keys())

    jobs_by_hash: Dict[str, StreamIngestionJob] = {}
    for job in client.list_jobs(include_terminated=False):
        if isinstance(job, StreamIngestionJob):
            jobs_by_hash[job.get_hash()] = job

    existing_job_hashes = set(jobs_by_hash.keys())

    job_hashes_to_cancel = existing_job_hashes - expected_job_hashes
    job_hashes_to_start = expected_job_hashes - existing_job_hashes

    logging.debug(
        f"existing_job_hashes = {sorted(list(existing_job_hashes))} expected_job_hashes = {sorted(list(expected_job_hashes))}"
    )

    for job_hash in job_hashes_to_cancel:
        job = jobs_by_hash[job_hash]
        logging.info(
            f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}"
        )
        try:
            job.cancel()
        except FailedPrecondition as exc:
            logging.warning(f"Job canceling failed with exception {exc}")

    for job_hash in job_hashes_to_start:
        # Any job that we wish to start should be among expected table refs map
        project, table_name = expected_job_hash_to_table_refs[job_hash]
        logging.info(
            f"Starting a stream ingestion job for project={project}, table_name={table_name} with job_hash={job_hash}"
        )
        feature_table = client.feature_store.get_feature_table(name=table_name,
                                                               project=project)
        client.start_stream_to_online_ingestion(feature_table, [],
                                                project=project)
Example #2
0
def start_job(
    feast_spark_client: SparkClient, feature_table: FeatureTable, pytestconfig
):
    if pytestconfig.getoption("scheduled_streaming_job"):
        return

    job = feast_spark_client.start_stream_to_online_ingestion(feature_table)
    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 180
    )
    return job
Example #3
0
def ensure_stream_ingestion_jobs(client: Client, all_projects: bool):
    """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs.

    More concretely, it will determine
    - which stream ingestion jobs are running
    - which stream ingestion jobs should be running
    And it'll do 2 kinds of operations
    - Cancel all running jobs that should not be running
    - Start all non-existent jobs that should be running

    Args:
        all_projects (bool): If true, runs the check for all project.
                             Otherwise only checks the client's current project.
    """

    projects = (client.feature_store.list_projects()
                if all_projects else [client.feature_store.project])
    if client.config.exists(opt.WHITELISTED_PROJECTS):
        whitelisted_projects = client.config.get(opt.WHITELISTED_PROJECTS)
        if whitelisted_projects:
            whitelisted_projects = whitelisted_projects.split(",")
            projects = [
                project for project in projects
                if project in whitelisted_projects
            ]

    expected_job_hash_to_tables = _get_expected_job_hash_to_tables(
        client, projects)

    expected_job_hashes = set(expected_job_hash_to_tables.keys())

    jobs_by_hash: Dict[str, StreamIngestionJob] = {}
    # when we want to retry failed jobs, we shouldn't include terminated jobs here
    # thus, Control Loop will behave like no job exists and will spawn new one
    for job in client.list_jobs(include_terminated=not client.config.
                                getboolean(opt.JOB_SERVICE_RETRY_FAILED_JOBS)):
        if (isinstance(job, StreamIngestionJob)
                and job.get_status() != SparkJobStatus.COMPLETED):
            jobs_by_hash[job.get_hash()] = job

    existing_job_hashes = set(jobs_by_hash.keys())

    job_hashes_to_cancel = existing_job_hashes - expected_job_hashes
    job_hashes_to_start = expected_job_hashes - existing_job_hashes

    logger.debug(f"existing_job_hashes = {sorted(list(existing_job_hashes))} "
                 f"expected_job_hashes = {sorted(list(expected_job_hashes))}")

    for job_hash in job_hashes_to_start:
        # Any job that we wish to start should be among expected table refs map
        project, feature_table = expected_job_hash_to_tables[job_hash]
        logger.warning(
            f"Starting a stream ingestion job for project={project}, "
            f"table_name={feature_table.name} with job_hash={job_hash}")
        client.start_stream_to_online_ingestion(feature_table, [],
                                                project=project)

        # prevent scheduler from peak load
        time.sleep(client.config.getint(opt.JOB_SERVICE_PAUSE_BETWEEN_JOBS))

    for job_hash in job_hashes_to_cancel:
        job = jobs_by_hash[job_hash]
        if job.get_status() != SparkJobStatus.IN_PROGRESS:
            logger.warning(
                f"Can't cancel job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}"
            )
            continue

        logger.warning(
            f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}"
        )
        try:
            job.cancel()
        except FailedPrecondition as exc:
            logger.error(f"Job canceling failed with exception {exc}")
def test_streaming_ingestion(
    feast_client: Client,
    feast_spark_client: SparkClient,
    local_staging_path: str,
    kafka_server,
    pytestconfig,
):
    entity = Entity(name="s2id", description="S2id", value_type=ValueType.INT64,)
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    if not pytestconfig.getoption("scheduled_streaming_job"):
        job = feast_spark_client.start_stream_to_online_ingestion(feature_table)
        assert job.get_feature_table() == feature_table.name
        wait_retry_backoff(
            lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 180
        )
    else:
        job = None

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300
    )

    test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=[{"s2id": s2_id} for s2_id in test_data["s2id"].tolist()],
            feature_names=["drivers_stream:unique_drivers"],
        )
    finally:
        if job:
            job.cancel()
        else:
            feast_client.delete_feature_table(feature_table.name)

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        test_data[["s2id", "unique_drivers"]].rename(
            columns={"unique_drivers": "drivers_stream:unique_drivers"}
        ),
    )