Ejemplo n.º 1
0
def test_offline_ingestion(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    ingest_and_verify(feast_client, feast_spark_client, feature_table,
                      original)
def transactions_feature_table(spark, client):
    schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("created_timestamp", TimestampType()),
        StructField("total_transactions", DoubleType()),
        StructField("is_vip", BooleanType()),
    ])
    df_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            50.0,
            True,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=2),
            100.0,
            True,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            400.0,
            False,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=2),
            datetime(year=2020, month=9, day=1),
            200.0,
            False,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=4),
            datetime(year=2020, month=9, day=1),
            300.0,
            False,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "transactions",
                                                  schema, df_data)
    file_source = FileSource("event_timestamp", "created_timestamp",
                             ParquetFormat(), file_uri)
    features = [
        Feature("total_transactions", ValueType.DOUBLE),
        Feature("is_vip", ValueType.BOOL),
    ]
    feature_table = FeatureTable("transactions", ["customer_id"],
                                 features,
                                 batch_source=file_source)
    yield client.apply_feature_table(feature_table)
    shutil.rmtree(temp_dir)
Ejemplo n.º 3
0
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(name="long_entity_name" * 10,
                    description="S2id",
                    value_type=ValueType.INT64)

    feature_table = FeatureTable(
        name="just1a2featuretable3with4a5really6really7really8really9really10",
        entities=[entity.name],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data().rename(columns={"s2id": entity.name})
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{
            entity.name: key
        } for key in data_sample[entity.name].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[[entity.name, f"{feature_table.name}:unique_drivers"]],
        data_sample[[entity.name, "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"
                     }),
    )
Ejemplo n.º 4
0
def test_offline_ingestion(feast_client: Client, staging_path: str):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            "event_timestamp",
            "event_timestamp",
            ParquetFormat(),
            os.path.join(staging_path, "batch-storage"),
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    job = feast_client.start_offline_to_online_ingestion(
        feature_table, datetime.today(),
        datetime.today() + timedelta(days=1))

    status = wait_retry_backoff(
        lambda:
        (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS),
        300)

    assert status == SparkJobStatus.COMPLETED

    features = feast_client.get_online_features(
        ["drivers:unique_drivers"],
        entity_rows=[{
            "s2id": s2_id
        } for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}),
    )
Ejemplo n.º 5
0
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client,
                                       feast_spark_client: SparkClient):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="gs://example/feast/*",
        event_timestamp_column="datetime_col",
        created_timestamp_column="timestamp",
        date_partition_column="datetime",
    )
    feature_table = FeatureTable(
        name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"),
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )
    feast_client.apply(entity)
    feast_client.apply(feature_table)

    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "0 0 * * *")
    config.load_incluster_config()
    k8s_api = client.CustomObjectsApi()

    def get_scheduled_spark_application():
        job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}".
                               encode()).hexdigest()
        resource_name = f"feast-{job_hash}"

        return k8s_api.get_namespaced_custom_object(
            group="sparkoperator.k8s.io",
            version="v1beta2",
            namespace=pytestconfig.getoption("k8s_namespace"),
            plural="scheduledsparkapplications",
            name=resource_name,
        )

    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "0 0 * * *"
    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "1 0 * * *")
    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "1 0 * * *"

    feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
Ejemplo n.º 6
0
def test_historical_features(
    feast_client: Client, batch_source: Union[BigQuerySource, FileSource]
):
    customer_entity = Entity(
        name="user_id", description="Customer", value_type=ValueType.INT64
    )
    feast_client.apply_entity(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply_feature_table(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    job = feast_client.get_historical_features(feature_refs, customers_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame(
        {
            "event_timestamp": customers_df.event_timestamp.tolist(),
            "user_id": customers_df.user_id.tolist(),
            "transactions__daily_transactions": transactions_df.daily_transactions.tolist()
            + [None] * transactions_df.shape[0],
        }
    )

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True),
        expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True
        ),
    )
Ejemplo n.º 7
0
def bookings_feature_table_with_mapping(spark, client):
    schema = StructType([
        StructField("id", IntegerType()),
        StructField("datetime", TimestampType()),
        StructField("created_datetime", TimestampType()),
        StructField("total_completed_bookings", IntegerType()),
    ])
    df_data = [
        (
            8001,
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            100,
        ),
        (
            8001,
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            150,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            200,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema,
                                                  df_data)

    file_source = FileSource(
        event_timestamp_column="datetime",
        created_timestamp_column="created_datetime",
        file_format=ParquetFormat(),
        file_url=file_uri,
        field_mapping={"id": "driver_id"},
    )
    features = [Feature("total_completed_bookings", ValueType.INT32)]
    max_age = Duration()
    max_age.FromSeconds(86400)
    feature_table = FeatureTable("bookings", ["driver_id"],
                                 features,
                                 batch_source=file_source,
                                 max_age=max_age)
    yield client.apply(feature_table)
    shutil.rmtree(temp_dir)
Ejemplo n.º 8
0
def create_schema(kafka_broker, topic_name, feature_table_name):
    entity = Entity(name="key", description="Key", value_type=ValueType.INT64)
    feature_table = FeatureTable(
        name=feature_table_name,
        entities=["key"],
        features=[Feature("num", ValueType.INT64), Feature("set", ValueType.STRING)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url="/dev/null",
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )
    return entity, feature_table
Ejemplo n.º 9
0
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name=
        "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data()
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids
def bookings_feature_table(spark, client):
    schema = StructType([
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("created_timestamp", TimestampType()),
        StructField("total_completed_bookings", IntegerType()),
    ])
    df_data = [
        (
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            100,
        ),
        (
            8001,
            datetime(year=2020, month=9, day=2),
            datetime(year=2020, month=9, day=2),
            150,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=2),
            datetime(year=2020, month=9, day=2),
            200,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema,
                                                  df_data)

    file_source = FileSource("event_timestamp", "created_timestamp", "parquet",
                             file_uri)
    features = [Feature("total_completed_bookings", ValueType.INT32)]
    max_age = Duration()
    max_age.FromSeconds(86400)
    feature_table = FeatureTable("bookings", ["driver_id"],
                                 features,
                                 batch_source=file_source,
                                 max_age=max_age)
    yield client.apply_feature_table(feature_table)
    shutil.rmtree(temp_dir)
Ejemplo n.º 11
0
def test_offline_ingestion_from_bq_view(pytestconfig, bq_dataset,
                                        feast_client: Client,
                                        feast_spark_client: SparkClient):
    original = generate_data()
    bq_project = pytestconfig.getoption("bq_project")

    bq_client = bigquery.Client(project=bq_project)
    source_ref = bigquery.TableReference(
        bigquery.DatasetReference(bq_project, bq_dataset),
        f"ingestion_source_{datetime.now():%Y%m%d%H%M%s}",
    )
    bq_client.load_table_from_dataframe(original, source_ref).result()

    view_ref = bigquery.TableReference(
        bigquery.DatasetReference(bq_project, bq_dataset),
        f"ingestion_view_{datetime.now():%Y%m%d%H%M%s}",
    )
    view = bigquery.Table(view_ref)
    view.view_query = f"select * from `{source_ref.project}.{source_ref.dataset_id}.{source_ref.table_id}`"
    bq_client.create_table(view)

    entity = Entity(name="s2id",
                    description="S2id",
                    value_type=ValueType.INT64)
    feature_table = FeatureTable(
        name="bq_ingestion",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=BigQuerySource(
            event_timestamp_column="event_timestamp",
            table_ref=
            f"{view_ref.project}:{view_ref.dataset_id}.{view_ref.table_id}",
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    ingest_and_verify(feast_client, feast_spark_client, feature_table,
                      original)
Ejemplo n.º 12
0
def test_historical_features(
    feast_client: Client,
    tfrecord_feast_client: Client,
    batch_source: Union[BigQuerySource, FileSource],
):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    # remove microseconds because job.get_start_time() does not contain microseconds
    job_submission_time = datetime.utcnow().replace(microsecond=0)
    job = feast_client.get_historical_features(feature_refs, customers_df)
    assert job.get_start_time() >= job_submission_time
    assert job.get_start_time() <= job_submission_time + timedelta(hours=1)

    output_dir = job.get_output_file_uri()

    # will both be None if not using Azure blob storage
    account_name, account_key = _get_azure_creds(feast_client)

    joined_df = read_parquet(output_dir,
                             azure_account_name=account_name,
                             azure_account_key=account_key)

    expected_joined_df = pd.DataFrame({
        "event_timestamp":
        customers_df.event_timestamp.tolist(),
        "user_id":
        customers_df.user_id.tolist(),
        "transactions__daily_transactions":
        transactions_df.daily_transactions.tolist() +
        [None] * transactions_df.shape[0],
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )

    job = tfrecord_feast_client.get_historical_features(
        feature_refs, customers_df)
    job.get_output_file_uri()
    assert job.get_status() == SparkJobStatus.COMPLETED
Ejemplo n.º 13
0
def test_streaming_ingestion(feast_client: Client, local_staging_path: str,
                             kafka_server):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            "event_timestamp",
            "event_timestamp",
            kafka_broker,
            AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 60)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 60)

    try:
        original = generate_data()[[
            "s2id", "unique_drivers", "event_timestamp"
        ]]
        for record in original.to_dict("records"):
            record["event_timestamp"] = (
                record["event_timestamp"].to_pydatetime().replace(
                    tzinfo=pytz.utc))

            send_avro_record_to_kafka(
                topic_name,
                record,
                bootstrap_servers=kafka_broker,
                avro_schema_json=avro_schema(),
            )

        def get_online_features():
            features = feast_client.get_online_features(
                ["drivers_stream:unique_drivers"],
                entity_rows=[{
                    "s2id": s2_id
                } for s2_id in original["s2id"].tolist()],
            ).to_dict()
            df = pd.DataFrame.from_dict(features)
            return df, not df["drivers_stream:unique_drivers"].isna().any()

        ingested = wait_retry_backoff(get_online_features, 60)
    finally:
        job.cancel()

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}),
    )
Ejemplo n.º 14
0
def test_validation_with_ge(feast_client: Client, kafka_server):
    entity = Entity(name="key", description="Key", value_type=ValueType.INT64)
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="validation_test",
        entities=["key"],
        features=[
            Feature("num", ValueType.INT64),
            Feature("set", ValueType.STRING)
        ],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url="/dev/null",
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=1)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=["validation_test:num", "validation_test:set"],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    test_data["num"] = test_data["num"].astype(np.float64)
    test_data["num"].iloc[invalid_idx] = np.nan
    test_data["set"].iloc[invalid_idx] = None

    pd.testing.assert_frame_equal(
        ingested[["key", "validation_test:num", "validation_test:set"]],
        test_data[["key", "num", "set"]].rename(columns={
            "num": "validation_test:num",
            "set": "validation_test:set"
        }),
    )
Ejemplo n.º 15
0
def test_historical_features(feast_client: Client, local_staging_path: str):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply_entity(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=FileSource(
            "event_timestamp",
            "created_timestamp",
            ParquetFormat(),
            os.path.join(local_staging_path, "transactions"),
        ),
        max_age=max_age,
    )

    feast_client.apply_feature_table(transactions_feature_table)

    retrieval_date = (datetime.utcnow().replace(
        hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None))
    retrieval_outside_max_age_date = retrieval_date + timedelta(1)
    event_date = retrieval_date - timedelta(2)
    creation_date = retrieval_date - timedelta(1)

    customers = [1001, 1002, 1003, 1004, 1005]
    daily_transactions = [np.random.rand() * 10 for _ in customers]
    total_transactions = [np.random.rand() * 100 for _ in customers]

    transactions_df = pd.DataFrame({
        "event_timestamp": [event_date for _ in customers],
        "created_timestamp": [creation_date for _ in customers],
        "user_id":
        customers,
        "daily_transactions":
        daily_transactions,
        "total_transactions":
        total_transactions,
    })

    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    customer_df = pd.DataFrame({
        "event_timestamp": [retrieval_date for _ in customers] +
        [retrieval_outside_max_age_date for _ in customers],
        "user_id":
        customers + customers,
    })

    job = feast_client.get_historical_features(feature_refs, customer_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame({
        "event_timestamp": [retrieval_date for _ in customers] +
        [retrieval_outside_max_age_date for _ in customers],
        "user_id":
        customers + customers,
        "transactions__daily_transactions":
        daily_transactions + [None] * len(customers),
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )
Ejemplo n.º 16
0
def test_streaming_ingestion(feast_client: Client, local_staging_path: str,
                             kafka_server, pytestconfig):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    if not pytestconfig.getoption("scheduled_streaming_job"):
        job = feast_client.start_stream_to_online_ingestion(feature_table)
        assert job.get_feature_table() == feature_table.name
        wait_retry_backoff(
            lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS),
            120)
    else:
        job = None

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300)

    test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=[{
                "s2id": s2_id
            } for s2_id in test_data["s2id"].tolist()],
            feature_names=["drivers_stream:unique_drivers"],
        )
    finally:
        if job:
            job.cancel()
        else:
            feast_client.delete_feature_table(feature_table.name)

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        test_data[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}),
    )
Ejemplo n.º 17
0
from google.protobuf.duration_pb2 import Duration

from feast import BigQuerySource, Entity, Feature, FeatureTable, ValueType

driver_locations_source = BigQuerySource(
    table_ref="rh_prod.ride_hailing_co.drivers",
    event_timestamp_column="event_timestamp",
    created_timestamp_column="created_timestamp",
)


driver = Entity(
    name="driver",  # The name is derived from this argument, not object name.
    value_type=ValueType.INT64,
    description="driver id",
)


driver_locations = FeatureTable(
    name="driver_locations",
    entities=["driver"],
    max_age=Duration(seconds=86400 * 1),
    features=[
        Feature(name="lat", dtype=ValueType.FLOAT),
        Feature(name="lon", dtype=ValueType.STRING),
    ],
    batch_source=driver_locations_source,
)