Exemple #1
0
def test_offline_ingestion(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    ingest_and_verify(feast_client, feast_spark_client, feature_table,
                      original)
Exemple #2
0
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(name="long_entity_name" * 10,
                    description="S2id",
                    value_type=ValueType.INT64)

    feature_table = FeatureTable(
        name="just1a2featuretable3with4a5really6really7really8really9really10",
        entities=[entity.name],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data().rename(columns={"s2id": entity.name})
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{
            entity.name: key
        } for key in data_sample[entity.name].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[[entity.name, f"{feature_table.name}:unique_drivers"]],
        data_sample[[entity.name, "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"
                     }),
    )
def test_offline_ingestion(feast_client: Client, staging_path: str):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            "event_timestamp",
            "event_timestamp",
            ParquetFormat(),
            os.path.join(staging_path, "batch-storage"),
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    job = feast_client.start_offline_to_online_ingestion(
        feature_table, datetime.today(),
        datetime.today() + timedelta(days=1))

    status = wait_retry_backoff(
        lambda:
        (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS),
        300)

    assert status == SparkJobStatus.COMPLETED

    features = feast_client.get_online_features(
        ["drivers:unique_drivers"],
        entity_rows=[{
            "s2id": s2_id
        } for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}),
    )
Exemple #4
0
def test_historical_features(
    feast_client: Client, batch_source: Union[BigQuerySource, FileSource]
):
    customer_entity = Entity(
        name="user_id", description="Customer", value_type=ValueType.INT64
    )
    feast_client.apply_entity(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply_feature_table(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    job = feast_client.get_historical_features(feature_refs, customers_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame(
        {
            "event_timestamp": customers_df.event_timestamp.tolist(),
            "user_id": customers_df.user_id.tolist(),
            "transactions__daily_transactions": transactions_df.daily_transactions.tolist()
            + [None] * transactions_df.shape[0],
        }
    )

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True),
        expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True
        ),
    )
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name=
        "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data()
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids
def test_historical_features(
    feast_client: Client,
    tfrecord_feast_client: Client,
    batch_source: Union[BigQuerySource, FileSource],
):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    # remove microseconds because job.get_start_time() does not contain microseconds
    job_submission_time = datetime.utcnow().replace(microsecond=0)
    job = feast_client.get_historical_features(feature_refs, customers_df)
    assert job.get_start_time() >= job_submission_time
    assert job.get_start_time() <= job_submission_time + timedelta(hours=1)

    output_dir = job.get_output_file_uri()

    # will both be None if not using Azure blob storage
    account_name, account_key = _get_azure_creds(feast_client)

    joined_df = read_parquet(output_dir,
                             azure_account_name=account_name,
                             azure_account_key=account_key)

    expected_joined_df = pd.DataFrame({
        "event_timestamp":
        customers_df.event_timestamp.tolist(),
        "user_id":
        customers_df.user_id.tolist(),
        "transactions__daily_transactions":
        transactions_df.daily_transactions.tolist() +
        [None] * transactions_df.shape[0],
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )

    job = tfrecord_feast_client.get_historical_features(
        feature_refs, customers_df)
    job.get_output_file_uri()
    assert job.get_status() == SparkJobStatus.COMPLETED
Exemple #7
0
        Feature(name="string_list_feature", dtype=ValueType.STRING_LIST),
        Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST),
    ],
)

client = Client(core_url=feast_core_url, serving_url=feast_online_serving_url)

# Register feature set
client.apply(all_types_fs_expected)

df.info()
df.describe()
df.head()

# Ingest tdata
client.ingest(all_types_fs_expected, df)


# Wait for data to be available
def try_get_features():
    online_request_entity = [{"user_id": 1001}]
    online_request_features = ["float_feature"]

    response = client.get_online_features(entity_rows=online_request_entity,
                                          feature_refs=online_request_features)
    response_dict = response.to_dict()
    if response_dict['float_feature'] == df.iloc[0]['float_feature']:
        return response_dict, True
    return response_dict, False

def test_historical_features(feast_client: Client, local_staging_path: str):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply_entity(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=FileSource(
            "event_timestamp",
            "created_timestamp",
            ParquetFormat(),
            os.path.join(local_staging_path, "transactions"),
        ),
        max_age=max_age,
    )

    feast_client.apply_feature_table(transactions_feature_table)

    retrieval_date = (datetime.utcnow().replace(
        hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None))
    retrieval_outside_max_age_date = retrieval_date + timedelta(1)
    event_date = retrieval_date - timedelta(2)
    creation_date = retrieval_date - timedelta(1)

    customers = [1001, 1002, 1003, 1004, 1005]
    daily_transactions = [np.random.rand() * 10 for _ in customers]
    total_transactions = [np.random.rand() * 100 for _ in customers]

    transactions_df = pd.DataFrame({
        "event_timestamp": [event_date for _ in customers],
        "created_timestamp": [creation_date for _ in customers],
        "user_id":
        customers,
        "daily_transactions":
        daily_transactions,
        "total_transactions":
        total_transactions,
    })

    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    customer_df = pd.DataFrame({
        "event_timestamp": [retrieval_date for _ in customers] +
        [retrieval_outside_max_age_date for _ in customers],
        "user_id":
        customers + customers,
    })

    job = feast_client.get_historical_features(feature_refs, customer_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame({
        "event_timestamp": [retrieval_date for _ in customers] +
        [retrieval_outside_max_age_date for _ in customers],
        "user_id":
        customers + customers,
        "transactions__daily_transactions":
        daily_transactions + [None] * len(customers),
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )