Ejemplo n.º 1
0
def test_historical_features(
    feast_client: Client, batch_source: Union[BigQuerySource, FileSource]
):
    customer_entity = Entity(
        name="user_id", description="Customer", value_type=ValueType.INT64
    )
    feast_client.apply_entity(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply_feature_table(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    job = feast_client.get_historical_features(feature_refs, customers_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame(
        {
            "event_timestamp": customers_df.event_timestamp.tolist(),
            "user_id": customers_df.user_id.tolist(),
            "transactions__daily_transactions": transactions_df.daily_transactions.tolist()
            + [None] * transactions_df.shape[0],
        }
    )

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True),
        expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True
        ),
    )
Ejemplo n.º 2
0
def test_historical_features(
    feast_client: Client,
    tfrecord_feast_client: Client,
    batch_source: Union[BigQuerySource, FileSource],
):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    # remove microseconds because job.get_start_time() does not contain microseconds
    job_submission_time = datetime.utcnow().replace(microsecond=0)
    job = feast_client.get_historical_features(feature_refs, customers_df)
    assert job.get_start_time() >= job_submission_time
    assert job.get_start_time() <= job_submission_time + timedelta(hours=1)

    output_dir = job.get_output_file_uri()

    # will both be None if not using Azure blob storage
    account_name, account_key = _get_azure_creds(feast_client)

    joined_df = read_parquet(output_dir,
                             azure_account_name=account_name,
                             azure_account_key=account_key)

    expected_joined_df = pd.DataFrame({
        "event_timestamp":
        customers_df.event_timestamp.tolist(),
        "user_id":
        customers_df.user_id.tolist(),
        "transactions__daily_transactions":
        transactions_df.daily_transactions.tolist() +
        [None] * transactions_df.shape[0],
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )

    job = tfrecord_feast_client.get_historical_features(
        feature_refs, customers_df)
    job.get_output_file_uri()
    assert job.get_status() == SparkJobStatus.COMPLETED
def test_historical_features(feast_client: Client, local_staging_path: str):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply_entity(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=FileSource(
            "event_timestamp",
            "created_timestamp",
            ParquetFormat(),
            os.path.join(local_staging_path, "transactions"),
        ),
        max_age=max_age,
    )

    feast_client.apply_feature_table(transactions_feature_table)

    retrieval_date = (datetime.utcnow().replace(
        hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None))
    retrieval_outside_max_age_date = retrieval_date + timedelta(1)
    event_date = retrieval_date - timedelta(2)
    creation_date = retrieval_date - timedelta(1)

    customers = [1001, 1002, 1003, 1004, 1005]
    daily_transactions = [np.random.rand() * 10 for _ in customers]
    total_transactions = [np.random.rand() * 100 for _ in customers]

    transactions_df = pd.DataFrame({
        "event_timestamp": [event_date for _ in customers],
        "created_timestamp": [creation_date for _ in customers],
        "user_id":
        customers,
        "daily_transactions":
        daily_transactions,
        "total_transactions":
        total_transactions,
    })

    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    customer_df = pd.DataFrame({
        "event_timestamp": [retrieval_date for _ in customers] +
        [retrieval_outside_max_age_date for _ in customers],
        "user_id":
        customers + customers,
    })

    job = feast_client.get_historical_features(feature_refs, customer_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame({
        "event_timestamp": [retrieval_date for _ in customers] +
        [retrieval_outside_max_age_date for _ in customers],
        "user_id":
        customers + customers,
        "transactions__daily_transactions":
        daily_transactions + [None] * len(customers),
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )