예제 #1
0
def feature_stats_dataset_basic(client, feature_stats_feature_set):

    n_rows = 20

    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    df = pd.DataFrame({
        "datetime": [time_offset] * n_rows,
        "entity_id": [i for i in range(n_rows)],
        "strings": ["a", "b"] * int(n_rows / 2),
        "ints": [int(i) for i in range(n_rows)],
        "floats": [10.5 - i for i in range(n_rows)],
    })

    expected_stats = tfdv.generate_statistics_from_dataframe(
        df[["strings", "ints", "floats"]])
    clear_unsupported_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = df[name].std()
            feature.num_stats.std_dev = std

    ingestion_id = client.ingest(feature_stats_feature_set, df)
    time.sleep(10)
    return {
        "df":
        df,
        "id":
        ingestion_id,
        "date":
        datetime(time_offset.year, time_offset.month,
                 time_offset.day).replace(tzinfo=pytz.utc),
        "stats":
        expected_stats,
    }
예제 #2
0
def test_feature_stats_force_refresh(client, feature_stats_dataset_basic,
                                     feature_stats_feature_set):
    df = feature_stats_dataset_basic["df"]

    df2 = pd.DataFrame({
        "datetime": [df.iloc[0].datetime],
        "entity_id": [10],
        "strings": ["c"],
        "ints": [2],
        "floats": [1.3],
    })
    client.ingest(feature_stats_feature_set, df2)
    time.sleep(10)

    actual_stats = client.get_statistics(
        "feature_stats",
        features=["strings", "ints", "floats"],
        store="historical",
        start_date=feature_stats_dataset_basic["date"],
        end_date=feature_stats_dataset_basic["date"] + timedelta(days=1),
        force_refresh=True,
    )

    combined_df = pd.concat([df, df2])
    expected_stats = tfdv.generate_statistics_from_dataframe(combined_df)

    clear_unsupported_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = combined_df[name].std()
            feature.num_stats.std_dev = std

    assert_stats_equal(expected_stats, actual_stats)
예제 #3
0
def test_batch_dataset_statistics(client):
    fs1 = client.get_feature_set(name="feature_set_1")
    fs2 = client.get_feature_set(name="feature_set_2")
    id_offset = 20

    n_rows = 21
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    features_1_df = pd.DataFrame({
        "datetime": [time_offset] * n_rows,
        "entity_id": [id_offset + i for i in range(n_rows)],
        "feature_value6": ["a" for i in range(n_rows)],
    })
    ingestion_id1 = client.ingest(fs1, features_1_df)

    features_2_df = pd.DataFrame({
        "datetime": [time_offset] * n_rows,
        "other_entity_id": [id_offset + i for i in range(n_rows)],
        "other_feature_value7": [int(i) % 10 for i in range(0, n_rows)],
    })
    ingestion_id2 = client.ingest(fs2, features_2_df)

    entity_df = pd.DataFrame({
        "datetime": [time_offset] * n_rows,
        "entity_id": [id_offset + i for i in range(n_rows)],
        "other_entity_id": [id_offset + i for i in range(n_rows)],
    })

    time.sleep(15)  # wait for rows to get written to bq
    while True:
        rows_ingested1 = get_rows_ingested(client, fs1, ingestion_id1)
        rows_ingested2 = get_rows_ingested(client, fs2, ingestion_id2)
        if rows_ingested1 == len(features_1_df) and rows_ingested2 == len(
                features_2_df):
            print(
                f"Number of rows successfully ingested: {rows_ingested1}, {rows_ingested2}. Continuing."
            )
            break
        time.sleep(30)

    feature_retrieval_job = client.get_historical_features(
        entity_rows=entity_df,
        feature_refs=["feature_value6", "feature_set_2:other_feature_value7"],
        project=PROJECT_NAME,
        compute_statistics=True,
    )
    output = feature_retrieval_job.to_dataframe(timeout_sec=180)
    print(output.head(10))
    stats = feature_retrieval_job.statistics(timeout_sec=180)
    clear_unsupported_fields(stats)

    expected_stats = tfdv.generate_statistics_from_dataframe(
        output[["feature_value6", "feature_set_2__other_feature_value7"]])
    clear_unsupported_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = output[name].std()
            feature.num_stats.std_dev = std

    assert_stats_equal(expected_stats, stats)
    clean_up_remote_files(feature_retrieval_job.get_avro_files())