def test_feature_stats_agg_over_datasets(client, feature_stats_dataset_agg): stats = client.get_statistics( "feature_stats", features=["strings", "ints", "floats"], store=STORE_NAME, ingestion_ids=feature_stats_dataset_agg["ids"], ) assert_stats_equal(feature_stats_dataset_agg["stats"], stats)
def test_feature_stats_agg_over_dates(client, feature_stats_dataset_agg): stats = client.get_statistics( "feature_stats", features=["strings", "ints", "floats"], store=STORE_NAME, start_date=feature_stats_dataset_agg["start_date"], end_date=feature_stats_dataset_agg["end_date"], ) assert_stats_equal(feature_stats_dataset_agg["stats"], stats)
def test_feature_stats_by_date(client, feature_stats_dataset_basic): stats = client.get_statistics( "feature_stats", features=["strings", "ints", "floats"], store=STORE_NAME, start_date=feature_stats_dataset_basic["date"], end_date=feature_stats_dataset_basic["date"] + timedelta(days=1), ) assert_stats_equal(feature_stats_dataset_basic["stats"], stats)
def test_feature_stats_retrieval_by_single_dataset( client, feature_stats_dataset_basic): stats = client.get_statistics( "feature_stats", features=["strings", "ints", "floats"], store=STORE_NAME, ingestion_ids=[feature_stats_dataset_basic["id"]], ) assert_stats_equal(feature_stats_dataset_basic["stats"], stats)
def test_feature_stats_force_refresh(client, feature_stats_dataset_basic, feature_stats_feature_set): df = feature_stats_dataset_basic["df"] df2 = pd.DataFrame({ "datetime": [df.iloc[0].datetime], "entity_id": [10], "strings": ["c"], "ints": [2], "floats": [1.3], }) client.ingest(feature_stats_feature_set, df2) time.sleep(10) actual_stats = client.get_statistics( "feature_stats", features=["strings", "ints", "floats"], store="historical", start_date=feature_stats_dataset_basic["date"], end_date=feature_stats_dataset_basic["date"] + timedelta(days=1), force_refresh=True, ) combined_df = pd.concat([df, df2]) expected_stats = tfdv.generate_statistics_from_dataframe(combined_df) clear_unsupported_fields(expected_stats) # Since TFDV computes population std dev for feature in expected_stats.datasets[0].features: if feature.HasField("num_stats"): name = feature.path.step[0] std = combined_df[name].std() feature.num_stats.std_dev = std assert_stats_equal(expected_stats, actual_stats)
def test_batch_dataset_statistics(client): fs1 = client.get_feature_set(name="feature_set_1") fs2 = client.get_feature_set(name="feature_set_2") id_offset = 20 n_rows = 21 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame({ "datetime": [time_offset] * n_rows, "entity_id": [id_offset + i for i in range(n_rows)], "feature_value6": ["a" for i in range(n_rows)], }) ingestion_id1 = client.ingest(fs1, features_1_df) features_2_df = pd.DataFrame({ "datetime": [time_offset] * n_rows, "other_entity_id": [id_offset + i for i in range(n_rows)], "other_feature_value7": [int(i) % 10 for i in range(0, n_rows)], }) ingestion_id2 = client.ingest(fs2, features_2_df) entity_df = pd.DataFrame({ "datetime": [time_offset] * n_rows, "entity_id": [id_offset + i for i in range(n_rows)], "other_entity_id": [id_offset + i for i in range(n_rows)], }) time.sleep(15) # wait for rows to get written to bq while True: rows_ingested1 = get_rows_ingested(client, fs1, ingestion_id1) rows_ingested2 = get_rows_ingested(client, fs2, ingestion_id2) if rows_ingested1 == len(features_1_df) and rows_ingested2 == len( features_2_df): print( f"Number of rows successfully ingested: {rows_ingested1}, {rows_ingested2}. Continuing." ) break time.sleep(30) feature_retrieval_job = client.get_historical_features( entity_rows=entity_df, feature_refs=["feature_value6", "feature_set_2:other_feature_value7"], project=PROJECT_NAME, compute_statistics=True, ) output = feature_retrieval_job.to_dataframe(timeout_sec=180) print(output.head(10)) stats = feature_retrieval_job.statistics(timeout_sec=180) clear_unsupported_fields(stats) expected_stats = tfdv.generate_statistics_from_dataframe( output[["feature_value6", "feature_set_2__other_feature_value7"]]) clear_unsupported_fields(expected_stats) # Since TFDV computes population std dev for feature in expected_stats.datasets[0].features: if feature.HasField("num_stats"): name = feature.path.step[0] std = output[name].std() feature.num_stats.std_dev = std assert_stats_equal(expected_stats, stats) clean_up_remote_files(feature_retrieval_job.get_avro_files())