def feature_stats_dataset_basic(client, feature_stats_feature_set): n_rows = 20 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) df = pd.DataFrame({ "datetime": [time_offset] * n_rows, "entity_id": [i for i in range(n_rows)], "strings": ["a", "b"] * int(n_rows / 2), "ints": [int(i) for i in range(n_rows)], "floats": [10.5 - i for i in range(n_rows)], }) expected_stats = tfdv.generate_statistics_from_dataframe( df[["strings", "ints", "floats"]]) clear_unsupported_fields(expected_stats) # Since TFDV computes population std dev for feature in expected_stats.datasets[0].features: if feature.HasField("num_stats"): name = feature.path.step[0] std = df[name].std() feature.num_stats.std_dev = std ingestion_id = client.ingest(feature_stats_feature_set, df) time.sleep(10) return { "df": df, "id": ingestion_id, "date": datetime(time_offset.year, time_offset.month, time_offset.day).replace(tzinfo=pytz.utc), "stats": expected_stats, }
def test_feature_stats_force_refresh(client, feature_stats_dataset_basic, feature_stats_feature_set): df = feature_stats_dataset_basic["df"] df2 = pd.DataFrame({ "datetime": [df.iloc[0].datetime], "entity_id": [10], "strings": ["c"], "ints": [2], "floats": [1.3], }) client.ingest(feature_stats_feature_set, df2) time.sleep(10) actual_stats = client.get_statistics( "feature_stats", features=["strings", "ints", "floats"], store="historical", start_date=feature_stats_dataset_basic["date"], end_date=feature_stats_dataset_basic["date"] + timedelta(days=1), force_refresh=True, ) combined_df = pd.concat([df, df2]) expected_stats = tfdv.generate_statistics_from_dataframe(combined_df) clear_unsupported_fields(expected_stats) # Since TFDV computes population std dev for feature in expected_stats.datasets[0].features: if feature.HasField("num_stats"): name = feature.path.step[0] std = combined_df[name].std() feature.num_stats.std_dev = std assert_stats_equal(expected_stats, actual_stats)
def test_batch_dataset_statistics(client): fs1 = client.get_feature_set(name="feature_set_1") fs2 = client.get_feature_set(name="feature_set_2") id_offset = 20 n_rows = 21 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame({ "datetime": [time_offset] * n_rows, "entity_id": [id_offset + i for i in range(n_rows)], "feature_value6": ["a" for i in range(n_rows)], }) ingestion_id1 = client.ingest(fs1, features_1_df) features_2_df = pd.DataFrame({ "datetime": [time_offset] * n_rows, "other_entity_id": [id_offset + i for i in range(n_rows)], "other_feature_value7": [int(i) % 10 for i in range(0, n_rows)], }) ingestion_id2 = client.ingest(fs2, features_2_df) entity_df = pd.DataFrame({ "datetime": [time_offset] * n_rows, "entity_id": [id_offset + i for i in range(n_rows)], "other_entity_id": [id_offset + i for i in range(n_rows)], }) time.sleep(15) # wait for rows to get written to bq while True: rows_ingested1 = get_rows_ingested(client, fs1, ingestion_id1) rows_ingested2 = get_rows_ingested(client, fs2, ingestion_id2) if rows_ingested1 == len(features_1_df) and rows_ingested2 == len( features_2_df): print( f"Number of rows successfully ingested: {rows_ingested1}, {rows_ingested2}. Continuing." ) break time.sleep(30) feature_retrieval_job = client.get_historical_features( entity_rows=entity_df, feature_refs=["feature_value6", "feature_set_2:other_feature_value7"], project=PROJECT_NAME, compute_statistics=True, ) output = feature_retrieval_job.to_dataframe(timeout_sec=180) print(output.head(10)) stats = feature_retrieval_job.statistics(timeout_sec=180) clear_unsupported_fields(stats) expected_stats = tfdv.generate_statistics_from_dataframe( output[["feature_value6", "feature_set_2__other_feature_value7"]]) clear_unsupported_fields(expected_stats) # Since TFDV computes population std dev for feature in expected_stats.datasets[0].features: if feature.HasField("num_stats"): name = feature.path.step[0] std = output[name].std() feature.num_stats.std_dev = std assert_stats_equal(expected_stats, stats) clean_up_remote_files(feature_retrieval_job.get_avro_files())