def tfrecord_statis_generator(file_path): """ Generate statistics for the tfrecord dataset """ tfrecord_stats = tfdv.generate_statistics_from_tfrecord(data_location = file_path) tfrecord_schema = tfdv.infer_schema(tfrecord_stats) tfdv.display_schema(tfrecord_schema) return tfrecord_stats, tfrecord_schema
def csv_statistics_generator(file_path): """ Generate statistics for the csv dataset """ csv_stats = tfdv.generate_statistics_from_csv(data_location = file_path, delimiter=',') csv_schema = tfdv.infer_schema(csv_stats) tfdv.display_schema(csv_schema) return csv_stats, csv_schema
def view_schema(uri: Text): """ View schema . Args: uri: URI to schema. """ schema = get_schema_proto(uri) # TODO: [LOW] Replace with our own function tfdv.display_schema(schema=schema)
def display_stats_for_examples(self, examples_id): """Displays stats for `examples_id`. Args: examples_id: A `int` indicating the id of a `TFXArtifactTypes.EXAMPLES` artifact. """ stats_artifact = self.get_dest_artifact_of_type( examples_id, TFXArtifactTypes.EXAMPLE_STATS) if stats_artifact: tfdv.visualize_statistics( tfdv.load_statistics( os.path.join(stats_artifact.uri, 'stats_tfrecord'))) print("display shema") tfdv.display_schema( tfdv.infer_schema(statistics=tfdv.load_statistics( os.path.join(stats_artifact.uri, 'stats_tfrecord'))))
# COMMAND ---------- from sklearn.model_selection import train_test_split import tensorflow_data_validation as tfdv from tensorflow_data_validation.utils.display_util import get_statistics_html import warnings warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning) stats_train = tfdv.generate_statistics_from_dataframe( dataframe=train_df.toPandas()) stats_serve = tfdv.generate_statistics_from_dataframe(dataframe=fdf.toPandas()) schema = tfdv.infer_schema(statistics=stats_train) tfdv.display_schema(schema=schema) # COMMAND ---------- # Compare evaluation data with training data displayHTML( get_statistics_html(lhs_statistics=stats_serve, rhs_statistics=stats_train, lhs_name='SERVE_DATASET', rhs_name='TRAIN_DATASET')) # COMMAND ---------- anomalies = tfdv.validate_statistics(statistics=stats_serve, schema=schema) tfdv.display_anomalies(anomalies)
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) vldn_output = os.path.join(DATA_DIR, 'validation') # TODO: Understand why this was used in the conversion to the output json # key columns: list of the names for columns that should be treated as unique keys. key_columns = ['trip_start_timestamp'] # read the first line of the cvs to have and ordered list of column names # (the Schema will scrable the features) with open(TRAIN_DATA) as f: column_names = f.readline().strip().split(',') stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA) schema = tfdv.infer_schema(stats) eval_stats = tfdv.generate_statistics_from_csv( data_location=EVALUATION_DATA) anomalies = tfdv.validate_statistics(eval_stats, schema) # Log anomalies for feature_name, anomaly_info in anomalies.anomaly_info.items(): logging.getLogger().error( 'Anomaly in feature "{}": {}'.format( feature_name, anomaly_info.description)) # show inferred schema tfdv.display_schema(schema=schema) # Resolve anomalies company = tfdv.get_feature(schema, 'company') company.distribution_constraints.min_domain_mass = 0.9 # Add new value to the domain of feature payment_type. payment_type_domain = tfdv.get_domain(schema, 'payment_type') payment_type_domain.value.append('Prcard') # Validate eval stats after updating the schema updated_anomalies = tfdv.validate_statistics(eval_stats, schema) tfdv.display_anomalies(updated_anomalies) # -----------------------DATA SAVING START--------------------------------- if "column_names" in locals(): _kale_resource_save(column_names, os.path.join( _kale_data_directory, "column_names")) else: print("_kale_resource_save: `column_names` not found.") if "schema" in locals(): _kale_resource_save(schema, os.path.join( _kale_data_directory, "schema")) else: print("_kale_resource_save: `schema` not found.")
def display(self, artifact: types.Artifact): schema_path = os.path.join(artifact.uri, 'schema.pbtxt') schema = tfdv.load_schema_text(schema_path) tfdv.display_schema(schema)
import tensorflow_data_validation as tfdv # Simple dataset analysis dataset = pd.read_csv("data/pollution-small.csv") print(dataset.shape) training_data = dataset[:1600] print(training_data.describe()) test_set = dataset[1600:] print(test_set.describe()) # Generate training data statistics train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset) schema = tfdv.infer_schema(statistics=train_stats) print(tfdv.display_schema(schema)) test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set) # Compare test statistics with the Schema anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema) # Displaying all detected anomalies # Integer larger than 10 # STRING type when expected INT type # FLOAT type when expected INT type # Integer smaller than 0 print(tfdv.display_anomalies(anomalies)) # New data WITH anomalies test_set_copy = test_set.copy() test_set_copy.drop("soot", axis=1, inplace=True)
data_folder = Path("../dataset") # below paths should be realtive to data_folder users_file_glob = "AllUsers.csv" ads_file_glob = "AllAds.csv" users_ads_ratings = "users-ads-without-gcp-ratings.csv" users_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_file_glob}").as_posix()) tfdv.visualize_statistics(users_stats) user_schema = tfdv.infer_schema(statistics=users_stats) tfdv.display_schema(schema=user_schema) ads_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{ads_file_glob}").as_posix()) tfdv.visualize_statistics(ads_stats) ads_schema = tfdv.infer_schema(statistics=ads_stats) tfdv.display_schema(schema=ads_schema) users_ratings_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_ads_ratings}").as_posix())
def _tfdv_schema_info(schema): with TFDVSchemaDisplay() as display: tfdv.display_schema(schema) assert len(display.display_obj) == 2, display.display_obj return display.display_obj
warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning) stats = tfdv.generate_statistics_from_dataframe(dataframe=weatherDF.toPandas()) tfdv.visualize_statistics(stats) displayHTML( get_statistics_html(stats) ) #tot_precip_mm has about 90% zeros!, avg_wnd_mps has about 16% #No missing data # COMMAND ---------- # MAGIC %md infer schema # COMMAND ---------- weather_data_schema = tfdv.infer_schema(statistics=stats) tfdv.display_schema(schema=weather_data_schema) # COMMAND ---------- # MAGIC %md check for anomalies # COMMAND ---------- weather_anomalies = tfdv.validate_statistics(statistics=stats, schema=weather_data_schema) tfdv.display_anomalies(weather_anomalies) # COMMAND ---------- # MAGIC %md average monthly temperature
def _render_inferred_schema_summary_markdown(inferred_schema): display_schema_out = io.StringIO() with redirect_stdout(display_schema_out): tfdv.display_schema(inferred_schema) return f'''# Inferred schema summary