def csv_statistics_validator(stats, schema): """ Validate statistics from a csv dataset """ stats_anomalies = tfdv.validate_statistics(statistics = stats, schema = schema) tfdv.display_anomalies(stats_anomalies) return stats_anomalies
def detect_anomalies(stats_uri: Text, schema_uri: Text, split_name: Text): schema = get_schema_proto(schema_uri) stats = get_statistics_dataset_dict(stats_uri) if split_name not in stats: raise Exception(f'{split_name} split not present!') anomalies = tfdv.validate_statistics(stats[split_name], schema) tfdv.display_anomalies(anomalies)
def display(self, artifact: types.Artifact): from IPython.core.display import display # pylint: disable=g-import-not-at-top from IPython.core.display import HTML # pylint: disable=g-import-not-at-top for split in artifact_utils.decode_split_names(artifact.split_names): display(HTML('<div><b>%r split:</b></div><br/>' % split)) anomalies_path = os.path.join(artifact.uri, split, 'anomalies.pbtxt') anomalies = tfdv.load_anomalies_text(anomalies_path) tfdv.display_anomalies(anomalies)
def tfdv_skew_validator(feature_name, train_stats, serve_stats, schema, threshold): """ Validate skew for the csv dataset """ #this doesn't display skew anomalies as the book shows tfdv.get_feature(schema, feature_name).skew_comparator.infinity_norm.threshold = threshold skew_anomalies = tfdv.validate_statistics(statistics = train_stats, schema = schema, serving_statistics = serve_stats) tfdv.display_anomalies(skew_anomalies) return skew_anomalies
def tfdv_drift_validator(feature_name, train_stats, previous_stats, schema, threshold): """ Validate drift for the csv dataset """ #this doesn't display drift anomalies as the book shows tfdv.get_feature(schema, feature_name).drift_comparator.infinity_norm.threshold = threshold drift_anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema, previous_statistics= previous_stats ) tfdv.display_anomalies(drift_anomalies) return drift_anomalies
def display(self, artifact: types.Artifact): from IPython.core.display import display # pylint: disable=g-import-not-at-top from IPython.core.display import HTML # pylint: disable=g-import-not-at-top for split in artifact_utils.decode_split_names(artifact.split_names): display(HTML('<div><b>%r split:</b></div><br/>' % split)) anomalies_path = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri([artifact], split)) if artifact_utils.is_artifact_version_older_than( artifact, artifact_utils._ARTIFACT_VERSION_FOR_ANOMALIES_UPDATE): # pylint: disable=protected-access anomalies = tfdv.load_anomalies_text(anomalies_path) else: anomalies = anomalies_pb2.Anomalies() anomalies_bytes = io_utils.read_bytes_file(anomalies_path) anomalies.ParseFromString(anomalies_bytes) tfdv.display_anomalies(anomalies)
schema = tfdv.infer_schema(statistics=stats_train) tfdv.display_schema(schema=schema) # COMMAND ---------- # Compare evaluation data with training data displayHTML( get_statistics_html(lhs_statistics=stats_serve, rhs_statistics=stats_train, lhs_name='SERVE_DATASET', rhs_name='TRAIN_DATASET')) # COMMAND ---------- anomalies = tfdv.validate_statistics(statistics=stats_serve, schema=schema) tfdv.display_anomalies(anomalies) # COMMAND ---------- # Add skew and drift comparators temp_f = tfdv.get_feature(schema, 'avg_temp_f') temp_f.skew_comparator.jensen_shannon_divergence.threshold = 0 temp_f.drift_comparator.jensen_shannon_divergence.threshold = 0 precip_mm = tfdv.get_feature(schema, 'tot_precip_mm') precip_mm.skew_comparator.jensen_shannon_divergence.threshold = 0 precip_mm.drift_comparator.jensen_shannon_divergence.threshold = 0 _anomalies = tfdv.validate_statistics(stats_train, schema, serving_statistics=stats_serve)
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) vldn_output = os.path.join(DATA_DIR, 'validation') # TODO: Understand why this was used in the conversion to the output json # key columns: list of the names for columns that should be treated as unique keys. key_columns = ['trip_start_timestamp'] # read the first line of the cvs to have and ordered list of column names # (the Schema will scrable the features) with open(TRAIN_DATA) as f: column_names = f.readline().strip().split(',') stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA) schema = tfdv.infer_schema(stats) eval_stats = tfdv.generate_statistics_from_csv( data_location=EVALUATION_DATA) anomalies = tfdv.validate_statistics(eval_stats, schema) # Log anomalies for feature_name, anomaly_info in anomalies.anomaly_info.items(): logging.getLogger().error( 'Anomaly in feature "{}": {}'.format( feature_name, anomaly_info.description)) # show inferred schema tfdv.display_schema(schema=schema) # Resolve anomalies company = tfdv.get_feature(schema, 'company') company.distribution_constraints.min_domain_mass = 0.9 # Add new value to the domain of feature payment_type. payment_type_domain = tfdv.get_domain(schema, 'payment_type') payment_type_domain.value.append('Prcard') # Validate eval stats after updating the schema updated_anomalies = tfdv.validate_statistics(eval_stats, schema) tfdv.display_anomalies(updated_anomalies) # -----------------------DATA SAVING START--------------------------------- if "column_names" in locals(): _kale_resource_save(column_names, os.path.join( _kale_data_directory, "column_names")) else: print("_kale_resource_save: `column_names` not found.") if "schema" in locals(): _kale_resource_save(schema, os.path.join( _kale_data_directory, "schema")) else: print("_kale_resource_save: `schema` not found.")
# %% # Compute stats for evaluation data test_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_test) # Compare evaluation data with training data tfdv.visualize_statistics(lhs_statistics=test_stats, rhs_statistics=train_stats, lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET') # %% ''' ## evaluate for anomalies related with schema ''' # %% anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema) tfdv.display_anomalies(anomalies) # %% ## checking the data is compliant with the schema of the train part # %% data_stats = tfdv.generate_statistics_from_dataframe(data) serving_anomalies = tfdv.validate_statistics(data_stats, schema) tfdv.display_anomalies(serving_anomalies) # %% # %%
def display(self, artifact: types.Artifact): anomalies_path = os.path.join(artifact.uri, 'anomalies.pbtxt') anomalies = tfdv.load_anomalies_text(anomalies_path) tfdv.display_anomalies(anomalies)
# Generate training data statistics train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset) schema = tfdv.infer_schema(statistics=train_stats) print(tfdv.display_schema(schema)) test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set) # Compare test statistics with the Schema anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema) # Displaying all detected anomalies # Integer larger than 10 # STRING type when expected INT type # FLOAT type when expected INT type # Integer smaller than 0 print(tfdv.display_anomalies(anomalies)) # New data WITH anomalies test_set_copy = test_set.copy() test_set_copy.drop("soot", axis=1, inplace=True) # Statistics based on data with anomalies test_set_copy_stats = tfdv.generate_statistics_from_dataframe( dataframe=test_set_copy) anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats, schema=schema) print(tfdv.display_anomalies(anomalies_new)) # Prepare the schema for Serving,environment schema.default_environment.append("TRAINING") schema.default_environment.append("SERVING")
schema = tfdv.infer_schema(statistics=train_stats) tfdv.display_schema(schema) ### Cálculo das estatísticas da base de teste test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set) ## Etapa 5: Comparação das estatísticas de teste com o esquema ### Checagem de anomalias nos novos dados anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema) ### Mostrando as anomalias detectadas # - Inteiros maiores do que 10 # - Esperava o tipo STRING mas a coluna estava com o tipo INT # - Esperava o tipo FLOAT mas a coluna estava com o tipo INT # - Inteiros menores do que 0 tfdv.display_anomalies(anomalies) ### Novos dados COM anomalias test_set_copy = test_set.copy() test_set_copy.drop("soot", axis=1, inplace=True) test_set_copy.describe() ### Estatísticas baseadas nos dados com anomalias test_set_copy_stats = tfdv.generate_statistics_from_dataframe( dataframe=test_set_copy) anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats, schema=schema) tfdv.display_anomalies(anomalies_new)
"""In evalation of the data TFDV shows in the plot both the train and eval stats overlaid, it is possible to check the distribuition visually, it can speedy the change in the schema when necessary. Looking in the plots we can consider both the train and eval has the same distribution, the difenrrence in the plots is just in the quantity of each value in the features. ### Anomalies After generate stats for both train and eval dataset, it necessary to check if the schema define to each one is the same (range of values, type of feature and so on), TFDV has a function to look for anomalies, e.g, features define different for each schema. """ # Check eval data for errors by validating the Eval data stats using the # previously inferred schema. anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=schema) # Display anomalies tfdv.display_anomalies(anomalies) """We have just one feature with anomalies, look deeply we can check the values are correct but the problem there is a period and a space in the values, so the TFDV considered it as a new values. ### Fixing the erros The corrections will depend of our knowlegde of the data, looking at each feature and decide what action should be take, for our case it is not necessary to ajusted the schema, but to test how to correct the errors we will add the values to the domain and check again the anomalies """ # Add new value to the domain of feature wage. wage = tfdv.get_domain(schema, 'wage') wage.value.append(' <=50K.') wage.value.append(' >50K.') # Validate eval stats after updating the schema updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
# MAGIC %md infer schema # COMMAND ---------- weather_data_schema = tfdv.infer_schema(statistics=stats) tfdv.display_schema(schema=weather_data_schema) # COMMAND ---------- # MAGIC %md check for anomalies # COMMAND ---------- weather_anomalies = tfdv.validate_statistics(statistics=stats, schema=weather_data_schema) tfdv.display_anomalies(weather_anomalies) # COMMAND ---------- # MAGIC %md average monthly temperature # COMMAND ---------- # MAGIC %sql # MAGIC SELECT x.month,AVG(x.avg_temp_f) as avg_temp_f FROM ( # MAGIC SELECT MONTH(time) as month,YEAR(time) as year,SUM(avg_temp_f) as avg_temp_f # MAGIC FROM dscc202_group05_db.weather_delta_bronze # MAGIC GROUP BY MONTH(time), YEAR(time) # MAGIC ) x # MAGIC GROUP BY x.month # MAGIC ORDER BY x.month