def skew_drift_validator(mode, gcp_bucket, control_set_path, treatment_set_path, feature_list_str, Linf_value): logging.basicConfig(level=logging.INFO) logging.info('Starting skew drift validator ..') logging.info('Input data:') logging.info('mode:{}'.format(mode)) logging.info('gcp_bucket:{}'.format(gcp_bucket)) logging.info('control_set_path:{}'.format(control_set_path)) logging.info('treatment_set_path:{}'.format(treatment_set_path)) logging.info('Linf_value:{}'.format(Linf_value)) feature_list = eval(feature_list_str) control_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + control_set_path, sep=',') treat_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + treatment_set_path, sep=',') control_stats = tfdv.generate_statistics_from_dataframe( dataframe=control_set_df) treat_stats = tfdv.generate_statistics_from_dataframe( dataframe=treat_set_df) control_schema = tfdv.infer_schema(control_stats) treat_schema = tfdv.infer_schema(treat_stats) for feature in feature_list: if (mode == "skew"): if (tfdv.get_feature(control_schema, feature).domain ): # if we have domain it is a categorical variable tfdv.get_feature( control_schema, feature ).skew_comparator.infinity_norm.threshold = Linf_value else: logging.critical( "feature: {} is not categorical".format(feature)) sys.exit(1) elif (mode == "drift"): tfdv.get_feature( control_schema, feature).drift_comparator.infinity_norm.threshold = Linf_value else: logging.critical("mode: {} not supported".format(mode)) sys.exit(1) anomalies = tfdv.validate_statistics(statistics=control_stats, schema=control_schema, serving_statistics=treat_stats) if (anomalies.anomaly_info): logging.info("Data-{} detected:".format(anomalies)) return anomalies else: logging.info("No data-{} detected".format(mode))
def tfdv_skew_validator(feature_name, train_stats, serve_stats, schema, threshold): """ Validate skew for the csv dataset """ #this doesn't display skew anomalies as the book shows tfdv.get_feature(schema, feature_name).skew_comparator.infinity_norm.threshold = threshold skew_anomalies = tfdv.validate_statistics(statistics = train_stats, schema = schema, serving_statistics = serve_stats) tfdv.display_anomalies(skew_anomalies) return skew_anomalies
def tfdv_drift_validator(feature_name, train_stats, previous_stats, schema, threshold): """ Validate drift for the csv dataset """ #this doesn't display drift anomalies as the book shows tfdv.get_feature(schema, feature_name).drift_comparator.infinity_norm.threshold = threshold drift_anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema, previous_statistics= previous_stats ) tfdv.display_anomalies(drift_anomalies) return drift_anomalies
def tfdv_detect_drift( stats_older_path: str, stats_new_path: str ) -> NamedTuple('Outputs', [('drift', str)]): import logging import time import tensorflow_data_validation as tfdv import tensorflow_data_validation.statistics.stats_impl logging.getLogger().setLevel(logging.INFO) logging.info('stats_older_path: %s', stats_older_path) logging.info('stats_new_path: %s', stats_new_path) if stats_older_path == 'none': return ('true', ) stats1 = tfdv.load_statistics(stats_older_path) stats2 = tfdv.load_statistics(stats_new_path) schema1 = tfdv.infer_schema(statistics=stats1) tfdv.get_feature(schema1, 'duration').drift_comparator.jensen_shannon_divergence.threshold = 0.01 drift_anomalies = tfdv.validate_statistics( statistics=stats2, schema=schema1, previous_statistics=stats1) logging.info('drift analysis results: %s', drift_anomalies.drift_skew_info) from google.protobuf.json_format import MessageToDict d = MessageToDict(drift_anomalies) val = d['driftSkewInfo'][0]['driftMeasurements'][0]['value'] thresh = d['driftSkewInfo'][0]['driftMeasurements'][0]['threshold'] logging.info('value %s and threshold %s', val, thresh) res = 'true' if val < thresh: res = 'false' logging.info('train decision: %s', res) return (res, )
# Compare evaluation data with training data displayHTML( get_statistics_html(lhs_statistics=stats_serve, rhs_statistics=stats_train, lhs_name='SERVE_DATASET', rhs_name='TRAIN_DATASET')) # COMMAND ---------- anomalies = tfdv.validate_statistics(statistics=stats_serve, schema=schema) tfdv.display_anomalies(anomalies) # COMMAND ---------- # Add skew and drift comparators temp_f = tfdv.get_feature(schema, 'avg_temp_f') temp_f.skew_comparator.jensen_shannon_divergence.threshold = 0 temp_f.drift_comparator.jensen_shannon_divergence.threshold = 0 precip_mm = tfdv.get_feature(schema, 'tot_precip_mm') precip_mm.skew_comparator.jensen_shannon_divergence.threshold = 0 precip_mm.drift_comparator.jensen_shannon_divergence.threshold = 0 _anomalies = tfdv.validate_statistics(stats_train, schema, serving_statistics=stats_serve) tfdv.display_anomalies(_anomalies) # COMMAND ----------
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) vldn_output = os.path.join(DATA_DIR, 'validation') # TODO: Understand why this was used in the conversion to the output json # key columns: list of the names for columns that should be treated as unique keys. key_columns = ['trip_start_timestamp'] # read the first line of the cvs to have and ordered list of column names # (the Schema will scrable the features) with open(TRAIN_DATA) as f: column_names = f.readline().strip().split(',') stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA) schema = tfdv.infer_schema(stats) eval_stats = tfdv.generate_statistics_from_csv( data_location=EVALUATION_DATA) anomalies = tfdv.validate_statistics(eval_stats, schema) # Log anomalies for feature_name, anomaly_info in anomalies.anomaly_info.items(): logging.getLogger().error( 'Anomaly in feature "{}": {}'.format( feature_name, anomaly_info.description)) # show inferred schema tfdv.display_schema(schema=schema) # Resolve anomalies company = tfdv.get_feature(schema, 'company') company.distribution_constraints.min_domain_mass = 0.9 # Add new value to the domain of feature payment_type. payment_type_domain = tfdv.get_domain(schema, 'payment_type') payment_type_domain.value.append('Prcard') # Validate eval stats after updating the schema updated_anomalies = tfdv.validate_statistics(eval_stats, schema) tfdv.display_anomalies(updated_anomalies) # -----------------------DATA SAVING START--------------------------------- if "column_names" in locals(): _kale_resource_save(column_names, os.path.join( _kale_data_directory, "column_names")) else: print("_kale_resource_save: `column_names` not found.") if "schema" in locals(): _kale_resource_save(schema, os.path.join( _kale_data_directory, "schema")) else: print("_kale_resource_save: `schema` not found.")
args = parser.parse_args() logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S') logging.info(f"Args: {args}") stats1 = tfdv.load_stats_text(input_path=args.stats_file_1) stats2 = tfdv.load_stats_text(input_path=args.stats_file_2) schema1 = tfdv.infer_schema(statistics=stats1) # Custom rules, tweak this as required. This is just an example tfdv.get_feature( schema1, 'I1').drift_comparator.jensen_shannon_divergence.threshold = 0.06 # Calculate drift between the reference stats stats1, and the statistics from new data in stats2 drift_anomalies = tfdv.validate_statistics(statistics=stats2, schema=schema1, previous_statistics=stats1) # Convert the .pb2 to dict drift = MessageToDict(drift_anomalies) value = drift['driftSkewInfo'][0]['driftMeasurements'][0]['value'] threshold = drift['driftSkewInfo'][0]['driftMeasurements'][0]['threshold'] logging.info( f"JS divergence value: {value}, and JS divergence threshold: {threshold}" )
# Displaying all detected anomalies # Integer larger than 10 # STRING type when expected INT type # FLOAT type when expected INT type # Integer smaller than 0 print(tfdv.display_anomalies(anomalies)) # New data WITH anomalies test_set_copy = test_set.copy() test_set_copy.drop("soot", axis=1, inplace=True) # Statistics based on data with anomalies test_set_copy_stats = tfdv.generate_statistics_from_dataframe( dataframe=test_set_copy) anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats, schema=schema) print(tfdv.display_anomalies(anomalies_new)) # Prepare the schema for Serving,environment schema.default_environment.append("TRAINING") schema.default_environment.append("SERVING") tfdv.get_feature(schema, "soot").not_in_environment.append("SERVING") serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats, schema, environment="SERVING") print(tfdv.display_anomalies(serving_env_anomalies)) # Freezing the schema tfdv.write_schema_text(schema=schema, output_path='pollution_schema.pbtext')
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema) #display all detected anomalies in new data anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema) #Displayign all detected anomalies tfdv.display_anomalies(anomalies) #New data with anomalies test_set_copy = test_set.copy() test_set_copy.drop('soot', axis=1, inplace=True) #stats based on data with anomalies test_set_copy_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set_copy) anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats, schema=schema) tfdv.display_anomalies(anomalies_new) #Stage 5: prepare the schema for serving schema.default_environment.append('TRAINING') schema.default_environment.append('SERVING') #Removing a target column from the Serving Schema tfdv.get_feature(schema, 'soot').not_in_environment.append('SERVING') #Checking for anomalies between serving environment and new test set serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats, schema, environment='SERVING') tfdv.display_anomalies(serving_env_anomalies)
def is_acceptable(train_batch, test_batch, method): if method == 'baseline': percent_null_range = determine_baseline_range(train_batch) test_batch_percent_null, __ = percent_null_dataframes( test_batch, test_batch) if percent_null_range['min'].values[ 0] <= test_batch_percent_null.values[0][ 0] <= percent_null_range['max'].values[0]: return True else: return False elif method == 'original': completeness_range, distinct_range = determine_acceptable_metric_range( train_batch) test_batch_completeness, __ = completeness_dataframes( test_batch, test_batch) test_batch_distinct, __ = distinct_counts_dataframes( test_batch, test_batch) completeness_within_range = 0 distinct_within_range = 0 for i in range(len(test_batch_distinct.values)): if distinct_range[i:i + 1]['min'][0] <= test_batch_distinct.values[ i][0] <= distinct_range[i:i + 1]['max'][0]: distinct_within_range += 1 if completeness_range[i:i + 1]['min'][ 0] <= test_batch_completeness.values[i][ 0] <= completeness_range[i:i + 1]['max'][0]: completeness_within_range += 1 if completeness_within_range / test_batch_completeness.shape[ 0] > .8 and distinct_within_range / test_batch_completeness.shape[ 0] > .8: return True else: return False elif method == 'tfdv': #set all flags (this is bc what we need to do with TFDV depends on the type of data we're dealing with) flights = False fb = False dirty = False #if this is flights data if 'FLIGHTS' in test_batch[0]: flights = True #if this is facebook data if 'FBPosts' in test_batch[0]: fb = True #if the test batch is a dirty file if 'dirty' in test_batch[0]: dirty = True ######## now work on updating the schema ####### #update schema with all the files in the train batch updated_schema = acceptable_schema(train_batch) #make sure TFDV knows the columns that are only in the clean + not the dirty data updated_schema.default_environment.append('TRAINING') updated_schema.default_environment.append('TESTING') #specify that 'for_key' feature is not in TESTING environment if dirty and flights: tfdv.get_feature(updated_schema, 'for_key').not_in_environment.append('TESTING') #get the number of anomalies for the test batch prior to relaxing constraints num_anomalies = get_num_anomalies(test_batch[0], updated_schema, environment='TESTING') #don't need to do anything for the dirty facebook data (there are no extra columns to worry about) else: #if it's a clean file, default environment is TRAINING (or dirty facebook file since no extra cols) num_anomalies = get_num_anomalies(test_batch[0], updated_schema) #relax constraints for each type of data if flights: tfdv.get_feature( updated_schema, 'ArrivalGate').distribution_constraints.min_domain_mass = 0.8 tfdv.get_feature( updated_schema, 'DepartureGate').distribution_constraints.min_domain_mass = 0.8 if fb: tfdv.get_feature( updated_schema, 'outlet').distribution_constraints.min_domain_mass = 0.65 tfdv.get_feature( updated_schema, 'domain').distribution_constraints.min_domain_mass = 0.6 #get anomalies after having relaxed constraints if dirty and flights: adjusted_num_anomalies = get_num_anomalies(test_batch[0], updated_schema, environment='TESTING') else: adjusted_num_anomalies = get_num_anomalies(test_batch[0], updated_schema) ######## determine whether acceptable or not ######## #my definition of "acceptable" is just if the relaxed constraints had any affect on the anomalies #being present (so relaxing constraints should yield less anomalies after adjustment) if adjusted_num_anomalies < num_anomalies: return True else: return False elif method == 'exploratory': #if flights data if 'FLIGHTS' in test_batch[0]: data = 'flight' #if facebook data if 'FBPosts' in test_batch[0]: data = 'fb' if data == 'flight': test_batch_completeness, __ = completeness_dataframes( test_batch, test_batch) test_batch_distinct, __ = distinct_counts_dataframes( test_batch, test_batch) accepted = 0 not_accepted = 0 for i in range(len(test_batch_distinct.values)): # values based off mean values for flights data of distinct values and completeness if test_batch_distinct.values[i][ 0] > 100 and test_batch_completeness.values[i][0] > 0.2: not_accepted += 1 # if both values true than add to not_accepted else: accepted += 1 # if not then accepted if not_accepted == 0: return True else: return False if data == 'fb': test_batch_completeness, __ = completeness_dataframes( test_batch, test_batch) test_batch_distinct, __ = distinct_counts_dataframes( test_batch, test_batch) accepted = 0 not_accepted = 0 for i in range(len(test_batch_distinct.values)): # values based off mean values for fb data of distinct values and completeness if test_batch_distinct.values[i][ 0] > 30 and test_batch_completeness.values[i][0] > 0.25: not_accepted += 1 # if both values true than add to not_accepted else: accepted += 1 # if not then accepted if not_accepted == 0: return True else: return False
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema) # Display anomalies tfdv.display_anomalies(anomalies) """Three features has inconsistent values, two of them has wrong type (net_manager, type_conn_perc), the purchase_are it is a feature expected to be outside of the train schema, it is due the fact correspond to a different region in Netherlands. ### Fixing the erros The corrections will depend of our knowlegde of the data, looking at each feature and decide what action should be take, looking in the description of each feature will to decide how to correct the erros, **net_manager** is code of the regional network manager, **type_conn_perc: percentage** of presence of the principal type of connection in the zipcode range, **purchase_area** code of the area where the energy is purchased. For the type_conn_perc let's covert it to BYTES and append the missing values to the purcha_are. """ # Convert the type_conn_perc to Bytes type_conn_perc = tfdv.get_feature(schema, 'type_conn_perc') type_conn_perc.type = 1 # Type 1 represent BYTES # Add new value to the domain of feature net_manager. purchase_area = tfdv.get_domain(schema, 'purchase_area') purchase_area.value.append('Stedin') purchase_area.value.append('Stedin Delfland') purchase_area.value.append('Stedin Elektriciteit Zuid-Kennemerland') purchase_area.value.append('Stedin Midden-Holland') purchase_area.value.append('Stedin Schiedam') purchase_area.value.append('Stedin Utrecht') purchase_area.value.append('Stedin Weert') # Validate eval stats after updating the schema updated_anomalies = tfdv.validate_statistics(test_stats, schema) tfdv.display_anomalies(updated_anomalies)