def Do(self, input_dict, output_dict, exec_properties): """TensorFlow ExampleValidator executor entrypoint. This validates the statistics on the 'eval' split against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of 'ExampleStatisticsPath' type which should contain split 'eval'. Stats on other splits are ignored. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'ExampleValidationPath' artifact of size one. It will include a single pbtxt file which contains all anomalies found. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) tf.logging.info('Validating schema against the computed statistics.') schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema']))) stats = tfdv.load_statistics( io_utils.get_only_uri_in_dir( types.get_split_uri(input_dict['stats'], 'eval'))) output_uri = types.get_single_uri(output_dict['output']) anomalies = tfdv.validate_statistics(stats, schema) io_utils.write_pbtxt_file(os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies) tf.logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def csv_statistics_validator(stats, schema): """ Validate statistics from a csv dataset """ stats_anomalies = tfdv.validate_statistics(statistics = stats, schema = schema) tfdv.display_anomalies(stats_anomalies) return stats_anomalies
def tfdv_statistics_anomalies( source_statistics: DatasetFeatureStatisticsList, input_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]: """ Compare two TDFV statistics and return anomalies. Args: source_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: source statistics input_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: input statistics Returns: Tuple[bool, Dict]: True if anomalies are detected, otherwise False, dictionary with structure: { <column_name>: { 'description': <description>, 'severity': 'ERROR', 'shortDescription': <short description>, 'reason': [{'type': <error_type>, 'shortDescription': <short description>, 'description': <description>}], 'path': {'step': [<column_name>]} } } """ schema = tfdv.infer_schema(source_statistics) anomalies = tfdv_object_to_dict( tfdv.validate_statistics(statistics=input_statistics, schema=schema)) tfdv_anomalies = anomalies.get('anomalyInfo', {}) return len(tfdv_anomalies) > 0, tfdv_anomalies
def _Validate(self, inputs: Dict[Text, Any], outputs: Dict[Text, Any]) -> None: """Validate the inputs and put validate result into outputs. This is the implementation part of example validator executor. This is intended for using or extending the executor without artifact dependecy. Args: inputs: A dictionary of labeled input values, including: - labels.STATS: the feature statistics to validate - labels.SCHEMA: the schema to respect - (Optional) labels.ENVIRONMENT: if an environment is specified, only validate the feature statistics of the fields in that environment. Otherwise, validate all fields. - (Optional) labels.PREV_SPAN_FEATURE_STATISTICS: the feature statistics of a previous span. - (Optional) labels.PREV_VERSION_FEATURE_STATISTICS: the feature statistics of a previous version. - (Optional) labels.FEATURES_NEEDED: the feature needed to be validated on. - (Optional) labels.VALIDATION_CONFIG: the configuration of this validation. - (Optional) labels.EXTERNAL_CONFIG_VERSION: the version number of external config file. outputs: A dictionary of labeled output values, including: - labels.SCHEMA_DIFF_PATH: the path to write the schema diff to """ schema = value_utils.GetSoleValue(inputs, labels.SCHEMA) stats = value_utils.GetSoleValue(inputs, labels.STATS) schema_diff_path = value_utils.GetSoleValue(outputs, labels.SCHEMA_DIFF_PATH) anomalies = tfdv.validate_statistics(stats, schema) io_utils.write_pbtxt_file( os.path.join(schema_diff_path, DEFAULT_FILE_NAME), anomalies)
def detect_anomalies(stats_uri: Text, schema_uri: Text, split_name: Text): schema = get_schema_proto(schema_uri) stats = get_statistics_dataset_dict(stats_uri) if split_name not in stats: raise Exception(f'{split_name} split not present!') anomalies = tfdv.validate_statistics(stats[split_name], schema) tfdv.display_anomalies(anomalies)
def validate_stats_against_schema(self): # type: () -> bool stats = tfdv.load_statistics(self.stats_path) self.anomalies = tfdv.validate_statistics(stats, self.schema) if len(self.anomalies.anomaly_info.items()) > 0: logger.error("Anomalies found in training dataset...") logger.error(str(self.anomalies.anomaly_info.items())) self.upload_anomalies() return False else: logger.info("No anomalies found") return True
def tfdv_skew_validator(feature_name, train_stats, serve_stats, schema, threshold): """ Validate skew for the csv dataset """ #this doesn't display skew anomalies as the book shows tfdv.get_feature(schema, feature_name).skew_comparator.infinity_norm.threshold = threshold skew_anomalies = tfdv.validate_statistics(statistics = train_stats, schema = schema, serving_statistics = serve_stats) tfdv.display_anomalies(skew_anomalies) return skew_anomalies
def data_validation(data_path): train = tfdv.generate_statistics_from_csv( os.path.join(data_path, 'train.csv'), delimiter=',') test = tfdv.generate_statistics_from_csv( os.path.join(data_path, 'train.csv'), delimiter=',') schema = tfdv.infer_schema(train) # print(schema) # tfdv.display_schema(schema) anomalies = tfdv.validate_statistics(statistics=test, schema=schema) # print(anomalies) # tfdv.display_anomalies(anomalies) print(text_format.MessageToString(anomalies))
def tfdv_drift_validator(feature_name, train_stats, previous_stats, schema, threshold): """ Validate drift for the csv dataset """ #this doesn't display drift anomalies as the book shows tfdv.get_feature(schema, feature_name).drift_comparator.infinity_norm.threshold = threshold drift_anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema, previous_statistics= previous_stats ) tfdv.display_anomalies(drift_anomalies) return drift_anomalies
def skew_drift_validator(mode, gcp_bucket, control_set_path, treatment_set_path, feature_list_str, Linf_value): logging.basicConfig(level=logging.INFO) logging.info('Starting skew drift validator ..') logging.info('Input data:') logging.info('mode:{}'.format(mode)) logging.info('gcp_bucket:{}'.format(gcp_bucket)) logging.info('control_set_path:{}'.format(control_set_path)) logging.info('treatment_set_path:{}'.format(treatment_set_path)) logging.info('Linf_value:{}'.format(Linf_value)) feature_list = eval(feature_list_str) control_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + control_set_path, sep=',') treat_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + treatment_set_path, sep=',') control_stats = tfdv.generate_statistics_from_dataframe( dataframe=control_set_df) treat_stats = tfdv.generate_statistics_from_dataframe( dataframe=treat_set_df) control_schema = tfdv.infer_schema(control_stats) treat_schema = tfdv.infer_schema(treat_stats) for feature in feature_list: if (mode == "skew"): if (tfdv.get_feature(control_schema, feature).domain ): # if we have domain it is a categorical variable tfdv.get_feature( control_schema, feature ).skew_comparator.infinity_norm.threshold = Linf_value else: logging.critical( "feature: {} is not categorical".format(feature)) sys.exit(1) elif (mode == "drift"): tfdv.get_feature( control_schema, feature).drift_comparator.infinity_norm.threshold = Linf_value else: logging.critical("mode: {} not supported".format(mode)) sys.exit(1) anomalies = tfdv.validate_statistics(statistics=control_stats, schema=control_schema, serving_statistics=treat_stats) if (anomalies.anomaly_info): logging.info("Data-{} detected:".format(anomalies)) return anomalies else: logging.info("No data-{} detected".format(mode))
def get_num_anomalies(csv_file, schema, environment='TRAINING'): #get column names from passed in schema cols = [f.name for f in schema.feature].sort() options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True) data_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,\ column_names=cols, stats_options=options) # Check eval data for errors by validating the eval data stats using the previously inferred schema anomalies = tfdv.validate_statistics(statistics=data_stats, schema=schema, environment=environment) #tfdv.display_anomalies(anomalies) return len(anomalies.anomaly_info)
def validate_stats(stats_path, schema_path, anomalies_path): """Validates the statistics against the schema and materializes anomalies. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location of the schema to be used for validation. anomalies_path: Location where the detected anomalies are materialized. """ # Validating schema against the computed statistics schema = my_metadata.read_schema(schema_path) stats = tfdv.load_statistics(stats_path) anomalies = tfdv.validate_statistics(stats, schema) # Writing anomalies to anomalies path to file_io.write_string_to_file(anomalies_path, text_format.MessageToString(anomalies))
def test_e2e(self, stats_options, expected_stats_pbtxt, expected_inferred_schema_pbtxt, schema_for_validation_pbtxt, expected_anomalies_pbtxt, expected_updated_schema_pbtxt): tfxio = tf_sequence_example_record.TFSequenceExampleRecord( self._input_file, ['tfdv', 'test']) stats_file = os.path.join(self._output_dir, 'stats') with beam.Pipeline() as p: _ = (p | 'TFXIORead' >> tfxio.BeamSource() | 'GenerateStats' >> tfdv.GenerateStatistics(stats_options) | 'WriteStats' >> tfdv.WriteStatisticsToTFRecord(stats_file)) actual_stats = tfdv.load_statistics(stats_file) test_util.make_dataset_feature_stats_list_proto_equal_fn( self, text_format.Parse( expected_stats_pbtxt, statistics_pb2.DatasetFeatureStatisticsList()))([actual_stats]) actual_inferred_schema = tfdv.infer_schema(actual_stats, infer_feature_shape=True) if hasattr(actual_inferred_schema, 'generate_legacy_feature_spec'): actual_inferred_schema.ClearField('generate_legacy_feature_spec') self._assert_schema_equal( actual_inferred_schema, text_format.Parse(expected_inferred_schema_pbtxt, schema_pb2.Schema())) schema_for_validation = text_format.Parse(schema_for_validation_pbtxt, schema_pb2.Schema()) actual_anomalies = tfdv.validate_statistics(actual_stats, schema_for_validation) actual_anomalies.ClearField('baseline') self.assertEqual( actual_anomalies, text_format.Parse(expected_anomalies_pbtxt, anomalies_pb2.Anomalies())) actual_updated_schema = tfdv.update_schema(schema_for_validation, actual_stats, infer_feature_shape=False) self._assert_schema_equal( actual_updated_schema, text_format.Parse(expected_updated_schema_pbtxt, schema_pb2.Schema()))
def validate_stats(stats_path, schema_path, anomalies_path): """Validates the statistics against the schema and materializes anomalies. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location of the schema to be used for validation. anomalies_path: Location where the detected anomalies are materialized. """ print('Validating schema against the computed statistics.') schema = tfdv.load_schema_text(schema_path) stats = tfdv.load_statistics(stats_path) anomalies = tfdv.validate_statistics(stats, schema) print('Detected following anomalies:') print(text_format.MessageToString(anomalies)) print('Writing anomalies to anomalies path.') file_io.write_string_to_file(anomalies_path, text_format.MessageToString(anomalies))
def validate_stats(stats_path, schema_path, anomalies_path): """Validates the statistics against the schema and materializes anomalies. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location of the schema to be used for validation. anomalies_path: Location where the detected anomalies are materialized. """ print('Validating schema against the computed statistics.') schema = taxi.read_schema(schema_path) stats = tfdv.load_statistics(stats_path) anomalies = tfdv.validate_statistics(stats, schema) print('Detected following anomalies:') print(text_format.MessageToString(anomalies)) print('Writing anomalies to anomalies path.') file_io.write_string_to_file(anomalies_path, text_format.MessageToString(anomalies))
def evaluate(self, model: BaseEstimator, num_repetitions: int, *corruptions: DataCorruption): schema = self.schema_from_train_data() baseline_predictions = model.predict_proba(self._task.test_data) baseline_score = self._task.score_on_test_data(baseline_predictions) results = [] # Repeatedly corrupt the test data for corruption in corruptions: corrupted_scores = [] anomalies = [] for _ in range(0, num_repetitions): test_data_copy = self._task.test_data.copy(deep=True) corrupted_data = corruption.transform(test_data_copy) # Determine whether tfdv finds anomalies in the data corrupted_data_stats = tfdv.generate_statistics_from_dataframe( corrupted_data) tfdv_anomalies = tfdv.validate_statistics( statistics=corrupted_data_stats, schema=schema) schema_anomalies = tfdv_anomalies.anomaly_info # Compute the prediction score on the test data corrupted_predictions = model.predict_proba(corrupted_data) corrupted_score = self._task.score_on_test_data( corrupted_predictions) anomalies.append(schema_anomalies) corrupted_scores.append(corrupted_score) results.append( SchemaValidationResult(corruption, anomalies, baseline_score, corrupted_scores)) return results
def validate_stats_against_schema(self, environment=None, previous_statistics=None, serving_statistics=None, ): # type: (str, DatasetFeatureStatisticsList, DatasetFeatureStatisticsList) -> bool stats = tfdv.load_statistics(self.stats_path) self.anomalies = tfdv.validate_statistics( stats, self.schema, environment=environment, previous_statistics=previous_statistics, serving_statistics=serving_statistics, ) if len(self.anomalies.anomaly_info.items()) > 0: logger.error("Anomalies found in training dataset...") logger.error(str(self.anomalies.anomaly_info.items())) self.upload_anomalies() return False else: logger.info("No anomalies found") return True
def tfdv_detect_drift( stats_older_path: str, stats_new_path: str ) -> NamedTuple('Outputs', [('drift', str)]): import logging import time import tensorflow_data_validation as tfdv import tensorflow_data_validation.statistics.stats_impl logging.getLogger().setLevel(logging.INFO) logging.info('stats_older_path: %s', stats_older_path) logging.info('stats_new_path: %s', stats_new_path) if stats_older_path == 'none': return ('true', ) stats1 = tfdv.load_statistics(stats_older_path) stats2 = tfdv.load_statistics(stats_new_path) schema1 = tfdv.infer_schema(statistics=stats1) tfdv.get_feature(schema1, 'duration').drift_comparator.jensen_shannon_divergence.threshold = 0.01 drift_anomalies = tfdv.validate_statistics( statistics=stats2, schema=schema1, previous_statistics=stats1) logging.info('drift analysis results: %s', drift_anomalies.drift_skew_info) from google.protobuf.json_format import MessageToDict d = MessageToDict(drift_anomalies) val = d['driftSkewInfo'][0]['driftMeasurements'][0]['value'] thresh = d['driftSkewInfo'][0]['driftMeasurements'][0]['threshold'] logging.info('value %s and threshold %s', val, thresh) res = 'true' if val < thresh: res = 'false' logging.info('train decision: %s', res) return (res, )
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow ExampleValidator executor entrypoint. This validates the statistics on the 'eval' split against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of 'ExampleStatisticsPath' type which should contain split 'eval'. Stats on other splits are ignored. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'ExampleValidationPath' artifact of size one. It will include a single pbtxt file which contains all anomalies found. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) tf.logging.info('Validating schema against the computed statistics.') schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema']))) stats = tfdv.load_statistics( io_utils.get_only_uri_in_dir( types.get_split_uri(input_dict['stats'], 'eval'))) output_uri = types.get_single_uri(output_dict['output']) anomalies = tfdv.validate_statistics(stats, schema) io_utils.write_pbtxt_file( os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies) tf.logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
# %% # Compute stats for evaluation data test_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_test) # Compare evaluation data with training data tfdv.visualize_statistics(lhs_statistics=test_stats, rhs_statistics=train_stats, lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET') # %% ''' ## evaluate for anomalies related with schema ''' # %% anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema) tfdv.display_anomalies(anomalies) # %% ## checking the data is compliant with the schema of the train part # %% data_stats = tfdv.generate_statistics_from_dataframe(data) serving_anomalies = tfdv.validate_statistics(data_stats, schema) tfdv.display_anomalies(serving_anomalies) # %% # %%
datefmt='%d-%m-%y %H:%M:%S') logging.info(f"Args: {args}") stats1 = tfdv.load_stats_text(input_path=args.stats_file_1) stats2 = tfdv.load_stats_text(input_path=args.stats_file_2) schema1 = tfdv.infer_schema(statistics=stats1) # Custom rules, tweak this as required. This is just an example tfdv.get_feature( schema1, 'I1').drift_comparator.jensen_shannon_divergence.threshold = 0.06 # Calculate drift between the reference stats stats1, and the statistics from new data in stats2 drift_anomalies = tfdv.validate_statistics(statistics=stats2, schema=schema1, previous_statistics=stats1) # Convert the .pb2 to dict drift = MessageToDict(drift_anomalies) value = drift['driftSkewInfo'][0]['driftMeasurements'][0]['value'] threshold = drift['driftSkewInfo'][0]['driftMeasurements'][0]['threshold'] logging.info( f"JS divergence value: {value}, and JS divergence threshold: {threshold}" ) drift_detected = True if value < threshold: drift_detected = False logging.info(f"Drift detected: {drift_detected}")
def run_validator(output_dir, column_names, key_columns, csv_data_file, csv_data_file_to_validate): """Writes a TFDV-generated schema. Args: output_dir: output folder column_names: list of names for the columns in the CSV file. If omitted, the first line is treated as the column names. key_columns: list of the names for columns that should be treated as unique keys. csv_data_file: name of the CSV file to analyze and generate a schema. csv_data_file_to_validate: name of a CSV file to validate against the schema. """ logging.getLogger().info('running in local mode') pipeline_options = None logging.getLogger().info('starting stats on tfdv') stats = tfdv.generate_statistics_from_csv( data_location=csv_data_file, column_names=column_names, delimiter=',', output_path=os.path.join(output_dir, 'data_stats.tfrecord'), pipeline_options=pipeline_options) schema = tfdv.infer_schema(stats) logging.getLogger().info('loading output_schema.pb2') with open('{}/output_schema.pb2'.format(output_dir), 'w+') as f: f.write(schema.SerializeToString()) logging.getLogger().info('loading [output_dir] {} schema.pb2'.format(output_dir)) with file_io.FileIO(os.path.join(output_dir, 'schema.pb2'), 'w+') as f: logging.getLogger().info('loading schema to {}'.format(f.name)) f.write(schema.SerializeToString()) schema_json = convert_schema_proto_to_json( schema, column_names, key_columns) logging.getLogger().info(' logging output_schema.json') with open('{}/output_schema.json'.format(output_dir), 'w+') as f: json.dump(schema_json, f) schema_json_file = os.path.join(output_dir, 'schema.json') with file_io.FileIO(schema_json_file, 'w+') as f: logging.getLogger().info('logging JSON schema to {}'.format(f.name)) json.dump(schema_json, f) with open('{}/schema.txt'.format(output_dir), 'w+') as f: logging.getLogger().info('schema.txt to {}'.format(f.name)) f.write(schema_json_file) logging.getLogger().info('Schema Write Done...') if not csv_data_file_to_validate: logging.getLogger().info('No csv file to validate') return logging.getLogger().info('Validation Stats...') validation_stats = tfdv.generate_statistics_from_csv( data_location=csv_data_file_to_validate, column_names=column_names, delimiter=',', output_path=os.path.join(output_dir, 'validation_data_stats.tfrecord'), pipeline_options=pipeline_options) anomalies = tfdv.validate_statistics(validation_stats, schema) logging.getLogger().info('logging output validation results ...') with open('{}/output_validation_result.txt'.format(output_dir), 'w+') as f: if len(anomalies.anomaly_info.items()) > 0: f.write('invalid') else: f.write('valid') return # logging.getLogger().info('logging anomalies result ...') # with file_io.FileIO(os.path.join(output_dir, 'anomalies.pb2'), 'w+') as f: # logging.getLogger().info('logging anomalies to {}'.format(f.name)) # f.write(anomalies.SerializeToString()) # # for feature_name, anomaly_info in anomalies.anomaly_info.items(): # logging.getLogger().error( # 'Anomaly in feature "{}": {}'.format( # feature_name, anomaly_info.description)) return 0
# COMMAND ---------- # MAGIC %md infer schema # COMMAND ---------- weather_data_schema = tfdv.infer_schema(statistics=stats) tfdv.display_schema(schema=weather_data_schema) # COMMAND ---------- # MAGIC %md check for anomalies # COMMAND ---------- weather_anomalies = tfdv.validate_statistics(statistics=stats, schema=weather_data_schema) tfdv.display_anomalies(weather_anomalies) # COMMAND ---------- # MAGIC %md average monthly temperature # COMMAND ---------- # MAGIC %sql # MAGIC SELECT x.month,AVG(x.avg_temp_f) as avg_temp_f FROM ( # MAGIC SELECT MONTH(time) as month,YEAR(time) as year,SUM(avg_temp_f) as avg_temp_f # MAGIC FROM dscc202_group05_db.weather_delta_bronze # MAGIC GROUP BY MONTH(time), YEAR(time) # MAGIC ) x # MAGIC GROUP BY x.month
rhs_statistics=train_stats, lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET') """In evalation of the data TFDV shows in the plot both the train and eval stats overlaid, it is possible to check the distribuition visually, it can speedy the change in the schema when necessary. Looking in the plots we can consider both the train and eval has the same distribution, the difenrrence in the plots is just in the quantity of each value in the features. ### Anomalies After generate stats for both train and eval dataset, it necessary to check if the schema define to each one is the same (range of values, type of feature and so on), TFDV has a function to look for anomalies, e.g, features define different for each schema. """ # Check eval data for errors by validating the Eval data stats using the # previously inferred schema. anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=schema) # Display anomalies tfdv.display_anomalies(anomalies) """We have just one feature with anomalies, look deeply we can check the values are correct but the problem there is a period and a space in the values, so the TFDV considered it as a new values. ### Fixing the erros The corrections will depend of our knowlegde of the data, looking at each feature and decide what action should be take, for our case it is not necessary to ajusted the schema, but to test how to correct the errors we will add the values to the domain and check again the anomalies """ # Add new value to the domain of feature wage. wage = tfdv.get_domain(schema, 'wage') wage.value.append(' <=50K.') wage.value.append(' >50K.')
training_data = dataset[:1600] print(training_data.describe()) test_set = dataset[1600:] print(test_set.describe()) # Generate training data statistics train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset) schema = tfdv.infer_schema(statistics=train_stats) print(tfdv.display_schema(schema)) test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set) # Compare test statistics with the Schema anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema) # Displaying all detected anomalies # Integer larger than 10 # STRING type when expected INT type # FLOAT type when expected INT type # Integer smaller than 0 print(tfdv.display_anomalies(anomalies)) # New data WITH anomalies test_set_copy = test_set.copy() test_set_copy.drop("soot", axis=1, inplace=True) # Statistics based on data with anomalies test_set_copy_stats = tfdv.generate_statistics_from_dataframe( dataframe=test_set_copy) anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats,
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) vldn_output = os.path.join(DATA_DIR, 'validation') # TODO: Understand why this was used in the conversion to the output json # key columns: list of the names for columns that should be treated as unique keys. key_columns = ['trip_start_timestamp'] # read the first line of the cvs to have and ordered list of column names # (the Schema will scrable the features) with open(TRAIN_DATA) as f: column_names = f.readline().strip().split(',') stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA) schema = tfdv.infer_schema(stats) eval_stats = tfdv.generate_statistics_from_csv( data_location=EVALUATION_DATA) anomalies = tfdv.validate_statistics(eval_stats, schema) # Log anomalies for feature_name, anomaly_info in anomalies.anomaly_info.items(): logging.getLogger().error( 'Anomaly in feature "{}": {}'.format( feature_name, anomaly_info.description)) # show inferred schema tfdv.display_schema(schema=schema) # Resolve anomalies company = tfdv.get_feature(schema, 'company') company.distribution_constraints.min_domain_mass = 0.9 # Add new value to the domain of feature payment_type. payment_type_domain = tfdv.get_domain(schema, 'payment_type') payment_type_domain.value.append('Prcard') # Validate eval stats after updating the schema updated_anomalies = tfdv.validate_statistics(eval_stats, schema) tfdv.display_anomalies(updated_anomalies) # -----------------------DATA SAVING START--------------------------------- if "column_names" in locals(): _kale_resource_save(column_names, os.path.join( _kale_data_directory, "column_names")) else: print("_kale_resource_save: `column_names` not found.") if "schema" in locals(): _kale_resource_save(schema, os.path.join( _kale_data_directory, "schema")) else: print("_kale_resource_save: `schema` not found.")
def run_validator(output_dir, column_names, key_columns, csv_data_file, csv_data_file_to_validate, project, mode): """Writes a TFDV-generated schema. Args: output_dir: output folder column_names: list of names for the columns in the CSV file. If omitted, the first line is treated as the column names. key_columns: list of the names for columns that should be treated as unique keys. csv_data_file: name of the CSV file to analyze and generate a schema. csv_data_file_to_validate: name of a CSV file to validate against the schema. project: the project to run dataflow in. mode: whether the job should be `local` or `cloud`. """ if mode == 'local': pipeline_options = None elif mode == 'cloud': temp_dir = os.path.join(output_dir, 'tmp') options = { 'job_name': ('pipeline-tfdv-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')), 'setup_file': './validation/setup.py', 'project': project, 'temp_location': temp_dir, } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) pipeline_options.view_as(StandardOptions).runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) stats = tfdv.generate_statistics_from_csv( data_location=csv_data_file, column_names=column_names, delimiter=',', output_path=os.path.join(output_dir, 'data_stats.tfrecord'), pipeline_options=pipeline_options) schema = tfdv.infer_schema(stats) with open('/output_schema.pb2', 'w+') as f: f.write(schema.SerializeToString()) with file_io.FileIO(os.path.join(output_dir, 'schema.pb2'), 'w+') as f: logging.getLogger().info('Writing schema to {}'.format(f.name)) f.write(schema.SerializeToString()) schema_json = convert_schema_proto_to_json(schema, column_names, key_columns) with open('/output_schema.json', 'w+') as f: json.dump(schema_json, f) with file_io.FileIO(os.path.join(output_dir, 'schema.json'), 'w+') as f: logging.getLogger().info('Writing JSON schema to {}'.format(f.name)) json.dump(schema_json, f) if not csv_data_file_to_validate: return validation_stats = tfdv.generate_statistics_from_csv( data_location=csv_data_file_to_validate, column_names=column_names, delimiter=',', output_path=os.path.join(output_dir, 'validation_data_stats.tfrecord'), pipeline_options=pipeline_options) anomalies = tfdv.validate_statistics(validation_stats, schema) with open('/output_validation_result.txt', 'w+') as f: if len(anomalies.anomaly_info.items()) > 0: f.write('invalid') else: f.write('valid') return with file_io.FileIO(os.path.join(output_dir, 'anomalies.pb2'), 'w+') as f: logging.getLogger().info('Writing anomalies to {}'.format(f.name)) f.write(anomalies.SerializeToString()) for feature_name, anomaly_info in anomalies.anomaly_info.items(): logging.getLogger().error('Anomaly in feature "{}": {}'.format( feature_name, anomaly_info.description))
def run(self, task, model, schema, num_corruptions, performance_threshold): # Make sure the schema works on the clean test data assert (not self.has_anomaly(self.validate(schema, task.test_data))) baseline_predictions = model.predict_proba(task.test_data) baseline_score = task.score_on_test_data(baseline_predictions) random_corruptions = set() for _ in range(0, num_corruptions): num_columns = len(task.numerical_columns + task.categorical_columns + task.text_columns) p_numerical_column_affected = float(len( task.numerical_columns)) / num_columns p_categorical_column_affected = float(len( task.categorical_columns)) / num_columns p_text_column_affected = float(len( task.text_columns)) / num_columns affected_column_type = np.random.choice( ['numerical', 'categorical', 'text'], 1, p=[ p_numerical_column_affected, p_categorical_column_affected, p_text_column_affected ]) fraction = float(np.random.randint(100)) / 100 if affected_column_type == 'numerical': if len(task.numerical_columns) >= 2 and np.random.uniform( ) < 0.1: affected_columns = np.random.choice( task.numerical_columns, 2) random_corruptions.add( SwappedValues(affected_columns[0], affected_columns[1], fraction)) else: corruption_type = np.random.choice( ['missing', 'noise', 'scaling']) if corruption_type == 'missing': missingness = np.random.choice(['MCAR', 'MAR', 'MNAR']) affected_column = np.random.choice( task.numerical_columns) random_corruptions.add( MissingValues(affected_column, fraction, na_value=np.nan, missingness=missingness)) elif corruption_type == 'noise': affected_column = np.random.choice( task.numerical_columns) random_corruptions.add( GaussianNoise(affected_column, fraction)) elif corruption_type == 'scaling': affected_column = np.random.choice( task.numerical_columns) random_corruptions.add( Scaling(affected_column, fraction)) elif affected_column_type == 'categorical': if len(task.categorical_columns) >= 2 and np.random.uniform( ) < 0.1: affected_columns = np.random.choice( task.categorical_columns, 2) random_corruptions.add( SwappedValues(affected_columns[0], affected_columns[1], fraction)) else: corruption_type = np.random.choice(['missing', 'encoding']) if corruption_type == 'missing': missingness = np.random.choice(['MCAR', 'MAR', 'MNAR']) affected_column = np.random.choice( task.categorical_columns) random_corruptions.add( MissingValues(affected_column, fraction, na_value='', missingness=missingness)) elif corruption_type == 'encoding': affected_column = np.random.choice( task.categorical_columns) random_corruptions.add( BrokenCharacters(affected_column, fraction)) elif affected_column_type == 'text': if len(task.text_columns) >= 2 and np.random.uniform() < 0.1: affected_columns = np.random.choice(task.text_columns, 2) random_corruptions.add( SwappedValues(affected_columns[0], affected_columns[1], fraction)) else: corruption_type = np.random.choice(['missing', 'encoding']) if corruption_type == 'missing': missingness = np.random.choice(['MCAR', 'MAR', 'MNAR']) affected_column = np.random.choice(task.text_columns) random_corruptions.add( MissingValues(affected_column, fraction, na_value='', missingness=missingness)) elif corruption_type == 'encoding': affected_column = np.random.choice(task.text_columns) random_corruptions.add( BrokenCharacters(affected_column, fraction)) outcome = { 'corruption': [], 'status': [], 'anomalies': [], 'baseline_score': [], 'corrupted_score': [] } for corruption in random_corruptions: print(corruption) test_data_copy = task.test_data.copy(deep=True) corrupted_data = corruption.transform(test_data_copy) corrupted_data_stats = tfdv.generate_statistics_from_dataframe( corrupted_data) tfdv_anomalies = tfdv.validate_statistics( statistics=corrupted_data_stats, schema=schema) schema_anomalies = tfdv_anomalies.anomaly_info try: corrupted_predictions = model.predict_proba(corrupted_data) corrupted_score = task.score_on_test_data( corrupted_predictions) performance_drop = (baseline_score - corrupted_score) / baseline_score has_negative_impact = performance_drop > performance_threshold except: corrupted_score = None has_negative_impact = True has_anomalies = len(tfdv_anomalies.anomaly_info) != 0 if has_anomalies: if has_negative_impact: status = 'TP' else: status = 'FP' else: if not has_negative_impact: status = 'TN' else: status = 'FN' outcome['corruption'].append(str(corruption)) outcome['status'].append(status) outcome['anomalies'].append(str(schema_anomalies)) outcome['baseline_score'].append(baseline_score) outcome['corrupted_score'].append(corrupted_score) return pd.DataFrame.from_dict(outcome)
def validate(self, schema, data): stats = tfdv.generate_statistics_from_dataframe(data) return tfdv.validate_statistics(statistics=stats, schema=schema)
rhs_statistics=train_stats, lhs_name='Test_Dataset', rhs_name='Train_Dataset') """In evalation of the data TFDV shows in the plot both the train and test stats overlaid, it is possible to check the distribuition visually, it can speedy the change in the schema when necessary. Looking in the results, we can infer easily the data from Liander region can't be use to predict the values on Stedin region, all the median, meadn, std, missing values are different between the dataset, but now we don't make any change and we will continue with the TFDV analysis. ### Anomalies After generate stats for both train and test dataset, it necessary to check if the schema define to each one is the same (range of values, type of feature and so on), TFDV has a function to look for anomalies, e.g, features define different for each schema. """ # Check eval data for errors by validating the test data stats using the # previously inferred schema. anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema) # Display anomalies tfdv.display_anomalies(anomalies) """Three features has inconsistent values, two of them has wrong type (net_manager, type_conn_perc), the purchase_are it is a feature expected to be outside of the train schema, it is due the fact correspond to a different region in Netherlands. ### Fixing the erros The corrections will depend of our knowlegde of the data, looking at each feature and decide what action should be take, looking in the description of each feature will to decide how to correct the erros, **net_manager** is code of the regional network manager, **type_conn_perc: percentage** of presence of the principal type of connection in the zipcode range, **purchase_area** code of the area where the energy is purchased. For the type_conn_perc let's covert it to BYTES and append the missing values to the purcha_are. """ # Convert the type_conn_perc to Bytes type_conn_perc = tfdv.get_feature(schema, 'type_conn_perc')
schema = tfdv.infer_schema(statistics=stats_train) tfdv.display_schema(schema=schema) # COMMAND ---------- # Compare evaluation data with training data displayHTML( get_statistics_html(lhs_statistics=stats_serve, rhs_statistics=stats_train, lhs_name='SERVE_DATASET', rhs_name='TRAIN_DATASET')) # COMMAND ---------- anomalies = tfdv.validate_statistics(statistics=stats_serve, schema=schema) tfdv.display_anomalies(anomalies) # COMMAND ---------- # Add skew and drift comparators temp_f = tfdv.get_feature(schema, 'avg_temp_f') temp_f.skew_comparator.jensen_shannon_divergence.threshold = 0 temp_f.drift_comparator.jensen_shannon_divergence.threshold = 0 precip_mm = tfdv.get_feature(schema, 'tot_precip_mm') precip_mm.skew_comparator.jensen_shannon_divergence.threshold = 0 precip_mm.drift_comparator.jensen_shannon_divergence.threshold = 0 _anomalies = tfdv.validate_statistics(stats_train, schema,
# imports from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext from tfx.components import CsvExampleGen from tfx.proto import example_gen_pb2 import tensorflow as tf import pandas as pd import csv import re # Set the Interaction Context context = InteractiveContext(pipeline_root= _tfx_pipeline) # Ingesting the data from tfx.components import CsvExampleGen from tfx.components import ImportExampleGen from tfx.utils.dsl_utils import external_input from tfx.proto import example_gen_pb2 import tensorflow_data_validation as tfdv train_stats = tfdv.generate_statistics_from_tfrecord(data_location=complaints_tf_train_file) val_stats = tfdv.generate_statistics_from_tfrecord(data_location=complaints_tf_val_file) print(train_stats) schema = tfdv.infer_schema(train_stats) val_anomalies = tfdv.validate_statistics(statistics=val_stats, schema=schema) print(val_anomalies)