def skew_drift_validator(mode, gcp_bucket, control_set_path, treatment_set_path, feature_list_str, Linf_value): logging.basicConfig(level=logging.INFO) logging.info('Starting skew drift validator ..') logging.info('Input data:') logging.info('mode:{}'.format(mode)) logging.info('gcp_bucket:{}'.format(gcp_bucket)) logging.info('control_set_path:{}'.format(control_set_path)) logging.info('treatment_set_path:{}'.format(treatment_set_path)) logging.info('Linf_value:{}'.format(Linf_value)) feature_list = eval(feature_list_str) control_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + control_set_path, sep=',') treat_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + treatment_set_path, sep=',') control_stats = tfdv.generate_statistics_from_dataframe( dataframe=control_set_df) treat_stats = tfdv.generate_statistics_from_dataframe( dataframe=treat_set_df) control_schema = tfdv.infer_schema(control_stats) treat_schema = tfdv.infer_schema(treat_stats) for feature in feature_list: if (mode == "skew"): if (tfdv.get_feature(control_schema, feature).domain ): # if we have domain it is a categorical variable tfdv.get_feature( control_schema, feature ).skew_comparator.infinity_norm.threshold = Linf_value else: logging.critical( "feature: {} is not categorical".format(feature)) sys.exit(1) elif (mode == "drift"): tfdv.get_feature( control_schema, feature).drift_comparator.infinity_norm.threshold = Linf_value else: logging.critical("mode: {} not supported".format(mode)) sys.exit(1) anomalies = tfdv.validate_statistics(statistics=control_stats, schema=control_schema, serving_statistics=treat_stats) if (anomalies.anomaly_info): logging.info("Data-{} detected:".format(anomalies)) return anomalies else: logging.info("No data-{} detected".format(mode))
def run_tf_test(self): print("Running tf workflow test!") DATA_DIR = "./" TRAIN_DATA = os.path.join(DATA_DIR, 'cal_housing.csv') train_stats = tfdv.generate_statistics_from_csv(TRAIN_DATA, delimiter=',') schema = tfdv.infer_schema(train_stats) enc_stats = json_format.MessageToJson(train_stats) enc_schema = json_format.MessageToJson(schema) data = pd.read_csv('cal_housing.csv') ds_info = {"name" : "cal_housing_dataset",\ "description": "data about housing in California",\ "encoded_stats": enc_stats,\ "encoded_schema": enc_schema,\ "source": "UCI ML Repository" } ds_reg = self.ap.register_dataset(ds_info) featureset = data.dtypes.to_dict() featureset = {k: str(featureset[k]) for k in featureset} featureset["name"] = "wine_no_transformations" fs_reg = self.ap.register_featureset(featureset, ds_reg["_key"]) dataset = self.ap.lookup_dataset("cal_housing_dataset") retrieved_stats = dataset["encoded_stats"] retrieved_schema = dataset["encoded_schema"] #print("Retrieved stats: " + str(retrieved_stats)) print("Completed tf workflow test!") return
def Load_TFDV(df): lencols = len(df.columns) # print(lencols) y_tfdv = [0] * lencols i = 0 for col in df.columns: # print(col) df_col = df[[col]] st_option = tfdv.StatsOptions(enable_semantic_domain_stats=True) stats = tfdv.generate_statistics_from_dataframe( df_col, stats_options=st_option) schema = tfdv.infer_schema(statistics=stats) categ_lst = get_categorical_features(schema) for x in categ_lst: y_tfdv[i] = 1 break xc = schema.feature # print(xc) for x in xc: cnt_NLD = str(x).count('natural_language_domain') cnt_TD = str(x).count('time_domain') if cnt_NLD: y_tfdv[i] = 3 if cnt_TD: y_tfdv[i] = 2 print(y_tfdv[i]) i = i + 1 return y_tfdv
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of 'ExampleStatisticsPath' type which should contain split 'train'. Stats on other splits are ignored. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'SchemaPath' artifact of size one. exec_properties: A dict of execution properties. Not used yet. Returns: None """ # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. train_stats_uri = io_utils.get_only_uri_in_dir( types.get_split_uri(input_dict['stats'], 'train')) output_uri = os.path.join( types.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME) infer_feature_shape = False tf.logging.info('Infering schema from statistics.') schema = tfdv.infer_schema( tfdv.load_statistics(train_stats_uri), infer_feature_shape) io_utils.write_pbtxt_file(output_uri, schema) tf.logging.info('Schema written to {}.'.format(output_uri))
def pandas_schema_anomalies( df: pd.DataFrame, tfdv_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]: """ Get dataframe schema anomalies comparing pandas and TFDV schema. Args: df {pandas.DataFrame}: dataframe tfdv_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: TFDV statistics Returns: Tuple[bool, Dict]: True if anomalies are detected, otherwise False, dictionary with structure: { <column_name>: { 'description': <description>, 'severity': 'ERROR', 'shortDescription': <short description>, 'reason': [{'type': <error_type>, 'shortDescription': <short description>, 'description': <description>}], 'path': {'step': [<column_name>]} } } """ tfdv_schema = tfdv.infer_schema(tfdv_statistics) pandas_schema = get_pandas_df_schema(df) return tfdv_pandas_schemas_anomalies(tfdv_schema, pandas_schema)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - 'stats': A list of 'ExampleStatistics' type which must contain split 'train'. Stats on other splits are ignored. - 'statistics': Synonym for 'stats'. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - infer_feature_shape: Whether or not to infer the shape of the feature. Returns: None """ # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. train_stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(input_dict['stats'], 'train')) output_uri = os.path.join( artifact_utils.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME) infer_feature_shape = exec_properties['infer_feature_shape'] absl.logging.info('Infering schema from statistics.') schema = tfdv.infer_schema(tfdv.load_statistics(train_stats_uri), infer_feature_shape) io_utils.write_pbtxt_file(output_uri, schema) absl.logging.info('Schema written to %s.' % output_uri)
def tfdv_statistics_anomalies( source_statistics: DatasetFeatureStatisticsList, input_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]: """ Compare two TDFV statistics and return anomalies. Args: source_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: source statistics input_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: input statistics Returns: Tuple[bool, Dict]: True if anomalies are detected, otherwise False, dictionary with structure: { <column_name>: { 'description': <description>, 'severity': 'ERROR', 'shortDescription': <short description>, 'reason': [{'type': <error_type>, 'shortDescription': <short description>, 'description': <description>}], 'path': {'step': [<column_name>]} } } """ schema = tfdv.infer_schema(source_statistics) anomalies = tfdv_object_to_dict( tfdv.validate_statistics(statistics=input_statistics, schema=schema)) tfdv_anomalies = anomalies.get('anomalyInfo', {}) return len(tfdv_anomalies) > 0, tfdv_anomalies
def _provide_schema(self, input_dict, exec_properties) -> schema_pb2.Schema: """Generates schema from either schema or statistics.""" # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. stats = input_dict.get('stats') or input_dict.get('statistics') schema = input_dict.get('schema') if bool(stats) == bool(schema): raise ValueError( 'Exactly only one of schema or stats must be provided') if schema: schema_uri = artifact_utils.get_single_uri(schema) absl.logging.info('Schema is provided. Reading from %s.' % schema_uri) schema_reader = io_utils.SchemaReader() try: return schema_reader.read( os.path.join(schema_uri, _DEFAULT_FILE_NAME)) except tf.errors.NotFoundError: raise ValueError( 'Schema is provided, but failed to read from %s.' % schema_uri) train_stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(stats, 'train')) infer_feature_shape = exec_properties['infer_feature_shape'] return tfdv.infer_schema(tfdv.load_statistics(train_stats_uri), infer_feature_shape)
def test_e2e(self, stats_options, expected_stats_pbtxt, expected_schema_pbtxt): tfxio = tf_sequence_example_record.TFSequenceExampleRecord( self._input_file, ['tfdv', 'test']) stats_file = os.path.join(self._output_dir, 'stats') with beam.Pipeline() as p: _ = (p | 'TFXIORead' >> tfxio.BeamSource() | 'GenerateStats' >> tfdv.GenerateStatistics(stats_options) | 'WriteStats' >> tfdv.WriteStatisticsToTFRecord(stats_file)) actual_stats = tfdv.load_statistics(stats_file) test_util.make_dataset_feature_stats_list_proto_equal_fn( self, text_format.Parse( expected_stats_pbtxt, statistics_pb2.DatasetFeatureStatisticsList()))([actual_stats]) actual_schema = tfdv.infer_schema(actual_stats, infer_feature_shape=True) if hasattr(actual_schema, 'generate_legacy_feature_spec'): actual_schema.ClearField('generate_legacy_feature_spec') self._assert_schema_equal( actual_schema, text_format.Parse(expected_schema_pbtxt, schema_pb2.Schema()))
def tfrecord_statis_generator(file_path): """ Generate statistics for the tfrecord dataset """ tfrecord_stats = tfdv.generate_statistics_from_tfrecord(data_location = file_path) tfrecord_schema = tfdv.infer_schema(tfrecord_stats) tfdv.display_schema(tfrecord_schema) return tfrecord_stats, tfrecord_schema
def infer_schema_from_csv(csv_file, column_names): data_stats = tfdv.generate_statistics_from_csv(data_location=csv_file, column_names=column_names) #tfdv.visualize_statistics(data_stats) schema = tfdv.infer_schema(statistics=data_stats) return schema
def csv_statistics_generator(file_path): """ Generate statistics for the csv dataset """ csv_stats = tfdv.generate_statistics_from_csv(data_location = file_path, delimiter=',') csv_schema = tfdv.infer_schema(csv_stats) tfdv.display_schema(csv_schema) return csv_stats, csv_schema
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - 'stats': A list of 'ExampleStatistics' type which must contain split 'train'. Stats on other splits are ignored. - 'statistics': Synonym for 'stats'. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - infer_feature_shape: Whether or not to infer the shape of the feature. - exclude_splits: Names of splits that will not be taken into consideration when auto-generating a schema. Returns: None """ # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. infer_feature_shape = exec_properties.get(INFER_FEATURE_SHAPE_KEY) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError('exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) # Only one schema is generated for all splits. schema = None stats_artifact = artifact_utils.get_single_instance( input_dict[STATISTICS_KEY]) for split in artifact_utils.decode_split_names(stats_artifact.split_names): if split in exclude_splits: continue logging.info('Processing schema from statistics for split %s.', split) stats_uri = io_utils.get_only_uri_in_dir( os.path.join(stats_artifact.uri, split)) if not schema: schema = tfdv.infer_schema( tfdv.load_statistics(stats_uri), infer_feature_shape) else: schema = tfdv.update_schema(schema, tfdv.load_statistics(stats_uri), infer_feature_shape) output_uri = os.path.join( artifact_utils.get_single_uri(output_dict[SCHEMA_KEY]), _DEFAULT_FILE_NAME) io_utils.write_pbtxt_file(output_uri, schema) logging.info('Schema written to %s.', output_uri)
def _infer_schema(output_dir, stats): inferred_schema_output_path = os.path.join(output_dir, 'inferred_schema.pb2') inferred_schema = tfdv.infer_schema(stats) file_io.write_string_to_file(inferred_schema_output_path, inferred_schema.SerializeToString()) file_io.write_string_to_file('/tmp/inferred_schema_output_path.txt', inferred_schema_output_path) return inferred_schema
def write_stats_and_schema(self, pipeline_args): # type: (List[str]) -> None stats = self.write_stats(pipeline_args) if not self.schema: logger.warning( "Inferring a new schema for this dataset. If you want to use an existing schema, " "provide a value for schema_path in the constructor." ) new_schema = tfdv.infer_schema(stats, infer_feature_shape=False) self.schema = new_schema self.upload_schema()
def data_validation(data_path): train = tfdv.generate_statistics_from_csv( os.path.join(data_path, 'train.csv'), delimiter=',') test = tfdv.generate_statistics_from_csv( os.path.join(data_path, 'train.csv'), delimiter=',') schema = tfdv.infer_schema(train) # print(schema) # tfdv.display_schema(schema) anomalies = tfdv.validate_statistics(statistics=test, schema=schema) # print(anomalies) # tfdv.display_anomalies(anomalies) print(text_format.MessageToString(anomalies))
def parse_schema_from_stats(cls, stats_path): # type: (str) -> Tuple[Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]], Schema] # noqa: E501 """ Returns TensorFlow Feature Spec and parsed tf.metadata Schema for given tf.metadata DatasetFeatureStatisticsList. :param stats_path: tf.metadata DatasetFeatureStatisticsList path """ import tensorflow_data_validation as tfdv stats = tfdv.load_statistics(stats_path) schema = tfdv.infer_schema(stats) return schema_to_feature_spec(schema), schema
def infer_schema(stats_path, schema_path): """Infers a schema from stats in stats_path. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location where the inferred schema is materialized. """ print('Infering schema from statistics.') schema = tfdv.infer_schema( tfdv.load_statistics(stats_path), infer_feature_shape=False) print(text_format.MessageToString(schema)) print('Writing schema to output path.') file_io.write_string_to_file(schema_path, text_format.MessageToString(schema))
def infer_schema(stats_path, schema_path): """Infers a schema from stats in stats_path. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location where the inferred schema is materialized. """ print('Infering schema from statistics.') schema = tfdv.infer_schema( tfdv.load_statistics(stats_path), infer_feature_shape=False) print(text_format.MessageToString(schema)) print('Writing schema to output path.') file_io.write_string_to_file(schema_path, text_format.MessageToString(schema))
def display_stats_for_examples(self, examples_id): """Displays stats for `examples_id`. Args: examples_id: A `int` indicating the id of a `TFXArtifactTypes.EXAMPLES` artifact. """ stats_artifact = self.get_dest_artifact_of_type( examples_id, TFXArtifactTypes.EXAMPLE_STATS) if stats_artifact: tfdv.visualize_statistics( tfdv.load_statistics( os.path.join(stats_artifact.uri, 'stats_tfrecord'))) print("display shema") tfdv.display_schema( tfdv.infer_schema(statistics=tfdv.load_statistics( os.path.join(stats_artifact.uri, 'stats_tfrecord'))))
def tfdv_detect_drift( stats_older_path: str, stats_new_path: str ) -> NamedTuple('Outputs', [('drift', str)]): import logging import time import tensorflow_data_validation as tfdv import tensorflow_data_validation.statistics.stats_impl logging.getLogger().setLevel(logging.INFO) logging.info('stats_older_path: %s', stats_older_path) logging.info('stats_new_path: %s', stats_new_path) if stats_older_path == 'none': return ('true', ) stats1 = tfdv.load_statistics(stats_older_path) stats2 = tfdv.load_statistics(stats_new_path) schema1 = tfdv.infer_schema(statistics=stats1) tfdv.get_feature(schema1, 'duration').drift_comparator.jensen_shannon_divergence.threshold = 0.01 drift_anomalies = tfdv.validate_statistics( statistics=stats2, schema=schema1, previous_statistics=stats1) logging.info('drift analysis results: %s', drift_anomalies.drift_skew_info) from google.protobuf.json_format import MessageToDict d = MessageToDict(drift_anomalies) val = d['driftSkewInfo'][0]['driftMeasurements'][0]['value'] thresh = d['driftSkewInfo'][0]['driftMeasurements'][0]['threshold'] logging.info('value %s and threshold %s', val, thresh) res = 'true' if val < thresh: res = 'false' logging.info('train decision: %s', res) return (res, )
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) vldn_output = os.path.join(DATA_DIR, 'validation') # TODO: Understand why this was used in the conversion to the output json # key columns: list of the names for columns that should be treated as unique keys. key_columns = ['trip_start_timestamp'] # read the first line of the cvs to have and ordered list of column names # (the Schema will scrable the features) with open(TRAIN_DATA) as f: column_names = f.readline().strip().split(',') stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA) schema = tfdv.infer_schema(stats) eval_stats = tfdv.generate_statistics_from_csv( data_location=EVALUATION_DATA) anomalies = tfdv.validate_statistics(eval_stats, schema) # Log anomalies for feature_name, anomaly_info in anomalies.anomaly_info.items(): logging.getLogger().error( 'Anomaly in feature "{}": {}'.format( feature_name, anomaly_info.description)) # show inferred schema tfdv.display_schema(schema=schema) # Resolve anomalies company = tfdv.get_feature(schema, 'company') company.distribution_constraints.min_domain_mass = 0.9 # Add new value to the domain of feature payment_type. payment_type_domain = tfdv.get_domain(schema, 'payment_type') payment_type_domain.value.append('Prcard') # Validate eval stats after updating the schema updated_anomalies = tfdv.validate_statistics(eval_stats, schema) tfdv.display_anomalies(updated_anomalies) # -----------------------DATA SAVING START--------------------------------- if "column_names" in locals(): _kale_resource_save(column_names, os.path.join( _kale_data_directory, "column_names")) else: print("_kale_resource_save: `column_names` not found.") if "schema" in locals(): _kale_resource_save(schema, os.path.join( _kale_data_directory, "schema")) else: print("_kale_resource_save: `schema` not found.")
# %% # %% ''' ## visualize statistics of train data ''' # %% tfdv.visualize_statistics(train_stats) # %% # %% schema = tfdv.infer_schema(statistics=train_stats) # %% # %% tfdv.display_schema(schema=schema) # %% ''' ## check that training data complies with the schema inferred from train data ! ''' # %% # Compute stats for evaluation data test_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_test)
type=str, required=False, default='./val_stats/stats.txt', help='Path to the validation stats .txt file ') args = parser.parse_args() logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S') logging.info(f"Args: {args}") stats1 = tfdv.load_stats_text(input_path=args.stats_file_1) stats2 = tfdv.load_stats_text(input_path=args.stats_file_2) schema1 = tfdv.infer_schema(statistics=stats1) # Custom rules, tweak this as required. This is just an example tfdv.get_feature( schema1, 'I1').drift_comparator.jensen_shannon_divergence.threshold = 0.06 # Calculate drift between the reference stats stats1, and the statistics from new data in stats2 drift_anomalies = tfdv.validate_statistics(statistics=stats2, schema=schema1, previous_statistics=stats1) # Convert the .pb2 to dict drift = MessageToDict(drift_anomalies) value = drift['driftSkewInfo'][0]['driftMeasurements'][0]['value']
def _gen_tfdv_schema(stats, args): schema = tfdv.infer_schema(stats) _save_tfdv_schema(schema, args) return schema
data_folder = Path("../dataset") # below paths should be realtive to data_folder users_file_glob = "AllUsers.csv" ads_file_glob = "AllAds.csv" users_ads_ratings = "users-ads-without-gcp-ratings.csv" users_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_file_glob}").as_posix()) tfdv.visualize_statistics(users_stats) user_schema = tfdv.infer_schema(statistics=users_stats) tfdv.display_schema(schema=user_schema) ads_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{ads_file_glob}").as_posix()) tfdv.visualize_statistics(ads_stats) ads_schema = tfdv.infer_schema(statistics=ads_stats) tfdv.display_schema(schema=ads_schema) users_ratings_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_ads_ratings}").as_posix())
def run_validator(output_dir, column_names, key_columns, csv_data_file, csv_data_file_to_validate, project, mode): """Writes a TFDV-generated schema. Args: output_dir: output folder column_names: list of names for the columns in the CSV file. If omitted, the first line is treated as the column names. key_columns: list of the names for columns that should be treated as unique keys. csv_data_file: name of the CSV file to analyze and generate a schema. csv_data_file_to_validate: name of a CSV file to validate against the schema. project: the project to run dataflow in. mode: whether the job should be `local` or `cloud`. """ if mode == 'local': pipeline_options = None elif mode == 'cloud': temp_dir = os.path.join(output_dir, 'tmp') options = { 'job_name': ('pipeline-tfdv-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')), 'setup_file': './validation/setup.py', 'project': project, 'temp_location': temp_dir, } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) pipeline_options.view_as(StandardOptions).runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) stats = tfdv.generate_statistics_from_csv( data_location=csv_data_file, column_names=column_names, delimiter=',', output_path=os.path.join(output_dir, 'data_stats.tfrecord'), pipeline_options=pipeline_options) schema = tfdv.infer_schema(stats) with open('/output_schema.pb2', 'w+') as f: f.write(schema.SerializeToString()) with file_io.FileIO(os.path.join(output_dir, 'schema.pb2'), 'w+') as f: logging.getLogger().info('Writing schema to {}'.format(f.name)) f.write(schema.SerializeToString()) schema_json = convert_schema_proto_to_json(schema, column_names, key_columns) with open('/output_schema.json', 'w+') as f: json.dump(schema_json, f) with file_io.FileIO(os.path.join(output_dir, 'schema.json'), 'w+') as f: logging.getLogger().info('Writing JSON schema to {}'.format(f.name)) json.dump(schema_json, f) if not csv_data_file_to_validate: return validation_stats = tfdv.generate_statistics_from_csv( data_location=csv_data_file_to_validate, column_names=column_names, delimiter=',', output_path=os.path.join(output_dir, 'validation_data_stats.tfrecord'), pipeline_options=pipeline_options) anomalies = tfdv.validate_statistics(validation_stats, schema) with open('/output_validation_result.txt', 'w+') as f: if len(anomalies.anomaly_info.items()) > 0: f.write('invalid') else: f.write('valid') return with file_io.FileIO(os.path.join(output_dir, 'anomalies.pb2'), 'w+') as f: logging.getLogger().info('Writing anomalies to {}'.format(f.name)) f.write(anomalies.SerializeToString()) for feature_name, anomaly_info in anomalies.anomaly_info.items(): logging.getLogger().error('Anomaly in feature "{}": {}'.format( feature_name, anomaly_info.description))
def schema_from_train_data(self): train_data_stats = tfdv.generate_statistics_from_dataframe( self._task.train_data) schema = tfdv.infer_schema(statistics=train_data_stats) return schema
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - 'statistics': A list of 'ExampleStatistics' type which must contain split 'train'. output_dict: Output dict from key to a list of artifacts, including: - schema: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - infer_feature_shape: Whether or not to infer the shape of the feature. - exclude_splits: Names of splits that will not be taken into consideration when auto-generating a schema. Returns: None """ infer_feature_shape = bool( exec_properties.get( standard_component_specs.INFER_FEATURE_SHAPE_KEY, True)) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError( 'exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) # Only one schema is generated for all splits. schema = None stats_artifact = artifact_utils.get_single_instance( input_dict[standard_component_specs.STATISTICS_KEY]) for split in artifact_utils.decode_split_names( stats_artifact.split_names): if split in exclude_splits: continue logging.info('Processing schema from statistics for split %s.', split) stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri([stats_artifact], split)) if artifact_utils.is_artifact_version_older_than( stats_artifact, artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE): # pylint: disable=protected-access stats = tfdv.load_statistics(stats_uri) else: stats = tfdv.load_stats_binary(stats_uri) if not schema: schema = tfdv.infer_schema(stats, infer_feature_shape) else: schema = tfdv.update_schema(schema, stats, infer_feature_shape) output_uri = os.path.join( artifact_utils.get_single_uri( output_dict[standard_component_specs.SCHEMA_KEY]), DEFAULT_FILE_NAME) io_utils.write_pbtxt_file(output_uri, schema) logging.info('Schema written to %s.', output_uri)
# MAGIC - check for data drift and skew between training and serving # COMMAND ---------- from sklearn.model_selection import train_test_split import tensorflow_data_validation as tfdv from tensorflow_data_validation.utils.display_util import get_statistics_html import warnings warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning) stats_train = tfdv.generate_statistics_from_dataframe( dataframe=train_df.toPandas()) stats_serve = tfdv.generate_statistics_from_dataframe(dataframe=fdf.toPandas()) schema = tfdv.infer_schema(statistics=stats_train) tfdv.display_schema(schema=schema) # COMMAND ---------- # Compare evaluation data with training data displayHTML( get_statistics_html(lhs_statistics=stats_serve, rhs_statistics=stats_train, lhs_name='SERVE_DATASET', rhs_name='TRAIN_DATASET')) # COMMAND ---------- anomalies = tfdv.validate_statistics(statistics=stats_serve, schema=schema) tfdv.display_anomalies(anomalies)
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) schema_file = os.path.join(args.transform_dir, _SCHEMA_FILE) if args.runner == 'DataflowRunner': schema = utils.read_schema(schema_file) dataflow_options = { 'job_name': args.job_name, 'project': args.project, 'service_account_email': '', 'setup_file': os.path.abspath( os.path.join(os.path.dirname(__file__), 'setup.py')), 'temp_location': os.path.join(args.output_dir, 'tmp') } pipeline_options = beam.pipeline.PipelineOptions( flags=[], **dataflow_options) else: pipeline_options = beam.pipeline.PipelineOptions(None) if os.path.exists(args.transform_dir): logger.info('Removing existing directory %s', args.transform_dir) shutil.rmtree(args.transform_dir) stats = tfdv.generate_statistics_from_csv(data_location=args.all_data) schema = tfdv.infer_schema(statistics=stats, infer_feature_shape=False) if not file_io.file_exists(args.transform_dir): file_io.recursive_create_dir(args.transform_dir) with file_io.FileIO(schema_file, 'w') as f: f.write(text_format.MessageToString(schema)) logger.info('Generated %s', schema_file) logger.info('Running pipeline on %s environment', args.runner) with beam.Pipeline(args.runner, options=pipeline_options) as pipeline: with tft_beam.Context(temp_dir=os.path.join(args.output_dir, 'tmp')): # `mode=predict` should be used during inference time if args.mode == 'predict': logger.info('Transforming only prediction data.') transform_predict( pipeline=pipeline, predict_data=args.predict_data, data_source=args.data_source, output_dir=args.output_dir, schema=schema) else: logger.info('Transforming both training, evaluation and predict data.') transform_predict( pipeline=pipeline, predict_data=args.predict_data, data_source=args.data_source, output_dir=args.output_dir, schema=schema) transform_train_and_eval( pipeline=pipeline, train_data=args.train_data, eval_data=args.eval_data, data_source=args.data_source, transform_dir=args.transform_dir, output_dir=args.output_dir, schema=schema)