def generate_tfdv_stats( input_data: str, output_path: str, job_name: str, use_dataflow: str, project_id: str, region: str, gcs_temp_location: str, gcs_staging_location: str, whl_location: str = '', requirements_file: str = 'requirements.txt' ) -> NamedTuple('Outputs', [('stats_path', str)]): import logging import time import tensorflow_data_validation as tfdv import tensorflow_data_validation.statistics.stats_impl from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions # pip download tensorflow_data_validation --no-deps --platform manylinux2010_x86_64 --only-binary=:all: # CHANGE this if your download resulted in a different filename. logging.getLogger().setLevel(logging.INFO) logging.info("output path: %s", output_path) logging.info("Building pipeline options") # Create and set your PipelineOptions. options = PipelineOptions() if use_dataflow == 'true': logging.info("using Dataflow") if not whl_location: logging.warning('tfdv whl file required with dataflow runner.') exit(1) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = project_id google_cloud_options.job_name = '{}-{}'.format(job_name, str(int(time.time()))) google_cloud_options.staging_location = gcs_staging_location google_cloud_options.temp_location = gcs_temp_location google_cloud_options.region = region options.view_as(StandardOptions).runner = 'DataflowRunner' setup_options = options.view_as(SetupOptions) # PATH_TO_WHL_FILE should point to the downloaded tfdv wheel file. setup_options.extra_packages = [whl_location] setup_options.requirements_file = 'requirements.txt' tfdv.generate_statistics_from_csv(data_location=input_data, output_path=output_path, pipeline_options=options) return (output_path, )
def data_validation(data_path): train = tfdv.generate_statistics_from_csv( os.path.join(data_path, 'train.csv'), delimiter=',') test = tfdv.generate_statistics_from_csv( os.path.join(data_path, 'train.csv'), delimiter=',') schema = tfdv.infer_schema(train) # print(schema) # tfdv.display_schema(schema) anomalies = tfdv.validate_statistics(statistics=test, schema=schema) # print(anomalies) # tfdv.display_anomalies(anomalies) print(text_format.MessageToString(anomalies))
def run_tf_test(self): print("Running tf workflow test!") DATA_DIR = "./" TRAIN_DATA = os.path.join(DATA_DIR, 'cal_housing.csv') train_stats = tfdv.generate_statistics_from_csv(TRAIN_DATA, delimiter=',') schema = tfdv.infer_schema(train_stats) enc_stats = json_format.MessageToJson(train_stats) enc_schema = json_format.MessageToJson(schema) data = pd.read_csv('cal_housing.csv') ds_info = {"name" : "cal_housing_dataset",\ "description": "data about housing in California",\ "encoded_stats": enc_stats,\ "encoded_schema": enc_schema,\ "source": "UCI ML Repository" } ds_reg = self.ap.register_dataset(ds_info) featureset = data.dtypes.to_dict() featureset = {k: str(featureset[k]) for k in featureset} featureset["name"] = "wine_no_transformations" fs_reg = self.ap.register_featureset(featureset, ds_reg["_key"]) dataset = self.ap.lookup_dataset("cal_housing_dataset") retrieved_stats = dataset["encoded_stats"] retrieved_schema = dataset["encoded_schema"] #print("Retrieved stats: " + str(retrieved_stats)) print("Completed tf workflow test!") return
def infer_schema_from_csv(csv_file, column_names): data_stats = tfdv.generate_statistics_from_csv(data_location=csv_file, column_names=column_names) #tfdv.visualize_statistics(data_stats) schema = tfdv.infer_schema(statistics=data_stats) return schema
def csv_statistics_generator(file_path): """ Generate statistics for the csv dataset """ csv_stats = tfdv.generate_statistics_from_csv(data_location = file_path, delimiter=',') csv_schema = tfdv.infer_schema(csv_stats) tfdv.display_schema(csv_schema) return csv_stats, csv_schema
def compute_stats(input_handle, stats_path, pipeline_args=None): """Computes statistics on the input data. Args: input_handle: Path to csv file with input data. stats_path: Directory in which stats are materialized. """ train_stats = tfdv.generate_statistics_from_csv( input_handle, delimiter=',', output_path=stats_path, pipeline_options=PipelineOptions(flags=pipeline_args))
def update_schema(csv_file, schema): #get column names from passed in schema cols = [f.name for f in schema.feature].sort() options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True) new_batch_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,\ column_names=cols, stats_options = options) # Check eval data for errors by validating the eval data stats using the previously inferred schema updated_schema = tfdv.update_schema(schema, new_batch_stats) #tfdv.display_schema(schema=updated_schema) return updated_schema
def get_num_anomalies(csv_file, schema, environment='TRAINING'): #get column names from passed in schema cols = [f.name for f in schema.feature].sort() options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True) data_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,\ column_names=cols, stats_options=options) # Check eval data for errors by validating the eval data stats using the previously inferred schema anomalies = tfdv.validate_statistics(statistics=data_stats, schema=schema, environment=environment) #tfdv.display_anomalies(anomalies) return len(anomalies.anomaly_info)
input_event.type = metadata_store_pb2.Event.DECLARED_INPUT # Submit input event to the Metadata Store store.put_events([input_event]) print('Input event:\n', input_event) # ## Run the TFDV component # # You will now run the TFDV component to generate the schema of dataset. This should look familiar since you've done this already in Week 1. # In[10]: # Infer a schema by passing statistics to `infer_schema()` train_data = './data/train/data.csv' train_stats = tfdv.generate_statistics_from_csv(data_location=train_data) schema = tfdv.infer_schema(statistics=train_stats) schema_file = './schema.pbtxt' tfdv.write_schema_text(schema, schema_file) print("Dataset's Schema has been generated at:", schema_file) # ## Generate output artifact unit # # Now that the TFDV component has finished running and schema has been generated, you can create the artifact for the generated schema. # In[11]: # Declare output artifact of type Schema_artifact schema_artifact = metadata_store_pb2.Artifact()
def run_validator(output_dir, column_names, key_columns, csv_data_file, csv_data_file_to_validate, project, mode): """Writes a TFDV-generated schema. Args: output_dir: output folder column_names: list of names for the columns in the CSV file. If omitted, the first line is treated as the column names. key_columns: list of the names for columns that should be treated as unique keys. csv_data_file: name of the CSV file to analyze and generate a schema. csv_data_file_to_validate: name of a CSV file to validate against the schema. project: the project to run dataflow in. mode: whether the job should be `local` or `cloud`. """ if mode == 'local': pipeline_options = None elif mode == 'cloud': temp_dir = os.path.join(output_dir, 'tmp') options = { 'job_name': ('pipeline-tfdv-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')), 'setup_file': './validation/setup.py', 'project': project, 'temp_location': temp_dir, } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) pipeline_options.view_as(StandardOptions).runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) stats = tfdv.generate_statistics_from_csv( data_location=csv_data_file, column_names=column_names, delimiter=',', output_path=os.path.join(output_dir, 'data_stats.tfrecord'), pipeline_options=pipeline_options) schema = tfdv.infer_schema(stats) with open('/output_schema.pb2', 'w+') as f: f.write(schema.SerializeToString()) with file_io.FileIO(os.path.join(output_dir, 'schema.pb2'), 'w+') as f: logging.getLogger().info('Writing schema to {}'.format(f.name)) f.write(schema.SerializeToString()) schema_json = convert_schema_proto_to_json(schema, column_names, key_columns) with open('/output_schema.json', 'w+') as f: json.dump(schema_json, f) with file_io.FileIO(os.path.join(output_dir, 'schema.json'), 'w+') as f: logging.getLogger().info('Writing JSON schema to {}'.format(f.name)) json.dump(schema_json, f) if not csv_data_file_to_validate: return validation_stats = tfdv.generate_statistics_from_csv( data_location=csv_data_file_to_validate, column_names=column_names, delimiter=',', output_path=os.path.join(output_dir, 'validation_data_stats.tfrecord'), pipeline_options=pipeline_options) anomalies = tfdv.validate_statistics(validation_stats, schema) with open('/output_validation_result.txt', 'w+') as f: if len(anomalies.anomaly_info.items()) > 0: f.write('invalid') else: f.write('valid') return with file_io.FileIO(os.path.join(output_dir, 'anomalies.pb2'), 'w+') as f: logging.getLogger().info('Writing anomalies to {}'.format(f.name)) f.write(anomalies.SerializeToString()) for feature_name, anomaly_info in anomalies.anomaly_info.items(): logging.getLogger().error('Anomaly in feature "{}": {}'.format( feature_name, anomaly_info.description))
""" !pip install -q tensorflow_data_validation import tensorflow_data_validation as tfdv print('TFDV version: {}'.format(tfdv.version.__version__)) """### Compute and visualize statistics TFDV can compute descriptive statistics providing a quick overview of the data in terms of the features, shapes and distribution of the values. First use tfdv.generate_statistics_from_csv to compute statistics for train data, TFDV can compute descriptive statistics, generate_statistics_from_csv works specifying a file pattern. The pattern can contain glob characters (*, ?, [...] and sets). """ # Generate stats for Ste train_stats = tfdv.generate_statistics_from_csv( data_location=os.path.join(DATA_DIR, 'data_employees.csv')) # Visualize tfdv.visualize_statistics(train_stats) """it is possible to check the distribution of each variable, categorical and numeric features are show separated, <br> The first thing it is possible to note the TERMINATED samples are very few compared with the ACTIVE, the class is very unbalanced <br> No missing data was found ### Evaluation Data After check the training data, it will be compared with the test data, if the data is different between the test and train data it will have problems when using the model for prediction. """
def generate_stat_html(table_name, file_name_pattern, query_filter="", table_name_default=None, html_file_name='stats.html', sample_size=300000, stratified=None): ''' Constrói report tensorflow-data-validation para comparação de uma tabela atual (current) com uma tabela default Args: table_name: nome da tabela current para ser feita report file_name_pattern: pattern de nome de arquivo para ser exportado do BQ query_filter: filtros para serem feitos na query table_name_default: nome completo da tabela BQ default para comparação. Em caso de None, report será feito apenas para table_name html_file_name: nome do arquivo html para ser salvo report sample_size: tamanho da amostra máxima para ser analisada stratified: lista de colunas para fazer estratificação. Se as colunas não existirem nas tabelas (current e default) serão desconsideradas. Em caso de None, não será feita estratificação Outputs: html do report ''' import pyarrow import tensorflow_data_validation as tfdv feat = __get_features(table_name) if table_name_default is not None: feat_default = __get_features(table_name_default) feat = list(set(feat) & set(feat_default)) feat_query = "{0}".format(','.join(feat)) query_pattern = """ WITH TOTAL AS ( SELECT COUNT(*) AS TOTAL_COUNT FROM `{0}`) SELECT {2} FROM (SELECT *, FROM `{0}` {3} ) CROSS JOIN TOTAL WHERE RAND() <= {1} / TOTAL_COUNT """ extract_csv_from_bq( query_pattern.format(table_name, sample_size, feat_query, query_filter), file_name_pattern + "_current.csv", table_name.split('.')[0], table_name.split('.')[1], 'tmp_{0}'.format(datetime.now().strftime("%Y%m%d%H%M%f"))) # current_file_name = download_file_from_gcp(file_name_pattern + "_current.csv") current_file_name = file_name_pattern + "_current.csv" stats = tfdv.generate_statistics_from_csv(current_file_name) if table_name_default is not None: extract_csv_from_bq( query_pattern.format(table_name_default, sample_size, feat_query, query_filter), file_name_pattern + "_default.csv", table_name_default.split('.')[0], table_name_default.split('.')[1], 'tmp_{0}'.format(datetime.now().strftime("%Y%m%d%H%M%f"))) # default_file_name = download_file_from_gcp(file_name_pattern + "_default.csv") default_file_name = file_name_pattern + "_default.csv" stats_default = tfdv.generate_statistics_from_csv(default_file_name) html = tfdv.utils.display_util.get_statistics_html( lhs_statistics=stats, lhs_name="Current", rhs_statistics=stats_default, rhs_name="Default") else: html = tfdv.utils.display_util.get_statistics_html( lhs_statistics=stats) with open('./stat.html', 'w') as f: f.write(html) upload_file_to_gcp(local_filename='./stat.html', gcs_path=html_file_name) return html
def create_stats(transform_dir, filepath): DENSE_FLOAT_FEATURE_KEYS = [] VOCAB_FEATURE_KEYS = [] for i in range(len(_CSV_COLUMNS_NAMES)): if _CSV_COLUMN_types[i] is tf.string: VOCAB_FEATURE_KEYS.append(_CSV_COLUMNS_NAMES[i]) if _CSV_COLUMN_types[i] is tf.float32: DENSE_FLOAT_FEATURE_KEYS.append(_CSV_COLUMNS_NAMES[i]) train_stats = tfdv.generate_statistics_from_csv(data_location=filepath, delimiter='|') jsonObj = python_obj = json.loads(MessageToJson(train_stats)) stats_dict = {} for i in range(len(jsonObj['datasets'][0]['features'])): mean, std, min, max = 0.1, 0.1, 0.1, 0.2 try: name = jsonObj['datasets'][0]['features'][i]['path']['step'][0] mean = (jsonObj['datasets'][0]['features'][i]['numStats']['mean']) std = (jsonObj['datasets'][0]['features'][i]['numStats']['stdDev']) minv = (jsonObj['datasets'][0]['features'][i]['numStats']['min']) maxv = (jsonObj['datasets'][0]['features'][i]['numStats']['max']) stats_dict[name] = [ float(mean), float(std), float(minv), float(maxv) ] except BaseException as e: try: name = jsonObj['datasets'][0]['features'][i]['name'] mean = ( jsonObj['datasets'][0]['features'][i]['numStats']['mean']) std = (jsonObj['datasets'][0]['features'][i]['numStats'] ['stdDev']) minv = ( jsonObj['datasets'][0]['features'][i]['numStats']['min']) maxv = ( jsonObj['datasets'][0]['features'][i]['numStats']['max']) stats_dict[name] = [ float(mean), float(std), float(minv), float(maxv) ] except BaseException as e1: pass # save for c in ['rsrp0', 'rsrp1', 'rsrp2', 'rsrp', 'ta']: count = 0 try: statsc = stats_dict[c] count += 1 except BaseException: statsc = [0, 0, 0, 0] mean, std, minv, maxv = statsc for i in range(1, window_size, 1): try: minv = (stats_dict[c + '_' + str(i)][2]) + minv maxv = (stats_dict[c + '_' + str(i)][3]) + maxv mean = (stats_dict[c + '_' + str(i)][0]) + mean std = (stats_dict[c + '_' + str(i)][1]) + std count += 1 except Exception as e: print(e) pass if count == 0: stats_dict[c] = [0, 0, 0, 0] else: stats_dict[c] = [ mean / count, std / count, minv / count, maxv / count ] w = csv.writer( open( os.path.join(transform_dir, "output_stats_{}".format(os.path.basename(filepath))), "w")) for key, val in stats_dict.items(): w.writerow([key, val]) return stats_dict
# Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import tensorflow_data_validation as tfdv # The following variables are provided through dependency injection. These # variables come from the specified input path and arguments provided by the # API post request. # # source train_stats = tfdv.generate_statistics_from_csv( data_location=source) # noqa: F821 tfdv.visualize_statistics(train_stats)
import tensorflow_data_validation as tfdv import pandas as pd import datetime from tensorflow_data_validation.utils import slicing_util data_location = '/home/jarekk/workspace/test.csv' output_location = '/home/jarekk/workspace/stats.pb' slice_fn = slicing_util.get_feature_value_slicer( features={'time_window': None}) stats_options = tfdv.StatsOptions(slice_functions=[slice_fn]) stats = tfdv.generate_statistics_from_csv(data_location, stats_options=stats_options, output_path=output_location)
def _gen_tfdv_stats(args): stats = tfdv.generate_statistics_from_csv(data_location=args.data_path) _save_tfdv_stats_html(stats, args) return stats
import tensorflow as tf import tensorflow_data_validation as tfdv tf.logging.set_verbosity(tf.logging.ERROR) print('TFDV version: {}'.format(tfdv.version.__version__)) # Confirm that we're using Python 3 assert sys.version_info.major is 3, 'Oops, not running Python 3. Use Runtime > Change runtime type' # Set up some globals for our file paths BASE_DIR = "/home/neo/tmp_data" DATA_DIR = os.path.join(BASE_DIR, 'data') OUTPUT_DIR = os.path.join(BASE_DIR, 'chicago_taxi_output') TRAIN_DATA = os.path.join(DATA_DIR, 'train', 'data.csv') EVAL_DATA = os.path.join(DATA_DIR, 'eval', 'data.csv') SERVING_DATA = os.path.join(DATA_DIR, 'serving', 'data.csv') # Download the zip file from GCP and unzip it zip, headers = urllib.request.urlretrieve( 'https://storage.googleapis.com/tfx-colab-datasets/chicago_data.zip') zipfile.ZipFile(zip).extractall(BASE_DIR) zipfile.ZipFile(zip).close() print("Here's what we downloaded:") os.system('ls -R /home/neo/tmp_data/data') train_stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA) tfdv.visualize_statistics(train_stats) schema = tfdv.infer_schema(statistics=train_stats) tfdv.display_schema(schema=schema)
def run_validator(output_dir, column_names, key_columns, csv_data_file, csv_data_file_to_validate): """Writes a TFDV-generated schema. Args: output_dir: output folder column_names: list of names for the columns in the CSV file. If omitted, the first line is treated as the column names. key_columns: list of the names for columns that should be treated as unique keys. csv_data_file: name of the CSV file to analyze and generate a schema. csv_data_file_to_validate: name of a CSV file to validate against the schema. """ logging.getLogger().info('running in local mode') pipeline_options = None logging.getLogger().info('starting stats on tfdv') stats = tfdv.generate_statistics_from_csv( data_location=csv_data_file, column_names=column_names, delimiter=',', output_path=os.path.join(output_dir, 'data_stats.tfrecord'), pipeline_options=pipeline_options) schema = tfdv.infer_schema(stats) logging.getLogger().info('loading output_schema.pb2') with open('{}/output_schema.pb2'.format(output_dir), 'w+') as f: f.write(schema.SerializeToString()) logging.getLogger().info('loading [output_dir] {} schema.pb2'.format(output_dir)) with file_io.FileIO(os.path.join(output_dir, 'schema.pb2'), 'w+') as f: logging.getLogger().info('loading schema to {}'.format(f.name)) f.write(schema.SerializeToString()) schema_json = convert_schema_proto_to_json( schema, column_names, key_columns) logging.getLogger().info(' logging output_schema.json') with open('{}/output_schema.json'.format(output_dir), 'w+') as f: json.dump(schema_json, f) schema_json_file = os.path.join(output_dir, 'schema.json') with file_io.FileIO(schema_json_file, 'w+') as f: logging.getLogger().info('logging JSON schema to {}'.format(f.name)) json.dump(schema_json, f) with open('{}/schema.txt'.format(output_dir), 'w+') as f: logging.getLogger().info('schema.txt to {}'.format(f.name)) f.write(schema_json_file) logging.getLogger().info('Schema Write Done...') if not csv_data_file_to_validate: logging.getLogger().info('No csv file to validate') return logging.getLogger().info('Validation Stats...') validation_stats = tfdv.generate_statistics_from_csv( data_location=csv_data_file_to_validate, column_names=column_names, delimiter=',', output_path=os.path.join(output_dir, 'validation_data_stats.tfrecord'), pipeline_options=pipeline_options) anomalies = tfdv.validate_statistics(validation_stats, schema) logging.getLogger().info('logging output validation results ...') with open('{}/output_validation_result.txt'.format(output_dir), 'w+') as f: if len(anomalies.anomaly_info.items()) > 0: f.write('invalid') else: f.write('valid') return # logging.getLogger().info('logging anomalies result ...') # with file_io.FileIO(os.path.join(output_dir, 'anomalies.pb2'), 'w+') as f: # logging.getLogger().info('logging anomalies to {}'.format(f.name)) # f.write(anomalies.SerializeToString()) # # for feature_name, anomaly_info in anomalies.anomaly_info.items(): # logging.getLogger().error( # 'Anomaly in feature "{}": {}'.format( # feature_name, anomaly_info.description)) return 0
"""### Compute and visualize statistics TFDV can compute descriptive statistics providing a quick overview of the data in terms of the features, shapes and distribution of the values. First use tfdv.generate_statistics_from_csv to compute statistics for train data (Liander region), TFDV can compute descriptive statistics, generate_statistics_from_csv works specifying a file pattern. The pattern can contain glob characters (*, ?, [...] and sets). """ # Define column names names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'wage'] # Generate stats for train data train_stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA, column_names=names) # Visualize tfdv.visualize_statistics(train_stats) """it is possible to check the distribution of each variable (max, min, median, mean, std and zeros), how much missing data to each feature. The numeric and categorical variables are shown separated. With this information it is possible have an idea about the data and infer some preliminary assumptios, most of the people work 30-40 hours per week, they are men married and receive less than 50k per year. ### Infer a schema Infering a schema we save a lot of initial work, the scheme defines the data that are relevant to the model, for example, the type of each feature (numerical or categorical), for categorical features the schema also defines the domain - the list of acceptable values, TFDV provides a initial version of the schema based on the descriptive statistics. Define the schema is important because the model pipeline relies on it also it is provide a documentation of the data. """
schema = tfdv.infer_schema(statistics=train_stats) # %% # %% tfdv.display_schema(schema=schema) # %% ''' ## check that training data complies with the schema inferred from train data ! ''' # %% # Compute stats for evaluation data test_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_test) # Compare evaluation data with training data tfdv.visualize_statistics(lhs_statistics=test_stats, rhs_statistics=train_stats, lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET') # %% ''' ## evaluate for anomalies related with schema ''' # %% anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema) tfdv.display_anomalies(anomalies) # %%
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) schema_file = os.path.join(args.transform_dir, _SCHEMA_FILE) if args.runner == 'DataflowRunner': schema = utils.read_schema(schema_file) dataflow_options = { 'job_name': args.job_name, 'project': args.project, 'service_account_email': '', 'setup_file': os.path.abspath( os.path.join(os.path.dirname(__file__), 'setup.py')), 'temp_location': os.path.join(args.output_dir, 'tmp') } pipeline_options = beam.pipeline.PipelineOptions( flags=[], **dataflow_options) else: pipeline_options = beam.pipeline.PipelineOptions(None) if os.path.exists(args.transform_dir): logger.info('Removing existing directory %s', args.transform_dir) shutil.rmtree(args.transform_dir) stats = tfdv.generate_statistics_from_csv(data_location=args.all_data) schema = tfdv.infer_schema(statistics=stats, infer_feature_shape=False) if not file_io.file_exists(args.transform_dir): file_io.recursive_create_dir(args.transform_dir) with file_io.FileIO(schema_file, 'w') as f: f.write(text_format.MessageToString(schema)) logger.info('Generated %s', schema_file) logger.info('Running pipeline on %s environment', args.runner) with beam.Pipeline(args.runner, options=pipeline_options) as pipeline: with tft_beam.Context(temp_dir=os.path.join(args.output_dir, 'tmp')): # `mode=predict` should be used during inference time if args.mode == 'predict': logger.info('Transforming only prediction data.') transform_predict( pipeline=pipeline, predict_data=args.predict_data, data_source=args.data_source, output_dir=args.output_dir, schema=schema) else: logger.info('Transforming both training, evaluation and predict data.') transform_predict( pipeline=pipeline, predict_data=args.predict_data, data_source=args.data_source, output_dir=args.output_dir, schema=schema) transform_train_and_eval( pipeline=pipeline, train_data=args.train_data, eval_data=args.eval_data, data_source=args.data_source, transform_dir=args.transform_dir, output_dir=args.output_dir, schema=schema)
PATH_TO_WHL_FILE = './tfdv.whl' CLOUMNS = "FL_DATE, MKT_UNIQUE_CARRIER, ORIGIN_AIRPORT_ID, DEST_AIRPORT_ID, DEP_TIME, DEP_DELAY, ARR_DELAY, DISTANCE, dep_lat, dep_lng, arr_lat, arr_lng".split( ', ') # Create and set your PipelineOptions. options = PipelineOptions() # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = JOB_NAME google_cloud_options.staging_location = GCS_STAGING_LOCATION google_cloud_options.temp_location = GCS_TMP_LOCATION options.view_as(StandardOptions).runner = 'DataflowRunner' setup_options = options.view_as(SetupOptions) # PATH_TO_WHL_FILE should point to the downloaded tfdv wheel file. # setup_options.extra_packages = [PATH_TO_WHL_FILE] setup_options.setup_file = os.path.join(os.getcwd(), 'tfdv_setup.py') stat_options = tfdv.StatsOptions(sample_rate=0.01) tfdv.generate_statistics_from_csv( GCS_DATA_LOCATION, column_names=CLOUMNS, output_path=GCS_STATS_OUTPUT_PATH, stats_options=stat_options, pipeline_options=options, )
# Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import tensorflow_data_validation as tfdv # The following variables are provided through dependency injection. These # variables come from the specified input path and arguments provided by the # API post request. # # input_path train_stats = tfdv.generate_statistics_from_csv(data_location=input_path) tfdv.visualize_statistics(train_stats)
from pathlib import Path import tensorflow_data_validation as tfdv print(f'TFDV version: {tfdv.version.__version__}') data_folder = Path("../dataset") # below paths should be realtive to data_folder users_file_glob = "AllUsers.csv" ads_file_glob = "AllAds.csv" users_ads_ratings = "users-ads-without-gcp-ratings.csv" users_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_file_glob}").as_posix()) tfdv.visualize_statistics(users_stats) user_schema = tfdv.infer_schema(statistics=users_stats) tfdv.display_schema(schema=user_schema) ads_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{ads_file_glob}").as_posix()) tfdv.visualize_statistics(ads_stats)
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) vldn_output = os.path.join(DATA_DIR, 'validation') # TODO: Understand why this was used in the conversion to the output json # key columns: list of the names for columns that should be treated as unique keys. key_columns = ['trip_start_timestamp'] # read the first line of the cvs to have and ordered list of column names # (the Schema will scrable the features) with open(TRAIN_DATA) as f: column_names = f.readline().strip().split(',') stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA) schema = tfdv.infer_schema(stats) eval_stats = tfdv.generate_statistics_from_csv( data_location=EVALUATION_DATA) anomalies = tfdv.validate_statistics(eval_stats, schema) # Log anomalies for feature_name, anomaly_info in anomalies.anomaly_info.items(): logging.getLogger().error( 'Anomaly in feature "{}": {}'.format( feature_name, anomaly_info.description)) # show inferred schema tfdv.display_schema(schema=schema) # Resolve anomalies company = tfdv.get_feature(schema, 'company') company.distribution_constraints.min_domain_mass = 0.9 # Add new value to the domain of feature payment_type. payment_type_domain = tfdv.get_domain(schema, 'payment_type') payment_type_domain.value.append('Prcard') # Validate eval stats after updating the schema updated_anomalies = tfdv.validate_statistics(eval_stats, schema) tfdv.display_anomalies(updated_anomalies) # -----------------------DATA SAVING START--------------------------------- if "column_names" in locals(): _kale_resource_save(column_names, os.path.join( _kale_data_directory, "column_names")) else: print("_kale_resource_save: `column_names` not found.") if "schema" in locals(): _kale_resource_save(schema, os.path.join( _kale_data_directory, "schema")) else: print("_kale_resource_save: `schema` not found.")
""" !pip install -q tensorflow_data_validation import tensorflow_data_validation as tfdv print('TFDV version: {}'.format(tfdv.version.__version__)) """### Compute and visualize statistics TFDV can compute descriptive statistics providing a quick overview of the data in terms of the features, shapes and distribution of the values. First use tfdv.generate_statistics_from_csv to compute statistics for train data (Liander region), TFDV can compute descriptive statistics, generate_statistics_from_csv works specifying a file pattern. The pattern can contain glob characters (*, ?, [...] and sets). """ # Generate stats for Ste train_stats = tfdv.generate_statistics_from_csv(data_location=DATA_DIR+'/l*') # Visualize tfdv.visualize_statistics(train_stats) """it is possible to check the distribution of each variable (max, min, median, mean, std and zeros), how much missing data to each feature. The numeric and categorical variables are shown separated. The charts show the purchase_are has 64,81% missing data, cliking in the opção "Show Raw Data" it is possible to see the details results to each feature. ### Infer a schema Infering a schema we save a lot of initial work, the scheme defines the data that are relevant to the model, for example, the type of each feature (numerical or categorical), for categorical features the schema also defines the domain - the list of acceptable values, TFDV provides a initial version of the schema based on the descriptive statistics. Define the schema is important because the mode pipeline relies on it also it is provide a documentation of the data. """