Beispiel #1
0
def generate_tfdv_stats(
    input_data: str,
    output_path: str,
    job_name: str,
    use_dataflow: str,
    project_id: str,
    region: str,
    gcs_temp_location: str,
    gcs_staging_location: str,
    whl_location: str = '',
    requirements_file: str = 'requirements.txt'
) -> NamedTuple('Outputs', [('stats_path', str)]):

    import logging
    import time

    import tensorflow_data_validation as tfdv
    import tensorflow_data_validation.statistics.stats_impl
    from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions

    # pip download tensorflow_data_validation --no-deps --platform manylinux2010_x86_64 --only-binary=:all:
    # CHANGE this if your download resulted in a different filename.

    logging.getLogger().setLevel(logging.INFO)
    logging.info("output path: %s", output_path)
    logging.info("Building pipeline options")
    # Create and set your PipelineOptions.
    options = PipelineOptions()

    if use_dataflow == 'true':
        logging.info("using Dataflow")
        if not whl_location:
            logging.warning('tfdv whl file required with dataflow runner.')
            exit(1)
        # For Cloud execution, set the Cloud Platform project, job_name,
        # staging location, temp_location and specify DataflowRunner.
        google_cloud_options = options.view_as(GoogleCloudOptions)
        google_cloud_options.project = project_id
        google_cloud_options.job_name = '{}-{}'.format(job_name,
                                                       str(int(time.time())))
        google_cloud_options.staging_location = gcs_staging_location
        google_cloud_options.temp_location = gcs_temp_location
        google_cloud_options.region = region
        options.view_as(StandardOptions).runner = 'DataflowRunner'

        setup_options = options.view_as(SetupOptions)
        # PATH_TO_WHL_FILE should point to the downloaded tfdv wheel file.
        setup_options.extra_packages = [whl_location]
        setup_options.requirements_file = 'requirements.txt'

    tfdv.generate_statistics_from_csv(data_location=input_data,
                                      output_path=output_path,
                                      pipeline_options=options)

    return (output_path, )
Beispiel #2
0
def data_validation(data_path):
    train = tfdv.generate_statistics_from_csv(
        os.path.join(data_path, 'train.csv'), delimiter=',')
    test = tfdv.generate_statistics_from_csv(
        os.path.join(data_path, 'train.csv'), delimiter=',')
    schema = tfdv.infer_schema(train)
    # print(schema)
    # tfdv.display_schema(schema)
    anomalies = tfdv.validate_statistics(statistics=test, schema=schema)
    # print(anomalies)
    # tfdv.display_anomalies(anomalies)
    print(text_format.MessageToString(anomalies))
Beispiel #3
0
 def run_tf_test(self):
     print("Running tf workflow test!")
     DATA_DIR = "./"
     TRAIN_DATA = os.path.join(DATA_DIR, 'cal_housing.csv')
     train_stats = tfdv.generate_statistics_from_csv(TRAIN_DATA,
                                                     delimiter=',')
     schema = tfdv.infer_schema(train_stats)
     enc_stats = json_format.MessageToJson(train_stats)
     enc_schema = json_format.MessageToJson(schema)
     data = pd.read_csv('cal_housing.csv')
     ds_info = {"name" : "cal_housing_dataset",\
                "description": "data about housing in California",\
        "encoded_stats": enc_stats,\
        "encoded_schema": enc_schema,\
        "source": "UCI ML Repository" }
     ds_reg = self.ap.register_dataset(ds_info)
     featureset = data.dtypes.to_dict()
     featureset = {k: str(featureset[k]) for k in featureset}
     featureset["name"] = "wine_no_transformations"
     fs_reg = self.ap.register_featureset(featureset, ds_reg["_key"])
     dataset = self.ap.lookup_dataset("cal_housing_dataset")
     retrieved_stats = dataset["encoded_stats"]
     retrieved_schema = dataset["encoded_schema"]
     #print("Retrieved stats: " + str(retrieved_stats))
     print("Completed tf workflow test!")
     return
Beispiel #4
0
def infer_schema_from_csv(csv_file, column_names):
    data_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,
                                                   column_names=column_names)

    #tfdv.visualize_statistics(data_stats)
    schema = tfdv.infer_schema(statistics=data_stats)

    return schema
Beispiel #5
0
def csv_statistics_generator(file_path):
    """
    Generate statistics for the csv dataset
    """
    csv_stats = tfdv.generate_statistics_from_csv(data_location = file_path,
                                                delimiter=',')
    csv_schema = tfdv.infer_schema(csv_stats)
    tfdv.display_schema(csv_schema)
    return csv_stats, csv_schema
Beispiel #6
0
def compute_stats(input_handle, stats_path, pipeline_args=None):
    """Computes statistics on the input data.

  Args:
    input_handle: Path to csv file with input data.
    stats_path: Directory in which stats are materialized.
  """

    train_stats = tfdv.generate_statistics_from_csv(
        input_handle,
        delimiter=',',
        output_path=stats_path,
        pipeline_options=PipelineOptions(flags=pipeline_args))
Beispiel #7
0
def update_schema(csv_file, schema):
    #get column names from passed in schema
    cols = [f.name for f in schema.feature].sort()

    options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
    new_batch_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,\
                                                        column_names=cols, stats_options = options)

    # Check eval data for errors by validating the eval data stats using the previously inferred schema
    updated_schema = tfdv.update_schema(schema, new_batch_stats)
    #tfdv.display_schema(schema=updated_schema)

    return updated_schema
Beispiel #8
0
def get_num_anomalies(csv_file, schema, environment='TRAINING'):
    #get column names from passed in schema
    cols = [f.name for f in schema.feature].sort()

    options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
    data_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,\
                                                   column_names=cols, stats_options=options)

    # Check eval data for errors by validating the eval data stats using the previously inferred schema
    anomalies = tfdv.validate_statistics(statistics=data_stats,
                                         schema=schema,
                                         environment=environment)

    #tfdv.display_anomalies(anomalies)

    return len(anomalies.anomaly_info)
Beispiel #9
0
input_event.type = metadata_store_pb2.Event.DECLARED_INPUT

# Submit input event to the Metadata Store
store.put_events([input_event])

print('Input event:\n', input_event)

# ## Run the TFDV component
#
# You will now run the TFDV component to generate the schema of dataset. This should look familiar since you've done this already in Week 1.

# In[10]:

# Infer a schema by passing statistics to `infer_schema()`
train_data = './data/train/data.csv'
train_stats = tfdv.generate_statistics_from_csv(data_location=train_data)
schema = tfdv.infer_schema(statistics=train_stats)

schema_file = './schema.pbtxt'
tfdv.write_schema_text(schema, schema_file)

print("Dataset's Schema has been generated at:", schema_file)

# ## Generate output artifact unit
#
# Now that the TFDV component has finished running and schema has been generated, you can create the artifact for the generated schema.

# In[11]:

# Declare output artifact of type Schema_artifact
schema_artifact = metadata_store_pb2.Artifact()
Beispiel #10
0
def run_validator(output_dir, column_names, key_columns, csv_data_file,
                  csv_data_file_to_validate, project, mode):
    """Writes a TFDV-generated schema.

    Args:
      output_dir: output folder
      column_names: list of names for the columns in the CSV file. If omitted,
          the first line is treated as the column names.
      key_columns: list of the names for columns that should be
          treated as unique keys.
      csv_data_file: name of the CSV file to analyze and generate a schema.
      csv_data_file_to_validate: name of a CSV file to validate
          against the schema.
      project: the project to run dataflow in.
      mode: whether the job should be `local` or `cloud`.
    """
    if mode == 'local':
        pipeline_options = None
    elif mode == 'cloud':
        temp_dir = os.path.join(output_dir, 'tmp')
        options = {
            'job_name': ('pipeline-tfdv-' +
                         datetime.datetime.now().strftime('%y%m%d-%H%M%S')),
            'setup_file':
            './validation/setup.py',
            'project':
            project,
            'temp_location':
            temp_dir,
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        pipeline_options.view_as(StandardOptions).runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    stats = tfdv.generate_statistics_from_csv(
        data_location=csv_data_file,
        column_names=column_names,
        delimiter=',',
        output_path=os.path.join(output_dir, 'data_stats.tfrecord'),
        pipeline_options=pipeline_options)
    schema = tfdv.infer_schema(stats)
    with open('/output_schema.pb2', 'w+') as f:
        f.write(schema.SerializeToString())
    with file_io.FileIO(os.path.join(output_dir, 'schema.pb2'), 'w+') as f:
        logging.getLogger().info('Writing schema to {}'.format(f.name))
        f.write(schema.SerializeToString())
    schema_json = convert_schema_proto_to_json(schema, column_names,
                                               key_columns)
    with open('/output_schema.json', 'w+') as f:
        json.dump(schema_json, f)
    with file_io.FileIO(os.path.join(output_dir, 'schema.json'), 'w+') as f:
        logging.getLogger().info('Writing JSON schema to {}'.format(f.name))
        json.dump(schema_json, f)

    if not csv_data_file_to_validate:
        return

    validation_stats = tfdv.generate_statistics_from_csv(
        data_location=csv_data_file_to_validate,
        column_names=column_names,
        delimiter=',',
        output_path=os.path.join(output_dir, 'validation_data_stats.tfrecord'),
        pipeline_options=pipeline_options)
    anomalies = tfdv.validate_statistics(validation_stats, schema)
    with open('/output_validation_result.txt', 'w+') as f:
        if len(anomalies.anomaly_info.items()) > 0:
            f.write('invalid')
        else:
            f.write('valid')
            return

    with file_io.FileIO(os.path.join(output_dir, 'anomalies.pb2'), 'w+') as f:
        logging.getLogger().info('Writing anomalies to {}'.format(f.name))
        f.write(anomalies.SerializeToString())
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error('Anomaly in feature "{}": {}'.format(
            feature_name, anomaly_info.description))
Beispiel #11
0
"""

!pip install -q tensorflow_data_validation
import tensorflow_data_validation as tfdv

print('TFDV version: {}'.format(tfdv.version.__version__))

"""### Compute and visualize statistics

TFDV can compute descriptive statistics providing a quick overview of the data in terms of the features, shapes and distribution of the values.

First use tfdv.generate_statistics_from_csv to compute statistics for train data, TFDV can compute descriptive statistics, generate_statistics_from_csv works specifying a file pattern. The pattern can contain glob characters  (*, ?, [...] and sets).
"""

# Generate stats for Ste
train_stats = tfdv.generate_statistics_from_csv(
    data_location=os.path.join(DATA_DIR, 'data_employees.csv'))

# Visualize
tfdv.visualize_statistics(train_stats)

"""it is possible to check the distribution of each variable, categorical and numeric features are show separated,
<br>
The first thing it is possible to note the TERMINATED samples are very few compared with the ACTIVE, the class is very unbalanced
<br>
No missing data was found

### Evaluation Data

After check the training data, it will be compared with the test data,  if the data is different between the test and train data it will have problems when using the model for prediction.
"""
Beispiel #12
0
def generate_stat_html(table_name,
                       file_name_pattern,
                       query_filter="",
                       table_name_default=None,
                       html_file_name='stats.html',
                       sample_size=300000,
                       stratified=None):
    '''
    Constrói report tensorflow-data-validation para comparação de uma tabela atual (current) com uma tabela default
    
    Args:
        table_name: nome da tabela current para ser feita report
        file_name_pattern: pattern de nome de arquivo para ser exportado do BQ 
        query_filter: filtros para serem feitos na query
        table_name_default: nome completo da tabela BQ default para comparação. Em caso de None, report será feito apenas para table_name
        html_file_name: nome do arquivo html para ser salvo report
        sample_size: tamanho da amostra máxima para ser analisada
        stratified: lista de colunas para fazer estratificação. Se as colunas não existirem nas tabelas (current e default) serão desconsideradas.
                    Em caso de None, não será feita estratificação
    Outputs:
        html do report
    '''

    import pyarrow
    import tensorflow_data_validation as tfdv

    feat = __get_features(table_name)

    if table_name_default is not None:
        feat_default = __get_features(table_name_default)
        feat = list(set(feat) & set(feat_default))

    feat_query = "{0}".format(','.join(feat))

    query_pattern = """        
        WITH TOTAL AS (
        SELECT COUNT(*) AS TOTAL_COUNT 
        FROM `{0}`) 

        SELECT {2}
        FROM 
        (SELECT 
        *,
        FROM `{0}`
        {3}
        )
        CROSS JOIN TOTAL

        WHERE RAND() <= {1} / TOTAL_COUNT   
    """

    extract_csv_from_bq(
        query_pattern.format(table_name, sample_size, feat_query,
                             query_filter), file_name_pattern + "_current.csv",
        table_name.split('.')[0],
        table_name.split('.')[1],
        'tmp_{0}'.format(datetime.now().strftime("%Y%m%d%H%M%f")))

    #     current_file_name = download_file_from_gcp(file_name_pattern + "_current.csv")
    current_file_name = file_name_pattern + "_current.csv"
    stats = tfdv.generate_statistics_from_csv(current_file_name)

    if table_name_default is not None:
        extract_csv_from_bq(
            query_pattern.format(table_name_default, sample_size, feat_query,
                                 query_filter),
            file_name_pattern + "_default.csv",
            table_name_default.split('.')[0],
            table_name_default.split('.')[1],
            'tmp_{0}'.format(datetime.now().strftime("%Y%m%d%H%M%f")))

        #         default_file_name = download_file_from_gcp(file_name_pattern + "_default.csv")
        default_file_name = file_name_pattern + "_default.csv"
        stats_default = tfdv.generate_statistics_from_csv(default_file_name)
        html = tfdv.utils.display_util.get_statistics_html(
            lhs_statistics=stats,
            lhs_name="Current",
            rhs_statistics=stats_default,
            rhs_name="Default")
    else:
        html = tfdv.utils.display_util.get_statistics_html(
            lhs_statistics=stats)

    with open('./stat.html', 'w') as f:
        f.write(html)

    upload_file_to_gcp(local_filename='./stat.html', gcs_path=html_file_name)

    return html
def create_stats(transform_dir, filepath):
    DENSE_FLOAT_FEATURE_KEYS = []
    VOCAB_FEATURE_KEYS = []
    for i in range(len(_CSV_COLUMNS_NAMES)):
        if _CSV_COLUMN_types[i] is tf.string:
            VOCAB_FEATURE_KEYS.append(_CSV_COLUMNS_NAMES[i])
        if _CSV_COLUMN_types[i] is tf.float32:
            DENSE_FLOAT_FEATURE_KEYS.append(_CSV_COLUMNS_NAMES[i])
    train_stats = tfdv.generate_statistics_from_csv(data_location=filepath,
                                                    delimiter='|')
    jsonObj = python_obj = json.loads(MessageToJson(train_stats))
    stats_dict = {}
    for i in range(len(jsonObj['datasets'][0]['features'])):
        mean, std, min, max = 0.1, 0.1, 0.1, 0.2
        try:
            name = jsonObj['datasets'][0]['features'][i]['path']['step'][0]
            mean = (jsonObj['datasets'][0]['features'][i]['numStats']['mean'])
            std = (jsonObj['datasets'][0]['features'][i]['numStats']['stdDev'])
            minv = (jsonObj['datasets'][0]['features'][i]['numStats']['min'])
            maxv = (jsonObj['datasets'][0]['features'][i]['numStats']['max'])
            stats_dict[name] = [
                float(mean), float(std),
                float(minv), float(maxv)
            ]
        except BaseException as e:
            try:
                name = jsonObj['datasets'][0]['features'][i]['name']
                mean = (
                    jsonObj['datasets'][0]['features'][i]['numStats']['mean'])
                std = (jsonObj['datasets'][0]['features'][i]['numStats']
                       ['stdDev'])
                minv = (
                    jsonObj['datasets'][0]['features'][i]['numStats']['min'])
                maxv = (
                    jsonObj['datasets'][0]['features'][i]['numStats']['max'])
                stats_dict[name] = [
                    float(mean),
                    float(std),
                    float(minv),
                    float(maxv)
                ]
            except BaseException as e1:
                pass
    # save
    for c in ['rsrp0', 'rsrp1', 'rsrp2', 'rsrp', 'ta']:
        count = 0
        try:
            statsc = stats_dict[c]
            count += 1
        except BaseException:
            statsc = [0, 0, 0, 0]
        mean, std, minv, maxv = statsc
        for i in range(1, window_size, 1):
            try:
                minv = (stats_dict[c + '_' + str(i)][2]) + minv
                maxv = (stats_dict[c + '_' + str(i)][3]) + maxv
                mean = (stats_dict[c + '_' + str(i)][0]) + mean
                std = (stats_dict[c + '_' + str(i)][1]) + std
                count += 1
            except Exception as e:
                print(e)
                pass
        if count == 0:
            stats_dict[c] = [0, 0, 0, 0]
        else:
            stats_dict[c] = [
                mean / count, std / count, minv / count, maxv / count
            ]

    w = csv.writer(
        open(
            os.path.join(transform_dir,
                         "output_stats_{}".format(os.path.basename(filepath))),
            "w"))
    for key, val in stats_dict.items():
        w.writerow([key, val])

    return stats_dict
Beispiel #14
0
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import tensorflow_data_validation as tfdv

# The following variables are provided through dependency injection. These
# variables come from the specified input path and arguments provided by the
# API post request.
#
# source

train_stats = tfdv.generate_statistics_from_csv(
    data_location=source)  # noqa: F821

tfdv.visualize_statistics(train_stats)
Beispiel #15
0
import tensorflow_data_validation as tfdv
import pandas as pd
import datetime

from tensorflow_data_validation.utils import slicing_util

data_location = '/home/jarekk/workspace/test.csv'
output_location = '/home/jarekk/workspace/stats.pb'

slice_fn = slicing_util.get_feature_value_slicer(
    features={'time_window': None})

stats_options = tfdv.StatsOptions(slice_functions=[slice_fn])

stats = tfdv.generate_statistics_from_csv(data_location,
                                          stats_options=stats_options,
                                          output_path=output_location)
Beispiel #16
0
def _gen_tfdv_stats(args):
    stats = tfdv.generate_statistics_from_csv(data_location=args.data_path)
    _save_tfdv_stats_html(stats, args)
    return stats
import tensorflow as tf
import tensorflow_data_validation as tfdv

tf.logging.set_verbosity(tf.logging.ERROR)
print('TFDV version: {}'.format(tfdv.version.__version__))

# Confirm that we're using Python 3
assert sys.version_info.major is 3, 'Oops, not running Python 3. Use Runtime > Change runtime type'

# Set up some globals for our file paths
BASE_DIR = "/home/neo/tmp_data"
DATA_DIR = os.path.join(BASE_DIR, 'data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'chicago_taxi_output')
TRAIN_DATA = os.path.join(DATA_DIR, 'train', 'data.csv')
EVAL_DATA = os.path.join(DATA_DIR, 'eval', 'data.csv')
SERVING_DATA = os.path.join(DATA_DIR, 'serving', 'data.csv')

# Download the zip file from GCP and unzip it
zip, headers = urllib.request.urlretrieve(
    'https://storage.googleapis.com/tfx-colab-datasets/chicago_data.zip')
zipfile.ZipFile(zip).extractall(BASE_DIR)
zipfile.ZipFile(zip).close()

print("Here's what we downloaded:")
os.system('ls -R /home/neo/tmp_data/data')

train_stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
tfdv.visualize_statistics(train_stats)
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)
Beispiel #18
0
def run_validator(output_dir, column_names, key_columns, csv_data_file,
                  csv_data_file_to_validate):
    """Writes a TFDV-generated schema.
    Args:
      output_dir: output folder
      column_names: list of names for the columns in the CSV file. If omitted,
          the first line is treated as the column names.
      key_columns: list of the names for columns that should be
          treated as unique keys.
      csv_data_file: name of the CSV file to analyze and generate a schema.
      csv_data_file_to_validate: name of a CSV file to validate
          against the schema.
    """
    logging.getLogger().info('running in local mode')
    pipeline_options = None

    logging.getLogger().info('starting stats on tfdv')
    stats = tfdv.generate_statistics_from_csv(
        data_location=csv_data_file,
        column_names=column_names,
        delimiter=',',
        output_path=os.path.join(output_dir, 'data_stats.tfrecord'),
        pipeline_options=pipeline_options)
    schema = tfdv.infer_schema(stats)

    logging.getLogger().info('loading output_schema.pb2')
    with open('{}/output_schema.pb2'.format(output_dir), 'w+') as f:
        f.write(schema.SerializeToString())

    logging.getLogger().info('loading [output_dir] {} schema.pb2'.format(output_dir))
    with file_io.FileIO(os.path.join(output_dir, 'schema.pb2'), 'w+') as f:
        logging.getLogger().info('loading schema to {}'.format(f.name))
        f.write(schema.SerializeToString())

    schema_json = convert_schema_proto_to_json(
        schema, column_names, key_columns)

    logging.getLogger().info(' logging output_schema.json')
    with open('{}/output_schema.json'.format(output_dir), 'w+') as f:
        json.dump(schema_json, f)
    schema_json_file = os.path.join(output_dir, 'schema.json')

    with file_io.FileIO(schema_json_file, 'w+') as f:
        logging.getLogger().info('logging JSON schema to {}'.format(f.name))
        json.dump(schema_json, f)

    with open('{}/schema.txt'.format(output_dir), 'w+') as f:
        logging.getLogger().info('schema.txt to {}'.format(f.name))
        f.write(schema_json_file)

    logging.getLogger().info('Schema Write Done...')

    if not csv_data_file_to_validate:
        logging.getLogger().info('No csv file to validate')
        return

    logging.getLogger().info('Validation Stats...')
    validation_stats = tfdv.generate_statistics_from_csv(
        data_location=csv_data_file_to_validate,
        column_names=column_names,
        delimiter=',',
        output_path=os.path.join(output_dir, 'validation_data_stats.tfrecord'),
        pipeline_options=pipeline_options)
    anomalies = tfdv.validate_statistics(validation_stats, schema)

    logging.getLogger().info('logging output validation results ...')
    with open('{}/output_validation_result.txt'.format(output_dir), 'w+') as f:
        if len(anomalies.anomaly_info.items()) > 0:
            f.write('invalid')
        else:
            f.write('valid')
            return

    # logging.getLogger().info('logging anomalies result ...')
    # with file_io.FileIO(os.path.join(output_dir, 'anomalies.pb2'), 'w+') as f:
    #     logging.getLogger().info('logging anomalies to {}'.format(f.name))
    #     f.write(anomalies.SerializeToString())
    #
    # for feature_name, anomaly_info in anomalies.anomaly_info.items():
    #     logging.getLogger().error(
    #         'Anomaly in feature "{}": {}'.format(
    #             feature_name, anomaly_info.description))
    return 0
"""### Compute and visualize statistics

TFDV can compute descriptive statistics providing a quick overview of the data in terms of the features, shapes and distribution of the values.

First use tfdv.generate_statistics_from_csv to compute statistics for train data (Liander region), TFDV can compute descriptive statistics, generate_statistics_from_csv works specifying a file pattern. The pattern can contain glob characters  (*, ?, [...] and sets).
"""

# Define column names
names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
         'marital_status', 'occupation', 'relationship', 'race', 'sex', 
         'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 
         'wage']

# Generate stats for train data
train_stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA,
                                                column_names=names)

# Visualize
tfdv.visualize_statistics(train_stats)

"""it is possible to check the distribution of each variable (max, min, median, mean, std and zeros), how much missing data to each feature. The numeric and categorical variables are shown separated.

With this information it is possible have an idea about the data and infer some preliminary assumptios, most of the people work 30-40 hours per week, they are men married and receive less than 50k per year.

### Infer a schema

Infering a schema we save a lot of initial work, the scheme defines the data that are relevant to the model, for example, the type of each feature (numerical or categorical), for categorical features the schema also defines the domain - the list of acceptable values, TFDV provides a initial version of the schema based on the descriptive statistics.

Define the schema is important because the model pipeline relies on it also it is provide a documentation of the data.
"""
Beispiel #20
0
schema = tfdv.infer_schema(statistics=train_stats)

# %%


# %%
tfdv.display_schema(schema=schema)

# %%
'''
## check that training data complies with the schema inferred from train data !
'''

# %%
# Compute stats for evaluation data
test_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_test)

# Compare evaluation data with training data
tfdv.visualize_statistics(lhs_statistics=test_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

# %%
'''
## evaluate for anomalies related with schema
'''

# %%
anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema)
tfdv.display_anomalies(anomalies)

# %%
Beispiel #21
0
def main(argv=None):
  args = parse_arguments(sys.argv if argv is None else argv)
  schema_file = os.path.join(args.transform_dir, _SCHEMA_FILE)
  if args.runner == 'DataflowRunner':
    schema = utils.read_schema(schema_file)
    dataflow_options = {
        'job_name':
            args.job_name,
        'project':
            args.project,
        'service_account_email':
            '',
        'setup_file':
            os.path.abspath(
                os.path.join(os.path.dirname(__file__), 'setup.py')),
        'temp_location':
            os.path.join(args.output_dir, 'tmp')
    }
    pipeline_options = beam.pipeline.PipelineOptions(
        flags=[], **dataflow_options)
  else:
    pipeline_options = beam.pipeline.PipelineOptions(None)

    if os.path.exists(args.transform_dir):
      logger.info('Removing existing directory %s', args.transform_dir)
      shutil.rmtree(args.transform_dir)

    stats = tfdv.generate_statistics_from_csv(data_location=args.all_data)
    schema = tfdv.infer_schema(statistics=stats, infer_feature_shape=False)
    if not file_io.file_exists(args.transform_dir):
      file_io.recursive_create_dir(args.transform_dir)
    with file_io.FileIO(schema_file, 'w') as f:
      f.write(text_format.MessageToString(schema))
    logger.info('Generated %s', schema_file)
    logger.info('Running pipeline on %s environment', args.runner)

  with beam.Pipeline(args.runner, options=pipeline_options) as pipeline:
    with tft_beam.Context(temp_dir=os.path.join(args.output_dir, 'tmp')):
      # `mode=predict` should be used during inference time
      if args.mode == 'predict':
        logger.info('Transforming only prediction data.')
        transform_predict(
            pipeline=pipeline,
            predict_data=args.predict_data,
            data_source=args.data_source,
            output_dir=args.output_dir,
            schema=schema)
      else:
        logger.info('Transforming both training, evaluation and predict data.')
        transform_predict(
            pipeline=pipeline,
            predict_data=args.predict_data,
            data_source=args.data_source,
            output_dir=args.output_dir,
            schema=schema)
        transform_train_and_eval(
            pipeline=pipeline,
            train_data=args.train_data,
            eval_data=args.eval_data,
            data_source=args.data_source,
            transform_dir=args.transform_dir,
            output_dir=args.output_dir,
            schema=schema)
Beispiel #22
0
PATH_TO_WHL_FILE = './tfdv.whl'

CLOUMNS = "FL_DATE, MKT_UNIQUE_CARRIER, ORIGIN_AIRPORT_ID, DEST_AIRPORT_ID, DEP_TIME, DEP_DELAY, ARR_DELAY, DISTANCE, dep_lat, dep_lng, arr_lat, arr_lng".split(
    ', ')
# Create and set your PipelineOptions.
options = PipelineOptions()

# For Cloud execution, set the Cloud Platform project, job_name,
# staging location, temp_location and specify DataflowRunner.
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT_ID
google_cloud_options.job_name = JOB_NAME
google_cloud_options.staging_location = GCS_STAGING_LOCATION
google_cloud_options.temp_location = GCS_TMP_LOCATION
options.view_as(StandardOptions).runner = 'DataflowRunner'

setup_options = options.view_as(SetupOptions)
# PATH_TO_WHL_FILE should point to the downloaded tfdv wheel file.
# setup_options.extra_packages = [PATH_TO_WHL_FILE]
setup_options.setup_file = os.path.join(os.getcwd(), 'tfdv_setup.py')
stat_options = tfdv.StatsOptions(sample_rate=0.01)

tfdv.generate_statistics_from_csv(
    GCS_DATA_LOCATION,
    column_names=CLOUMNS,
    output_path=GCS_STATS_OUTPUT_PATH,
    stats_options=stat_options,
    pipeline_options=options,
)
Beispiel #23
0
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import tensorflow_data_validation as tfdv

# The following variables are provided through dependency injection. These
# variables come from the specified input path and arguments provided by the
# API post request.
#
# input_path

train_stats = tfdv.generate_statistics_from_csv(data_location=input_path)

tfdv.visualize_statistics(train_stats)
Beispiel #24
0

from pathlib import Path
import tensorflow_data_validation as tfdv

print(f'TFDV version: {tfdv.version.__version__}')


data_folder = Path("../dataset")
# below paths should be realtive to data_folder
users_file_glob = "AllUsers.csv" 
ads_file_glob = "AllAds.csv"
users_ads_ratings = "users-ads-without-gcp-ratings.csv"


users_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_file_glob}").as_posix())


tfdv.visualize_statistics(users_stats)


user_schema = tfdv.infer_schema(statistics=users_stats)
tfdv.display_schema(schema=user_schema)


ads_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{ads_file_glob}").as_posix())


tfdv.visualize_statistics(ads_stats)

def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    vldn_output = os.path.join(DATA_DIR, 'validation')

    # TODO: Understand why this was used in the conversion to the output json
    # key columns: list of the names for columns that should be treated as unique keys.
    key_columns = ['trip_start_timestamp']

    # read the first line of the cvs to have and ordered list of column names
    # (the Schema will scrable the features)
    with open(TRAIN_DATA) as f:
        column_names = f.readline().strip().split(',')

    stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
    schema = tfdv.infer_schema(stats)

    eval_stats = tfdv.generate_statistics_from_csv(
        data_location=EVALUATION_DATA)
    anomalies = tfdv.validate_statistics(eval_stats, schema)

    # Log anomalies
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error(
            'Anomaly in feature "{}": {}'.format(
                feature_name, anomaly_info.description))

    # show inferred schema
    tfdv.display_schema(schema=schema)
    # Resolve anomalies
    company = tfdv.get_feature(schema, 'company')
    company.distribution_constraints.min_domain_mass = 0.9

    # Add new value to the domain of feature payment_type.
    payment_type_domain = tfdv.get_domain(schema, 'payment_type')
    payment_type_domain.value.append('Prcard')

    # Validate eval stats after updating the schema
    updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
    tfdv.display_anomalies(updated_anomalies)

    # -----------------------DATA SAVING START---------------------------------
    if "column_names" in locals():
        _kale_resource_save(column_names, os.path.join(
            _kale_data_directory, "column_names"))
    else:
        print("_kale_resource_save: `column_names` not found.")
    if "schema" in locals():
        _kale_resource_save(schema, os.path.join(
            _kale_data_directory, "schema"))
    else:
        print("_kale_resource_save: `schema` not found.")
"""

!pip install -q tensorflow_data_validation
import tensorflow_data_validation as tfdv

print('TFDV version: {}'.format(tfdv.version.__version__))

"""### Compute and visualize statistics

TFDV can compute descriptive statistics providing a quick overview of the data in terms of the features, shapes and distribution of the values.

First use tfdv.generate_statistics_from_csv to compute statistics for train data (Liander region), TFDV can compute descriptive statistics, generate_statistics_from_csv works specifying a file pattern. The pattern can contain glob characters  (*, ?, [...] and sets).
"""

# Generate stats for Ste
train_stats = tfdv.generate_statistics_from_csv(data_location=DATA_DIR+'/l*')

# Visualize
tfdv.visualize_statistics(train_stats)

"""it is possible to check the distribution of each variable (max, min, median, mean, std and zeros), how much missing data to each feature. The numeric and categorical variables are shown separated.

The charts show the purchase_are has 64,81% missing data, cliking in the opção "Show Raw Data" it is possible to see the details results to each feature.

### Infer a schema

Infering a schema we save a lot of initial work, the scheme defines the data that are relevant to the model, for example, the type of each feature (numerical or categorical), for categorical features the schema also defines the domain - the list of acceptable values, TFDV provides a initial version of the schema based on the descriptive statistics.

Define the schema is important because the mode pipeline relies on it also it is provide a documentation of the data.
"""