Ejemplo n.º 1
0
# Check eval data for errors by validating the Eval data stats using the 
# previously inferred schema.
anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=schema)

# Display anomalies
tfdv.display_anomalies(anomalies)

"""We have just one feature with anomalies, look deeply we can check the values are correct but the problem there is a period and a space in the values, so the TFDV considered it as a new values.

### Fixing the erros

The corrections will depend of our knowlegde of the data, looking at each feature and decide what action should be take, for our case it is not necessary to ajusted the schema, but to test how to correct the errors we will add the values to the domain and check again the anomalies
"""

# Add new value to the domain of feature wage.
wage = tfdv.get_domain(schema, 'wage')
wage.value.append(' <=50K.')
wage.value.append(' >50K.')

# Validate eval stats after updating the schema 
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)

"""Running and check again the anomalies we found not any error

### Schema Environments

TFDV it is possible to define different schema according to your data, for example, you added labels in the train dataset but for prediction it is not necessary.

**Environments** can be used to express such requirements, features in schema can be associated with a set of environments using `default_environment`, `in_environment` and `not_in_environment`.
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    vldn_output = os.path.join(DATA_DIR, 'validation')

    # TODO: Understand why this was used in the conversion to the output json
    # key columns: list of the names for columns that should be treated as unique keys.
    key_columns = ['trip_start_timestamp']

    # read the first line of the cvs to have and ordered list of column names
    # (the Schema will scrable the features)
    with open(TRAIN_DATA) as f:
        column_names = f.readline().strip().split(',')

    stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
    schema = tfdv.infer_schema(stats)

    eval_stats = tfdv.generate_statistics_from_csv(
        data_location=EVALUATION_DATA)
    anomalies = tfdv.validate_statistics(eval_stats, schema)

    # Log anomalies
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error(
            'Anomaly in feature "{}": {}'.format(
                feature_name, anomaly_info.description))

    # show inferred schema
    tfdv.display_schema(schema=schema)
    # Resolve anomalies
    company = tfdv.get_feature(schema, 'company')
    company.distribution_constraints.min_domain_mass = 0.9

    # Add new value to the domain of feature payment_type.
    payment_type_domain = tfdv.get_domain(schema, 'payment_type')
    payment_type_domain.value.append('Prcard')

    # Validate eval stats after updating the schema
    updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
    tfdv.display_anomalies(updated_anomalies)

    # -----------------------DATA SAVING START---------------------------------
    if "column_names" in locals():
        _kale_resource_save(column_names, os.path.join(
            _kale_data_directory, "column_names"))
    else:
        print("_kale_resource_save: `column_names` not found.")
    if "schema" in locals():
        _kale_resource_save(schema, os.path.join(
            _kale_data_directory, "schema"))
    else:
        print("_kale_resource_save: `schema` not found.")
"""Three features has inconsistent values, two of them has wrong type (net_manager, type_conn_perc), the purchase_are it is a feature expected to be outside of the train schema, it is due the fact correspond to a different region in Netherlands.

### Fixing the erros

The corrections will depend of our knowlegde of the data, looking at each feature and decide what action should be take, looking in the description of each feature will to decide how to correct the erros, **net_manager** is code of the regional network manager, **type_conn_perc: percentage** of presence of the principal type of connection in the zipcode range, **purchase_area** code of the area where the energy is purchased.

For the type_conn_perc let's covert it to BYTES and append the missing values to the purcha_are.
"""

# Convert the type_conn_perc to Bytes
type_conn_perc = tfdv.get_feature(schema, 'type_conn_perc')
type_conn_perc.type = 1 # Type 1 represent BYTES

# Add new value to the domain of feature net_manager.
purchase_area = tfdv.get_domain(schema, 'purchase_area')
purchase_area.value.append('Stedin')
purchase_area.value.append('Stedin Delfland')
purchase_area.value.append('Stedin Elektriciteit Zuid-Kennemerland')
purchase_area.value.append('Stedin Midden-Holland')
purchase_area.value.append('Stedin Schiedam')
purchase_area.value.append('Stedin Utrecht')
purchase_area.value.append('Stedin Weert')

# Validate eval stats after updating the schema 
updated_anomalies = tfdv.validate_statistics(test_stats, schema)
tfdv.display_anomalies(updated_anomalies)

"""It is left just one error, but we don't need to convert the net_manager to Integer, we'll use this feature as Bytes, since net_manager correspond to a code

### Schema Environments