Exemple #1
0
def csv_statistics_validator(stats, schema):
    """
    Validate statistics from a csv dataset
    """
    stats_anomalies = tfdv.validate_statistics(statistics = stats, schema = schema)
    tfdv.display_anomalies(stats_anomalies)
    return stats_anomalies
Exemple #2
0
def detect_anomalies(stats_uri: Text, schema_uri: Text, split_name: Text):
    schema = get_schema_proto(schema_uri)
    stats = get_statistics_dataset_dict(stats_uri)
    if split_name not in stats:
        raise Exception(f'{split_name} split not present!')
    anomalies = tfdv.validate_statistics(stats[split_name], schema)
    tfdv.display_anomalies(anomalies)
Exemple #3
0
 def display(self, artifact: types.Artifact):
     from IPython.core.display import display  # pylint: disable=g-import-not-at-top
     from IPython.core.display import HTML  # pylint: disable=g-import-not-at-top
     for split in artifact_utils.decode_split_names(artifact.split_names):
         display(HTML('<div><b>%r split:</b></div><br/>' % split))
         anomalies_path = os.path.join(artifact.uri, split,
                                       'anomalies.pbtxt')
         anomalies = tfdv.load_anomalies_text(anomalies_path)
         tfdv.display_anomalies(anomalies)
Exemple #4
0
def tfdv_skew_validator(feature_name, train_stats, serve_stats, schema, threshold):
    """
    Validate skew for the csv dataset
    """
    #this doesn't display skew anomalies as the book shows
    tfdv.get_feature(schema, feature_name).skew_comparator.infinity_norm.threshold = threshold
    skew_anomalies = tfdv.validate_statistics(statistics = train_stats,
                                                schema = schema,
                                                serving_statistics = serve_stats)
    tfdv.display_anomalies(skew_anomalies)
    return skew_anomalies
Exemple #5
0
def tfdv_drift_validator(feature_name, train_stats, previous_stats, schema, threshold):
    """
    Validate drift for the csv dataset
    """
    #this doesn't display drift anomalies as the book shows
    tfdv.get_feature(schema, feature_name).drift_comparator.infinity_norm.threshold = threshold
    drift_anomalies = tfdv.validate_statistics(statistics=train_stats, 
                                                schema=schema, 
                                                previous_statistics= previous_stats
                                                )
    tfdv.display_anomalies(drift_anomalies)
    return drift_anomalies
Exemple #6
0
 def display(self, artifact: types.Artifact):
   from IPython.core.display import display  # pylint: disable=g-import-not-at-top
   from IPython.core.display import HTML  # pylint: disable=g-import-not-at-top
   for split in artifact_utils.decode_split_names(artifact.split_names):
     display(HTML('<div><b>%r split:</b></div><br/>' % split))
     anomalies_path = io_utils.get_only_uri_in_dir(
         artifact_utils.get_split_uri([artifact], split))
     if artifact_utils.is_artifact_version_older_than(
         artifact, artifact_utils._ARTIFACT_VERSION_FOR_ANOMALIES_UPDATE):  # pylint: disable=protected-access
       anomalies = tfdv.load_anomalies_text(anomalies_path)
     else:
       anomalies = anomalies_pb2.Anomalies()
       anomalies_bytes = io_utils.read_bytes_file(anomalies_path)
       anomalies.ParseFromString(anomalies_bytes)
     tfdv.display_anomalies(anomalies)
schema = tfdv.infer_schema(statistics=stats_train)
tfdv.display_schema(schema=schema)

# COMMAND ----------

# Compare evaluation data with training data
displayHTML(
    get_statistics_html(lhs_statistics=stats_serve,
                        rhs_statistics=stats_train,
                        lhs_name='SERVE_DATASET',
                        rhs_name='TRAIN_DATASET'))

# COMMAND ----------

anomalies = tfdv.validate_statistics(statistics=stats_serve, schema=schema)
tfdv.display_anomalies(anomalies)

# COMMAND ----------

# Add skew and drift comparators
temp_f = tfdv.get_feature(schema, 'avg_temp_f')
temp_f.skew_comparator.jensen_shannon_divergence.threshold = 0
temp_f.drift_comparator.jensen_shannon_divergence.threshold = 0

precip_mm = tfdv.get_feature(schema, 'tot_precip_mm')
precip_mm.skew_comparator.jensen_shannon_divergence.threshold = 0
precip_mm.drift_comparator.jensen_shannon_divergence.threshold = 0

_anomalies = tfdv.validate_statistics(stats_train,
                                      schema,
                                      serving_statistics=stats_serve)
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    vldn_output = os.path.join(DATA_DIR, 'validation')

    # TODO: Understand why this was used in the conversion to the output json
    # key columns: list of the names for columns that should be treated as unique keys.
    key_columns = ['trip_start_timestamp']

    # read the first line of the cvs to have and ordered list of column names
    # (the Schema will scrable the features)
    with open(TRAIN_DATA) as f:
        column_names = f.readline().strip().split(',')

    stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
    schema = tfdv.infer_schema(stats)

    eval_stats = tfdv.generate_statistics_from_csv(
        data_location=EVALUATION_DATA)
    anomalies = tfdv.validate_statistics(eval_stats, schema)

    # Log anomalies
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error(
            'Anomaly in feature "{}": {}'.format(
                feature_name, anomaly_info.description))

    # show inferred schema
    tfdv.display_schema(schema=schema)
    # Resolve anomalies
    company = tfdv.get_feature(schema, 'company')
    company.distribution_constraints.min_domain_mass = 0.9

    # Add new value to the domain of feature payment_type.
    payment_type_domain = tfdv.get_domain(schema, 'payment_type')
    payment_type_domain.value.append('Prcard')

    # Validate eval stats after updating the schema
    updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
    tfdv.display_anomalies(updated_anomalies)

    # -----------------------DATA SAVING START---------------------------------
    if "column_names" in locals():
        _kale_resource_save(column_names, os.path.join(
            _kale_data_directory, "column_names"))
    else:
        print("_kale_resource_save: `column_names` not found.")
    if "schema" in locals():
        _kale_resource_save(schema, os.path.join(
            _kale_data_directory, "schema"))
    else:
        print("_kale_resource_save: `schema` not found.")
Exemple #9
0
# %%
# Compute stats for evaluation data
test_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_test)

# Compare evaluation data with training data
tfdv.visualize_statistics(lhs_statistics=test_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

# %%
'''
## evaluate for anomalies related with schema
'''

# %%
anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema)
tfdv.display_anomalies(anomalies)

# %%
## checking the data is compliant with the schema of the train part

# %%
data_stats = tfdv.generate_statistics_from_dataframe(data)
serving_anomalies = tfdv.validate_statistics(data_stats, schema)
tfdv.display_anomalies(serving_anomalies)

# %%


# %%

 def display(self, artifact: types.Artifact):
   anomalies_path = os.path.join(artifact.uri, 'anomalies.pbtxt')
   anomalies = tfdv.load_anomalies_text(anomalies_path)
   tfdv.display_anomalies(anomalies)
Exemple #11
0
# Generate training data statistics
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset)
schema = tfdv.infer_schema(statistics=train_stats)
print(tfdv.display_schema(schema))

test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set)

# Compare test statistics with the Schema
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)
# Displaying all detected anomalies
# Integer larger than 10
# STRING type when expected INT type
# FLOAT type when expected INT type
# Integer smaller than 0
print(tfdv.display_anomalies(anomalies))

# New data WITH anomalies
test_set_copy = test_set.copy()
test_set_copy.drop("soot", axis=1, inplace=True)

# Statistics based on data with anomalies
test_set_copy_stats = tfdv.generate_statistics_from_dataframe(
    dataframe=test_set_copy)
anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats,
                                         schema=schema)
print(tfdv.display_anomalies(anomalies_new))

# Prepare the schema for Serving,environment
schema.default_environment.append("TRAINING")
schema.default_environment.append("SERVING")
Exemple #12
0
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema)

### Cálculo das estatísticas da base de teste
test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set)

## Etapa 5: Comparação das estatísticas de teste com o esquema
### Checagem de anomalias nos novos dados
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)

### Mostrando as anomalias detectadas
# - Inteiros maiores do que 10
# - Esperava o tipo STRING mas a coluna estava com o tipo INT
# - Esperava o tipo FLOAT mas a coluna estava com o tipo INT
# - Inteiros menores do que 0
tfdv.display_anomalies(anomalies)

### Novos dados COM anomalias
test_set_copy = test_set.copy()
test_set_copy.drop("soot", axis=1, inplace=True)
test_set_copy.describe()

### Estatísticas baseadas nos dados com anomalias

test_set_copy_stats = tfdv.generate_statistics_from_dataframe(
    dataframe=test_set_copy)

anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats,
                                         schema=schema)

tfdv.display_anomalies(anomalies_new)
"""In evalation of the data TFDV shows in the plot both the train and eval stats overlaid, it is possible to check the distribuition visually, it can speedy the change in the schema when necessary.

Looking in the plots we can consider both the train and eval has the same distribution, the difenrrence in the plots is just in the quantity of each value in the features.

### Anomalies

After generate stats for both train and eval dataset, it necessary to check if the schema define to each one is the same (range of values, type of feature and so on), TFDV has a function to look for anomalies, e.g, features define different for each schema.
"""

# Check eval data for errors by validating the Eval data stats using the 
# previously inferred schema.
anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=schema)

# Display anomalies
tfdv.display_anomalies(anomalies)

"""We have just one feature with anomalies, look deeply we can check the values are correct but the problem there is a period and a space in the values, so the TFDV considered it as a new values.

### Fixing the erros

The corrections will depend of our knowlegde of the data, looking at each feature and decide what action should be take, for our case it is not necessary to ajusted the schema, but to test how to correct the errors we will add the values to the domain and check again the anomalies
"""

# Add new value to the domain of feature wage.
wage = tfdv.get_domain(schema, 'wage')
wage.value.append(' <=50K.')
wage.value.append(' >50K.')

# Validate eval stats after updating the schema 
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
# MAGIC %md infer schema

# COMMAND ----------

weather_data_schema = tfdv.infer_schema(statistics=stats)
tfdv.display_schema(schema=weather_data_schema)

# COMMAND ----------

# MAGIC %md check for anomalies

# COMMAND ----------

weather_anomalies = tfdv.validate_statistics(statistics=stats,
                                             schema=weather_data_schema)
tfdv.display_anomalies(weather_anomalies)

# COMMAND ----------

# MAGIC %md average monthly temperature

# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT x.month,AVG(x.avg_temp_f) as avg_temp_f FROM (
# MAGIC SELECT MONTH(time) as month,YEAR(time) as year,SUM(avg_temp_f) as avg_temp_f
# MAGIC FROM dscc202_group05_db.weather_delta_bronze
# MAGIC GROUP BY MONTH(time), YEAR(time)
# MAGIC   ) x
# MAGIC GROUP BY x.month
# MAGIC ORDER BY x.month