コード例 #1
0
def tfrecord_statis_generator(file_path):
    """
    Generate statistics for the tfrecord dataset
    """
    tfrecord_stats = tfdv.generate_statistics_from_tfrecord(data_location = file_path)
    tfrecord_schema = tfdv.infer_schema(tfrecord_stats)
    tfdv.display_schema(tfrecord_schema)
    return tfrecord_stats, tfrecord_schema
コード例 #2
0
def csv_statistics_generator(file_path):
    """
    Generate statistics for the csv dataset
    """
    csv_stats = tfdv.generate_statistics_from_csv(data_location = file_path,
                                                delimiter=',')
    csv_schema = tfdv.infer_schema(csv_stats)
    tfdv.display_schema(csv_schema)
    return csv_stats, csv_schema
コード例 #3
0
def view_schema(uri: Text):
    """
    View schema .

    Args:
        uri: URI to schema.
    """
    schema = get_schema_proto(uri)

    # TODO: [LOW] Replace with our own function
    tfdv.display_schema(schema=schema)
コード例 #4
0
    def display_stats_for_examples(self, examples_id):
        """Displays stats for `examples_id`.

    Args:
      examples_id: A `int` indicating the id of a `TFXArtifactTypes.EXAMPLES`
          artifact.
    """
        stats_artifact = self.get_dest_artifact_of_type(
            examples_id, TFXArtifactTypes.EXAMPLE_STATS)
        if stats_artifact:
            tfdv.visualize_statistics(
                tfdv.load_statistics(
                    os.path.join(stats_artifact.uri, 'stats_tfrecord')))
            print("display shema")
            tfdv.display_schema(
                tfdv.infer_schema(statistics=tfdv.load_statistics(
                    os.path.join(stats_artifact.uri, 'stats_tfrecord'))))
# COMMAND ----------

from sklearn.model_selection import train_test_split
import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils.display_util import get_statistics_html
import warnings

warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)

stats_train = tfdv.generate_statistics_from_dataframe(
    dataframe=train_df.toPandas())
stats_serve = tfdv.generate_statistics_from_dataframe(dataframe=fdf.toPandas())

schema = tfdv.infer_schema(statistics=stats_train)
tfdv.display_schema(schema=schema)

# COMMAND ----------

# Compare evaluation data with training data
displayHTML(
    get_statistics_html(lhs_statistics=stats_serve,
                        rhs_statistics=stats_train,
                        lhs_name='SERVE_DATASET',
                        rhs_name='TRAIN_DATASET'))

# COMMAND ----------

anomalies = tfdv.validate_statistics(statistics=stats_serve, schema=schema)
tfdv.display_anomalies(anomalies)
コード例 #6
0
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    vldn_output = os.path.join(DATA_DIR, 'validation')

    # TODO: Understand why this was used in the conversion to the output json
    # key columns: list of the names for columns that should be treated as unique keys.
    key_columns = ['trip_start_timestamp']

    # read the first line of the cvs to have and ordered list of column names
    # (the Schema will scrable the features)
    with open(TRAIN_DATA) as f:
        column_names = f.readline().strip().split(',')

    stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
    schema = tfdv.infer_schema(stats)

    eval_stats = tfdv.generate_statistics_from_csv(
        data_location=EVALUATION_DATA)
    anomalies = tfdv.validate_statistics(eval_stats, schema)

    # Log anomalies
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error(
            'Anomaly in feature "{}": {}'.format(
                feature_name, anomaly_info.description))

    # show inferred schema
    tfdv.display_schema(schema=schema)
    # Resolve anomalies
    company = tfdv.get_feature(schema, 'company')
    company.distribution_constraints.min_domain_mass = 0.9

    # Add new value to the domain of feature payment_type.
    payment_type_domain = tfdv.get_domain(schema, 'payment_type')
    payment_type_domain.value.append('Prcard')

    # Validate eval stats after updating the schema
    updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
    tfdv.display_anomalies(updated_anomalies)

    # -----------------------DATA SAVING START---------------------------------
    if "column_names" in locals():
        _kale_resource_save(column_names, os.path.join(
            _kale_data_directory, "column_names"))
    else:
        print("_kale_resource_save: `column_names` not found.")
    if "schema" in locals():
        _kale_resource_save(schema, os.path.join(
            _kale_data_directory, "schema"))
    else:
        print("_kale_resource_save: `schema` not found.")
コード例 #7
0
 def display(self, artifact: types.Artifact):
   schema_path = os.path.join(artifact.uri, 'schema.pbtxt')
   schema = tfdv.load_schema_text(schema_path)
   tfdv.display_schema(schema)
コード例 #8
0
import tensorflow_data_validation as tfdv

# Simple dataset analysis
dataset = pd.read_csv("data/pollution-small.csv")
print(dataset.shape)

training_data = dataset[:1600]
print(training_data.describe())

test_set = dataset[1600:]
print(test_set.describe())

# Generate training data statistics
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset)
schema = tfdv.infer_schema(statistics=train_stats)
print(tfdv.display_schema(schema))

test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set)

# Compare test statistics with the Schema
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)
# Displaying all detected anomalies
# Integer larger than 10
# STRING type when expected INT type
# FLOAT type when expected INT type
# Integer smaller than 0
print(tfdv.display_anomalies(anomalies))

# New data WITH anomalies
test_set_copy = test_set.copy()
test_set_copy.drop("soot", axis=1, inplace=True)
コード例 #9
0
data_folder = Path("../dataset")
# below paths should be realtive to data_folder
users_file_glob = "AllUsers.csv" 
ads_file_glob = "AllAds.csv"
users_ads_ratings = "users-ads-without-gcp-ratings.csv"


users_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_file_glob}").as_posix())


tfdv.visualize_statistics(users_stats)


user_schema = tfdv.infer_schema(statistics=users_stats)
tfdv.display_schema(schema=user_schema)


ads_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{ads_file_glob}").as_posix())


tfdv.visualize_statistics(ads_stats)


ads_schema = tfdv.infer_schema(statistics=ads_stats)
tfdv.display_schema(schema=ads_schema)


users_ratings_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_ads_ratings}").as_posix())

コード例 #10
0
def _tfdv_schema_info(schema):
    with TFDVSchemaDisplay() as display:
        tfdv.display_schema(schema)
        assert len(display.display_obj) == 2, display.display_obj
        return display.display_obj
warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)

stats = tfdv.generate_statistics_from_dataframe(dataframe=weatherDF.toPandas())
tfdv.visualize_statistics(stats)
displayHTML(
    get_statistics_html(stats)
)  #tot_precip_mm has about 90% zeros!, avg_wnd_mps has about 16% #No missing data

# COMMAND ----------

# MAGIC %md infer schema

# COMMAND ----------

weather_data_schema = tfdv.infer_schema(statistics=stats)
tfdv.display_schema(schema=weather_data_schema)

# COMMAND ----------

# MAGIC %md check for anomalies

# COMMAND ----------

weather_anomalies = tfdv.validate_statistics(statistics=stats,
                                             schema=weather_data_schema)
tfdv.display_anomalies(weather_anomalies)

# COMMAND ----------

# MAGIC %md average monthly temperature
コード例 #12
0
def _render_inferred_schema_summary_markdown(inferred_schema):
    display_schema_out = io.StringIO()
    with redirect_stdout(display_schema_out):
        tfdv.display_schema(inferred_schema)
    return f'''# Inferred schema summary