コード例 #1
0
def infer_schema(stats_path, schema_path):
    """Infers a schema from stats in stats_path.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location where the inferred schema is materialized.
  """
    print('Infering schema from statistics.')
    schema = tfdv.infer_schema(tfdv.load_statistics(stats_path),
                               infer_feature_shape=False)
    print(text_format.MessageToString(schema))

    print('Writing schema to output path.')
    tfdv.write_schema_text(schema, schema_path)
コード例 #2
0
ファイル: utils.py プロジェクト: Sruinard/centernet_tfx
 def write_schema(destination):
     schema = SchemaGenerator._get_schema()
     tfdv.write_schema_text(schema, destination)
コード例 #3
0
print('Input event:\n', input_event)

# ## Run the TFDV component
#
# You will now run the TFDV component to generate the schema of dataset. This should look familiar since you've done this already in Week 1.

# In[10]:

# Infer a schema by passing statistics to `infer_schema()`
train_data = './data/train/data.csv'
train_stats = tfdv.generate_statistics_from_csv(data_location=train_data)
schema = tfdv.infer_schema(statistics=train_stats)

schema_file = './schema.pbtxt'
tfdv.write_schema_text(schema, schema_file)

print("Dataset's Schema has been generated at:", schema_file)

# ## Generate output artifact unit
#
# Now that the TFDV component has finished running and schema has been generated, you can create the artifact for the generated schema.

# In[11]:

# Declare output artifact of type Schema_artifact
schema_artifact = metadata_store_pb2.Artifact()
schema_artifact.uri = schema_file
schema_artifact.type_id = schema_artifact_type_id
schema_artifact.properties['version'].int_value = 1
schema_artifact.properties['name'].string_value = 'Chicago Taxi Schema'
コード例 #4
0
# Integer larger than 10
# STRING type when expected INT type
# FLOAT type when expected INT type
# Integer smaller than 0
print(tfdv.display_anomalies(anomalies))

# New data WITH anomalies
test_set_copy = test_set.copy()
test_set_copy.drop("soot", axis=1, inplace=True)

# Statistics based on data with anomalies
test_set_copy_stats = tfdv.generate_statistics_from_dataframe(
    dataframe=test_set_copy)
anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats,
                                         schema=schema)
print(tfdv.display_anomalies(anomalies_new))

# Prepare the schema for Serving,environment
schema.default_environment.append("TRAINING")
schema.default_environment.append("SERVING")

tfdv.get_feature(schema, "soot").not_in_environment.append("SERVING")

serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats,
                                                 schema,
                                                 environment="SERVING")
print(tfdv.display_anomalies(serving_env_anomalies))

# Freezing the schema
tfdv.write_schema_text(schema=schema, output_path='pollution_schema.pbtext')
コード例 #5
0
### Estatísticas baseadas nos dados com anomalias

test_set_copy_stats = tfdv.generate_statistics_from_dataframe(
    dataframe=test_set_copy)

anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats,
                                         schema=schema)

tfdv.display_anomalies(anomalies_new)

## Etapa 6: Preparação do esquema para produção (Serving)

schema.default_environment.append("TRAINING")
schema.default_environment.append("SERVING")

### Removendo a coluna alvo do esquema para produção

tfdv.get_feature(schema, "soot").not_in_environment.append("SERVING")

### Checando anomalias entre o ambiente em produção (Serving) e a nova base de teste

serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats,
                                                 schema,
                                                 environment="SERVING")

tfdv.display_anomalies(serving_env_anomalies)

## Etapa 7: Salvando o esquema

tfdv.write_schema_text(schema=schema, output_path="pollution_schema.pbtxt")