def infer_schema(stats_path, schema_path): """Infers a schema from stats in stats_path. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location where the inferred schema is materialized. """ print('Infering schema from statistics.') schema = tfdv.infer_schema(tfdv.load_statistics(stats_path), infer_feature_shape=False) print(text_format.MessageToString(schema)) print('Writing schema to output path.') tfdv.write_schema_text(schema, schema_path)
def write_schema(destination): schema = SchemaGenerator._get_schema() tfdv.write_schema_text(schema, destination)
print('Input event:\n', input_event) # ## Run the TFDV component # # You will now run the TFDV component to generate the schema of dataset. This should look familiar since you've done this already in Week 1. # In[10]: # Infer a schema by passing statistics to `infer_schema()` train_data = './data/train/data.csv' train_stats = tfdv.generate_statistics_from_csv(data_location=train_data) schema = tfdv.infer_schema(statistics=train_stats) schema_file = './schema.pbtxt' tfdv.write_schema_text(schema, schema_file) print("Dataset's Schema has been generated at:", schema_file) # ## Generate output artifact unit # # Now that the TFDV component has finished running and schema has been generated, you can create the artifact for the generated schema. # In[11]: # Declare output artifact of type Schema_artifact schema_artifact = metadata_store_pb2.Artifact() schema_artifact.uri = schema_file schema_artifact.type_id = schema_artifact_type_id schema_artifact.properties['version'].int_value = 1 schema_artifact.properties['name'].string_value = 'Chicago Taxi Schema'
# Integer larger than 10 # STRING type when expected INT type # FLOAT type when expected INT type # Integer smaller than 0 print(tfdv.display_anomalies(anomalies)) # New data WITH anomalies test_set_copy = test_set.copy() test_set_copy.drop("soot", axis=1, inplace=True) # Statistics based on data with anomalies test_set_copy_stats = tfdv.generate_statistics_from_dataframe( dataframe=test_set_copy) anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats, schema=schema) print(tfdv.display_anomalies(anomalies_new)) # Prepare the schema for Serving,environment schema.default_environment.append("TRAINING") schema.default_environment.append("SERVING") tfdv.get_feature(schema, "soot").not_in_environment.append("SERVING") serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats, schema, environment="SERVING") print(tfdv.display_anomalies(serving_env_anomalies)) # Freezing the schema tfdv.write_schema_text(schema=schema, output_path='pollution_schema.pbtext')
### Estatísticas baseadas nos dados com anomalias test_set_copy_stats = tfdv.generate_statistics_from_dataframe( dataframe=test_set_copy) anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats, schema=schema) tfdv.display_anomalies(anomalies_new) ## Etapa 6: Preparação do esquema para produção (Serving) schema.default_environment.append("TRAINING") schema.default_environment.append("SERVING") ### Removendo a coluna alvo do esquema para produção tfdv.get_feature(schema, "soot").not_in_environment.append("SERVING") ### Checando anomalias entre o ambiente em produção (Serving) e a nova base de teste serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats, schema, environment="SERVING") tfdv.display_anomalies(serving_env_anomalies) ## Etapa 7: Salvando o esquema tfdv.write_schema_text(schema=schema, output_path="pollution_schema.pbtxt")