def load_anomalies_text(input_path: Text) -> anomalies_pb2.Anomalies: """Loads the Anomalies proto stored in text format in the input path. Args: input_path: File path from which to load the Anomalies proto. Returns: An Anomalies protocol buffer. """ anomalies = anomalies_pb2.Anomalies() anomalies_text = io_util.read_file_to_string(input_path) text_format.Parse(anomalies_text, anomalies) return anomalies
def load_schema_text(input_path: Text) -> schema_pb2.Schema: """Loads the schema stored in text format in the input path. Args: input_path: File path to load the schema from. Returns: A Schema protocol buffer. """ schema = schema_pb2.Schema() schema_text = io_util.read_file_to_string(input_path) text_format.Parse(schema_text, schema) return schema
def load_stats_binary( input_path: Text) -> statistics_pb2.DatasetFeatureStatisticsList: """Loads a serialized DatasetFeatureStatisticsList proto from a file. Args: input_path: File path from which to load the DatasetFeatureStatisticsList proto. Returns: A DatasetFeatureStatisticsList proto. """ stats_proto = statistics_pb2.DatasetFeatureStatisticsList() stats_proto.ParseFromString( io_util.read_file_to_string(input_path, binary_mode=True)) return stats_proto
def load_stats_text( input_path: Text) -> statistics_pb2.DatasetFeatureStatisticsList: """Loads the specified DatasetFeatureStatisticsList proto stored in text format. Args: input_path: File path from which to load the DatasetFeatureStatisticsList proto. Returns: A DatasetFeatureStatisticsList proto. """ stats_proto = statistics_pb2.DatasetFeatureStatisticsList() stats_text = io_util.read_file_to_string(input_path) text_format.Parse(stats_text, stats_proto) return stats_proto
def load_anomalies_binary(input_path: Text) -> anomalies_pb2.Anomalies: """Loads the Anomalies proto stored in binary format in the input path. Args: input_path: File path from which to load the Anomalies proto. Returns: An Anomalies protocol buffer. """ anomalies_proto = anomalies_pb2.Anomalies() anomalies_proto.ParseFromString( io_util.read_file_to_string(input_path, binary_mode=True)) return anomalies_proto
def test_write_stats_to_binary_file(self): stats = text_format.Parse( """ datasets { name: 'x' num_examples: 100 } """, statistics_pb2.DatasetFeatureStatisticsList()) output_path = os.path.join(self._get_temp_dir(), 'stats') with beam.Pipeline() as p: _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToBinaryFile(output_path)) stats_from_file = statistics_pb2.DatasetFeatureStatisticsList() serialized_stats = io_util.read_file_to_string(output_path, binary_mode=True) stats_from_file.ParseFromString(serialized_stats) self.assertLen(stats_from_file.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, stats_from_file.datasets[0], stats.datasets[0])
def test_write_stats_to_tfrecord_and_binary(self): stats1 = text_format.Parse( """ datasets { name: 'x' num_examples: 100 features: { path: { step: "f1" } } } """, statistics_pb2.DatasetFeatureStatisticsList()) stats2 = text_format.Parse( """ datasets { name: 'x' num_examples: 100 features: { path: { step: "f2" } } } """, statistics_pb2.DatasetFeatureStatisticsList()) stats_combined = text_format.Parse( """ datasets { name: 'x' num_examples: 100 features: { path: { step: "f1" } } features: { path: { step: "f2" } } } """, statistics_pb2.DatasetFeatureStatisticsList()) output_path_binary = os.path.join(self._get_temp_dir(), 'stats.pb') output_path_prefix = os.path.join(self._get_temp_dir(), 'stats_shards') with beam.Pipeline() as p: _ = (p | beam.Create([stats1, stats2]) | stats_api.WriteStatisticsToRecordsAndBinaryFile( output_path_binary, output_path_prefix)) stats_from_pb = statistics_pb2.DatasetFeatureStatisticsList() serialized_stats = io_util.read_file_to_string(output_path_binary, binary_mode=True) stats_from_pb.ParseFromString(serialized_stats) self.assertLen(stats_from_pb.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, stats_from_pb.datasets[0], stats_combined.datasets[0]) stats_from_shards = stats_util.load_sharded_statistics( output_path_prefix + '*').proto() self.assertLen(stats_from_shards.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, stats_from_shards.datasets[0], stats_combined.datasets[0])