def load_anomalies_text(input_path: Text) -> anomalies_pb2.Anomalies:
    """Loads the Anomalies proto stored in text format in the input path.

  Args:
    input_path: File path from which to load the Anomalies proto.

  Returns:
    An Anomalies protocol buffer.
  """
    anomalies = anomalies_pb2.Anomalies()
    anomalies_text = io_util.read_file_to_string(input_path)
    text_format.Parse(anomalies_text, anomalies)
    return anomalies
Exemple #2
0
def load_schema_text(input_path: Text) -> schema_pb2.Schema:
    """Loads the schema stored in text format in the input path.

  Args:
    input_path: File path to load the schema from.

  Returns:
    A Schema protocol buffer.
  """
    schema = schema_pb2.Schema()
    schema_text = io_util.read_file_to_string(input_path)
    text_format.Parse(schema_text, schema)
    return schema
def load_stats_binary(
        input_path: Text) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Loads a serialized DatasetFeatureStatisticsList proto from a file.

  Args:
    input_path: File path from which to load the DatasetFeatureStatisticsList
      proto.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    stats_proto = statistics_pb2.DatasetFeatureStatisticsList()
    stats_proto.ParseFromString(
        io_util.read_file_to_string(input_path, binary_mode=True))
    return stats_proto
def load_stats_text(
        input_path: Text) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Loads the specified DatasetFeatureStatisticsList proto stored in text format.

  Args:
    input_path: File path from which to load the DatasetFeatureStatisticsList
      proto.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    stats_proto = statistics_pb2.DatasetFeatureStatisticsList()
    stats_text = io_util.read_file_to_string(input_path)
    text_format.Parse(stats_text, stats_proto)
    return stats_proto
def load_anomalies_binary(input_path: Text) -> anomalies_pb2.Anomalies:
    """Loads the Anomalies proto stored in binary format in the input path.

  Args:
    input_path: File path from which to load the Anomalies proto.

  Returns:
    An Anomalies protocol buffer.
  """
    anomalies_proto = anomalies_pb2.Anomalies()

    anomalies_proto.ParseFromString(
        io_util.read_file_to_string(input_path, binary_mode=True))

    return anomalies_proto
Exemple #6
0
 def test_write_stats_to_binary_file(self):
     stats = text_format.Parse(
         """
     datasets {
       name: 'x'
       num_examples: 100
     }
     """, statistics_pb2.DatasetFeatureStatisticsList())
     output_path = os.path.join(self._get_temp_dir(), 'stats')
     with beam.Pipeline() as p:
         _ = (p | beam.Create([stats])
              | stats_api.WriteStatisticsToBinaryFile(output_path))
     stats_from_file = statistics_pb2.DatasetFeatureStatisticsList()
     serialized_stats = io_util.read_file_to_string(output_path,
                                                    binary_mode=True)
     stats_from_file.ParseFromString(serialized_stats)
     self.assertLen(stats_from_file.datasets, 1)
     test_util.assert_dataset_feature_stats_proto_equal(
         self, stats_from_file.datasets[0], stats.datasets[0])
Exemple #7
0
    def test_write_stats_to_tfrecord_and_binary(self):
        stats1 = text_format.Parse(
            """
        datasets {
          name: 'x'
          num_examples: 100
          features: {
             path: {
                step: "f1"
             }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
        stats2 = text_format.Parse(
            """
        datasets {
          name: 'x'
          num_examples: 100
          features: {
             path: {
                step: "f2"
             }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

        stats_combined = text_format.Parse(
            """
        datasets {
          name: 'x'
          num_examples: 100
          features: {
             path: {
                step: "f1"
             }
          }
          features: {
             path: {
                step: "f2"
             }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

        output_path_binary = os.path.join(self._get_temp_dir(), 'stats.pb')
        output_path_prefix = os.path.join(self._get_temp_dir(), 'stats_shards')
        with beam.Pipeline() as p:
            _ = (p | beam.Create([stats1, stats2])
                 | stats_api.WriteStatisticsToRecordsAndBinaryFile(
                     output_path_binary, output_path_prefix))

        stats_from_pb = statistics_pb2.DatasetFeatureStatisticsList()
        serialized_stats = io_util.read_file_to_string(output_path_binary,
                                                       binary_mode=True)
        stats_from_pb.ParseFromString(serialized_stats)
        self.assertLen(stats_from_pb.datasets, 1)
        test_util.assert_dataset_feature_stats_proto_equal(
            self, stats_from_pb.datasets[0], stats_combined.datasets[0])

        stats_from_shards = stats_util.load_sharded_statistics(
            output_path_prefix + '*').proto()
        self.assertLen(stats_from_shards.datasets, 1)
        test_util.assert_dataset_feature_stats_proto_equal(
            self, stats_from_shards.datasets[0], stats_combined.datasets[0])