def process(self, examples: List[bytes]):
     decoded = self._decoder.DecodeBatch(examples)
     if self._raw_record_column_name is None:
         yield decoded
     else:
         yield record_based_tfxio.AppendRawRecordColumn(
             decoded, self._raw_record_column_name, examples)
  def testAppendRawRecordColumn(
      self, input_record_batch,
      raw_records,
      expected_raw_record_column,
      record_index_column_name=None):
    # input_record_batch = pa.record_batch([pa.array([[1], [2]])], ["feature1"])
    column_name = "raw_record"
    output_record_batch = record_based_tfxio.AppendRawRecordColumn(
        record_batch=input_record_batch, column_name=column_name,
        raw_records=raw_records,
        produce_large_types=True,
        record_index_column_name=record_index_column_name)
    self.assertEqual(
        output_record_batch.num_columns,
        input_record_batch.num_columns + 1)
    for i in range(input_record_batch.num_columns):
      self.assertTrue(
          input_record_batch.column(i).equals(output_record_batch.column(i)))

    self.assertEqual(
        output_record_batch.schema.names[output_record_batch.num_columns - 1],
        column_name)
    self.assertTrue(
        output_record_batch.column(output_record_batch.num_columns - 1)
        .equals(expected_raw_record_column))
Exemple #3
0
 def process(self, records: List[bytes]) -> Iterator[pa.RecordBatch]:
     decoded = self._tensors_to_record_batch_converter.convert(
         self._decoder.decode_record(records))
     if self._raw_record_column_name is None:
         yield decoded
     else:
         yield record_based_tfxio.AppendRawRecordColumn(
             decoded, self._raw_record_column_name, records,
             self._produce_large_raw_record_column)
Exemple #4
0
 def process(self, records: List[bytes]) -> Iterator[pa.RecordBatch]:
   decoded = self._tensors_to_record_batch_converter.convert(
       self._decode_fn(tf.convert_to_tensor(records, dtype=tf.string)))
   if self._raw_record_column_name is None:
     yield decoded
   else:
     yield record_based_tfxio.AppendRawRecordColumn(
         decoded, self._raw_record_column_name, records,
         self._record_index_column_name)
    def RecordBatches(
        self, options: dataset_options.RecordBatchesOptions
    ) -> Iterator[pa.RecordBatch]:
        dataset = dataset_util.make_tf_record_dataset(
            self._file_pattern, options.batch_size, options.drop_final_batch,
            options.num_epochs, options.shuffle, options.shuffle_buffer_size,
            options.shuffle_seed)

        decoder = example_coder.ExamplesToRecordBatchDecoder(
            self._schema.SerializeToString())
        for examples in dataset.as_numpy_iterator():
            decoded = decoder.DecodeBatch(examples)
            if self._raw_record_column_name is None:
                yield decoded
            else:
                yield record_based_tfxio.AppendRawRecordColumn(
                    decoded, self._raw_record_column_name, examples.tolist())
Exemple #6
0
    def _readDatasetIntoBatchedExtracts(self):
        """Read the raw dataset and massage examples into batched Extracts."""
        serialized_examples = list(
            self._dataset.read_raw_dataset(deserialize=False,
                                           limit=self._max_num_examples()))

        # TODO(b/153996019): Once the TFXIO interface that returns an iterator of
        # RecordBatch is available, clean this up.
        coder = example_coder.ExamplesToRecordBatchDecoder(
            serialized_schema=benchmark_utils.read_schema(
                self._dataset.tf_metadata_schema_path()).SerializeToString())
        batches = []
        for i in range(0, len(serialized_examples), _BATCH_SIZE):
            example_batch = serialized_examples[i:i + _BATCH_SIZE]
            record_batch = record_based_tfxio.AppendRawRecordColumn(
                coder.DecodeBatch(example_batch), constants.ARROW_INPUT_COLUMN,
                example_batch)
            batches.append({constants.ARROW_RECORD_BATCH_KEY: record_batch})
        return batches
Exemple #7
0
  def testAppendRawRecordColumn(
      self, input_record_batch,
      raw_records,
      expected_raw_record_column,
      record_index_column_name=None):
    column_name = "raw_record"
    output_record_batch = record_based_tfxio.AppendRawRecordColumn(
        record_batch=input_record_batch, column_name=column_name,
        raw_records=raw_records,
        record_index_column_name=record_index_column_name)
    self.assertEqual(
        output_record_batch.num_columns,
        input_record_batch.num_columns + 1)
    for i in range(input_record_batch.num_columns):
      self.assertTrue(
          input_record_batch.column(i).equals(output_record_batch.column(i)))

    self.assertEqual(
        output_record_batch.schema.names[output_record_batch.num_columns - 1],
        column_name)
    self.assertTrue(
        output_record_batch.column(output_record_batch.num_columns - 1)
        .equals(expected_raw_record_column))