def process(self, examples: List[bytes]): decoded = self._decoder.DecodeBatch(examples) if self._raw_record_column_name is None: yield decoded else: yield record_based_tfxio.AppendRawRecordColumn( decoded, self._raw_record_column_name, examples)
def testAppendRawRecordColumn( self, input_record_batch, raw_records, expected_raw_record_column, record_index_column_name=None): # input_record_batch = pa.record_batch([pa.array([[1], [2]])], ["feature1"]) column_name = "raw_record" output_record_batch = record_based_tfxio.AppendRawRecordColumn( record_batch=input_record_batch, column_name=column_name, raw_records=raw_records, produce_large_types=True, record_index_column_name=record_index_column_name) self.assertEqual( output_record_batch.num_columns, input_record_batch.num_columns + 1) for i in range(input_record_batch.num_columns): self.assertTrue( input_record_batch.column(i).equals(output_record_batch.column(i))) self.assertEqual( output_record_batch.schema.names[output_record_batch.num_columns - 1], column_name) self.assertTrue( output_record_batch.column(output_record_batch.num_columns - 1) .equals(expected_raw_record_column))
def process(self, records: List[bytes]) -> Iterator[pa.RecordBatch]: decoded = self._tensors_to_record_batch_converter.convert( self._decoder.decode_record(records)) if self._raw_record_column_name is None: yield decoded else: yield record_based_tfxio.AppendRawRecordColumn( decoded, self._raw_record_column_name, records, self._produce_large_raw_record_column)
def process(self, records: List[bytes]) -> Iterator[pa.RecordBatch]: decoded = self._tensors_to_record_batch_converter.convert( self._decode_fn(tf.convert_to_tensor(records, dtype=tf.string))) if self._raw_record_column_name is None: yield decoded else: yield record_based_tfxio.AppendRawRecordColumn( decoded, self._raw_record_column_name, records, self._record_index_column_name)
def RecordBatches( self, options: dataset_options.RecordBatchesOptions ) -> Iterator[pa.RecordBatch]: dataset = dataset_util.make_tf_record_dataset( self._file_pattern, options.batch_size, options.drop_final_batch, options.num_epochs, options.shuffle, options.shuffle_buffer_size, options.shuffle_seed) decoder = example_coder.ExamplesToRecordBatchDecoder( self._schema.SerializeToString()) for examples in dataset.as_numpy_iterator(): decoded = decoder.DecodeBatch(examples) if self._raw_record_column_name is None: yield decoded else: yield record_based_tfxio.AppendRawRecordColumn( decoded, self._raw_record_column_name, examples.tolist())
def _readDatasetIntoBatchedExtracts(self): """Read the raw dataset and massage examples into batched Extracts.""" serialized_examples = list( self._dataset.read_raw_dataset(deserialize=False, limit=self._max_num_examples())) # TODO(b/153996019): Once the TFXIO interface that returns an iterator of # RecordBatch is available, clean this up. coder = example_coder.ExamplesToRecordBatchDecoder( serialized_schema=benchmark_utils.read_schema( self._dataset.tf_metadata_schema_path()).SerializeToString()) batches = [] for i in range(0, len(serialized_examples), _BATCH_SIZE): example_batch = serialized_examples[i:i + _BATCH_SIZE] record_batch = record_based_tfxio.AppendRawRecordColumn( coder.DecodeBatch(example_batch), constants.ARROW_INPUT_COLUMN, example_batch) batches.append({constants.ARROW_RECORD_BATCH_KEY: record_batch}) return batches
def testAppendRawRecordColumn( self, input_record_batch, raw_records, expected_raw_record_column, record_index_column_name=None): column_name = "raw_record" output_record_batch = record_based_tfxio.AppendRawRecordColumn( record_batch=input_record_batch, column_name=column_name, raw_records=raw_records, record_index_column_name=record_index_column_name) self.assertEqual( output_record_batch.num_columns, input_record_batch.num_columns + 1) for i in range(input_record_batch.num_columns): self.assertTrue( input_record_batch.column(i).equals(output_record_batch.column(i))) self.assertEqual( output_record_batch.schema.names[output_record_batch.num_columns - 1], column_name) self.assertTrue( output_record_batch.column(output_record_batch.num_columns - 1) .equals(expected_raw_record_column))