def testRecordBatchesWithRawRecords(self): raw_example_column_name = "raw_records" tfxio = self._MakeTFXIO(_SCHEMA, raw_example_column_name) options = dataset_options.RecordBatchesOptions( batch_size=len(_EXAMPLES), shuffle=False, num_epochs=1) for record_batch in tfxio.RecordBatches(options): self._ValidateRecordBatch(tfxio, record_batch, raw_example_column_name)
def testRecordBatchesWithProject(self): tfxio = self._MakeTFXIO(_SCHEMA) feature_name = "string_feature" projected_tfxio = tfxio.Project([feature_name]) options = dataset_options.RecordBatchesOptions( batch_size=len(_EXAMPLES), shuffle=False, num_epochs=1) for record_batch in projected_tfxio.RecordBatches(options): self._ValidateRecordBatch(projected_tfxio, record_batch) self.assertIn(feature_name, record_batch.schema.names) self.assertLen(record_batch.schema.names, 1)
def _input_fn( file_pattern: Text, data_accessor: DataAccessor, schema: schema_pb2.Schema, batch_size: int = 20, ) -> Tuple[np.ndarray, np.ndarray]: """Generates features and label for tuning/training. Args: file_pattern: input tfrecord file pattern. data_accessor: DataAccessor for converting input to RecordBatch. schema: schema of the input data. batch_size: An int representing the number of records to combine in a single batch. Returns: A (features, indices) tuple where features is a matrix of features, and indices is a single vector of label indices. """ record_batch_iterator = data_accessor.record_batch_factory( file_pattern, dataset_options.RecordBatchesOptions(batch_size=batch_size, num_epochs=1), schema) feature_list = [] label_list = [] for record_batch in record_batch_iterator: record_dict = {} for column, field in zip(record_batch, record_batch.schema): record_dict[field.name] = column.flatten() label_list.append(record_dict[_LABEL_KEY]) features = [record_dict[key] for key in _FEATURE_KEYS] feature_list.append(np.stack(features, axis=-1)) return np.concatenate(feature_list), np.concatenate(label_list)
def testRecordBatches(self): tfxio = self._MakeTFXIO(_SCHEMA) options = dataset_options.RecordBatchesOptions( batch_size=len(_EXAMPLES), shuffle=False, num_epochs=1) for record_batch in tfxio.RecordBatches(options): self._ValidateRecordBatch(tfxio, record_batch)