def testFeedSerializeDeserializeMany(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorPlaceholder() sp_input1 = self._SparseTensorPlaceholder() input0_val = self._SparseTensorValue_5x6(np.arange(6)) input1_val = self._SparseTensorValue_3x4(np.arange(6)) serialized0 = tf.serialize_sparse(sp_input0) serialized1 = tf.serialize_sparse(sp_input1) serialized_concat = tf.pack([serialized0, serialized1]) sp_deserialized = tf.deserialize_many_sparse(serialized_concat, dtype=tf.int32) combined_indices, combined_values, combined_shape = sess.run( sp_deserialized, { sp_input0: input0_val, sp_input1: input1_val }) self.assertAllEqual(combined_indices[:6, 0], [0] * 6) # minibatch 0 self.assertAllEqual(combined_indices[:6, 1:], input0_val[0]) self.assertAllEqual(combined_indices[6:, 0], [1] * 6) # minibatch 1 self.assertAllEqual(combined_indices[6:, 1:], input1_val[0]) self.assertAllEqual(combined_values[:6], input0_val[1]) self.assertAllEqual(combined_values[6:], input1_val[1]) self.assertAllEqual(combined_shape, [2, 5, 6])
def threaded_input_pipeline(base_dir,file_patterns, num_threads=4, batch_size=32, batch_device=None, preprocess_device=None, num_epochs=None): queue_capacity = num_threads*batch_size*2 # Allow a smaller final batch if we are going for a fixed number of epochs final_batch = (num_epochs!=None) data_queue = _get_data_queue(base_dir, file_patterns, capacity=queue_capacity, num_epochs=num_epochs) # each thread has a subgraph with its own reader (sharing filename queue) data_tuples = [] # list of subgraph [image, label, width, text] elements with tf.device(preprocess_device): for _ in range(num_threads): image, width, label, length, text, filename = _read_word_record( data_queue) image = _preprocess_image(image) # move after batch? data_tuples.append([image, width, label, length, text, filename]) with tf.device(batch_device): # Create batch queue image, width, label, length, text, filename = tf.train.batch_join( data_tuples, batch_size=batch_size, capacity=queue_capacity, allow_smaller_final_batch=final_batch, dynamic_pad=True) label = tf.deserialize_many_sparse(label, tf.int64) # post-batching... label = tf.cast(label, tf.int32) # for ctc_loss return image, width, label, length, text, filename
def make_batch(self, batch_size, single_char=False): filenames = self._get_filenames() dataset = tf.data.TFRecordDataset(filenames).repeat() dataset = dataset.map( functools.partial(self._parser, distort=(single_char and self.subset == 'train')), num_parallel_calls=batch_size) if self.subset == 'train': min_q_exs = 0.4 * self.num_examples dataset = dataset.shuffle( buffer_size=int(min_q_exs + 3 * batch_size) ) padded_shapes = tuple([self._padding[k] for k in self.feat_keys]) dataset = dataset.padded_batch(batch_size, padded_shapes=padded_shapes) iterator = dataset.make_one_shot_iterator() batch = iterator.get_next() features = dict(zip(self.feat_keys, batch)) for key, value in features.items(): if 'sparse' in key: features[key] = tf.deserialize_many_sparse(value, dtype=tf.int32) labels = features.pop(self._target_id, None) return features, labels
def threaded_input_pipeline(base_dir,file_patterns, num_threads=4, batch_size=32, batch_device=None, preprocess_device=None, num_epochs=None): queue_capacity = num_threads*batch_size*2 # Allow a smaller final batch if we are going for a fixed number of epochs, this function for test final_batch = (num_epochs!=None) data_queue = _get_data_queue(base_dir, file_patterns, capacity=queue_capacity, num_epochs=num_epochs) # each thread has a subgraph with its own reader (sharing filename queue) data_tuples = [] # list of subgraph [image, label, width, text] elements with tf.device(preprocess_device): for _ in range(num_threads): image, width, label, length, text, filename = _read_word_record( data_queue) image = _preprocess_image(image) # move after batch? data_tuples.append([image, width, label, length, text, filename]) with tf.device(batch_device): # Create batch queue image, width, label, length, text, filename = tf.train.batch_join( data_tuples, batch_size=batch_size, capacity=queue_capacity, allow_smaller_final_batch=final_batch, dynamic_pad=True) label = tf.deserialize_many_sparse(label, tf.int64) # post-batching... label = tf.cast(label, tf.int32) # for ctc_loss return image, width, label, length, text, filename
def inputs(tfrecords, batch_size, num_epochs, is_sparse_label=True): with tf.name_scope('input'): # 1.push the '.tfrecords' files into File Queue. filename_queue = tf.train.string_input_producer([tfrecords], num_epochs=num_epochs) # Even when reading in multiple threads, share the filename queue. image, label = read_and_decode(filename_queue) # Shuffle the examples and collect them into batch_size batches. # (Internally uses a RandomShuffleQueue.) # We run this in two threads to avoid being a bottleneck. images_batch, labels_batch_serialized = tf.train.shuffle_batch( [image, label], batch_size=batch_size, num_threads=2, capacity=1000 + 3 * batch_size, # Ensures a minimum amount of shuffling of examples. min_after_dequeue=1000) # for variable length labels sparse_labels_batch = tf.deserialize_many_sparse( labels_batch_serialized, dtype=tf.int64) if is_sparse_label: labels_batch = sparse_labels_batch else: labels_batch = tf.sparse_tensor_to_dense(sparse_labels_batch) return images_batch, labels_batch
def get_inputs(self): """ Return's tensors for inputs, sequence_lengths and labels """ with tf.device("/cpu:0"): inputs, sequence_lengths, labels = self.queue.dequeue() labels = tf.deserialize_many_sparse(labels, dtype=tf.int32) return inputs, sequence_lengths, labels
def bucketed_input_pipeline(base_dir, file_patterns, num_threads=4, batch_size=32, boundaries=[32, 64, 96, 128, 160, 192, 224, 256], input_device=None, width_threshold=None, length_threshold=None, num_epochs=None): """Get input tensors bucketed by image width Returns: image : float32 image tensor [batch_size 32 ? 1] padded to batch max width width : int32 image widths (for calculating post-CNN sequence length) label : Sparse tensor with label sequences for the batch length : Length of label sequence (text length) text : Human readable string for the image filename : Source file path """ queue_capacity = num_threads * batch_size * 2 # Allow a smaller final batch if we are going for a fixed number of epochs final_batch = (num_epochs != None) data_queue, number_of_images = _get_data_queue( base_dir, file_patterns, # возвращает tensor со списком длиной capacity, состоящим из файлов tfrecords capacity=queue_capacity, num_epochs=num_epochs) with tf.device(input_device): # Create bucketing batcher image, width, label, length, text, filename = _read_word_record( data_queue) # считывает изображения из tfrecord image = _preprocess_image(image) # нормализация изображения keep_input = _get_input_filter( width, width_threshold, # true или false, оставлять изображение в выборке или нет в зависимости от порога ширины/высоты изображения length, length_threshold) data_tuple = [image, width, label, length, text, filename] # bucket_by_sequence_length делит изображения на батчи, кластеризуя изображения в соответствии с их шириной # если размер батча слишком велик для имеющегося количества изображений данной ширины в датасете, то изображения в батче будут повторяться # это реализовано из необходимости паддинга для соблюдения одинаковой ширины изображений внутри одного батча # если изображений внутри датасета одинаковой ширины нет, то батч собирается из изображений с шириной с некоторым отнклонением # затем среди них проводится паддинг # https://blog.altoros.com/the-magic-behind-google-translate-sequence-to-sequence-models-and-tensorflow.html # ниже функция bucket_by_sequence_length была заменена на tf.train.batch, с которой нет подобных проблем data_tuple = tf.train.batch(tensors=data_tuple, batch_size=batch_size, num_threads=num_threads, capacity=queue_capacity, dynamic_pad=True, allow_smaller_final_batch=final_batch) [image, width, label, length, text, filename] = data_tuple label = tf.deserialize_many_sparse(label, tf.int64) # post-batching... label = tf.cast(label, tf.int32) # for ctc_loss return image, width, label, length, text, filename, number_of_images
def postbatch_fn(image, width, label, length, text): """Post-batching, postprocessing: packs raw tensors into a dictionary for Dataset's iterator output""" # Batching is complete, so now we can re-sparsify our labels for ctc_loss label = tf.cast(tf.deserialize_many_sparse(label, tf.int64), tf.int32) # Format relevant features for estimator ingestion features = {"image": image, "width": width, "length": length, "text": text} return features, label
def ImageInput(input_pattern, num_threads, shape, using_ctc, reader=None): """Creates an input image tensor from the input_pattern filenames. TODO(rays) Expand for 2-d labels, 0-d labels, and logistic targets. Args: input_pattern: Filenames of the dataset(s) to read. num_threads: Number of preprocessing threads. shape: ImageShape with the desired shape of the input. using_ctc: Take the unpadded_class labels instead of padded. reader: Function that returns an actual reader to read Examples from input files. If None, uses tf.TFRecordReader(). Returns: images: Float Tensor containing the input image scaled to [-1.28, 1.27]. heights: Tensor int64 containing the heights of the images. widths: Tensor int64 containing the widths of the images. labels: Serialized SparseTensor containing the int64 labels. sparse_labels: Serialized SparseTensor containing the int64 labels. truths: Tensor string of the utf8 truth texts. Raises: ValueError: if the optimizer type is unrecognized. """ data_files = tf.gfile.Glob(input_pattern) assert data_files, 'no files found for dataset ' + input_pattern queue_capacity = shape.batch_size * num_threads * 2 filename_queue = tf.train.string_input_producer( data_files, capacity=queue_capacity) # Create a subgraph with its own reader (but sharing the # filename_queue) for each preprocessing thread. images_and_label_lists = [] for _ in range(num_threads): image, height, width, labels, text = _ReadExamples(filename_queue, shape, using_ctc, reader) images_and_label_lists.append([image, height, width, labels, text]) # Create a queue that produces the examples in batches. images, heights, widths, labels, truths = tf.train.batch_join( images_and_label_lists, batch_size=shape.batch_size, capacity=16 * shape.batch_size, dynamic_pad=True) # Deserialize back to sparse, because the batcher doesn't do sparse. labels = tf.deserialize_many_sparse(labels, tf.int64) sparse_labels = tf.cast(labels, tf.int32) labels = tf.sparse_tensor_to_dense(labels) labels = tf.reshape(labels, [shape.batch_size, -1], name='Labels') # Crush the other shapes to just the batch dimension. heights = tf.reshape(heights, [-1], name='Heights') widths = tf.reshape(widths, [-1], name='Widths') truths = tf.reshape(truths, [-1], name='Truths') # Give the images a nice name as well. images = tf.identity(images, name='Images') tf.image_summary('Images', images) return images, heights, widths, labels, sparse_labels, truths
def ImageInput(input_pattern, num_threads, shape, using_ctc, reader=None): """Creates an input image tensor from the input_pattern filenames. TODO(rays) Expand for 2-d labels, 0-d labels, and logistic targets. Args: input_pattern: Filenames of the dataset(s) to read. num_threads: Number of preprocessing threads. shape: ImageShape with the desired shape of the input. using_ctc: Take the unpadded_class labels instead of padded. reader: Function that returns an actual reader to read Examples from input files. If None, uses tf.TFRecordReader(). Returns: images: Float Tensor containing the input image scaled to [-1.28, 1.27]. heights: Tensor int64 containing the heights of the images. widths: Tensor int64 containing the widths of the images. labels: Serialized SparseTensor containing the int64 labels. sparse_labels: Serialized SparseTensor containing the int64 labels. truths: Tensor string of the utf8 truth texts. Raises: ValueError: if the optimizer type is unrecognized. """ data_files = tf.gfile.Glob(input_pattern) assert data_files, 'no files found for dataset ' + input_pattern queue_capacity = shape.batch_size * num_threads * 2 filename_queue = tf.train.string_input_producer(data_files, capacity=queue_capacity) # Create a subgraph with its own reader (but sharing the # filename_queue) for each preprocessing thread. images_and_label_lists = [] for _ in range(num_threads): image, height, width, labels, text = _ReadExamples( filename_queue, shape, using_ctc, reader) images_and_label_lists.append([image, height, width, labels, text]) # Create a queue that produces the examples in batches. images, heights, widths, labels, truths = tf.train.batch_join( images_and_label_lists, batch_size=shape.batch_size, capacity=16 * shape.batch_size, dynamic_pad=True) # Deserialize back to sparse, because the batcher doesn't do sparse. labels = tf.deserialize_many_sparse(labels, tf.int64) sparse_labels = tf.cast(labels, tf.int32) labels = tf.sparse_tensor_to_dense(labels) labels = tf.reshape(labels, [shape.batch_size, -1], name='Labels') # Crush the other shapes to just the batch dimension. heights = tf.reshape(heights, [-1], name='Heights') widths = tf.reshape(widths, [-1], name='Widths') truths = tf.reshape(truths, [-1], name='Truths') # Give the images a nice name as well. images = tf.identity(images, name='Images') tf.summary.image('Images', images) return images, heights, widths, labels, sparse_labels, truths
def testDeserializeFailsInvalidProto(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorPlaceholder() input0_val = self._SparseTensorValue_5x6(np.arange(6)) serialized0 = tf.serialize_sparse(sp_input0) serialized1 = ["a", "b", "c"] serialized_concat = tf.pack([serialized0, serialized1]) sp_deserialized = tf.deserialize_many_sparse(serialized_concat, dtype=tf.int32) with self.assertRaisesOpError( r"Could not parse serialized_sparse\[1, 0\]"): sess.run(sp_deserialized, {sp_input0: input0_val})
def benchmarkVeryLarge2DFloatSparseTensor(self): np.random.seed(127) num_elements = 10000 batch_size = 64 indices_batch = np.random.randint(batch_size, size=num_elements, dtype=np.int64) indices_value = np.arange(num_elements, dtype=np.int64) indices = np.asarray(sorted(zip(indices_batch, indices_value)), dtype=np.int64) values = ["feature_value_for_embedding_lookup"] * num_elements shape = np.asarray([batch_size, num_elements], dtype=np.int64) with tf.Session() as sess: with tf.device("/cpu:0"): indices = tf.Variable(indices) values = tf.Variable(values) shape = tf.Variable(shape) st = tf.SparseTensor(indices, values, shape) st_handles = add_many_sparse_to_tensors_map(st) st_roundtrip = take_many_sparse_from_tensors_map( sparse_map_op=st_handles.op, sparse_handles=st_handles) st_roundtrip_op = st_roundtrip.values.op st_serialized = tf.serialize_many_sparse(st) st_deserialized = tf.deserialize_many_sparse( st_serialized, dtype=values.dtype) st_deserialized_op = st_deserialized.values.op tf.global_variables_initializer().run() st_roundtrip_values = sess.run(st_roundtrip) st_deserialized_values = sess.run(st_deserialized) np.testing.assert_equal(st_roundtrip_values.values, st_deserialized_values.values) np.testing.assert_equal(st_roundtrip_values.indices, st_deserialized_values.indices) np.testing.assert_equal(st_roundtrip_values.shape, st_deserialized_values.shape) self.run_op_benchmark( sess, st_roundtrip_op, min_iters=2000, name="benchmark_very_large_2d_float_st_tensor_maps") self.run_op_benchmark( sess, st_deserialized_op, min_iters=2000, name="benchmark_very_large_2d_float_st_serialization")
def testDeserializeFailsInvalidProto(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorPlaceholder() input0_val = self._SparseTensorValue_5x6(np.arange(6)) serialized0 = tf.serialize_sparse(sp_input0) serialized1 = ["a", "b", "c"] serialized_concat = tf.pack([serialized0, serialized1]) sp_deserialized = tf.deserialize_many_sparse( serialized_concat, dtype=tf.int32) with self.assertRaisesOpError( r"Could not parse serialized_sparse\[1, 0\]"): sess.run(sp_deserialized, {sp_input0: input0_val})
def bucketed_input_pipeline(base_dir, file_patterns, num_threads=4, batch_size=32, boundaries=[32, 64, 96, 128, 160, 192, 224, 256], input_device=None, width_threshold=None, length_threshold=None, num_epochs=None): """Get input tensors bucketed by image width Returns: image : float32 image tensor [batch_size 32 ? 1] padded to batch max width width : int32 image widths (for calculating post-CNN sequence length) label : Sparse tensor with label sequences for the batch length : Length of label sequence (text length) text : Human readable string for the image filename : Source file path """ queue_capacity = num_threads * batch_size * 2 # Allow a smaller final batch if we are going for a fixed number of epochs final_batch = (num_epochs != None) data_queue = _get_data_queue(base_dir, file_patterns, capacity=queue_capacity, num_epochs=num_epochs) with tf.device(input_device): # Create bucketing batcher image, width, label, length, text, filename = _read_word_record( data_queue) image = _preprocess_image(image) # move after batch? keep_input = _get_input_filter(width, width_threshold, length, length_threshold) data_tuple = [image, label, length, text, filename] width, data_tuple = tf.contrib.training.bucket_by_sequence_length( input_length=width, tensors=data_tuple, bucket_boundaries=boundaries, batch_size=batch_size, capacity=queue_capacity, keep_input=keep_input, allow_smaller_final_batch=final_batch, dynamic_pad=True) [image, label, length, text, filename] = data_tuple label = tf.deserialize_many_sparse(label, tf.int64) # post-batching... label = tf.cast(label, tf.int32) # for ctc_loss return image, width, label, length, text, filename
def bucketed_input_pipeline(base_dir,file_patterns, num_threads=4, batch_size=32, boundaries=[32, 64, 96, 128, 160, 192, 224, 256], input_device=None, width_threshold=None, length_threshold=None, num_epochs=None): """Get input tensors bucketed by image width Returns: image : float32 image tensor [batch_size 32 ? 1] padded to batch max width width : int32 image widths (for calculating post-CNN sequence length) label : Sparse tensor with label sequences for the batch length : Length of label sequence (text length) text : Human readable string for the image filename : Source file path """ queue_capacity = num_threads*batch_size*2 # Allow a smaller final batch if we are going for a fixed number of epochs final_batch = (num_epochs!=None) data_queue = _get_data_queue(base_dir, file_patterns, capacity=queue_capacity, num_epochs=num_epochs) with tf.device(input_device): # Create bucketing batcher image, width, label, length, text, filename = _read_word_record( data_queue) image = _preprocess_image(image) # move after batch? keep_input = _get_input_filter(width, width_threshold, length, length_threshold) data_tuple = [image, label, length, text, filename] width,data_tuple = tf.contrib.training.bucket_by_sequence_length( input_length=width, tensors=data_tuple, bucket_boundaries=boundaries, batch_size=batch_size, capacity=queue_capacity, keep_input=keep_input, allow_smaller_final_batch=final_batch, dynamic_pad=True) [image, label, length, text, filename] = data_tuple label = tf.deserialize_many_sparse(label, tf.int64) # post-batching... label = tf.cast(label, tf.int32) # for ctc_loss return image, width, label, length, text, filename
def benchmarkVeryLarge2DFloatSparseTensor(self): np.random.seed(127) num_elements = 10000 batch_size = 64 indices_batch = np.random.randint( batch_size, size=num_elements, dtype=np.int64) indices_value = np.arange(num_elements, dtype=np.int64) indices = np.asarray( sorted(zip(indices_batch, indices_value)), dtype=np.int64) values = ["feature_value_for_embedding_lookup"] * num_elements shape = np.asarray([batch_size, num_elements], dtype=np.int64) with tf.Session() as sess: with tf.device("/cpu:0"): indices = tf.Variable(indices) values = tf.Variable(values) shape = tf.Variable(shape) st = tf.SparseTensor(indices, values, shape) st_handles = add_many_sparse_to_tensors_map(st) st_roundtrip = take_many_sparse_from_tensors_map( sparse_map_op=st_handles.op, sparse_handles=st_handles) st_roundtrip_op = st_roundtrip.values.op st_serialized = tf.serialize_many_sparse(st) st_deserialized = tf.deserialize_many_sparse( st_serialized, dtype=values.dtype) st_deserialized_op = st_deserialized.values.op tf.initialize_all_variables().run() st_roundtrip_values = sess.run(st_roundtrip) st_deserialized_values = sess.run(st_deserialized) np.testing.assert_equal( st_roundtrip_values.values, st_deserialized_values.values) np.testing.assert_equal( st_roundtrip_values.indices, st_deserialized_values.indices) np.testing.assert_equal( st_roundtrip_values.shape, st_deserialized_values.shape) self.run_op_benchmark( sess, st_roundtrip_op, min_iters=2000, name="benchmark_very_large_2d_float_st_tensor_maps") self.run_op_benchmark( sess, st_deserialized_op, min_iters=2000, name="benchmark_very_large_2d_float_st_serialization")
def testDeserializeFailsInconsistentRank(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorPlaceholder() sp_input1 = self._SparseTensorPlaceholder() input0_val = self._SparseTensorValue_5x6(np.arange(6)) input1_val = self._SparseTensorValue_1x1x1() serialized0 = tf.serialize_sparse(sp_input0) serialized1 = tf.serialize_sparse(sp_input1) serialized_concat = tf.pack([serialized0, serialized1]) sp_deserialized = tf.deserialize_many_sparse( serialized_concat, dtype=tf.int32) with self.assertRaisesOpError( r"Inconsistent rank across SparseTensors: rank prior to " r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"): sess.run( sp_deserialized, {sp_input0: input0_val, sp_input1: input1_val})
def testDeserializeFailsWrongType(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorPlaceholder() sp_input1 = self._SparseTensorPlaceholder() input0_val = self._SparseTensorValue_5x6(np.arange(6)) input1_val = self._SparseTensorValue_3x4(np.arange(6)) serialized0 = tf.serialize_sparse(sp_input0) serialized1 = tf.serialize_sparse(sp_input1) serialized_concat = tf.pack([serialized0, serialized1]) sp_deserialized = tf.deserialize_many_sparse( serialized_concat, dtype=tf.int64) with self.assertRaisesOpError( r"Requested SparseTensor of type int64 but " r"SparseTensor\[0\].values.dtype\(\) == int32"): sess.run( sp_deserialized, {sp_input0: input0_val, sp_input1: input1_val})
def testSerializeManyDeserializeManyRoundTrip(self): with self.test_session(use_gpu=False) as sess: # N == 4 because shape_value == [4, 5] indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64) values_value = np.array([b"a", b"b", b"c"]) shape_value = np.array([4, 5], dtype=np.int64) sparse_tensor = self._SparseTensorPlaceholder(dtype=tf.string) serialized = tf.serialize_many_sparse(sparse_tensor) deserialized = tf.deserialize_many_sparse(serialized, dtype=tf.string) serialized_value, deserialized_value = sess.run( [serialized, deserialized], feed_dict={sparse_tensor.indices: indices_value, sparse_tensor.values: values_value, sparse_tensor.shape: shape_value}) self.assertEqual(serialized_value.shape, (4, 3)) self.assertAllEqual(deserialized_value.indices, indices_value) self.assertAllEqual(deserialized_value.values, values_value) self.assertAllEqual(deserialized_value.shape, shape_value)
def testSerializeDeserializeMany(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorValue_5x6(np.arange(6)) sp_input1 = self._SparseTensorValue_3x4(np.arange(6)) serialized0 = tf.serialize_sparse(sp_input0) serialized1 = tf.serialize_sparse(sp_input1) serialized_concat = tf.pack([serialized0, serialized1]) sp_deserialized = tf.deserialize_many_sparse( serialized_concat, dtype=tf.int32) combined_indices, combined_values, combined_shape = sess.run( sp_deserialized) self.assertAllEqual(combined_indices[:6, 0], [0] * 6) # minibatch 0 self.assertAllEqual(combined_indices[:6, 1:], sp_input0[0]) self.assertAllEqual(combined_indices[6:, 0], [1] * 6) # minibatch 1 self.assertAllEqual(combined_indices[6:, 1:], sp_input1[0]) self.assertAllEqual(combined_values[:6], sp_input0[1]) self.assertAllEqual(combined_values[6:], sp_input1[1]) self.assertAllEqual(combined_shape, [2, 5, 6])
def threaded_input_pipeline(base_dir, file_patterns, num_threads=4, batch_size=32, batch_device=None, preprocess_device=None, num_epochs=None): queue_capacity = num_threads * batch_size * 2 # Allow a smaller final batch if we are going for a fixed number of epochs final_batch = (num_epochs != None) data_queue, number_of_images = _get_data_queue(base_dir, file_patterns, capacity=queue_capacity, num_epochs=num_epochs) # each thread has a subgraph with its own reader (sharing filename queue) # можно заменить на tf.train.batch https://stackoverflow.com/questions/35689547/how-to-process-single-training-file-in-parallel data_tuples = [] # list of subgraph [image, label, width, text] elements with tf.device(preprocess_device): for _ in range(num_threads): image, width, label, length, text, filename = _read_word_record( data_queue) # считывает изображения из tfrecord image = _preprocess_image(image) # нормализация изображения data_tuples.append([image, width, label, length, text, filename]) with tf.device(batch_device): # Create batch queue image, width, label, length, text, filename = tf.train.batch_join( data_tuples, batch_size=batch_size, capacity=queue_capacity, allow_smaller_final_batch=final_batch, dynamic_pad=True) label = tf.deserialize_many_sparse(label, tf.int64) # post-batching... label = tf.cast(label, tf.int32) # for ctc_loss return image, width, label, length, text, filename, number_of_images
def get_batch(self): """ """ # Get datasets datasets = [] batch_sizes = [] for i, (ds_name, ds_path, ds_portion) \ in enumerate(zip(self.dataset_names, self.dataset_paths, self.dataset_portions)): # Extract if self.concat_batch: _batch_size = max(int(self.batch_size * ds_portion), 1) \ if i < len(self.dataset_names)-1 \ else max(self.batch_size - sum(batch_sizes), 1) batch_sizes.append(_batch_size) else: _batch_size = self.batch_size _data_files = glob.glob(ds_path, recursive=True) _dataset = tf.data.Dataset.list_files( _data_files, shuffle=self.shuffle_and_repeat, seed=self.seed) _dataset = _dataset.interleave(self.dataset_class, cycle_length=self.num_cpus, num_parallel_calls=self.num_cpus) if self.worker_index is not None: _dataset = _dataset.shard(self.num_gpus, self.worker_index) if self.shuffle_and_repeat: _dataset = _dataset.apply( tf.contrib.data.shuffle_and_repeat( buffer_size=_batch_size * self.buffer_size, seed=self.seed)) # Trasform _dataset = _dataset.map(lambda *e: self.parse_fn(*e, ds_name), num_parallel_calls=self.num_cpus) if self.preprocess_image: _dataset = _dataset.map(self.preprocess_fn, num_parallel_calls=self.num_cpus) _dataset = _dataset.filter(self.filter_fn) _dataset = _dataset.batch(_batch_size) datasets.append(_dataset) # Load if self.concat_batch: batches = [] for _dataset in datasets: _dataset = _dataset.apply( tf.contrib.data.prefetch_to_device(self.input_device, 2)) _iterator = _dataset.make_initializable_iterator() tf.add_to_collection(self.iterator_name, _iterator.initializer) _batch = _iterator.get_next() batches.append(_batch) batch = [tf.concat(elements, axis=0) for elements in zip(*batches)] \ if len(batches) > 1 \ else batches[0] print('DATASET BATCHES : {} = {}'.format( ' + '.join([str(size) for size in batch_sizes]), sum(batch_sizes))) else: concatted = datasets[0] for i in range(1, len(datasets)): concatted = concatted.concatenate(datasets[i]) concatted = concatted.apply( tf.data.experimental.prefetch_to_device(self.input_device, 2)) iterator = concatted.make_initializable_iterator() tf.add_to_collection(self.iterator_name, iterator.initializer) batch = iterator.get_next() image, label, length, text, filename, dataset_name = \ batch label = tf.deserialize_many_sparse(label, tf.int64) label = tf.cast(label, tf.int32) batch = Batch(image, label, length, text, filename, dataset_name) return batch