def __init__(self, dataset_source, dataset_target, shuffle=True, num_epochs=None, common_queue_capacity=4096, common_queue_min=1024, seed=None): if seed is None: seed = np.random.randint(10e8) _, data_source = parallel_read(dataset_source.data_sources, reader_class=dataset_source.reader, num_epochs=num_epochs, num_readers=1, shuffle=False, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed) data_target = "" if dataset_target is not None: _, data_target = parallel_read(dataset_target.data_sources, reader_class=dataset_target.reader, num_epochs=num_epochs, num_readers=1, shuffle=False, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed) # Optionally shuffle the data if shuffle: shuffle_queue = tf.RandomShuffleQueue( capacity=common_queue_capacity, min_after_dequeue=common_queue_min, dtypes=[tf.string, tf.string], seed=seed) enqueue_ops = [shuffle_queue.enqueue([data_source, data_target])] tf.train.add_queue_runner( tf.train.QueueRunner(shuffle_queue, enqueue_ops)) data_source, data_target = shuffle_queue.dequeue() # Decode source items items = dataset_source.decoder.list_items() tensors = dataset_source.decoder.decode(data_source, items) if dataset_target is not None: # Decode target items items2 = dataset_target.decoder.list_items() tensors2 = dataset_target.decoder.decode(data_target, items2) # Merge items and results items = items + items2 tensors = tensors + tensors2 super(ParallelDatasetProvider, self).__init__(items_to_tensors=dict(zip(items, tensors)), num_samples=dataset_source.num_samples)
def __init__(self, dataset_source, dataset_target, shuffle=True, num_epochs=None, common_queue_capacity=4096, common_queue_min=1024, seed=None): if seed is None: seed = np.random.randint(10e8) _, data_source = parallel_read( dataset_source.data_sources, reader_class=dataset_source.reader, num_epochs=num_epochs, num_readers=1, shuffle=False, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed) data_target = "" if dataset_target is not None: _, data_target = parallel_read( dataset_target.data_sources, reader_class=dataset_target.reader, num_epochs=num_epochs, num_readers=1, shuffle=False, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed) # Optionally shuffle the data if shuffle: shuffle_queue = tf.RandomShuffleQueue( capacity=common_queue_capacity, min_after_dequeue=common_queue_min, dtypes=[tf.string, tf.string], seed=seed) enqueue_ops = [shuffle_queue.enqueue([data_source, data_target])] tf.train.add_queue_runner( tf.train.QueueRunner(shuffle_queue, enqueue_ops)) data_source, data_target = shuffle_queue.dequeue() # Decode source items items = dataset_source.decoder.list_items() tensors = dataset_source.decoder.decode(data_source, items) if dataset_target is not None: # Decode target items items2 = dataset_target.decoder.list_items() tensors2 = dataset_target.decoder.decode(data_target, items2) # Merge items and results items = items + items2 tensors = tensors + tensors2 super(ParallelDatasetProvider, self).__init__(items_to_tensors=dict(zip(items, tensors)), num_samples=dataset_source.num_samples)
def __init__(self, dataset, num_readers=1, shuffle=True, num_epochs=None, common_queue_capacity=256, common_queue_min=128): """Creates a DatasetDataProvider. Args: dataset: An instance of the Dataset class. num_readers: The number of parallel readers to use. shuffle: Whether to shuffle the data sources and common queue when reading. num_epochs: The number of times each data source is read. If left as None, the data will be cycled through indefinitely. common_queue_capacity: The capacity of the common queue. common_queue_min: The minimum number of elements in the common queue after a dequeue. """ _, data = parallel_reader.parallel_read( dataset.data_sources, reader_class=dataset.reader, num_epochs=num_epochs, num_readers=num_readers, shuffle=shuffle, capacity=common_queue_capacity, min_after_dequeue=common_queue_min) items = dataset.decoder.list_items() tensors = dataset.decoder.decode(data, items) super(DatasetDataProvider, self).__init__( items_to_tensors=dict(zip(items, tensors)), num_samples=dataset.num_samples)
def __init__(self, dataset, num_readers=1, reader_kwargs=None, shuffle=True, num_epochs=None, common_queue_capacity=256, common_queue_min=128, record_key='__record_key__', seed=None, scope=None): key, data = parallel_read( dataset.data_sources, reader_class=dataset.reader, num_epochs=num_epochs, num_readers=num_readers, reader_kwargs=reader_kwargs, shuffle=shuffle, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed, scope=scope) items = dataset.decoder.list_items() tensors = dataset.decoder.decode(data, items) if record_key in items: raise ValueError('The item name used for `record_key` cannot also be ' 'used for a dataset item: %s', record_key) # items.append(record_key) # tensors.append(key) super(DatasetDataProvider, self).__init__(items_to_tensors=dict(zip(items, tensors)), num_samples=dataset.num_samples)
def __init__(self, dataset, num_readers=1, reader_kwargs=None, shuffle=True, num_epochs=None, common_queue_capacity=256, common_queue_min=128, record_key='record_key', seed=None, scope=None): """Creates a DatasetDataProvider. Note: if `num_epochs` is not `None`, local counter `epochs` will be created by relevant function. Use `local_variables_initializer()` to initialize local variables. Args: dataset: An instance of the Dataset class. num_readers: The number of parallel readers to use. reader_kwargs: An optional dict of kwargs for the reader. shuffle: Whether to shuffle the data sources and common queue when reading. num_epochs: The number of times each data source is read. If left as None, the data will be cycled through indefinitely. common_queue_capacity: The capacity of the common queue. common_queue_min: The minimum number of elements in the common queue after a dequeue. record_key: The item name to use for the dataset record keys in the provided tensors. seed: The seed to use if shuffling. scope: Optional name scope for the ops. Raises: ValueError: If `record_key` matches one of the items in the dataset. """ key, data = parallel_reader.parallel_read( dataset.data_sources, reader_class=dataset.reader, num_epochs=num_epochs, num_readers=num_readers, reader_kwargs=reader_kwargs, shuffle=shuffle, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed, scope=scope) items = dataset.decoder.list_items() tensors = dataset.decoder.decode(data, items) if record_key in items: raise ValueError( 'The item name used for `record_key` cannot also be ' 'used for a dataset item: %s', record_key) items.append(record_key) tensors.append(key) super(DatasetDataProvider, self).__init__(items_to_tensors=dict(zip(items, tensors)), num_samples=dataset.num_samples)
def __init__(self, dataset, num_readers=1, reader_kwargs=None, shuffle=True, num_epochs=None, common_queue_capacity=256, common_queue_min=128, record_key='record_key', seed=None, scope=None): """Creates a DatasetDataProvider. Note: if `num_epochs` is not `None`, local counter `epochs` will be created by relevant function. Use `local_variables_initializer()` to initialize local variables. Args: dataset: An instance of the Dataset class. num_readers: The number of parallel readers to use. reader_kwargs: An optional dict of kwargs for the reader. shuffle: Whether to shuffle the data sources and common queue when reading. num_epochs: The number of times each data source is read. If left as None, the data will be cycled through indefinitely. common_queue_capacity: The capacity of the common queue. common_queue_min: The minimum number of elements in the common queue after a dequeue. record_key: The item name to use for the dataset record keys in the provided tensors. seed: The seed to use if shuffling. scope: Optional name scope for the ops. Raises: ValueError: If `record_key` matches one of the items in the dataset. """ key, data = parallel_reader.parallel_read( dataset.data_sources, reader_class=dataset.reader, num_epochs=num_epochs, num_readers=num_readers, reader_kwargs=reader_kwargs, shuffle=shuffle, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed, scope=scope) items = dataset.decoder.list_items() tensors = dataset.decoder.decode(data, items) if record_key in items: raise ValueError('The item name used for `record_key` cannot also be ' 'used for a dataset item: %s', record_key) items.append(record_key) tensors.append(key) super(DatasetDataProvider, self).__init__( items_to_tensors=dict(zip(items, tensors)), num_samples=dataset.num_samples)
def __init__(self, dataset, num_readers=1, shuffle=True, num_epochs=None, common_queue_capacity=256, common_queue_min=128, bgr_flips=None): """Creates a DatasetDataProvider. Args: dataset: An instance of the Dataset class. num_readers: The number of parallel readers to use. shuffle: Whether to shuffle the data sources and common queue when reading. num_epochs: The number of times each data source is read. If left as None, the data will be cycled through indefinitely. common_queue_capacity: The capacity of the common queue. common_queue_min: The minimum number of elements in the common queue after a dequeue. """ self.num_channels_stream = [] if bgr_flips is not None: bgr_flips = bgr_flips.split(',') img_str, label = parallel_reader.parallel_read( dataset.data_sources, reader_class=dataset.reader, num_epochs=num_epochs, num_readers=num_readers, shuffle=shuffle, capacity=common_queue_capacity, min_after_dequeue=common_queue_min) items = dataset.decoder.list_items() imgs = dataset.decoder.decode(img_str, items) num_streams = len(imgs[0]) final_imgs = [] for sid in range(num_streams): self.num_channels_stream.append( imgs[0][sid].get_shape().as_list()[-1]) img_stream = [] for bid in range(len(imgs)): img_stream.append(imgs[bid][sid]) img = tf.pack(img_stream) if bgr_flips[sid] == 'True': logging.info('BGR flipping stream %d' % sid) img = tf.reverse(img, [False, False, False, True]) final_imgs.append(img) img = tf.concat(3, final_imgs) tensors = [img, label] super(DatasetDataProvider, self).__init__(items_to_tensors=dict(zip(items, tensors)), num_samples=dataset.num_samples)
def __init__(self, dataset, num_readers=1, reader_kwargs=None, shuffle=True, num_epochs=None, common_queue_capacity=256, common_queue_min=128, record_key='record_key', seed=None, scope=None): """Creates a DatasetDataProvider. Args: dataset: An instance of the Dataset class. num_readers: The number of parallel readers to use. reader_kwargs: An optional dict of kwargs for the reader. shuffle: Whether to shuffle the data sources and common queue when reading. num_epochs: The number of times each data source is read. If left as None, the data will be cycled through indefinitely. common_queue_capacity: The capacity of the common queue. common_queue_min: The minimum number of elements in the common queue after a dequeue. record_key: The item name to use for the dataset record keys in the provided tensors. seed: The seed to use if shuffling. scope: Optional name scope for the ops. Raises: ValueError: If `record_key` matches one of the items in the dataset. """ key, data = parallel_reader.parallel_read( dataset.data_sources, reader_class=dataset.reader, num_epochs=num_epochs, num_readers=num_readers, reader_kwargs=reader_kwargs, shuffle=shuffle, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed, scope=scope) items = dataset.decoder.list_items() tensors = dataset.decoder.decode(data, items) super(FeaturedDataProvider, self).__init__( items_to_tensors=dict(zip(items, tensors)), num_samples=dataset.num_samples)
def __init__(self, dataset, num_readers=1, shuffle=True, num_epochs=None, common_queue_capacity=256, common_queue_min=128, bgr_flips=None): """Creates a DatasetDataProvider. Args: dataset: An instance of the Dataset class. num_readers: The number of parallel readers to use. shuffle: Whether to shuffle the data sources and common queue when reading. num_epochs: The number of times each data source is read. If left as None, the data will be cycled through indefinitely. common_queue_capacity: The capacity of the common queue. common_queue_min: The minimum number of elements in the common queue after a dequeue. """ self.num_channels_stream = [] if bgr_flips is not None: bgr_flips = bgr_flips.split(',') img_str, label = parallel_reader.parallel_read( dataset.data_sources, reader_class=dataset.reader, num_epochs=num_epochs, num_readers=num_readers, shuffle=shuffle, capacity=common_queue_capacity, min_after_dequeue=common_queue_min) items = dataset.decoder.list_items() imgs = dataset.decoder.decode(img_str, items) num_streams = len(imgs[0]) final_imgs = [] for sid in range(num_streams): self.num_channels_stream.append(imgs[0][sid].get_shape().as_list()[-1]) img_stream = [] for bid in range(len(imgs)): img_stream.append(imgs[bid][sid]) img = tf.pack(img_stream) if bgr_flips[sid] == 'True': logging.info('BGR flipping stream %d' % sid) img = tf.reverse(img, [False, False, False, True]) final_imgs.append(img) img = tf.concat(3, final_imgs) tensors = [img, label] super(DatasetDataProvider, self).__init__( items_to_tensors=dict(zip(items, tensors)), num_samples=dataset.num_samples)
def main(): reader = tf.TFRecordReader data_sources = ["traineh.tfrecord"] _, data = parallel_reader.parallel_read( data_sources, reader_class=reader, num_epochs=1, num_readers=1, shuffle=False, capacity=256, min_after_dequeue=1) context_features, sequence_features = parsing_ops.parse_single_sequence_example(data, context_features={ 'video_id': tf.VarLenFeature(tf.string), 'labels': tf.VarLenFeature(tf.int64), }, sequence_features={ 'inc3': tf.FixedLenSequenceFeature(1, tf.string) }, example_name="") with tf.Session() as sess: sess.run(tf.initialize_local_variables()) sess.run(tf.initialize_all_variables()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: while not coord.should_stop(): meta = sess.run(context_features) vid = meta['video_id'].values[0] labels = meta['labels'].values inc3_fea = sess.run(sequence_features)['inc3'] frame_feas = [] for r in inc3_fea: v = np.fromstring(r[0], dtype=np.uint8) frame_feas.append(v[None, :]) frame_feas = np.vstack(frame_feas) print(vid, labels) print(frame_feas.shape) # Do something here except tf.errors.OutOfRangeError: print('Finished extracting.') finally: coord.request_stop() coord.join(threads)
def testTFRecordReader(self): with self.test_session(): self._tfrecord_paths = test_utils.create_tfrecord_files( self.get_temp_dir(), num_files=3) key, value = parallel_reader.parallel_read( self._tfrecord_paths, reader_class=io_ops.TFRecordReader, num_readers=3) sv = supervisor.Supervisor(logdir=self.get_temp_dir()) with sv.prepare_or_wait_for_session() as sess: sv.start_queue_runners(sess) flowers = 0 num_reads = 100 for _ in range(num_reads): current_key, _ = sess.run([key, value]) if 'flowers' in str(current_key): flowers += 1 self.assertGreater(flowers, 0) self.assertEquals(flowers, num_reads)
def testTFRecordReader(self): with self.test_session(): self._tfrecord_paths = test_utils.create_tfrecord_files( self.get_temp_dir(), num_files=3) key, value = parallel_reader.parallel_read( self._tfrecord_paths, reader_class=io_ops.TFRecordReader, num_readers=3) sv = supervisor.Supervisor(logdir=self.get_temp_dir()) with sv.managed_session() as sess: flowers = 0 num_reads = 100 for _ in range(num_reads): current_key, _ = sess.run([key, value]) if 'flowers' in str(current_key): flowers += 1 self.assertGreater(flowers, 0) self.assertEquals(flowers, num_reads)
def _get_tensor_and_example( self, mode: tf.estimator.ModeKeys, shuffle: bool = False, num_epochs: Optional[int] = None ) -> Tuple[Dict[bytes, tf.Tensor], bytes]: """Read and decode the serialized tf.Example into tensors. Args: mode: One of tf.estimator.ModeKeys {TRAIN,EVAL,INFER}. shuffle: Whether to shuffle the input. num_epochs: Number of times a tf.Example will be visited in generating the input. If set to None, each Example will be cycled indefinitely. Returns: Tuple with: A dictionary that maps tensorflow.Example feature names to tensors. serialized_example: bytes, a serialized example. """ dataset = self._data[mode] if mode == tf.estimator.ModeKeys.INFER: serialized_example = tf.placeholder( dtype=tf.string, shape=[], name='input_serialized_examples') else: _, serialized_example = parallel_reader.parallel_read( dataset.data_sources, reader_class=dataset.reader, num_epochs=num_epochs, num_readers=self._num_readers, shuffle=shuffle, capacity=self._queue_capacity, min_after_dequeue=self._queue_min) items = dataset.decoder.list_items() tensors = dataset.decoder.decode(serialized_example, items) return dict(zip(items, tensors)), serialized_example
def __init__(self, dataset1, dataset2, schemas=None, shuffle=True, num_epochs=None, common_queue_capacity=4096, common_queue_min=1024, seed=None): if seed is None: seed = np.random.randint(10e8) _, data_source = parallel_reader.parallel_read( dataset1.data_sources, reader_class=dataset1.reader, num_epochs=num_epochs, num_readers=1, shuffle=False, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed) data_target = "" if dataset2 is not None: _, data_target = parallel_reader.parallel_read( dataset2.data_sources, reader_class=dataset2.reader, num_epochs=num_epochs, num_readers=1, shuffle=False, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed) data_schemas = "" print("schemas.data_sources", schemas.data_sources) if schemas is not None: _, data_schemas = parallel_reader.parallel_read( schemas.data_sources, reader_class=schemas.reader, num_epochs=num_epochs, num_readers=1, shuffle=False, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed) # Optionally shuffle the data if shuffle: shuffle_queue = tf.RandomShuffleQueue( capacity=common_queue_capacity, min_after_dequeue=common_queue_min, dtypes=[tf.string, tf.string, tf.string], seed=seed) enqueue_ops = [] enqueue_ops.append( shuffle_queue.enqueue([data_source, data_target, data_schemas])) tf.train.add_queue_runner( tf.train.QueueRunner(shuffle_queue, enqueue_ops)) data_source, data_target, data_schemas = shuffle_queue.dequeue() # Decode source items items = dataset1.decoder.list_items() tensors = dataset1.decoder.decode(data_source, items) if dataset2 is not None: # Decode target items items2 = dataset2.decoder.list_items() print("items2", items2) print("data_target", data_target) tensors2 = dataset2.decoder.decode(data_target, items2) # Merge items and results items = items + items2 tensors = tensors + tensors2 if schemas is not None: items_schema = schemas.decoder.list_items() tensors_schema = schemas.decoder.decode(data_schemas, items_schema) print("items_schema", items_schema) print("tensor_schema", tensors_schema) sess = tf.Session() # with tf.Session() as sess: # print (tf.Tensor.eval(tensors_schema[0])) # print (tf.shape(tensors_schema[0])) items = items + items_schema tensors = tensors + tensors_schema super(TripleDataProvider, self).__init__(items_to_tensors=dict(zip(items, tensors)), num_samples=dataset1.num_samples)
def _convert_Youtube8M_tfrecord_to_numpy(tfrecord_filename): ''' Function: _convert_Youtube8M_tfrecord_to_numpy i.e. parse each data_component according to example_prototxt Input: <string> tfrecord_filename Output: <dictionary> parsed_data ''' reader = tf.TFRecordReader _, data = parallel_reader.parallel_read(data_sources=tfrecord_filename, reader_class=reader, num_epochs=1, num_readers=1, shuffle=False, capacity=256, min_after_dequeue=1) # build-up fileQueue and exampleQueue for tfrecords.file... context_feat, seq_feat = parsing_ops.parse_single_sequence_example( data, context_features={ 'video_id': tf.VarLenFeature(tf.string), 'labels': tf.VarLenFeature(tf.int64) }, sequence_features={ 'rgb': tf.FixedLenSequenceFeature([], tf.string), 'audio': tf.FixedLenSequenceFeature([], tf.string) }, example_name=" ") # standard framework for example parsing... with tf.Session() as sess: #--- initialize variables in tensorflow session ---# sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) #--- start-up coordinator to manage the QueueRunner threads ---# coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) #--- training operations ---# try: total_rgb_feat = [] total_audio_feat = [] total_label = [] while not coord.should_stop(): video_context, video_features = sess.run( (context_feat, seq_feat)) #--- extract 'video_id' and 'labels' from context features ---# video_id = video_context['video_id'].values[0] labels = video_context['labels'].values #--- one-hot vector for labels ---# labels = sess.run( tf.sparse_to_dense(labels, (4716, ), 1, validate_indices=False)) #--- extract 'rgb' and 'audio' features from video features ---# hex_rgb_feat = video_features['rgb'] hex_audio_feat = video_features['audio'] rgb_feat = [] audio_feat = [] #--- convert hex data i.e. hex_rgb_feat to numpy.uint8 format ---# for ii in range(len(hex_rgb_feat)): single_rgb_feat = np.fromstring(hex_rgb_feat[ii], dtype=np.uint8) single_audio_feat = np.fromstring(hex_audio_feat[ii], dtype=np.uint8) rgb_feat.append(single_rgb_feat) audio_feat.append(single_audio_feat) #--- reshape e.g. [[1,2], [3,4]] -> [1,2; 3,4] rgb_feat = np.vstack(rgb_feat) audio_feat = np.vstack(audio_feat) #--- dequantize the rgb and audio features... ---# rgb_feat = _dequantize(rgb_feat, 2, -2) audio_feat = _dequantize(audio_feat, 2, -2) #--- padding or crop to fixed nframe=300... ---# rgb_feat = _frame_padding(input_feat=rgb_feat, padding_value=0, target_nframe=300) audio_feat = _frame_padding(input_feat=audio_feat, padding_value=0, target_nframe=300) total_rgb_feat.append(rgb_feat) total_audio_feat.append(audio_feat) total_label.append(labels) except tf.errors.OutOfRangeError: print('!All video features have been exported...') finally: coord.request_stop() coord.join(threads=threads) return total_rgb_feat, total_audio_feat, total_label sess.close()
def __init__(self, dataset1, dataset2, dataset3, shuffle=True, num_epochs=None, common_queue_capacity=4096, common_queue_min=1024, seed=None): if seed is None: seed = np.random.randint(10e8) _, data_source_query = parallel_reader.parallel_read( dataset1.data_sources, reader_class=dataset1.reader, num_epochs=num_epochs, num_readers=1, shuffle=False, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed) _, data_source_candidate = parallel_reader.parallel_read( dataset2.data_sources, reader_class=dataset2.reader, num_epochs=num_epochs, num_readers=1, shuffle=False, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed) data_target = "" if dataset3 is not None: _, data_target = parallel_reader.parallel_read( dataset3.data_sources, reader_class=dataset3.reader, num_epochs=num_epochs, num_readers=1, shuffle=False, capacity=common_queue_capacity, min_after_dequeue=common_queue_min, seed=seed) # Optionally shuffle the data if shuffle: shuffle_queue = tf.RandomShuffleQueue( capacity=common_queue_capacity, min_after_dequeue=common_queue_min, dtypes=[tf.string, tf.string, tf.string], seed=seed) enqueue_ops = [] enqueue_ops.append( shuffle_queue.enqueue( [data_source_query, data_source_candidate, data_target])) tf.train.add_queue_runner( tf.train.QueueRunner(shuffle_queue, enqueue_ops)) data_source_query, data_source_candidate, data_target = shuffle_queue.dequeue( ) # Decode source query items items = dataset1.decoder.list_items() tensors = dataset1.decoder.decode(data_source_query, items) # Decode source candidate items items2 = dataset2.decoder.list_items() tensors2 = dataset2.decoder.decode(data_source_candidate, items2) items = items + items2 tensors = tensors + tensors2 if dataset3 is not None: # Decode target items items3 = dataset3.decoder.list_items() tensors3 = dataset3.decoder.decode(data_target, items3) items = items + items3 tensors = tensors + tensors3 # Merge items and results #items = items + items2 + items3 #tensors = tensors + tensors2 + tensors3 print("items:{}".format(items)) print("tensors:{}".format(tensors)) super(TripleDataProvider, self).__init__(items_to_tensors=dict(zip(items, tensors)), num_samples=dataset1.num_samples)