def _benchmarkMapAndFilter(self, chain_length, optimize_dataset): with ops.Graph().as_default(): dataset = dataset_ops.Dataset.from_tensors(0).repeat(None) for _ in range(chain_length): dataset = dataset.map(lambda x: x + 5).filter( lambda x: math_ops.greater_equal(x - 5, 0)) if optimize_dataset: dataset = dataset.apply( optimization.optimize(["map_and_filter_fusion"])) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() with session.Session() as sess: for _ in range(10): sess.run(next_element.op) deltas = [] for _ in range(100): start = time.time() for _ in range(100): sess.run(next_element.op) end = time.time() deltas.append(end - start) median_wall_time = np.median(deltas) / 100 opt_mark = "opt" if optimize_dataset else "no-opt" print("Map and filter dataset {} chain length: {} Median wall time: {}". format(opt_mark, chain_length, median_wall_time)) self.report_benchmark( iters=1000, wall_time=median_wall_time, name="benchmark_map_and_filter_dataset_chain_latency_{}_{}".format( opt_mark, chain_length))
def build_dataset(): dataset = dataset_ops.Dataset.range(100) dataset = dataset.map(lambda x: x) dataset = dataset.batch(5) # map_vectorization adds a new vectorized function to the function # library. dataset = dataset.apply( optimization.optimize(["map_vectorization"])) return dataset
def build_dataset(num_elements, batch_size): return dataset_ops.Dataset.range(num_elements).map( lambda x: x * x).batch(batch_size).apply( optimization.optimize(["map_and_batch_fusion"]))
def build_dataset(num_elements, batch_size): return dataset_ops.Dataset.range(num_elements).map(lambda x: x * x).batch( batch_size).apply(optimization.optimize(["map_and_batch_fusion"]))
def read_tf_records(batch_size, tf_records, num_repeats=1, shuffle_records=True, shuffle_examples=True, shuffle_buffer_size=None, interleave=True, filter_amount=1.0, dist_train=False, seed=0): """ Args: batch_size: batch size to return tf_records: a list of tf_record filenames num_repeats: how many times the data should be read (default: One) shuffle_records: whether to shuffle the order of files read shuffle_examples: whether to shuffle the tf.Examples shuffle_buffer_size: how big of a buffer to fill before shuffling. interleave: iwhether to interleave examples from multiple tf_records filter_amount: what fraction of records to keep Returns: a tf dataset of batched tensors """ if shuffle_examples and not shuffle_buffer_size: raise ValueError("Must set shuffle buffer size if shuffling examples") tf_records = list(tf_records) random.seed(seed) #if shuffle_records: # random.shuffle(tf_records) record_list = tf.data.Dataset.from_tensor_slices(tf_records) if dist_train: record_list = record_list.shard(hvd.size(), hvd.rank()) # compression_type here must agree with write_tf_examples map_func = functools.partial(tf.data.TFRecordDataset, buffer_size=8 * 1024 * 1024, compression_type='ZLIB') if interleave: # cycle_length = how many tfrecord files are read in parallel # The idea is to shuffle both the order of the files being read, # and the examples being read from the files. dataset = record_list.apply( tf.data.experimental.parallel_interleave(map_func, cycle_length=1000, sloppy=True)) else: dataset = record_list.flat_map(map_func) if filter_amount < 1.0: dataset = dataset.filter( lambda _: tf.random.uniform([], seed=seed) < filter_amount) dataset = dataset.apply( optimization.optimize(["filter_with_random_uniform_fusion"])) #if dist_train: # dataset = dataset.shard(hvd.size(), hvd.rank()) dataset = dataset.repeat(num_repeats) if shuffle_examples: dataset = dataset.shuffle(buffer_size=shuffle_buffer_size) dataset = dataset.batch(batch_size, drop_remainder=True) return dataset