def get_exec_time_loss(loss_fn, logits_shape, num_runs=1): run_opts = tf1.RunOptions(trace_level=tf1.RunOptions.FULL_TRACE) times = [] @tf.function def run_loss(logits, labels): return loss_fn(logits, labels) conc = run_loss.get_concrete_function(tf.TensorSpec(logits_shape), tf.TensorSpec(logits_shape)) for run in range(num_runs + 1): with tf1.Session() as sess: run_meta = tf1.RunMetadata() sess.run(tf1.global_variables_initializer()) logits = tf.random.normal(logits_shape) labels = tf.random.normal(logits_shape) out = conc(logits, labels) sess.run(out, options=run_opts, run_metadata=run_meta) t1 = timeline.Timeline(run_meta.step_stats) lctf = t1.generate_chrome_trace_format() del logits del labels time = convert_string_to_time(lctf) times.append(time) if np.std(times) <= np.std(times[1:]): return np.average(times), np.std(times) # Filter first run return np.average(times[1:]), np.std(times[1:])
def main(): tf.disable_eager_execution() with tf.device('/gpu:0'): t1 = tf.random.uniform(shape=[32, 56, 56, 64], dtype=tf.half) t2 = tf.random.uniform(shape=[3, 3, 64, 64], dtype=tf.half) t = tf.nn.conv2d(input=t1, filters=t2, strides=[2, 2], padding='SAME', data_format='NHWC', name='Conv2D') run_options = tf.RunOptions() run_options.trace_level = run_options.FULL_TRACE run_metadata = tf.RunMetadata() options = tf.GraphOptions(build_cost_model=1) cfg = tf.ConfigProto(graph_options=options) with tf.Session(config=cfg) as sess: sess.run(tf.global_variables_initializer()) _ = sess.run([t], options=run_options, run_metadata=run_metadata) for node in run_metadata.cost_graph.node: if node.name == 'Conv2D': print(node.name, ':', node.compute_cost * 1000, 'ns.')
def _step(self, handles, merged=None, writer=None, summary=False, log_trace=False): # Optimization step feed_dict = { self.model.is_train: True, self.model.monte_carlo: self.monte_carlo, self.model.augmentation: self.augment_train, self.model.total_steps: self.total_steps, self.learning_rate_multiplier: self.curr_multiplier } for h_t, h in zip(self.model.handles, handles): feed_dict.update({h_t: h}) feed_dict.update(self.model.custom_feed_dict) run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) if log_trace else None run_metadata = tf.RunMetadata() if log_trace else None if summary: # Write summaries on TensorBoard assert merged is not None, 'No merged summary exists.' assert writer is not None, 'No summary writer exists.' _, loss, Y_true, Y_pred, summaries = self.model.session.run( [ self.optimization_operation, self.model.loss, self.model.Y_all, self.model.pred, merged ], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) writer.add_summary(summaries, self.curr_step + 1) writer.flush() else: _, loss, Y_true, Y_pred, = self.model.session.run( [ self.optimization_operation, self.model.loss, self.model.Y_all, self.model.pred ], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) if log_trace: assert writer is not None, 'TensorFlow FileWriter must be provided for logging.' tracing_dir = os.path.join(writer.get_logdir(), 'tracing') if not os.path.exists(tracing_dir): os.makedirs(tracing_dir) fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( show_memory=False) with open( os.path.join(tracing_dir, 'step_{}.json'.format(self.curr_step + 1)), 'w') as f: f.write(chrome_trace) return loss, Y_true, Y_pred
def benchmark(self, image_arrays, trace_filename=None): if not self.sess: self.build() # init session self.sess.run( self.signitures['prediction'], feed_dict={self.signitures['image_arrays']: image_arrays}) start = perf_counter() for i in range(10): self.sess.run( self.signitures['prediction'], feed_dict={self.signitures['image_arrays']: image_arrays}) end = perf_counter() inference_time = (end - start) / 10 print('Inference time: ', inference_time) print('FPS: ', 1 / inference_time) if trace_filename: run_options = tf.RunOptions() run_options.trace_level = tf.RunOptions.FULL_TRACE run_metadata = tf.RunMetadata() self.sess.run( self.signitures['prediction'], feed_dict={self.signitures['image_arrays']: image_arrays}, options=run_options, run_metadata=run_metadata) with tf.io.gfile.GFile(trace_filename, 'w') as trace_file: from tensorflow.python.client import timeline # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top trace = timeline.Timeline(step_stats=run_metadata.step_stats) trace_file.write( trace.generate_chrome_trace_format(show_memory=True))
def main(): t = trainer.Trainer() args = t.args params = Params(args.batch_size, args.seq_len, args.model_size) # Initialize dataset dataset = TextDataLoader(args.batch_size, args.src_vocab, args.tgt_vocab, args.src_text, args.tgt_text, params.max_seq_len, args.src_vocab_size, args.tgt_vocab_size, args.sentences_size) enc_inputs, dec_inputs, _, _ = dataset.next_batch() # Model graph, mesh_to_impl, mtf_loss = Transformer(enc_inputs, dec_inputs, params, dataset.src_vocab_size, dataset.tgt_vocab_size, args.strategy, t.num_nodes, t.num_gpus) # Train run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) config = tf.ConfigProto(allow_soft_placement=False) t.train_model(graph, mesh_to_impl, mtf_loss, dataset, config=config, run_options=run_options)
def tf1_benchmark(): """Run TF1 inference and benchmark.""" # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top from tensorflow.python.client import timeline with tf1.Session() as sess: model = effnetv2_model.EffNetV2Model(FLAGS.model_name, FLAGS.hparam_str) batch_size = FLAGS.batch_size run_options = tf1.RunOptions(trace_level=tf1.RunOptions.FULL_TRACE) run_metadata = tf1.RunMetadata() isize = FLAGS.image_size or model.cfg.eval.isize data_dtype = tf.float16 if FLAGS.mixed_precision else tf.float32 inputs = tf.ones((batch_size, isize, isize, 3), data_dtype) output = model(inputs, training=False) sess.run(tf1.global_variables_initializer()) print('starting warmup.') for _ in range(5): sess.run(output) print('starting benchmark.') start = time.perf_counter() for _ in range(10): sess.run(output) end = time.perf_counter() inference_time = (end - start) / 10 print('Per batch inference time: ', inference_time) print('FPS: ', batch_size / inference_time) if FLAGS.trace_file: sess.run(output, options=run_options, run_metadata=run_metadata) with tf.io.gfile.GFile(FLAGS.trace_file, 'w') as f: trace = timeline.Timeline(step_stats=run_metadata.step_stats) f.write(trace.generate_chrome_trace_format(show_memory=True))
def get_exec_time_profile(lyr, batch_size, get_grads=False): # must print(lyr.__class__.__name__) run_opts = tf1.RunOptions(trace_level=tf1.RunOptions.FULL_TRACE) if type(lyr.input) != list: input_shapes = [(batch_size,) + tuple(lyr.input.shape[1:])] else: input_shapes = [(batch_size,) + tuple(inp.shape[1:]) for inp in lyr.input] inputs = [tf.random.normal(shp) for shp in input_shapes] func = tf.function(lyr) conc = func.get_concrete_function(*[tf.TensorSpec(shape=shp, dtype=tf.float32) for shp in input_shapes]) run_meta = tf1.RunMetadata() with tf1.Session() as sess: sess.run(tf1.global_variables_initializer()) out = conc(*inputs) sess.run(out, options=run_opts, run_metadata=run_meta) profile = tf1.profiler.Profiler(sess.graph) profile.add_step(0, run_meta) profiler_options = (tf1.profiler.ProfileOptionBuilder( tf1.profiler.ProfileOptionBuilder.time_and_memory( min_cpu_micros=int(0) )).with_step(0).with_empty_output().build()) prof = profile.profile_graph(options=profiler_options) micro_s = prof.total_exec_micros if get_grads: out_grads = tf.random.normal(tf.shape(out)) loss = tf.losses.mean_squared_error(out, out_correct) grads = tf.gradients(loss, inp) return micro_s, prof
def get_exec_time_timeline(mod, batch_size, get_grads=False): run_opts = tf1.RunOptions(trace_level=tf1.RunOptions.FULL_TRACE) if type(mod.input) != list: input_shapes = [(batch_size,) + tuple(mod.input.shape[1:])] output_shapes = [(batch_size,) + tuple(mod.output.shape[1:])] else: input_shapes = [(batch_size,) + tuple(inp.shape[1:]) for inp in mod.input] output_shapes = [(batch_size,) + tuple(inp.shape[1:]) for inp in mod.output] inputs = [tf.random.normal(shp) for shp in input_shapes] outputs = [tf.random.normal(shp) for shp in output_shapes] func = tf.function(mod) if len(inputs) == 1: conc = func.get_concrete_function(tf.TensorSpec(shape=input_shapes[0], dtype=tf.float32)) else: conc = func.get_concrete_function([tf.TensorSpec(shape=shp, dtype=tf.float32) for shp in input_shapes]) with tf1.Session() as sess: run_meta = tf1.RunMetadata() sess.run(tf1.global_variables_initializer()) out = conc(*inputs) if not get_grads: sess.run(out, options=run_opts, run_metadata=run_meta) t1 = timeline.Timeline(run_meta.step_stats) ctf = t1.generate_chrome_trace_format() else: grads = tf.gradients(out, inputs, grad_ys=outputs) run_meta = tf1.RunMetadata() sess.run(grads, options=run_opts, run_metadata=run_meta) t1 = timeline.Timeline(run_meta.step_stats) ctf = t1.generate_chrome_trace_format() return convert_string_to_time(ctf)
def evaluate_full_batch(sess, model, minibatch_iter, many_runs_timeline, mode): """ Full batch evaluation NOTE: HERE GCN RUNS THROUGH THE FULL GRAPH. HOWEVER, WE CALCULATE F1 SCORE FOR VALIDATION / TEST NODES ONLY. """ options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() t1 = time.time() num_cls = minibatch_iter.class_arr.shape[-1] feed_dict, labels = minibatch_iter.feed_dict(mode) if args_global.timeline: preds, loss = sess.run([model.preds, model.loss], feed_dict=feed_dict, options=options, run_metadata=run_metadata) fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() many_runs_timeline.append(chrome_trace) else: preds, loss = sess.run([model.preds, model.loss], feed_dict=feed_dict) node_val_test = minibatch_iter.node_val if mode == 'val' else minibatch_iter.node_test t2 = time.time() f1_scores = calc_f1(labels[node_val_test], preds[node_val_test], model.sigmoid_loss) return loss, f1_scores[0], f1_scores[1], (t2 - t1)
def train(self, num_inputs, writer=None, step_offset=0): """ Train the network on the data provided by the input tensor. :param num_inputs: The total number of inputs in the data-set. Used to determine batches per epoch :param writer: The summary writer to add summaries to. This is created by the caller so when we stack layers we don't end up with duplicate outputs. If `None` then no summaries will be written. :param step_offset: The offset for the global step variable so I don't accidentally overwrite my summaries """ # Divide by num_gpus to avoid accidentally training on the same data a bunch of times if self._gpus > 0: batches_per_epoch = num_inputs // self._batch_size // self._gpus else: batches_per_epoch = num_inputs // self._batch_size total_batches = batches_per_epoch * self._max_epochs # Get how many batches constitute roughly 10 percent of the total for recording summaries summary_mod = int(0.1 * total_batches) global_step = step_offset logging.info("Training self-organizing Map") for epoch in range(self._max_epochs): logging.info("Epoch: {}/{}".format(epoch, self._max_epochs)) for batch in range(batches_per_epoch): current_batch = batch + (batches_per_epoch * epoch) global_step = current_batch + step_offset percent_complete = current_batch / total_batches logging.debug("\tBatch {}/{} - {:.2%} complete".format( batch, batches_per_epoch, percent_complete)) # Only do summaries when a SummaryWriter has been provided if writer: if current_batch > 0 and current_batch % summary_mod == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, _, _, = self._sess.run( [ self._merged, self._training_op, self._activity_op ], feed_dict={self._epoch: epoch}, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, "step_{}".format(global_step)) writer.add_summary(summary, global_step) self._save_checkpoint(global_step) else: summary, _ = self._sess.run( [self._merged, self._training_op], feed_dict={self._epoch: epoch}) writer.add_summary(summary, global_step) else: self._sess.run(self._training_op, feed_dict={self._epoch: epoch}) self._trained = True return global_step
def get_exec_time_timeline(model, batch_size, get_grads=False, num_runs=1, return_timeline=False): print("get_exec_time_timeline", model.__class__.__name__) run_opts = tf1.RunOptions(trace_level=tf1.RunOptions.FULL_TRACE) input_shapes, output_shapes = get_shapes(model, batch_size) concrete_function = get_concrete_function(model, input_shapes) # input_names = [f"input_random_normal_{i}" for i in range(len(input_shapes))] # output_names = [f"output_random_normal_{i}" for i in range(len(output_shapes))] # inputs = [tf.random.normal(shp, name=name) for name, shp in zip(input_names, input_shapes)] # outputs = [tf.random.normal(shp, name=name) for name, shp in zip(output_names, output_shapes)] times = [] for run in range(num_runs + 1): # with tf1.Session(config=config) as sess: with tf1.Session() as sess: run_meta = tf1.RunMetadata() sess.run(tf1.global_variables_initializer()) inputs = [tf.random.normal(shp) for shp in input_shapes] outputs = [tf.random.normal(shp) for shp in output_shapes] out = concrete_function(*inputs) if not get_grads: sess.run(out, options=run_opts, run_metadata=run_meta) t1 = timeline.Timeline(run_meta.step_stats) ctf = t1.generate_chrome_trace_format() else: grads = tf.gradients(out, inputs, grad_ys=outputs) run_meta = tf1.RunMetadata() sess.run(grads, options=run_opts, run_metadata=run_meta) t1 = timeline.Timeline(run_meta.step_stats) ctf = t1.generate_chrome_trace_format() if return_timeline: return ctf # for i in inputs: # del i # del inputs # for o in outputs: # del o # del outputs time = convert_string_to_time(ctf) times.append(time) # for handle in inputs: # tf1.delete_session_tensor(handle) # for handle in output_names: # tf1.delete_session_tensor(handle) if np.std(times) <= np.std(times[1:]): return np.average(times), np.std(times) # Filter first run return np.average(times[1:]), np.std(times[1:])
def fit(self, feed, session=None): """Training step for observed source language example.""" if session is None: session = tf.get_default_session() run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) _, cost, summary, last_state = session.run([ self.training_update, self.unregularized_loss, self.training_summary, self.last_state ], feed_dict=feed, options=run_options) return cost, summary, last_state
def get_report(loop_op: tf.Operation, infeed_queue_initializer: tf.Operation, outfeed_op: tf.Operation, report_dest: str, available_memory_proportion: Optional[float] = 0.6) -> None: """Generate report from running model on IPU and save to disk. Args: loop_op: Inference op to generate report on. infeed_queue_initializer: Initializer for the infeed queue outfeed_op: Outfeed operator. report_dest: Location to store report. available_memory_proportion: Proportion of tile memory available as temporary memory for matmul and convolution execution """ # Set compile and device options use_poplar_text_report = report_mode == 'text' opts = ipu_utils.create_ipu_config( profiling=True, use_poplar_text_report=use_poplar_text_report, profile_execution=True) opts = ipu_utils.set_matmul_options(opts, matmul_options={ "availableMemoryProportion": str(available_memory_proportion) }) opts = ipu_utils.set_convolution_options( opts, convolution_options={ "availableMemoryProportion": str(available_memory_proportion) }) ipu_utils.auto_select_ipus(opts, [1]) ipu_utils.configure_ipu_system(opts) with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) session = tf.Session() session.run(infeed_queue_initializer) session.run(loop_op, options=run_options) session.run(outfeed_op, options=run_options) out = session.run(report) if report_mode == 'text': # extract the report rep = ipu_utils.extract_all_strings_from_event_trace(out) logging.info("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: save_tf_report(out)
def main(): t = trainer.Trainer() args = t.args lr = 0.01 # Initialize dataset dataset = TextDataLoader(args.batch_size, args.src_vocab, None, args.src_text, None, args.seq_len, args.src_vocab_size, args.tgt_vocab_size, args.sentences_size) inputs, labels, _, _ = dataset.next_batch() # Convert inputs and labels to int32, due to a bug in mtf.one_hot that leads # to TypeError due to type mismatch inputs = tf.cast(inputs, tf.int32) labels = tf.cast(labels, tf.int32) vocab_size = utils.RoundUp(dataset.src_vocab_size, t.num_gpus) print("Vocab size: %d" % vocab_size) params = Params(args.batch_size, vocab_size, args.seq_len, t.num_nodes, t.num_gpus) # Model if args.strategy == 0: import rnnlm_data as rnn elif args.strategy == 1: import rnnlm_opt as rnn elif args.strategy == 2: import rnnlm_gnmt as rnn elif args.strategy == 3: import rnnlm_flexflow as rnn else: assert False graph, mesh_to_impl, mtf_loss = rnn.model(params, inputs, labels) #try: # soft_placement = rnn.model.soft_placement #except AttributeError: # soft_placement = False soft_placement = True # Train run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) config = tf.ConfigProto(allow_soft_placement=soft_placement, log_device_placement=True) t.train_model(graph, mesh_to_impl, mtf_loss, dataset, config=config, run_options=run_options)
def run_model(train_op, init, warmup_runs=10, profile_runs=10): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True run_metadata = tf.RunMetadata() with tf.Session(config=config) as sess: sess.run(init) options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) for i in range(warmup_runs): sess.run(train_op) for i in range(profile_runs): sess.run(train_op, options=options, run_metadata=run_metadata) return sess.graph_def, run_metadata
def __init__(self, timing_topn=20, timing_min_ms=100, memory_topn=20, memory_min_bytes=1024 * 1024, every_secs=None, every_steps=None, stats_client=None): self._timing_topn = timing_topn self._timing_min_ms = timing_min_ms self._memory_topn = memory_topn self._memory_min_bytes = memory_min_bytes self._stats_client = stats_client or _gctx.stats_client self._timer = tf.train.SecondOrStepTimer(every_secs=every_secs, every_steps=every_steps) self._run_options = \ tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
def trace_solver_solution(save_path: PathLike, train_ds, solver): import tensorflow.compat.v1 as tf1 from tensorflow.python.client import timeline data_iter = train_ds.__iter__() data_list = [x.numpy() for x in data_iter.next()] with tf1.Session() as sess: sqrtn_fn, *_ = _build_model_via_solver(dataset, model_name, train_ds.element_spec, solver) out = sqrtn_fn(*[tf1.convert_to_tensor(x) for x in data_list]) run_meta = tf1.RunMetadata() sess.run(tf1.global_variables_initializer()) sess.run(out, options=tf1.RunOptions(trace_level=tf1.RunOptions.FULL_TRACE), run_metadata=run_meta) t1 = timeline.Timeline(run_meta.step_stats) lctf = t1.generate_chrome_trace_format() with Path(save_path).open("w") as f: f.write(lctf)
def testMultipleInt32ValuesOverMultipleRunsAreRecorded(self): with tf.Session() as sess: x_init_val = np.array([10], dtype=np.int32) x_init = tf.constant(x_init_val, shape=[1], name="x_init") x = tf.Variable(x_init, name="x") x_inc_val = np.array([2], dtype=np.int32) x_inc = tf.constant(x_inc_val, name="x_inc") inc_x = tf.assign_add(x, x_inc, name="inc_x") sess.run(x.initializer) run_options = tf.RunOptions(output_partition_graphs=True) tf_debug.watch_graph( run_options, sess.graph, debug_ops=["DebugNumericSummary"], debug_urls=[self._debug_url], ) # Increase three times. for _ in range(3): sess.run(inc_x, options=run_options) # Debugger data is stored within a special directory within logdir. event_files = glob.glob( os.path.join( self._logdir, constants.DEBUGGER_DATA_DIRECTORY_NAME, "events.debugger*", )) self.assertEqual(1, len(event_files)) self._check_health_pills_in_events_file( event_files[0], { "x_inc:0:DebugNumericSummary": [x_inc_val] * 3, "x:0:DebugNumericSummary": [ x_init_val, x_init_val + x_inc_val, x_init_val + 2 * x_inc_val, ], }, )
def compile(self, optimizer, clipnorm, loss='mse'): # TODO(KGF): check the following import taken from runner.py # Was not in this file, originally. from tensorflow.keras.optimizers import (SGD, Adam, RMSprop, Nadam) if optimizer == 'sgd': optimizer_class = SGD(lr=self.DUMMY_LR, clipnorm=clipnorm) elif optimizer == 'momentum_sgd': optimizer_class = SGD(lr=self.DUMMY_LR, clipnorm=clipnorm, decay=1e-6, momentum=0.9) elif optimizer == 'tf_momentum_sgd': # TODO(KGF): removed TFOptimizer wrapper from here and below # may not work anymore? See # https://github.com/tensorflow/tensorflow/issues/22780 optimizer_class = tf.train.MomentumOptimizer( learning_rate=self.DUMMY_LR, momentum=0.9) elif optimizer == 'adam': optimizer_class = Adam(lr=self.DUMMY_LR, clipnorm=clipnorm) elif optimizer == 'tf_adam': optimizer_class = tf.train.AdamOptimizer( learning_rate=self.DUMMY_LR) elif optimizer == 'rmsprop': optimizer_class = RMSprop(lr=self.DUMMY_LR, clipnorm=clipnorm) elif optimizer == 'nadam': optimizer_class = Nadam(lr=self.DUMMY_LR, clipnorm=clipnorm) else: print("Optimizer not implemented yet") exit(1) # Timeline profiler if (self.conf is not None and conf['training']['timeline_prof']): self.run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) self.run_metadata = tf.RunMetadata() self.model.compile(optimizer=optimizer_class, loss=loss, options=self.run_options, run_metadata=self.run_metadata) else: self.model.compile(optimizer=optimizer_class, loss=loss) self.ensure_equal_weights()
def _test_drive(self, save_dir): self.train_set.initialize( self.model.session) # Initialize training iterator handles = self.train_set.get_string_handles( self.model.session) # Get a string handle from training iterator options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() feed_dict = { self.model.is_train: True, self.model.monte_carlo: False, self.model.augmentation: True, self.learning_rate_multiplier: 0.0 } for h_t, h in zip(self.model.handles, handles): feed_dict.update({h_t: h}) print('Running test epoch...') start_time = time.time() i = 0 while True: try: self.model.session.run([ self.optimization_operation, self.model.loss, self.model.Y_all, self.model.pred ], feed_dict=feed_dict, options=options, run_metadata=run_metadata) fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( show_memory=False) with open( os.path.join(save_dir, 'logs', 'timeline_{:03}.json'.format(i)), 'w') as f: f.write(chrome_trace) i += 1 except tf.errors.OutOfRangeError: break print('Test epoch: {:.2f} sec'.format(time.time() - start_time))
def benchmark(self, image_arrays, trace_filename=None): """Benchmark inference latency/throughput. Args: image_arrays: a list of images in numpy array format. trace_filename: If None, specify the filename for saving trace. """ if not self.sess: self.build() # init session self.sess.run( self.signitures["prediction"], feed_dict={self.signitures["image_arrays"]: image_arrays}, ) start = time.perf_counter() for _ in range(10): self.sess.run( self.signitures["prediction"], feed_dict={self.signitures["image_arrays"]: image_arrays}, ) end = time.perf_counter() inference_time = (end - start) / 10 print("Per batch inference time: ", inference_time) print("FPS: ", self.batch_size / inference_time) if trace_filename: run_options = tf.RunOptions() run_options.trace_level = tf.RunOptions.FULL_TRACE run_metadata = tf.RunMetadata() self.sess.run( self.signitures["prediction"], feed_dict={self.signitures["image_arrays"]: image_arrays}, options=run_options, run_metadata=run_metadata, ) with tf.io.gfile.GFile(trace_filename, "w") as trace_file: trace = timeline.Timeline(step_stats=run_metadata.step_stats) trace_file.write( trace.generate_chrome_trace_format(show_memory=True))
def get_exec_time_loss(loss_fn, logits_shape): run_opts = tf1.RunOptions(trace_level=tf1.RunOptions.FULL_TRACE) logits = tf.random.normal(logits_shape) labels = tf.random.normal(logits_shape) @tf.function def run_loss(): return loss_fn(logits, labels) conc = run_loss.get_concrete_function() with tf1.Session() as sess: run_meta = tf1.RunMetadata() sess.run(tf1.global_variables_initializer()) out = conc() sess.run(out, options=run_opts, run_metadata=run_meta) t1 = timeline.Timeline(run_meta.step_stats) lctf = t1.generate_chrome_trace_format() return convert_string_to_time(lctf)
def _poll_server_till_success(self, max_tries, poll_interval_seconds): for _ in range(max_tries): try: with tf.Session() as sess: a_init_val = np.array([42.0]) a_init = tf.constant(a_init_val, shape=[1], name="a_init") a = tf.Variable(a_init, name="a") run_options = tf.RunOptions(output_partition_graphs=True) tf_debug.watch_graph(run_options, sess.graph, debug_ops=["DebugNumericSummary"], debug_urls=[self._debug_url]) sess.run(a.initializer, options=run_options) return True except tf.errors.FailedPreconditionError as exc: time.sleep(poll_interval_seconds) return False
def log_model_analysis(self): run_metadata = tf.RunMetadata() run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) _, loss = self.sess.run( [self.optimizer, self.loss], feed_dict={ self.x: self.batch_input, self.x2: self.batch_input_bicubic, self.y: self.batch_true, self.lr_input: self.lr, self.dropout: self.dropout_rate }, options=run_options, run_metadata=run_metadata) # tf.contrib.tfprof.model_analyzer.print_model_analysis( # tf.get_default_graph(), # run_meta=run_metadata, # tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY) self.first_training = False
def benchmark(self, image_arrays, trace_filename=None): """Benchmark inference latency/throughput. Args: image_arrays: a list of images in numpy array format. trace_filename: If None, specify the filename for saving trace. """ if not self.sess: self.build() # init session self.sess.run( self.signitures['prediction'], feed_dict={self.signitures['image_arrays']: image_arrays}) start = time.perf_counter() for _ in range(10): self.sess.run( self.signitures['prediction'], feed_dict={self.signitures['image_arrays']: image_arrays}) end = time.perf_counter() inference_time = (end - start) / 10 print('Per batch inference time: ', inference_time) print('FPS: ', self.batch_size / inference_time) if trace_filename: run_options = tf.RunOptions() run_options.trace_level = tf.RunOptions.FULL_TRACE run_metadata = tf.RunMetadata() self.sess.run( self.signitures['prediction'], feed_dict={self.signitures['image_arrays']: image_arrays}, options=run_options, run_metadata=run_metadata) with tf.io.gfile.GFile(trace_filename, 'w') as trace_file: from tensorflow.python.client import timeline # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top trace = timeline.Timeline(step_stats=run_metadata.step_stats) trace_file.write( trace.generate_chrome_trace_format(show_memory=True))
def run_model(args): image_dim = args.image_size if args.channels_last: K.set_image_data_format('channels_last') input_shape = (image_dim, image_dim, 3) else: K.set_image_data_format('channels_first') input_shape = (3, image_dim, image_dim) num_classes = 15 batch_size = args.batch_size model_class = model_choices.get(args.model) model = model_class(weights=None, include_top=True, input_shape=input_shape, classes=num_classes) if args.tensors_on_oom: run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) run_metadata = tf.RunMetadata() model.compile(optimizer='rmsprop', loss='categorical_crossentropy', options=run_options, run_metadata=run_metadata) else: model.compile(optimizer='rmsprop', loss='categorical_crossentropy') random_generator = random_image_generator(batch_size, num_classes, input_shape) steps_per_epoch = args.steps if dist_mod: steps_per_epoch = steps_per_epoch // dist_mod.size() verbose = 0 if dist_mod and dist_mod.rank() != 0 else 1 model.fit_generator(random_generator, steps_per_epoch=steps_per_epoch, epochs=args.epochs, callbacks=get_callbacks(args), verbose=verbose)
def run(self, pianorolls, masks=None, sample_steps=0, current_step=0, total_gibbs_steps=0, temperature=0.99, timeout_ms=0): """Given input pianorolls, runs Gibbs sampling to fill in the rest. When total_gibbs_steps is 0, total_gibbs_steps is set to time * instruments. If faster sampling is desired on the expanse of sample quality, total_gibbs_steps can be explicitly set to a lower number, possibly to the value of sample_steps if do not plan on stopping sample early to obtain intermediate results. This function can be used to return intermediate results by setting the sample_steps to when results should be returned and leaving total_gibbs_steps to be 0. To continue sampling from intermediate results, set current_step to the number of steps taken, and feed in the intermediate pianorolls. Again leaving total_gibbs_steps as 0. Builds the graph and restores checkpoint if necessary. Args: pianorolls: a 4D numpy array of shape (batch, time, pitch, instrument) masks: a 4D numpy array of the same shape as pianorolls, with 1s indicating mask out. If is None, then the masks will be where have 1s where there are no notes, indicating to the model they should be filled in. sample_steps: an integer indicating the number of steps to sample in this call. If set to 0, then it defaults to total_gibbs_steps. current_step: an integer indicating how many steps might have already sampled before. total_gibbs_steps: an integer indicating the total number of steps that a complete sampling procedure would take. temperature: a float indicating the temperature for sampling from softmax. timeout_ms: Timeout for session.Run. Set to zero for no timeout. Returns: A dictionary, consisting of "pianorolls" which is a 4D numpy array of the sampled results and "time_taken" which is the time taken in sampling. """ if self.sess is None: # Build graph and restore checkpoint. self.instantiate_sess_and_restore_checkpoint() if masks is None: masks = np.zeros_like(pianorolls) start_time = time.time() run_options = None if timeout_ms: run_options = tf.RunOptions(timeout_in_ms=timeout_ms) new_piece = self.sess.run( self.samples, feed_dict={ self.placeholders["pianorolls"]: pianorolls, self.placeholders["outer_masks"]: masks, self.placeholders["sample_steps"]: sample_steps, self.placeholders["total_gibbs_steps"]: total_gibbs_steps, self.placeholders["current_step"]: current_step, self.placeholders["temperature"]: temperature }, options=run_options) label = "independent blocked gibbs" time_taken = (time.time() - start_time) / 60.0 tf.logging.info("exit %s (%.3fmin)" % (label, time_taken)) return dict(pianorolls=new_piece, time_taken=time_taken)
def benchmark_model(self, warmup_runs, bm_runs, num_threads, trace_filename=None): """Benchmark model.""" if self.tensorrt: print('Using tensorrt ', self.tensorrt) self.build_and_save_model() graphdef = self.freeze_model() if num_threads > 0: print('num_threads for benchmarking: {}'.format(num_threads)) sess_config = tf.ConfigProto( intra_op_parallelism_threads=num_threads, inter_op_parallelism_threads=1) else: sess_config = tf.ConfigProto() # rewriter_config_pb2.RewriterConfig.OFF sess_config.graph_options.rewrite_options.dependency_optimization = 2 if self.use_xla: sess_config.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_2) with tf.Graph().as_default(), tf.Session(config=sess_config) as sess: inputs = tf.placeholder(tf.float32, name='input', shape=self.inputs_shape) output = self.build_model(inputs, is_training=False) img = np.random.uniform(size=self.inputs_shape) sess.run(tf.global_variables_initializer()) if self.tensorrt: fetches = [inputs.name] + [i.name for i in output] goutput = self.convert_tr(graphdef, fetches) inputs, output = goutput[0], goutput[1:] if not self.use_xla: # Don't use tf.group because XLA removes the whole graph for tf.group. output = tf.group(*output) for i in range(warmup_runs): start_time = time.time() sess.run(output, feed_dict={inputs: img}) print('Warm up: {} {:.4f}s'.format(i, time.time() - start_time)) print('Start benchmark runs total={}'.format(bm_runs)) timev = [] for i in range(bm_runs): if trace_filename and i == (bm_runs // 2): run_options = tf.RunOptions() run_options.trace_level = tf.RunOptions.FULL_TRACE run_metadata = tf.RunMetadata() sess.run(output, feed_dict={inputs: img}, options=run_options, run_metadata=run_metadata) logging.info('Dumping trace to %s', trace_filename) trace_dir = os.path.dirname(trace_filename) if not tf.io.gfile.exists(trace_dir): tf.io.gfile.makedirs(trace_dir) with tf.io.gfile.GFile(trace_filename, 'w') as trace_file: from tensorflow.python.client import timeline # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top trace = timeline.Timeline( step_stats=run_metadata.step_stats) trace_file.write( trace.generate_chrome_trace_format( show_memory=True)) start_time = time.time() sess.run(output, feed_dict={inputs: img}) timev.append(time.time() - start_time) timev.sort() timev = timev[2:bm_runs - 2] print( '{} {}runs {}threads: mean {:.4f} std {:.4f} min {:.4f} max {:.4f}' .format(self.model_name, len(timev), num_threads, np.mean(timev), np.std(timev), np.min(timev), np.max(timev)))
def train(self, log_dir=None, max_epoch=10000, learning_rate=0.001, batch_size=None, interval_sec=300, restore_step=None, run_metadata=False): """Train model. Args: log_dir (str): Log directory where log and model is saved. max_epoch (int): Size of epoch learning_rate (float): Learning rate batch_size (int): Batch size when using mini-batch descent method. If specifying a size larger then learning data or `None`, using batch descent. interfal_sec (float): Specify logging time interval in seconds. Default by 300. restore_step (int): When you specify this argument, this mixin resotres model for specified step. run_metadata (bool): If true, run metadata and write logs. """ if log_dir is None: log_dir = os.path.join(os.path.dirname(__file__), 'tf_logs', datetime.utcnow().strftime('%Y%m%d%H%M%S')) if batch_size is None: batch_size = 1 n_batches = len(self.corpus) // (batch_size * self.time_size) jump = (len(self.corpus) - 1) // batch_size if run_metadata: options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) metadata = tf.RunMetadata() else: options = None metadata = None with self.open_writer(log_dir) as writer: with self.open_session(interval_sec=interval_sec, per_step=n_batches, restore_step=restore_step) as sess: incomes = np.empty([batch_size, self.time_size], dtype=int) labels = np.empty([batch_size, self.time_size], dtype=int) for b in range(batch_size): incomes[b, ] = self.corpus[b * jump:b * jump + self.time_size] labels[b, ] = self.corpus[b * jump + 1:b * jump + self.time_size + 1] step = restore_step or 0 next_hs = np.zeros([batch_size, self.hidden_size]) if restore_step is None: for summary in sess.run( self.los_summaries, feed_dict={ self.incomes: incomes[:batch_size], self.labels: labels[:batch_size], self.prev_hs: next_hs }, ): writer.add_summary(summary, step) for epoch_i in range(step // self.data_size, max_epoch): for batch_i in range(n_batches): inc, lab = self.fetch_batch(epoch_i, batch_i, batch_size, jump, incomes, labels) fd = { self.incomes: inc, self.labels: lab, self.prev_hs: next_hs, self.learning_rate: learning_rate, } _, next_hs = sess.run([self.training_op, self.next_hs], feed_dict=fd, options=options, run_metadata=metadata) step += 1 if run_metadata: writer.add_run_metadata(metadata, f'step: {step}') self.record(sess, writer, step, feed_dict=fd) self.record(sess, writer, step, feed_dict=fd, force_write=True)
def run_sobel(logdir, verbose=False): """Run a Sobel edge detection demonstration. See the summary description for more details. Arguments: logdir: Directory into which to write event logs. verbose: Boolean; whether to log any output. """ if verbose: logger.info("--- Starting run: sobel") tf.reset_default_graph() tf.set_random_seed(0) image = get_image(verbose=verbose) kernel_radius = tf.placeholder(shape=(), dtype=tf.int32) with tf.name_scope("horizontal_kernel"): kernel_side_length = kernel_radius * 2 + 1 # Drop off influence for pixels further away from the center. weighting_kernel = 1.0 - tf.abs( tf.linspace(-1.0, 1.0, num=kernel_side_length)) differentiation_kernel = tf.linspace(-1.0, 1.0, num=kernel_side_length) horizontal_kernel = tf.matmul( tf.expand_dims(weighting_kernel, 1), tf.expand_dims(differentiation_kernel, 0), ) with tf.name_scope("vertical_kernel"): vertical_kernel = tf.transpose(a=horizontal_kernel) float_image = tf.cast(image, tf.float32) dx = convolve(float_image, horizontal_kernel, name="convolve_dx") dy = convolve(float_image, vertical_kernel, name="convolve_dy") gradient_magnitude = tf.norm(tensor=[dx, dy], axis=0, name="gradient_magnitude") with tf.name_scope("normalized_gradient"): normalized_gradient = gradient_magnitude / tf.reduce_max( input_tensor=gradient_magnitude) with tf.name_scope("output_image"): output_image = tf.cast(255 * normalized_gradient, tf.uint8) summ = image_summary.op( "sobel", tf.stack([output_image]), display_name="Sobel edge detection", description=( "Demonstration of [Sobel edge detection]. The step " "parameter adjusts the radius of the kernel. " "The kernel can be of arbitrary size, and considers " "nearby pixels with \u2113\u2082-linear falloff.\n\n" # (that says ``$\ell_2$-linear falloff'') "Edge detection is done on a per-channel basis, so " "you can observe which edges are “mostly red " "edges,” for instance.\n\n" "For practical edge detection, a small kernel " "(usually not more than more than *r*=2) is best.\n\n" "[Sobel edge detection]: %s\n\n" "%s" % ("https://en.wikipedia.org/wiki/Sobel_operator", IMAGE_CREDIT)), ) with tf.Session() as sess: sess.run(image.initializer) writer = tf.summary.FileWriter(os.path.join(logdir, "sobel")) writer.add_graph(sess.graph) for step in xrange(8): if verbose: logger.info("--- sobel: step: %s" % step) feed_dict = {kernel_radius: step} run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() s = sess.run( summ, feed_dict=feed_dict, options=run_options, run_metadata=run_metadata, ) writer.add_summary(s, global_step=step) writer.add_run_metadata(run_metadata, "step_%04d" % step) writer.close()