def comp_fn(): def body(img): with scopes.ipu_scope('/device:IPU:0'): if mode == 'sharded': with autoshard.ipu_autoshard(): probs = tf.import_graph_def( network.optimized_graph, input_map={network.graph_input: img}, name="optimized", return_elements=[network.graph_output])[0] autoshard.automatic_sharding(num_shards=num_ipus, input_ts=img, loss_ts=probs, frozen_inference=True) outfeed_op = outfeed_queue.enqueue(probs) outfeed_op._set_attr( sharding._XLA_SHARDING, attr_value_pb2.AttrValue( s=probs.op.get_attr('_XlaSharding'))) else: probs = tf.import_graph_def( network.optimized_graph, input_map={network.graph_input: img}, name="optimized", return_elements=[network.graph_output])[0] outfeed_op = outfeed_queue.enqueue(probs) # Note that enqueue happens on the IPU. return outfeed_op return loops.repeat(batches_per_step, body, [], infeed_queue)
def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss, aux_loss, accuracy, grad_op = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train)
def compile_fn(): def body(x, y): # z1, z2 = model1(x, y, time_steps_ph) # outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2}) z3 = model2(time_steps_ph) outfeed = outfeed_queue.enqueue({'z3': z3}) return outfeed return loops.repeat(1, body, [], infeed_queue)
def comp_fn(): def body(total_accuracy, image, label): accuracy = validation_graph_builder(model, image, label, opts) return total_accuracy + (tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]) accuracy = loops.repeat(int(opts["validation_batches_per_step"]), body, [tf.constant(0, tf.float32)], valid_iterator) if opts['replicas'] > 1: accuracy = cross_replica_ops.cross_replica_sum(accuracy) / (opts['replicas']*opts['shards']) return accuracy
def comp_fn(): def body(data_dict): accuracy = validation_graph_builder(model, data_dict, opts) accuracy_enqueue = acc_queue.enqueue(accuracy) return accuracy_enqueue accuracy = loops.repeat( int(opts['validation_batches_per_step']), body, [], inference_infeed_iterator) return accuracy
def comp_fn(): def body(img): with scopes.ipu_scope('/device:IPU:0'): probs = tf.import_graph_def( network.optimized_graph, input_map={network.graph_input: img}, name="optimized", return_elements=[network.graph_output])[0] outfeed_op = outfeed_queue.enqueue(probs) # Note that enqueue happens on the IPU. return outfeed_op return loops.repeat(batches_per_step, body, [], infeed_queue)
def comp_fn(): def body(sum_rmse_metric, *args, **kwargs): data_tensors = args observed_ratings, ground_truth = tf.split( data_tensors[0], num_or_size_splits=2, axis=1) rmse_metric = graph_builder( opts, observed_ratings=observed_ratings, ground_truth=ground_truth, type='VALID') return sum_rmse_metric + rmse_metric return loops.repeat(opts.validation_batches_per_step, body, [tf.constant(0, tf.float32)], infeed)
def comp_fn(): def body(total_loss_, sum_rmse_metric, *args): data_tensors = args observed_ratings = data_tensors[0] loss, rmse_metric, apply_grads_ = graph_builder( opts, observed_ratings=observed_ratings, learning_rate=placeholders["learning_rate"]) with tf.control_dependencies([apply_grads_]): return total_loss_ + loss, sum_rmse_metric + rmse_metric return loops.repeat( opts.batches_per_step, body, [tf.constant(0, tf.float32), tf.constant(0, tf.float32)], infeed)
def comp_fn(): def body(total_loss, total_rmse, batch): loss, rmse, grad_op = graph_builder(opts, observed=batch[:, :-1], ground_truth=tf.expand_dims(batch[:, -1], axis=1), learning_rate=placeholders['learning_rate'] if training else None, mode=trainFlag) if not training: return total_loss + loss, total_rmse + rmse with tf.control_dependencies([grad_op]): return total_loss + loss, total_rmse + rmse return loops.repeat(batches_per_step, body, [tf.constant(0, getattr(np, opts.dtypes[0]))]*2, infeed)
def comp_fn(): def body(uids, mids, cats, mid_his, cat_his, mid_mask, target, sl): prob, accuracy = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, use_negsampling=False) with tf.control_dependencies([prob]): return outfeed_queue.enqueue((prob, target, accuracy)) return loops.repeat(opts['batches_per_step'], body, [], infeed)
def comp_fn(): def body(total_accuracy, data_dict): accuracy = validation_graph_builder(model, data_dict, opts) if opts['latency']: timestamp_enqueue = timestamp_queue.enqueue( data_dict['timestamp']) return (total_accuracy + (tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]), timestamp_enqueue) else: return total_accuracy + ( tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]) accuracy = loops.repeat( int(opts["validation_batches_per_step"]), body, [tf.constant(0, tf.float32)], valid_iterator) if opts['total_replicas'] * opts['shards'] > 1 and not opts.get( 'inference', False): accuracy = cross_replica_ops.cross_replica_sum( accuracy) / (opts['total_replicas'] * opts['shards']) return accuracy
def comp_fn_validate(): def body(uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss_total, _, accuracy, _ = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) outfeed_op = outfeed_queue.enqueue( (prob, target, accuracy)) return outfeed_op return loops.repeat(opts['batches_per_step'], body, [], infeed_val)
def my_net(v): r = loops.repeat(2, body, inputs=[v]) return r
def loop_builder(iterations, builder_func, infeed): return loops.repeat(iterations, builder_func, [], infeed)
def my_net(): r = loops.repeat(100, body, [], infeed_queue) return r
def my_net(): r = loops.repeat(REPEATS, body, [], infeed_queue) return r
def train(replication_factor, batch_size, batch_per_step, profile, num_iter, time_steps): """Launch training.""" # Set up in-feeds for the data with tf.device('cpu'): data_generator = EnvGenerator(batch_size, time_steps) items = next(data_generator) output_types = tuple((tf.dtypes.as_dtype(i.dtype) for i in items)) output_shapes = tuple((tf.TensorShape(i.shape) for i in items)) total_bytes = 0 for i in items: total_bytes += i.nbytes print(f'Input data size = {total_bytes/1000000} MB/batch') dataset = tf.data.Dataset.from_generator(data_generator, output_types=output_types, output_shapes=output_shapes) infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, "InfeedQueue", replication_factor=replication_factor) data_init = infeed_queue.initializer # Compile loss op with ipu_scope("/device:IPU:0"): total_loss = ipu_compiler.compile( lambda: loops.repeat(batch_per_step, build_train_op, infeed_queue=infeed_queue, inputs=[tf.constant(0.0, dtype=DTYPE)])) # Set up report op optionally. if profile: with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() # Set up session on IPU opts = utils.create_ipu_config( profiling=profile, use_poplar_text_report=use_poplar_text_report, profile_execution=profile, merge_infeed_io_copies=True) opts = utils.set_optimization_options( opts, max_cross_replica_sum_buffer_size=10000000) opts = utils.auto_select_ipus(opts, [replication_factor]) utils.configure_ipu_system(opts) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) # Initialize variables utils.move_variable_initialization_to_cpu() sess.run([tf.global_variables_initializer(), data_init]) # Run training and time total_time = 0.0 total_samples = 0 skip_iterations = 5 # Initially the infeed may buffer extra input data and # first run for IPU includes XLA compile, so skipping these iterations for calculating items/sec. for iters in range(num_iter): data_generator.reset_counter() t0 = time.perf_counter() sess.run(total_loss) t1 = time.perf_counter() if profile: raw_reports = sess.run(report) if use_poplar_text_report: # extract the report rep = utils.extract_all_strings_from_event_trace(raw_reports) print("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: os.makedirs('profile_rl', exist_ok=True) save_tf_report(raw_reports, log_dir='profile_rl') print("Writing profiling report to profile_rl") break if iters > skip_iterations: total_time += (t1 - t0) total_samples += (batch_size * batch_per_step * replication_factor) print("Average %.1f items/sec" % (total_samples / total_time))
def my_net(): r = loops.repeat(self.repeat_count, model_func, [], infeed_queue) return r
def model(): return loops.repeat(2, body, inputs=[1.0])
def my_net(): loss = tf.constant(0.0, shape=[]) r = loops.repeat(num_iterations, training, [loss], infeed_queue) return r
def compiled_fn(): return loops.repeat(iterations_per_step, partial(training_step_loop, outfeed=outfeed), [], train_iterator)
def infer(self): with tf.device("cpu"): dataset, infeed_queue, data_init, vocab = self._build_dataset() outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed") if self.host_embeddings: src_embedding = Nmt._build_embedding( self.src_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="source_embedding", ) tgt_embedding = Nmt._build_embedding( self.tgt_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="tgt_embedding", ) def build_common(src_embedding, tgt_embedding, source): input_, encoder_outputs, encoder_state = self._build_encoder( src_embedding, source) samples, logits = self._build_decoder(encoder_outputs, encoder_state, tgt_embedding, None, train=False) outfeed = outfeed_queue.enqueue({"samples": samples}) return outfeed def build_infer(source): src_embedding = Nmt._build_embedding( self.src_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="source_embedding", ) tgt_embedding = Nmt._build_embedding( self.tgt_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="tgt_embedding", ) return build_common(src_embedding, tgt_embedding, source) def build_infer_host_embeddings(source): nonlocal src_embedding, tgt_embedding return build_common(src_embedding, tgt_embedding, source) with ipu_scope("/device:IPU:0"): build = build_infer_host_embeddings if self.host_embeddings else build_infer batch = ipu_compiler.compile(lambda: loops.repeat( 1, build, infeed_queue=infeed_queue, inputs=[])) # Create a restoring object saver = tf.train.Saver() ipu_options = util.get_config(report_n=0) utils.configure_ipu_system(ipu_options) session = tf.Session() checkpoint = CHECKPOINT_FILE + ("host_ckpt" if self.opts.host_embeddings else "ckpt") saver.restore(session, checkpoint) session.run(data_init) if self.host_embeddings: batch = [ batch, src_embedding(1, 1, False), tgt_embedding(1, 1, False) ] result_queue = outfeed_queue.dequeue() # Run a dummy value to force the graph compilation session.run(batch) result = session.run(result_queue) predictions = result["samples"] print_data(self.generator.query, vocab[0], predictions, vocab[1]) while True: session.run(batch) result = session.run(result_queue) predictions = result["samples"] print_data(self.generator.query, vocab[0], predictions, vocab[1]) if not self.opts.interact: break
def getImages(numPoints): r = loops.repeat(numPoints, body, []) return r
def my_net(i, w): loss = array_ops.constant(0.0, shape=[]) r = loops.repeat(num_iterations, training, [loss, i, w]) return r
def my_net(): count = 0 count = loops.repeat(10, body, [count], infeed_queue) return count
def training_loop(): return loops.repeat(opts.num_iters, model, infeed_queue=infeed)
def training_loop_FULL(numPoints): out = loops.repeat(numPoints, train_model, infeed_queue=infeed_GAN) return out
def train(self): with tf.device("cpu"): dataset, infeed_queue, data_init, vocab = self._build_dataset() outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed") if self.host_embeddings: src_embedding = Nmt._build_embedding( self.src_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="source_embedding", ) tgt_embedding = Nmt._build_embedding( self.tgt_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="tgt_embedding", ) def build_common(src_embedding, tgt_embedding, source, target, label, mask): nonlocal outfeed_queue input_, encoder_outputs, encoder_state = self._build_encoder( src_embedding, source) samples, logits = self._build_decoder(encoder_outputs, encoder_state, tgt_embedding, target, train=True) loss = self._build_optimiser(logits, label, mask) outfeed = outfeed_queue.enqueue({"loss": loss, "logits": logits}) return outfeed def build_train(source, target, label, mask): src_embedding = Nmt._build_embedding( self.src_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="source_embedding", ) tgt_embedding = Nmt._build_embedding( self.tgt_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="tgt_embedding", ) return build_common(src_embedding, tgt_embedding, source, target, label, mask) def build_train_host_embeddings(source, target, label, mask): nonlocal src_embedding, tgt_embedding return build_common(src_embedding, tgt_embedding, source, target, label, mask) with ipu_scope("/device:IPU:0"): build = build_train_host_embeddings if self.host_embeddings else build_train batch = ipu_compiler.compile(lambda: loops.repeat( self.opts.batches_per_step, build, infeed_queue=infeed_queue, inputs=[], )) # Create a restoring object saver = tf.train.Saver() if self.opts.save_graph: # Dump the graph to a logdir writer = tf.summary.FileWriter( os.path.join("./logs", "NMT", time.strftime("%Y%m%d_%H%M%S_%Z"))) writer.add_graph(tf.get_default_graph()) ipu_options = util.get_config(report_n=0) utils.configure_ipu_system(ipu_options) session = tf.Session() checkpoint = CHECKPOINT_FILE + ("host_ckpt" if self.opts.host_embeddings else "ckpt") if self.opts.ckpt: saver.restore(session, checkpoint) else: utils.move_variable_initialization_to_cpu() session.run(tf.global_variables_initializer()) session.run(data_init) print("Init done.") if self.host_embeddings: batch = [ batch, src_embedding(self.opts.batches_per_step, 1), tgt_embedding(self.opts.batches_per_step, 1), ] result_queue = outfeed_queue.dequeue() session.run(batch) # Warmup best_loss = float("Inf") for e in range(self.opts.iterations): start = time.time() session.run(batch) result = session.run(result_queue) l = result["loss"] avg_loss = np.mean(l) duration = (time.time() - start) / self.opts.batches_per_step print( "Step: {:>5}. Average Loss {:.3}. Items/sec {:.4}. Tokens/sec {}" .format( (e + 1), avg_loss, self.opts.batch_size / duration, self.opts.batch_size * (self.src_length + self.tgt_length) / duration, )) if avg_loss < best_loss: best_loss = avg_loss saver.save(session, checkpoint)
def run(benchmark, opts): ''' Run the benchmark. benchmark - An instance of Benchmark opts - Namespace from argparse generated from parse_opts ''' with ipu_scope('/device:IPU:0'): # Build graph with tf.device('cpu'): dataset = tf.data.Dataset \ .range((opts.steps + 2) * opts.batches_per_step) \ .map(lambda i: benchmark.inputs(opts, i)) \ .prefetch(opts.batches_per_step) if opts.batches_per_step > 1: with tf.device('cpu'): infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name="benchmark_dataset_infeed") data_init = infeed_queue.initializer with tf.Graph().as_default(): # To get the shape and dtype dummy_opts = copy.deepcopy(opts) dummy_opts.shards = 1 d = benchmark.inputs(dummy_opts, tf.constant(0)) out = benchmark.graph_builder(dummy_opts, d) input = tf.constant(0, out.dtype, shape=out.shape) def body(inout, *args, **kwargs): with tf.control_dependencies([inout]): # Run graph with tf.variable_scope("MainGraph"): out = benchmark.graph_builder(opts, kwargs) return out out = ipu_compiler.compile( lambda: loops.repeat(opts.batches_per_step, body, [input], infeed_queue), []) else: with tf.device('cpu'): data_tensors = dataset.make_one_shot_iterator().get_next() data_init = tf.no_op() out = ipu_compiler.compile( lambda: benchmark.graph_builder(opts, data_tensors), []) opts.batches_per_step = 1 # Report report = gen_ipu_ops.ipu_event_trace() # Dump the graph to a logdir if opts.save_graph: writer = tf.summary.FileWriter( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'logs', time.strftime('%Y%m%d_%H%M%S_%Z'))) writer.add_graph(tf.get_default_graph()) utils.configure_ipu_system(get_config(opts)) with tf.Session() as sess: # Setup sess.run([benchmark.initializer(), data_init]) sess.run(report) # Warmup print("Compiling and Warmup...") start = time.time() sess.run(out) duration = time.time() - start print("Duration: {:.3f} seconds\n".format(duration)) # Cycle Report if opts.cycle_report: rep = sess.run(report) return extract_runtimes_from_report( rep, opts, display=True) # Only run once if producing cycle report print("Executing...") average_batches_per_sec = 0 # steps for i in range(opts.steps): # Run start = time.time() sess.run(out) duration = time.time() - start average_batches_per_sec += (opts.batches_per_step / duration) / opts.steps report_string = "{:<7.3} sec/itr.".format(duration) report_string += " " + benchmark.iteration_report(opts, duration) print(report_string) return average_batches_per_sec
def training_loop_test(): out = loops.repeat(1, test_model) return out