def make_and_run_on_device_benchmark(opts, train=True): name = "training" if train else "test" logging.info(f"Creating the {name} benchmark for running with a device") graph = tf.Graph() with graph.as_default(): ds, num_ds, *_ = make_dataset(opts, use_synthetic_data=False, training=train) num_ds = num_ds // opts.batch_size infeed = ipu_infeed_queue.IPUInfeedQueue(ds) def empty_loop(): def empty_body(data_infeed): return tf.no_op() return ipu.loops.repeat(opts.repeat_count, empty_body, [], infeed) with ipu.scopes.ipu_scope("/device:IPU:0"): benchmark_op = ipu.ipu_compiler.compile(empty_loop, inputs=[]) with tf.Session(graph=graph) as sess: # run a first un-monitored epoch to force compile sess.run(benchmark_op) times = [] for _ in range(opts.epochs): progress = tqdm.tqdm(range(num_ds // opts.repeat_count)) for _ in progress: t0 = time.perf_counter() sess.run(benchmark_op) t1 = time.perf_counter() times.append(t1 - t0) avg_time = np.mean(times) token_throughput = opts.source_sequence_length * opts.batch_size * opts.repeat_count / avg_time bytes_throughput = token_throughput * 4 / (2**30) logging.info(f"On device throughput: {token_throughput:0.2f} tokens/s = {bytes_throughput:0.2f} GB/s")
def training_graph(opts, training_data): train_graph = tf.Graph() with train_graph.as_default(): dataset, train_iterator, placeholders = training_data.get_dataset( opts, is_training=True) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss_, sum_rmse_metric, *args, **kwargs): data_tensors = args observed_ratings = data_tensors[0] loss, rmse_metric, apply_grads_ = graph_builder(opts, observed_ratings=observed_ratings, learning_rate=placeholders["learning_rate"], type='TRAIN') with tf.control_dependencies([apply_grads_]): return total_loss_ + loss, sum_rmse_metric + rmse_metric return loops.repeat(opts.batches_per_step, body, [tf.constant(0, tf.float32), tf.constant(0, tf.float32)], infeed) total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, []) rmse = sum_rmse_metric / opts.batches_per_step loss = total_loss / opts.batches_per_step tf.summary.scalar("loss", loss) tf.summary.scalar("learning_rate", placeholders["learning_rate"]) tf.summary.scalar("RMSE/train", rmse) train_summary = tf.summary.merge_all() train_saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() train_writer = tf.summary.FileWriter( opts.logs_path + '/train', graph=train_graph, flush_secs=30) ipu_options = util.get_config(opts) ipu_options.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [loss, train_summary, rmse], placeholders, infeed, train_saver, train_writer)
def _build_dataset(self): if not self.config['use_synthetic_data']: with open(self.config['dict_path'], 'r') as fp: for item in fp.readlines(): item = item.strip().split(' ') self.char_dict[int(item[1])] = item[0] self.data_loader = Dataloader( self.config['data_path'], self.config['maxlen_in'], self.config['maxlen_tgt'], self.config['vocab_size'], self.config['fbank_size'], training=self.training, dtype=self.config['dtype'], use_synthetic_data=self.config['use_synthetic_data']) self.data_loader.load_data() output_types = (self.dtype, tf.int32, tf.int32, tf.int32) output_shapes = (tf.TensorShape([self.config['maxlen_in'], 83, 1]), tf.TensorShape([]), tf.TensorShape([self.config['maxlen_tgt']]), tf.TensorShape([])) dataset = tf.data.Dataset.from_generator(self.data_loader, output_types, output_shapes=output_shapes) dataset = dataset.batch(self.config['batch_size'], drop_remainder=True) self.infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, prefetch_depth=15)
def _build_dataset(self): self.start_id = start_id(self.output_vocab) self.end_id = end_id(self.output_vocab) data_file = ("./data/validation.csv" if self.opts.infer else "./data/training.csv") data = Data(data_file, self.input_vocab, self.output_vocab) data.load() transform(data) vocab = (self.input_vocab, self.output_vocab) self.generator = DataGenerator(data, vocab, self.opts, self.start_id, self.end_id) items = next(self.generator) output_types = {i: tf.dtypes.as_dtype(items[i].dtype) for i in items} output_shapes = {i: tf.TensorShape(items[i].shape) for i in items} total_bytes = 0 for i in items: total_bytes += items[i].nbytes dataset = tf.data.Dataset.from_generator(self.generator, output_types=output_types, output_shapes=output_shapes) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "InfeedQueue", replication_factor=1) data_init = infeed_queue.initializer return dataset, infeed_queue, data_init, vocab
def testPipelineIterationsNotMultiple(self): dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): a = value b = (value + 10.) / 2.0 return {"a": a, "b": b} dataset = dataset.map(dataset_parser) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1") def stage1(c, **kwargs): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(kwargs["a"]) return y + kwargs["b"], c def stage2(x, c): return math_ops.reduce_sum(x) + c def stage3(x): return x def my_net(c): return pipelining_ops.pipeline( [stage1, stage2, stage3], 10, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped) with ops.device('cpu'): c = array_ops.placeholder(np.float32, shape=[]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): r = ipu_compiler.compile(my_net, inputs=[c]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) sess.run(infeed_queue.initializer) with self.assertRaisesRegex( errors.FailedPreconditionError, 'The pipeline depth of the pipeline must be a multiple of 3' ): sess.run(r, {c: 10.01})
def validation_graph(model, opts): valid_graph = tf.Graph() with valid_graph.as_default(): # datasets must be defined outside the ipu device scope valid_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=False), feed_name='validation_feed', replication_factor=opts['replicas'] * opts['shards']) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_accuracy, image, label): accuracy = validation_graph_builder( model, image, label, opts) return total_accuracy + ( tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]) accuracy = loops.repeat( int(opts["validation_batches_per_step"]), body, [tf.constant(0, tf.float32)], valid_iterator) if opts['replicas'] > 1: accuracy = cross_replica_ops.cross_replica_sum( accuracy) / (opts['replicas'] * opts['shards']) return accuracy (accuracy, ) = xla.compile(comp_fn, []) accuracy = 100 * accuracy valid_saver = tf.train.Saver() ipu.utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=1, number_of_replicas=opts['replicas'] * opts['shards'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], profile=opts['profile'], availableMemoryProportion=globalAMP, stable_norm=opts["stable_norm"]) ipu.utils.configure_ipu_system(ipu_options) valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto()) return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy], None, valid_iterator, None, valid_saver, None)
def _gradient_accumulation_loop(test_wrapper, fwd_fn, inputs_fn, input_values, repeat_count, num_batches_to_accumulate, dataset_fn, optimizer, num_iterations=None): g = ops.Graph() if num_iterations is None: num_iterations = repeat_count * num_batches_to_accumulate with g.as_default(), test_wrapper.test_session(graph=g) as session: dataset = dataset_fn() inputs = inputs_fn() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id()) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) with variable_scope.variable_scope("ipu", use_resource=True, reuse=False): def model(*args): loss = fwd_fn(*functional_ops._convert_to_list(args)) # pylint: disable=W0212 enqueue_op = outfeed_queue.enqueue(loss) opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2( optimizer, num_batches_to_accumulate) outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements]) outs.append(enqueue_op) outs.append(opt.minimize(loss)) return outs def my_net(*args): return loops.repeat(num_iterations, model, inputs=args, infeed_queue=infeed_queue) with ops.device("/device:IPU:0"): loop_ret = ipu_compiler.compile(my_net, inputs=inputs) outfeed_op = outfeed_queue.dequeue() profiling = utils.running_on_ipu_model() cfg = utils.create_ipu_config(profiling=profiling, profile_execution=profiling) cfg = utils.set_ipu_model_options(cfg, compile_ipu_code=True, tiles_per_ipu=128) cfg = utils.auto_select_ipus(cfg, 1) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() session.run(variables.global_variables_initializer()) session.run(infeed_queue.initializer) session.run(loop_ret, feed_dict=dict(zip(inputs, input_values))) return session.run(outfeed_op)
def validation_graph(opts, valid_data): # Do not apply dropout during validation opts.apply_dropout = False valid_graph = tf.Graph() tf_device_ordinal = 0 if opts.multiprocessing else 1 with valid_graph.as_default(): dataset, _, _ = valid_data.get_dataset(opts, is_training=False) infeed = ipu_infeed_queue.IPUInfeedQueue( dataset, device_ordinal=tf_device_ordinal) with ipu_scope('/device:IPU:{}'.format(tf_device_ordinal)): def comp_fn(): def body(sum_rmse_metric, *args, **kwargs): data_tensors = args observed_ratings, ground_truth = tf.split( data_tensors[0], num_or_size_splits=2, axis=1) rmse_metric = graph_builder(opts, observed_ratings=observed_ratings, ground_truth=ground_truth, type='VALID') return sum_rmse_metric + rmse_metric return loops.repeat(opts.validation_batches_per_step, body, [tf.constant(0, tf.float32)], infeed) (sum_rmse_metric,) = ipu_compiler.compile(comp_fn, []) # Accuracy Ops rmse = sum_rmse_metric / opts.validation_batches_per_step valid_summary = tf.summary.scalar("RMSE/validation", rmse) valid_saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() valid_writer = tf.summary.FileWriter( opts.logs_path + '/valid', graph=valid_graph, flush_secs=30) ipu_options = util.get_config(opts) if opts.multiprocessing: ipu_options.configure_ipu_system() valid_sess = tf.Session(graph=valid_graph) return GraphOps(valid_graph, valid_sess, valid_init, [rmse, valid_summary], None, infeed, valid_saver, valid_writer)
def testResetSeed(self): # The dataset for feeding the graphs ds = dataset_ops.Dataset.from_tensors( array_ops.constant(1.0, shape=[SIZE])) ds = ds.map(lambda x: [x, x]) ds = ds.repeat() # The host side queues infeed_queue = ipu_infeed_queue.IPUInfeedQueue( ds, feed_name="infeed", replication_factor=REPLICAS) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=REPLICAS) # The device side def body(x1, x2): d1 = rand_ops.dropout(x1) d2 = rand_ops.dropout(x2) outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2}) return outfeed def my_net(): r = loops.repeat(REPEATS, body, [], infeed_queue) return r with scopes.ipu_scope('/device:IPU:0'): res = ipu_compiler.compile(my_net, inputs=[]) # The outfeed dequeue has to happen after the outfeed enqueue dequeue_outfeed = outfeed_queue.dequeue() # Configure the hardware config = utils.create_ipu_config(profiling=True) config = utils.auto_select_ipus(config, REPLICAS) config = utils.set_floating_point_behaviour_options(config) utils.configure_ipu_system(config) with session.Session() as sess: res_all = set() total = 0 sess.run(infeed_queue.initializer) for _ in range(EXECS): sess.run(res) outfed_result = sess.run(dequeue_outfeed) for r in np.array(list(outfed_result.values())).reshape( [-1, SIZE]): total += 1 res_all.add(r.tostring()) # 2 dropouts per replica * REPLICAS * REPEATS * EXECS expected = 2 * REPLICAS * REPEATS * EXECS self.assertEqual(total, expected) self.assertEqual(len(res_all), expected)
def generic_train_graph(opts, is_training): data_type = 'float32' train_graph = tf.Graph() with train_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed) if opts['use_synthetic_data']: dataset_train = get_synthetic_dataset(opts) else: dataset_train = get_dataset_embed(opts, is_training=True) infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas'])) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train] outfeed = None saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config() ipu_options = utils.set_optimization_options(ipu_options, combine_embedding_lookups=True) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) if seed is not None: utils.reset_ipu_seed(seed) ops_train = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.compat.v1.Session(graph=train_graph) return GraphOps(sess, init, ops_train, placeholders, infeed_train, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def training_graph(model, opts, iterations_per_step=1): train_graph = tf.Graph() with train_graph.as_default(): placeholders = dict() datatype = tf.float16 if opts["precision"].split( '.') == '16' else tf.float32 placeholders['learning_rate'] = tf.placeholder(datatype, shape=[]) learning_rate = placeholders['learning_rate'] # datasets must be defined outside the ipu device scope train_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=True), feed_name='training_feed', replication_factor=opts['replicas']) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, model, opts, learning_rate, iterations_per_step) outfeed = outfeed_queue.dequeue() logging.print_trainable_variables(opts) train_saver = tf.train.Saver(max_to_keep=999999) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=opts["shards"], number_of_replicas=opts['replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], availableMemoryProportion=globalAMP) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto()) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver)
def train(): graph = tf.Graph() with graph.as_default(): dataset = tf.data.Dataset.from_tensors(tf.constant(1, shape=[])) # dataset = tf.data.Dataset.from_tensors(np.array([1,2,3,4,5,6,7,8,9,0])) dataset = dataset.map(lambda x: [x, x]) dataset = dataset.batch(BS, drop_remainder=True) dataset = dataset.repeat() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(get_data_set(), feed_name="infeed") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name='outfeed') time_steps_ph = tf.placeholder(tf.int32, shape=[]) with ipu_scope('/device:IPU:0'): def compile_fn(): def body(x, y): # z1, z2 = model1(x, y, time_steps_ph) # outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2}) z3 = model2(time_steps_ph) outfeed = outfeed_queue.enqueue({'z3': z3}) return outfeed return loops.repeat(1, body, [], infeed_queue) utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() outputs = ipu_compiler.compile(compile_fn, []) dequeue_outfeed = outfeed_queue.dequeue() ipu_options = utils.create_ipu_config( profiling=False, profile_execution=False, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) ipu_options = utils.auto_select_ipus(ipu_options, 1) utils.configure_ipu_system(ipu_options) utils.reset_ipu_seed(SEED) sess = tf.Session(graph=graph) sess.run(init) sess.run(infeed_queue.initializer) steps = 6 i = 0 while i < steps: sess.run(outputs, feed_dict={time_steps_ph: 3}) result = sess.run(dequeue_outfeed) print(result) i = i + 1 break
def testSyntheticDataWithOutfeeds(self): poplar_flags = os.environ.get("TF_POPLAR_FLAGS", "") poplar_flags += " --use_ipu_model" poplar_flags += " --use_synthetic_data" poplar_flags += " --synthetic_data_initializer=random" with test.mock.patch.dict("os.environ", {"TF_POPLAR_FLAGS": poplar_flags}): # The device side main def body(x1, x2): d1 = x1 + x2 d2 = x1 - x2 outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2}) return outfeed def my_net(): r = loops.repeat(5, body, [], infeed_queue) return r with ops.device('cpu'): # The dataset for feeding the graphs ds = tf.data.Dataset.from_tensors(tf.constant(1.0, shape=[10])) ds = ds.map(lambda x: [x, x]) ds = ds.repeat() # The host side queues infeed_queue = ipu_infeed_queue.IPUInfeedQueue(ds, feed_name="infeed2") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="outfeed2") with scopes.ipu_scope('/device:IPU:0'): run_loop = ipu_compiler.compile(my_net, inputs=[]) # The outfeed dequeue has to happen after the outfeed enqueue dequeue_outfeed = outfeed_queue.dequeue() # Configure the hardware config = utils.create_ipu_config() config = utils.auto_select_ipus(config, 1) utils.configure_ipu_system(config) with tf.Session() as sess: sess.run(infeed_queue.initializer) sess.run(run_loop) result = sess.run(dequeue_outfeed) self.assertAllEqual(len(result['d1']), 0)
def test_optimizer(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") strategy = ipu_strategy.IPUStrategy() report = tu.ReportJSON(self, eager_mode=True, replicated=True) report.reset() with strategy.scope(): initial_variable = 2.0 variable = variables.Variable(initial_variable) learning_rate = 0.5 num_iterations = 3 data = [1.0, 2.0] dataset = dataset_ops.Dataset.from_tensor_slices((data)) dataset = dataset.repeat(num_iterations) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, feed_name="feed", replication_factor=2) optimizer = keras.optimizer_v2.gradient_descent.SGD(learning_rate) @def_function.function(experimental_compile=True) def apply_gradient(): gradient = infeed._dequeue() # pylint: disable=protected-access optimizer.apply_gradients([(gradient, variable)]) # The optimizers in v2 will sum the gradients, and not average them. expected_gradient = np.sum(data) expected_variable = initial_variable infeed.initializer # pylint: disable=pointless-statement for _ in range(num_iterations): strategy.experimental_run_v2(apply_gradient) expected_variable -= learning_rate * expected_gradient self.assertEqual(expected_variable, variable.numpy())
def validation_graph(model, opts): valid_graph = tf.Graph() with valid_graph.as_default(): # datasets must be defined outside the ipu device scope valid_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=False), feed_name='validation_feed', replication_factor=opts['replicas'] * opts['shards']) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_accuracy, image, label): accuracy = validation_graph_builder( model, image, label, opts) return total_accuracy + ( tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]) accuracy = loops.repeat( int(opts["validation_batches_per_step"]), body, [tf.constant(0, tf.float32)], valid_iterator) if opts['replicas'] > 1: accuracy = cross_replica_ops.cross_replica_sum( accuracy) / (opts['replicas'] * opts['shards']) return accuracy (accuracy, ) = xla.compile(comp_fn, []) accuracy = 100 * accuracy valid_saver = tf.train.Saver() ipu.utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto()) return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy], None, valid_iterator, None, valid_saver)
def run(benchmark, opts): ''' Run the benchmark. benchmark - An instance of Benchmark opts - Namespace from argparse generated from parse_opts ''' with ipu_scope('/device:IPU:0'): # Build graph with tf.device('cpu'): dataset = tf.data.Dataset \ .range((opts.steps + 2) * opts.batches_per_step) \ .map(lambda i: benchmark.inputs(opts, i)) \ .prefetch(opts.batches_per_step) if opts.batches_per_step > 1: with tf.device('cpu'): infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name="benchmark_dataset_infeed") data_init = infeed_queue.initializer with tf.Graph().as_default(): # To get the shape and dtype dummy_opts = copy.deepcopy(opts) dummy_opts.shards = 1 d = benchmark.inputs(dummy_opts, tf.constant(0)) out = benchmark.graph_builder(dummy_opts, d) input = tf.constant(0, out.dtype, shape=out.shape) def body(inout, *args, **kwargs): with tf.control_dependencies([inout]): # Run graph with tf.variable_scope("MainGraph"): out = benchmark.graph_builder(opts, kwargs) return out out = ipu_compiler.compile( lambda: loops.repeat(opts.batches_per_step, body, [input], infeed_queue), []) else: with tf.device('cpu'): data_tensors = dataset.make_one_shot_iterator().get_next() data_init = tf.no_op() out = ipu_compiler.compile( lambda: benchmark.graph_builder(opts, data_tensors), []) opts.batches_per_step = 1 # Report report = gen_ipu_ops.ipu_event_trace() # Dump the graph to a logdir if opts.save_graph: writer = tf.summary.FileWriter( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'logs', time.strftime('%Y%m%d_%H%M%S_%Z'))) writer.add_graph(tf.get_default_graph()) utils.configure_ipu_system(get_config(opts)) with tf.Session() as sess: # Setup sess.run([benchmark.initializer(), data_init]) sess.run(report) # Warmup print("Compiling and Warmup...") start = time.time() sess.run(out) duration = time.time() - start print("Duration: {:.3f} seconds\n".format(duration)) # Cycle Report if opts.cycle_report: rep = sess.run(report) return extract_runtimes_from_report( rep, opts, display=True) # Only run once if producing cycle report print("Executing...") average_batches_per_sec = 0 # steps for i in range(opts.steps): # Run start = time.time() sess.run(out) duration = time.time() - start average_batches_per_sec += (opts.batches_per_step / duration) / opts.steps report_string = "{:<7.3} sec/itr.".format(duration) report_string += " " + benchmark.iteration_report(opts, duration) print(report_string) return average_batches_per_sec
def training_graph(opts, training_data, device_index=0, learning_rate=0.001): train_graph = tf.Graph() with train_graph.as_default(): dataset, _, placeholders = training_data.get_dataset(opts, is_training=True) infeed = ipu_infeed_queue.IPUInfeedQueue( dataset, "training_dataset_infeed{0}".format(device_index), 0) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss_, sum_rmse_metric, *args): data_tensors = args observed_ratings = data_tensors[0] loss, rmse_metric, apply_grads_ = graph_builder( opts, observed_ratings=observed_ratings, learning_rate=placeholders["learning_rate"]) with tf.control_dependencies([apply_grads_]): return total_loss_ + loss, sum_rmse_metric + rmse_metric return loops.repeat( opts.batches_per_step, body, [tf.constant(0, tf.float32), tf.constant(0, tf.float32)], infeed) total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, []) rmse = sum_rmse_metric / opts.batches_per_step loss = total_loss / opts.batches_per_step tf.summary.scalar("loss", loss) tf.summary.scalar("learning_rate", learning_rate) tf.summary.scalar("RMSE/train", rmse) train_summary = tf.summary.merge_all() train_saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() train_writer = tf.summary.FileWriter(opts.logs_path + '/train{0}'.format(device_index), graph=train_graph, flush_secs=30) ipu_options = ipu_utils.create_ipu_config(profiling=False) ipu_options = ipu_utils.set_floating_point_behaviour_options( ipu_options, inv=opts.fp_exceptions, div0=opts.fp_exceptions, oflo=opts.fp_exceptions, esr=opts.prng, nanoo=True) ipu_options = ipu_utils.auto_select_ipus(ipu_options, 1) ipu_utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [loss, train_summary, rmse], placeholders, infeed, train_saver, train_writer)
def create_poplar_exec(model, opts, poplar_exec_path): """Create graph and save it to the file.""" valid_graph = tf.Graph() with valid_graph.as_default(): # datasets must be defined outside the ipu device scope if opts['generated_data']: # create dummy dataset with images only dummy_image = np.zeros((opts['micro_batch_size'], opts['image_size'], opts['image_size'], 3), dtype=np.uint8) inference_dataset = tf.data.Dataset.from_tensors( {"image": dummy_image}) else: # create dataset with images and labels inference_dataset = dataset.data(opts, is_training=False) inference_dataset = inference_dataset.map(lambda x: {'data_dict': x}) inference_infeed_iterator = \ ipu_infeed_queue.IPUInfeedQueue(inference_dataset, prefetch_depth=opts['prefetch_depth']) acc_queue = ipu_outfeed_queue.IPUOutfeedQueue() with ipu_scope('/device:IPU:0'): def comp_fn(): def body(data_dict): accuracy = validation_graph_builder(model, data_dict, opts) accuracy_enqueue = acc_queue.enqueue(accuracy) return accuracy_enqueue accuracy = loops.repeat( int(opts['validation_batches_per_step']), body, [], inference_infeed_iterator) return accuracy filenames, _ = get_ckpt_filenames(opts) accuracy = application_compile_op.experimental_application_compile_op( comp_fn, output_path=poplar_exec_path, freeze_variables=True) outfeed = acc_queue.dequeue() valid_saver = tf.train.Saver() ipu.utils.move_variable_initialization_to_cpu() with tf.Session(graph=valid_graph, config=tf.ConfigProto()) as sess: if len(filenames) == 1: print("Restoring from a snapshot: ", filenames[0]) sess.run(inference_infeed_iterator.initializer) init = tf.global_variables_initializer() sess.run(init) valid_saver.restore(sess, filenames[0]) else: print( "Warning: no restore point found - randomly initialising weights instead" ) init = tf.global_variables_initializer() sess.run(init) path = sess.run(accuracy) print(f"Poplar executable: {path}") valid_graph.finalize()
def run_model(opts): # Use Keras to get the dataset: mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 # Sizes/shapes for the dataset: image_shape = x_train.shape[1:] num_pixels = image_shape[0] * image_shape[1] batch_size = 16 batch_shape = [batch_size, num_pixels] num_train = y_train.shape[0] num_test = y_test.shape[0] data_shape = [None, num_pixels] # Flatten the images and cast the labels: x_train_flat = x_train.astype(np.float32).reshape(-1, num_pixels) x_test_flat = x_test.astype(np.float32).reshape(-1, num_pixels) y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) # Decide how to split epochs into loops up front: epochs = 5 ipu_steps_per_epoch = 15 batches_per_epoch = num_train // batch_size train_batches = (num_train * epochs) // batch_size test_batches = num_test // batch_size batches_per_step = batches_per_epoch // ipu_steps_per_epoch if not batches_per_epoch % ipu_steps_per_epoch == 0: raise ValueError(f"IPU steps per epoch {ipu_steps_per_epoch} must divide batches per epoch {batches_per_epoch}.") # Put placeholders on the CPU host: with tf.device("cpu"): place_x = tf.placeholder(dtype=tf.float32, shape=data_shape, name="input") place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label") lr_placeholder = tf.placeholder(tf.float32, shape=[]) # Create dataset and IPU feeds: dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y)) dataset = dataset.cache().repeat().batch(batch_size, drop_remainder=True) infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name="train_infeed") outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="train_outfeed") infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name="test_infeed") outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="test_outfeed") # Use function binding to create all the builder functions that are neeeded: bound_train_model = partial(model, lr_placeholder, outfeed_train_queue, True) bound_train_loop = partial( loop_builder, batches_per_step, bound_train_model, infeed_train_queue) bound_test_model = partial(model, lr_placeholder, outfeed_test_queue, False) bound_test_loop = partial(loop_builder, test_batches, bound_test_model, infeed_test_queue) # Use the bound builder functions to place the model on the IPU: with scopes.ipu_scope("/device:IPU:0"): train_loop = ipu_compiler.compile(bound_train_loop, inputs=[]) test_loop = ipu_compiler.compile(bound_test_loop, inputs=[]) # Initialisers should go on the CPU: with tf.device("cpu"): metrics_vars = tf.get_collection( tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer(var_list=metrics_vars) saver = tf.train.Saver() # Setup and acquire an IPU device: config = utils.create_ipu_config() config = utils.auto_select_ipus(config, 1) utils.configure_ipu_system(config) # These allow us to retrieve the results of IPU feeds: dequeue_train_outfeed = outfeed_train_queue.dequeue() dequeue_test_outfeed = outfeed_test_queue.dequeue() # Create a benchmark program for the infeed to determine maximum achievable throughput: infeed_perf = dataset_benchmark.infeed_benchmark( infeed_train_queue, epochs, num_train, True) print(f"\nImage shape: {image_shape} Training examples: {num_train} Test examples: {num_test}") print(f"Epochs: {epochs} Batch-size: {batch_size} Steps-per-epoch: {ipu_steps_per_epoch} Batches-per-step: {batches_per_step}") # Run the model: with tf.Session() as sess: print(f"Benchmarking the infeed...") sess.run(infeed_perf, feed_dict={place_x: x_train_flat, place_y: y_train}) sess.run(tf.global_variables_initializer()) sess.run(infeed_train_queue.initializer, feed_dict={ place_x: x_train_flat, place_y: y_train}) if opts.test_mode in ["all", "training"]: print(f"Training...") progress = tqdm( range(epochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}') for e in progress: sess.run(metrics_initializer) for i in range(ipu_steps_per_epoch): sess.run(train_loop, feed_dict={lr_placeholder: scheduler(e)}) result = sess.run(dequeue_train_outfeed) if len(result['mean_loss'] != 0) and len(result['acc'] != 0): progress.set_description(f"Loss {result['mean_loss'][0]:.5f} Accuracy {result['acc'][0]:.5f}") print(f"Saving...") saver.save(sess, "model") if opts.test_mode in ["all", "tests"]: print(f"Testing...") sess.run(metrics_initializer) sess.run(infeed_test_queue.initializer, feed_dict={ place_x: x_test_flat, place_y: y_test}) sess.run(test_loop) result = sess.run(dequeue_test_outfeed) test_loss = np.mean(result['mean_loss']) test_acc = np.mean(result['acc']) print(f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}")
from tensorflow.python.ipu.ops import normalization_ops BATCH_SIZE = 50 # load dataset train_images = np.random.normal(0, 1, (60000, 4)) print(np.shape(train_images)) train_images = train_images.reshape(train_images.shape[0], 1, 4).astype("float32") train_dataset = (tf.data.Dataset.from_tensor_slices(train_images).batch( BATCH_SIZE, drop_remainder=True).repeat(10)) infeed_GAN = ipu_infeed_queue.IPUInfeedQueue(train_dataset, feed_name="in_GAN") outfeed_FULL = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="out_FULL") outfeed_test = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="out_test") with tf.device("cpu"): numPoints = tf.placeholder(np.int32, shape=(), name="numPoints") from tensorflow.keras.layers import ( Input, Flatten, Dense, Reshape, Dropout, LeakyReLU,
if __name__ == "__main__": logging.basicConfig( level=logging.getLevelName('INFO'), format='%(asctime)s %(name)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') # Parse options opts = parse_args() if not opts.on_device_only: logger.info("Creating training dataset, infeed queue and benchmark.") # Create training dataset and infeed queue train_set, num_train, *_ = make_dataset(opts, use_synthetic_data=False, training=True) num_train = num_train // opts.batch_size infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_set) # Benchmark it infeed_perf_train = dataset_benchmark.infeed_benchmark( infeed_queue=infeed_train_queue, number_of_epochs=opts.epochs, elements_per_epochs=num_train, print_stats=False) ds_perf_train = dataset_benchmark.dataset_benchmark( dataset=train_set, number_of_epochs=opts.epochs, elements_per_epochs=num_train, print_stats=False, apply_options=True) logger.info("Creating test dataset, infeed queue and benchmark.") # Create test dataset
def generic_infer_graph(opts, is_training): data_type = 'float32' infer_graph = tf.Graph() with infer_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, is_training, seed) if opts['use_synthetic_data']: dataset_val = get_synthetic_dataset(opts) else: dataset_val = get_dataset_embed(opts, is_training=False) infeed_val = ipu_infeed_queue.IPUInfeedQueue( dataset_val, feed_name='DIN_dataset_infeed_val', replication_factor=(opts['replicas'])) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="DIN_validation_outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): def comp_fn_validate(): def body(uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss_total, _, accuracy, _ = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) outfeed_op = outfeed_queue.enqueue( (prob, target, accuracy)) return outfeed_op return loops.repeat(opts['batches_per_step'], body, [], infeed_val) outputs_val = ipu_compiler.compile(comp_fn_validate, []) outfeed = outfeed_queue.dequeue() saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config() ipu_options = utils.set_optimization_options( ipu_options, combine_embedding_lookups=True) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) if seed is not None: utils.reset_ipu_seed(seed) ops_val = [outputs_val] sess = tf.compat.v1.Session(graph=infer_graph) return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
from tensorflow.python.ipu import ipu_compiler from tensorflow.python.ipu import ipu_infeed_queue from tensorflow.python.ipu import ipu_outfeed_queue from tensorflow.python.ipu import loops from tensorflow.python.ipu import scopes from tensorflow.python.ipu import utils import tensorflow.compat.v1 as tf tf.disable_v2_behavior() # The dataset for feeding the graphs ds = tf.data.Dataset.from_tensors(tf.constant(1.0, shape=[800])) ds = ds.map(lambda x: [x, x]) ds = ds.repeat() # The host side queues infeed_queue = ipu_infeed_queue.IPUInfeedQueue(ds, feed_name="infeed") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="outfeed") # The device side main def body(x1, x2): d1 = x1 + x2 d2 = x1 - x2 outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2}) return outfeed def my_net(): r = loops.repeat(10, body, [], infeed_queue) return r
def train(replication_factor, batch_size, batch_per_step, profile, num_iter, time_steps): """Launch training.""" # Set up in-feeds for the data with tf.device('cpu'): data_generator = EnvGenerator(batch_size, time_steps) items = next(data_generator) output_types = tuple((tf.dtypes.as_dtype(i.dtype) for i in items)) output_shapes = tuple((tf.TensorShape(i.shape) for i in items)) total_bytes = 0 for i in items: total_bytes += i.nbytes print(f'Input data size = {total_bytes/1000000} MB/batch') dataset = tf.data.Dataset.from_generator(data_generator, output_types=output_types, output_shapes=output_shapes) infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, "InfeedQueue", replication_factor=replication_factor) data_init = infeed_queue.initializer # Compile loss op with ipu_scope("/device:IPU:0"): total_loss = ipu_compiler.compile( lambda: loops.repeat(batch_per_step, build_train_op, infeed_queue=infeed_queue, inputs=[tf.constant(0.0, dtype=DTYPE)])) # Set up report op optionally. if profile: with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() # Set up session on IPU opts = utils.create_ipu_config( profiling=profile, use_poplar_text_report=use_poplar_text_report, profile_execution=profile, merge_infeed_io_copies=True) opts = utils.set_optimization_options( opts, max_cross_replica_sum_buffer_size=10000000) opts = utils.auto_select_ipus(opts, [replication_factor]) utils.configure_ipu_system(opts) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) # Initialize variables utils.move_variable_initialization_to_cpu() sess.run([tf.global_variables_initializer(), data_init]) # Run training and time total_time = 0.0 total_samples = 0 skip_iterations = 5 # Initially the infeed may buffer extra input data and # first run for IPU includes XLA compile, so skipping these iterations for calculating items/sec. for iters in range(num_iter): data_generator.reset_counter() t0 = time.perf_counter() sess.run(total_loss) t1 = time.perf_counter() if profile: raw_reports = sess.run(report) if use_poplar_text_report: # extract the report rep = utils.extract_all_strings_from_event_trace(raw_reports) print("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: os.makedirs('profile_rl', exist_ok=True) save_tf_report(raw_reports, log_dir='profile_rl') print("Writing profiling report to profile_rl") break if iters > skip_iterations: total_time += (t1 - t0) total_samples += (batch_size * batch_per_step * replication_factor) print("Average %.1f items/sec" % (total_samples / total_time))
def run_mnist(opts): random_gen = np.random.default_rng(seed=opts.seed) # Use Keras to get the dataset: mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 # Sizes/shapes for the dataset: image_shape = x_train.shape[1:] num_pixels = image_shape[0] * image_shape[1] batch_size = opts.batch_size batch_shape = [batch_size, num_pixels] num_train = y_train.shape[0] num_test = y_test.shape[0] data_shape = [None, num_pixels] # Flatten the images and cast the labels: x_train_flat = x_train.astype(np.float32).reshape(-1, num_pixels) x_test_flat = x_test.astype(np.float32).reshape(-1, num_pixels) y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) # Decide how to split epochs into loops up front: batches_per_epoch = num_train // batch_size train_batches = (num_train * opts.epochs) // batch_size test_batches = num_test // batch_size batches_per_step = batches_per_epoch // opts.steps_per_epoch if not batches_per_epoch % opts.steps_per_epoch == 0: raise ValueError( f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly." ) # Create FC layer descriptions: fc_layers = create_fc_layers(opts, batch_shape, random_gen) for name, fc in fc_layers.items(): logger.info(f"Layer Config: {name}: {type(fc)}") # Put placeholders on the CPU host: with tf.device("cpu"): place_x = tf.placeholder(dtype=tf.float32, shape=data_shape, name="input") place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label") lr_placeholder = tf.placeholder(tf.float32, shape=[]) for fc in fc_layers.values(): fc.create_placeholders(tf.float32) # Create dataset and IPU feeds: dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y)) dataset = dataset.shuffle(buffer_size=num_train, seed=opts.seed).cache() dataset = dataset.repeat().batch(batch_size, drop_remainder=True) infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name="train_infeed") outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="train_outfeed_last_itr") infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name="test_infeed") outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="test_outfeed") # Use function binding to create all the builder functions that are neeeded: bound_train_model = partial(model, fc_layers, opts.droprate, lr_placeholder, batches_per_step, True, outfeed_train_queue) bound_train_loop = partial(loop_builder, batches_per_step, bound_train_model, infeed_train_queue) bound_test_model = partial(model, fc_layers, opts.droprate, lr_placeholder, batches_per_step, False, outfeed_test_queue) bound_test_loop = partial(loop_builder, test_batches, bound_test_model, infeed_test_queue) # Use the bound builder functions to place the model on the IPU: with scopes.ipu_scope("/device:IPU:0"): train_loop = ipu_compiler.compile(bound_train_loop, inputs=[]) test_loop = ipu_compiler.compile(bound_test_loop, inputs=[]) update_representation = build_update_op(fc_layers) # Initialisers should go on the CPU: with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer(var_list=metrics_vars) saver = tf.train.Saver() # Setup and acquire an IPU device: config = utils.create_ipu_config() config = utils.auto_select_ipus(config, 1) utils.configure_ipu_system(config) # These allow us to retrieve the results of IPU feeds: dequeue_test_outfeed = outfeed_test_queue.dequeue() dequeue_train_outfeed = outfeed_train_queue.dequeue() logger.info( f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}" ) logger.info( f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}" ) total_steps = opts.steps_per_epoch * opts.epochs logger.info(f"Total steps: {total_steps}") # Merge the feeds needed for all layers: sparse_feed = {} for fc in fc_layers.values(): sparse_feed.update(fc.feed_dict()) if opts.log: # Open log and write header fields: log_file = open(opts.log, 'w') d1, d2 = opts.densities log_file.write(f"Iteration Density_{d1}_{d2}\n") logpath = os.path.join(opts.checkpoint_path, datetime.now().strftime("%Y%m%d-%H%M%S")) summary_writer = tf.summary.FileWriter(logpath) if opts.records_path: # Save the first hidden layer's weight mask for later analysis: save_weights(opts, 'fc1', fc_layers['fc1'], 0) # Run the model: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(infeed_train_queue.initializer, feed_dict={ place_x: x_train_flat, place_y: y_train }) # Must initialise the sparse layers separately: sess.run(update_representation, feed_dict=sparse_feed) if opts.test_mode in ["all", "training"]: logger.info(f"Training...") progress = tqdm( range(opts.epochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}') for e in progress: for i in range(opts.steps_per_epoch): sess.run(metrics_initializer) # Only need to feed an updated sparsity representation if we are running rig-L: if not opts.disable_pruning: sess.run(update_representation, feed_dict=sparse_feed) sess.run(train_loop, feed_dict={lr_placeholder: scheduler(e, opts)}) last = sess.run(dequeue_train_outfeed) steps = 1 + i + e * opts.steps_per_epoch batches_processed = batches_per_step * steps for name, fc in fc_layers.items(): if fc.is_sparse(): logger.info( f"Average weights for layer {name}: {np.mean(last[name+'_non_zeros'][0])}" ) logger.info( f"Average momentum for layer {name} : {np.mean(last[name+'_momentum'][0])}" ) if not opts.disable_pruning: logger.info( f"Starting prune and grow for layer {name}" ) t0 = time.perf_counter() prune_and_grow(name, fc, last, random_gen, steps, total_steps, opts) t1 = time.perf_counter() logger.info( f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds" ) if opts.log: log_file.write( f"{batches_processed} {last['acc'][0]}\n") progress.set_description( f"Loss {last['mean_loss'][0]:.5f} Accuracy {last['acc'][0]:.5f}" ) logger.info(f"Saving...") saver.save(sess, os.path.join(logpath, 'model.ckpt')) if opts.test_mode in ["all", "tests"]: test_feed = {} for fc in fc_layers.values(): test_feed.update(fc.feed_dict()) logger.info(f"Testing...") sess.run(metrics_initializer) sess.run(infeed_test_queue.initializer, feed_dict={ place_x: x_test_flat, place_y: y_test }) sess.run(test_loop, feed_dict=test_feed) result = sess.run(dequeue_test_outfeed) test_loss = result['mean_loss'][-1] test_acc = result['acc'][-1] logger.info( f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}")
def run_mnist(opts): if opts.pipelining and opts.gradient_accumulation_count < 4: raise ValueError( "Pipelining requires at least 4 gradient accumulation steps.") if opts.seed is not None: utils.reset_ipu_seed(opts.seed) random_gen = np.random.default_rng(seed=opts.seed) # Use Keras to get the dataset: mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 # Sizes/shapes for the dataset: image_shape = x_train.shape[1:] num_pixels = image_shape[0] * image_shape[1] batch_size = opts.batch_size // opts.gradient_accumulation_count batch_shape = [batch_size, num_pixels] num_train = y_train.shape[0] num_test = y_test.shape[0] dtype = tf.float16 if opts.data_type == 'fp16' else tf.float32 # Flatten the images and cast the labels: permutation = make_pixel_permutation_matrix(opts, image_shape) x_train_flat = x_train.astype(dtype.as_numpy_dtype()).reshape( -1, num_pixels) x_test_flat = x_test.astype(dtype.as_numpy_dtype()).reshape(-1, num_pixels) x_train_flat[:, ...] = x_train_flat[:, permutation] x_test_flat[:, ...] = x_test_flat[:, permutation] if opts.records_path: os.makedirs(opts.records_path, exist_ok=True) filename = os.path.join(opts.records_path, "pixel_permutation") np.save(filename, permutation) y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) # Decide how to split epochs into loops up front: if opts.pipelining: logger.info( f"Pipelined: micro-batch-size: {batch_size} accumulation-count: {opts.gradient_accumulation_count}" ) batches_per_epoch = num_train // (batch_size * opts.gradient_accumulation_count) test_batches = num_test // (batch_size * opts.gradient_accumulation_count) batches_per_step = opts.batches_per_step_override if batches_per_step is None: batches_per_step = batches_per_epoch // opts.steps_per_epoch if not (batches_per_epoch % opts.steps_per_epoch) == 0: raise ValueError( f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly." ) # Create FC layer descriptions: fc_layers = create_fc_layers(opts, batch_shape, random_gen) for name, fc in fc_layers.items(): logger.info(f"Layer Config: {name}: {type(fc)}") # Put placeholders on the CPU host: with tf.device("cpu"): lr_placeholder = tf.placeholder(dtype, shape=[]) # Create dataset and IPU feeds: def make_generator(features, labels): return lambda: zip(features, labels) # Input pipeline def make_dataset(features, labels, is_training: bool): dataset = tf.data.Dataset.from_generator( generator=make_generator(features, labels), output_types=(features.dtype, labels.dtype), output_shapes=(features.shape[1:], labels.shape[1:])) if is_training: dataset = dataset.shuffle(buffer_size=num_train, seed=opts.seed).cache() dataset = dataset.repeat().batch(batch_size, drop_remainder=True) return dataset train_dataset = make_dataset(features=x_train_flat, labels=y_train, is_training=True) test_dataset = make_dataset(features=x_test_flat, labels=y_test, is_training=False) infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_dataset) outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue() outfeed_prune_and_grow_queue = ipu_outfeed_queue.IPUOutfeedQueue() infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(test_dataset) outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue() # Get optimiser opt_cls, opt_kws = build_optimizer(opts.optimizer, opts.optimizer_arg) logger.info('Optimiser %s, optimiser keywords %s', opt_cls.__name__, opt_kws) # Get the bound model functions bound_model_fn = make_bound_model_pipelining if opts.pipelining else make_bound_model (bound_train_loop, bound_test_loop), train_inputs = bound_model_fn( fc_layers=fc_layers, opts=opts, lr_placeholder=lr_placeholder, opt_cls=opt_cls, opt_kws=opt_kws, train_batches_per_step=batches_per_step, test_batches_per_step=test_batches, train_queues=(outfeed_train_queue, infeed_train_queue), test_queues=(outfeed_test_queue, infeed_test_queue), png_queue=outfeed_prune_and_grow_queue, disable_dense_grad=opts.disable_dense_grad_override) # Use the bound builder functions to place the model on the IPU: with scopes.ipu_scope("/device:IPU:0"): train_loop = ipu_compiler.compile(bound_train_loop, inputs=train_inputs) test_loop = ipu_compiler.compile(bound_test_loop) # Placeholders can only be created on cpu after all the slots have registered: with tf.device("cpu"): for fc in fc_layers.values(): fc.create_placeholders() # Create update op on IPU: with scopes.ipu_scope("/device:IPU:0"): update_representation = build_update_op(fc_layers) # Initialisers should go on the CPU: with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer(var_list=metrics_vars) saver = tf.train.Saver() # Setup and acquire an IPU device: utils.move_variable_initialization_to_cpu() config = IPUConfig() config.auto_select_ipus = 1 config.floating_point_behaviour.inv = False config.floating_point_behaviour.div0 = False config.floating_point_behaviour.oflo = False config.floating_point_behaviour.esr = True config.floating_point_behaviour.nanoo = False config.configure_ipu_system() # These allow us to retrieve the results of IPU feeds: dequeue_test_outfeed = outfeed_test_queue.dequeue() dequeue_train_outfeed = outfeed_train_queue.dequeue() # Add dense gradient outfeed if we have sparse layers dequeue_prune_and_grow_outfeed = None if not opts.disable_dense_grad_override and any( fc.is_sparse() for fc in fc_layers.values()): dequeue_prune_and_grow_outfeed = outfeed_prune_and_grow_queue.dequeue() logger.info( f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}" ) logger.info( f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}" ) total_steps = opts.steps_per_epoch * opts.epochs logger.info(f"Total steps: {total_steps}") if opts.log: # Open log and write header fields: log_file = open(opts.log, 'w') d1, d2 = opts.densities log_file.write(f"Iteration Density_{d1}_{d2}\n") if opts.restore: logpath = os.path.join(opts.checkpoint_path, opts.restore) else: logpath = os.path.join(opts.checkpoint_path, datetime.now().strftime("%Y%m%d-%H%M%S")) summary_writer = tf.summary.FileWriter(logpath) if opts.records_path: # Save the first hidden layer's weight mask for later analysis: save_weights(opts, 'fc1', fc_layers['fc1'], 0) # Run the model: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(infeed_train_queue.initializer) if opts.restore: saver.restore(sess, logpath + '/model.ckpt') if opts.test_mode in ["all", "training"]: logger.info(f"Training...") start = opts.start_epoch if opts.restore else 0 progress = tqdm( range(start, opts.epochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}') for e in progress: for i in range(opts.steps_per_epoch): sess.run(metrics_initializer) t1 = time.perf_counter() sess.run(train_loop, feed_dict={lr_placeholder: scheduler(e, opts)}) t2 = time.perf_counter() sess_time = t2 - t1 batch_time = sess_time / batches_per_step throughput = batch_size / batch_time logger.info(f"Time for sess.run: {sess_time:0.3f} " f"Time per batch: {batch_time:0.6f} " f"Throughput: {throughput}") if opts.single_train_step_only: return train_outputs = sess.run(dequeue_train_outfeed) if opts.pipelining: train_outputs = train_outputs[-1] # Get the last value for all items: for k, v in train_outputs.items(): train_outputs[k] = v[-1] logger.debug(f"Train outputs: {train_outputs.keys()}") # Merge prune and grow fetches with last fetches: if dequeue_prune_and_grow_outfeed is not None: png_data = sess.run(dequeue_prune_and_grow_outfeed) for k in png_data: png_data[k] = png_data[k][-1] logger.debug( f"Prune and grow outputs: {png_data.keys()}") steps = 1 + i + e * opts.steps_per_epoch batches_processed = batches_per_step * steps for name, fc in fc_layers.items(): if fc.is_sparse(): var_name = fc.get_values_var().name logger.info( f"Average weights for layer {name}: {np.mean(png_data[var_name])}" ) for slot_name in fc.sparse_slots: logger.info( f"Average {slot_name} for layer {name} : {np.mean(png_data[slot_name])}" ) if i == 0 and e == opts.start_epoch: metainfo = sess.run(fc.get_metainfo_var()) else: metainfo = None if not opts.disable_pruning: logger.info( f"Starting prune and grow for layer {name}" ) t0 = time.perf_counter() prune_sched = prune_and_grow(name, fc, png_data, random_gen, steps, total_steps, opts, metainfo=metainfo) t1 = time.perf_counter() logger.info( f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds" ) logger.info( f"Pruned proportion: {prune_sched}") if opts.use_wandb: wandb.log({'Prune Schedule': prune_sched}, commit=False) if opts.log: log_file.write( f"{batches_processed} {train_outputs['acc']}\n") if opts.use_wandb: wandb.log( { 'Loss': train_outputs['mean_loss'], 'Accuracy': train_outputs['acc'], 'Throughput': throughput }, commit=True) progress.set_description( f"Loss {train_outputs['mean_loss']:.5f} Accuracy {train_outputs['acc']:.5f}" ) # Only need to feed an updated sparsity representation if we are running rig-L: if not opts.disable_pruning: # Merge the feeds needed for all layers: sparse_feed = {} for fc in fc_layers.values(): if fc.is_sparse(): sparse_feed.update(fc.feed_dict()) sess.run(update_representation, feed_dict=sparse_feed) if e % opts.checkpoint_freq == 0: logger.info(f"Saving...") saver.save(sess, os.path.join(logpath, 'model.ckpt')) if opts.test_mode in ["all", "tests"]: logger.info(f"Testing...") sess.run(metrics_initializer) sess.run(infeed_test_queue.initializer) sess.run(test_loop) result = sess.run(dequeue_test_outfeed) test_loss = result['mean_loss'][-1] test_acc = result['acc'][-1] logger.info( f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f} Name: {opts.log}" ) if opts.use_wandb: wandb.run.summary["Test Loss"] = test_loss wandb.run.summary["Test Accuracy"] = test_acc
def training_graph(model, opts, iterations_per_step=1): train_graph = tf.Graph() sess_config = tf.ConfigProto() sess_target = None strategy = None if opts['distributed_cluster']: strategy, sess_target, sess_config = configure_distribution( opts, sess_config) with train_graph.as_default(), ExitStack() as stack: if strategy: stack.enter_context(strategy.scope()) placeholders = dict() datatype = tf.float16 if opts["precision"].split( '.') == '16' else tf.float32 placeholders['learning_rate'] = tf.placeholder(datatype, shape=[]) learning_rate = placeholders['learning_rate'] # datasets must be defined outside the ipu device scope train_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=True), feed_name='training_feed', replication_factor=opts['replicas']) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, model, opts, learning_rate, iterations_per_step) outfeed = outfeed_queue.dequeue() if strategy: # Take the mean of all the outputs across the distributed workers outfeed = [ strategy.reduce(tf.distribute.ReduceOp.MEAN, v) for v in outfeed ] logging.print_trainable_variables(opts) train_saver = tf.train.Saver(max_to_keep=999999) with tf.device('cpu'): profile_report = gen_ipu_ops.ipu_event_trace() ipu.utils.move_variable_initialization_to_cpu(graph=None) train_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=opts["shards"], number_of_replicas=opts['replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], profile=opts['profile'], availableMemoryProportion=globalAMP) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=sess_config, target=sess_target) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, profile_report)
def generic_graph(opts, data, trainFlag): graph = tf.Graph() training = trainFlag == util.Modes.TRAIN mode_name = 'training' if training else 'validation' batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream. # For this reason, batches_per_step must be a minimum of N. batches_per_step = int(batches_per_step / opts.replication_factor) with graph.as_default(): dataset, placeholders = data.get_dataset(opts, mode=trainFlag) kwargs = {} if opts.replication_factor == 1 else {'replication_factor': opts.replication_factor} infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, f"{mode_name}_dataset_infeed", **kwargs) with ipu_scope(f'/device:IPU:0'): def comp_fn(): def body(total_loss, total_rmse, batch): loss, rmse, grad_op = graph_builder(opts, observed=batch[:, :-1], ground_truth=tf.expand_dims(batch[:, -1], axis=1), learning_rate=placeholders['learning_rate'] if training else None, mode=trainFlag) if not training: return total_loss + loss, total_rmse + rmse with tf.control_dependencies([grad_op]): return total_loss + loss, total_rmse + rmse return loops.repeat(batches_per_step, body, [tf.constant(0, getattr(np, opts.dtypes[0]))]*2, infeed) outputs = ipu_compiler.compile(comp_fn, []) # Average them over batches per step avg_loss, avg_rmse = [x / batches_per_step for x in outputs] # Add relevant things to the tf.summary for both if training: tf.summary.scalar("loss", avg_loss) tf.summary.scalar("learning_rate", placeholders["learning_rate"]) tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse) summary = tf.summary.merge_all() saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() report = None if opts.compiler_report: if training: summary_ops.ipu_compile_summary('compile_summary', avg_loss) with tf.device('cpu'): print('Initializing training report...') report = gen_ipu_ops.ipu_event_trace() writer = tf.summary.FileWriter( opts.logs_path + f'/{mode_name}', graph=graph, flush_secs=30) # Attach to IPUs and configure system # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0 if (not training and opts.multiprocessing) or training: config = ipu_utils.create_ipu_config(profiling=training, use_poplar_text_report=True, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) if opts.select_ipus == 'AUTO': config = ipu_utils.auto_select_ipus(config, [opts.replication_factor]) else: config = ipu_utils.select_ipus(config, [opts.select_ipus[not training]]) config = ipu_utils.set_compilation_options(config, {"prng.enable": str(opts.prng).lower()}) ipu_utils.configure_ipu_system(config) graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary] sess = tf.Session(graph=graph) return GraphOps(graph, sess, init, graph_outputs, placeholders if training else None, infeed, saver, writer, report, trainFlag)
idx2char = np.array(vocab) text_as_int = np.array([char2idx[c] for c in text]).astype(np.int32) sequence_length = 100 batch_size = 16 replication_factor = 2 # Create training examples / targets ds = tf.data.Dataset.from_tensor_slices(text_as_int) ds = ds.batch(sequence_length, drop_remainder=True) ds = ds.shuffle(batch_size * batch_size) ds = ds.batch(batch_size, drop_remainder=True) ds = ds.repeat() # The host side queues infeed_queue = ipu_infeed_queue.IPUInfeedQueue( ds, feed_name="infeed", replication_factor=replication_factor) # Set the learning rate lr = 0.0001 # Create a momentum optimiser for replication optimizer = cross_replica_optimizer.CrossReplicaOptimizer( tf.train.MomentumOptimizer(lr, 0.99)) # Create a host embedding object embedding = embedding_ops.create_host_embedding( "char_embedding", shape=[256, 256], dtype=tf.float32, partition_strategy="TOKEN", optimizer_spec=embedding_ops.HostEmbeddingOptimizerSpec(lr))
def construct_graph( network_class: Type[InferenceNetwork], config: Path, checkpoint_dir: str, batch_size: int, batches_per_step: int, image_filenames: Tuple[str], loop: bool, preprocess_fn: Callable, num_ipus: int, mode: str, save_graph_pb: bool ) -> Tuple[tf.Operation, tf.Operation, tf.Operation]: """Create inference graph on the device, set up in-feeds and out-feeds, connect dataset iterator to the graph. This function also exports the frozen graph into an event file, to be viewed in Tensorboard in `network_name_graph` directory. Args: network_class: Class corresponding to chosen model. config: Path to config file. checkpoint_dir: Checkpoint location. batch_size: Batch size per forward pass. batches_per_step: Number of forward passes per step. image_filenames: Collection of path to images. loop: Run inference in a loop. preprocess_fn: Pre-process function to apply to the image before feeding into the graph. num_ipus: Number of ipus. mode: Inference mode. save_graph_pb: If true, export frozen graph to event file to view in Tensorboard Returns: Compiled loop operator to run repeated inference over the dataset, infeed_queue intitializer, outfeed op. """ # Model specific config with open(config.as_posix()) as file_stream: try: config_dict = yaml.safe_load(file_stream) except yaml.YAMLError as exc: tf.logging.error(exc) config_dict['network_name'] = config.stem if 'dtype' not in config_dict: config_dict["dtype"] = 'float16' # Create inference optimized frozen graph definition network = network_class(input_shape=config_dict["input_shape"], num_outputs=1000, batch_size=batch_size, data_type=config_dict['dtype'], config=config_dict, checkpoint_dir=checkpoint_dir) # Export frozen graph to event file to view in Tensorboard" if save_graph_pb: log_dir = Path(f"{config_dict['network_name']}_graph") graph_filename = f"{log_dir}/{config_dict['network_name']}_graph.pb" if not log_dir.exists(): log_dir.mkdir() with tf.io.gfile.GFile(graph_filename, "wb") as f: f.write(network.optimized_graph.SerializeToString()) logging.info("%d ops in the final graph." % len(network.optimized_graph.node)) import_to_tensorboard(graph_filename, log_dir=log_dir.as_posix()) # Reset graph before creating one on the IPU tf.reset_default_graph() # Create dataset dataset = get_dataset(image_filenames, batch_size, loop=loop, preprocess_fn=preprocess_fn, img_width=config_dict["input_shape"][1], img_height=config_dict["input_shape"][0], dtype=config_dict['dtype']) # Set up graph on device, connect infeed and outfeed to the graph. num_replicas = num_ipus if mode == 'replicated' else 1 infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, device_ordinal=0, feed_name="infeed", replication_factor=num_replicas) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( device_ordinal=0, feed_name="outfeed", outfeed_all=True, replication_factor=num_replicas) def comp_fn(): def body(img): with scopes.ipu_scope('/device:IPU:0'): if mode == 'sharded': with autoshard.ipu_autoshard(): probs = tf.import_graph_def( network.optimized_graph, input_map={network.graph_input: img}, name="optimized", return_elements=[network.graph_output])[0] autoshard.automatic_sharding(num_shards=num_ipus, input_ts=img, loss_ts=probs, frozen_inference=True) outfeed_op = outfeed_queue.enqueue(probs) outfeed_op._set_attr( sharding._XLA_SHARDING, attr_value_pb2.AttrValue( s=probs.op.get_attr('_XlaSharding'))) else: probs = tf.import_graph_def( network.optimized_graph, input_map={network.graph_input: img}, name="optimized", return_elements=[network.graph_output])[0] outfeed_op = outfeed_queue.enqueue(probs) # Note that enqueue happens on the IPU. return outfeed_op return loops.repeat(batches_per_step, body, [], infeed_queue) loop_op = ipu_compiler.compile(comp_fn, []) # The dequeue of the outfeed needs to happen on the CPU. with tf.device('cpu'): outfeed_dequeue = outfeed_queue.dequeue() ipu_utils.move_variable_initialization_to_cpu() return loop_op, infeed_queue.initializer, outfeed_dequeue