def validation_graph(model, opts): valid_graph = tf.Graph() with valid_graph.as_default(): # datasets must be defined outside the ipu device scope valid_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=False), feed_name='validation_feed', replication_factor=opts['replicas'] * opts['shards']) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_accuracy, image, label): accuracy = validation_graph_builder( model, image, label, opts) return total_accuracy + ( tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]) accuracy = loops.repeat( int(opts["validation_batches_per_step"]), body, [tf.constant(0, tf.float32)], valid_iterator) if opts['replicas'] > 1: accuracy = cross_replica_ops.cross_replica_sum( accuracy) / (opts['replicas'] * opts['shards']) return accuracy (accuracy, ) = xla.compile(comp_fn, []) accuracy = 100 * accuracy valid_saver = tf.train.Saver() ipu.utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=1, number_of_replicas=opts['replicas'] * opts['shards'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], profile=opts['profile'], availableMemoryProportion=globalAMP, stable_norm=opts["stable_norm"]) ipu.utils.configure_ipu_system(ipu_options) valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto()) return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy], None, valid_iterator, None, valid_saver, None)
def training_graph(model, opts, iterations_per_step=1): train_graph = tf.Graph() with train_graph.as_default(): placeholders = dict() datatype = tf.float16 if opts["precision"].split( '.') == '16' else tf.float32 placeholders['learning_rate'] = tf.placeholder(datatype, shape=[]) learning_rate = placeholders['learning_rate'] # datasets must be defined outside the ipu device scope train_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=True), feed_name='training_feed', replication_factor=opts['replicas']) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, model, opts, learning_rate, iterations_per_step) outfeed = outfeed_queue.dequeue() logging.print_trainable_variables(opts) train_saver = tf.train.Saver(max_to_keep=999999) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=opts["shards"], number_of_replicas=opts['replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], availableMemoryProportion=globalAMP) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto()) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver)
def validation_graph(model, opts): valid_graph = tf.Graph() with valid_graph.as_default(): # datasets must be defined outside the ipu device scope valid_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=False), feed_name='validation_feed', replication_factor=opts['replicas'] * opts['shards']) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_accuracy, image, label): accuracy = validation_graph_builder( model, image, label, opts) return total_accuracy + ( tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]) accuracy = loops.repeat( int(opts["validation_batches_per_step"]), body, [tf.constant(0, tf.float32)], valid_iterator) if opts['replicas'] > 1: accuracy = cross_replica_ops.cross_replica_sum( accuracy) / (opts['replicas'] * opts['shards']) return accuracy (accuracy, ) = xla.compile(comp_fn, []) accuracy = 100 * accuracy valid_saver = tf.train.Saver() ipu.utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto()) return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy], None, valid_iterator, None, valid_saver)
def inference_run(exec_filename, ckpt_name, iteration, epoch, first_run, opts): """Run inference for multiple iterations and collect latency values.""" logging.mlperf_logging(key="EVAL_START", log_type="start", metadata={"epoch_num": round(epoch)}) engine_name = "my_engine" ctx = embedded_runtime.embedded_runtime_start(exec_filename, [], engine_name, timeout=1000) input_placeholder = tf.placeholder( tf.uint8, (opts['micro_batch_size'], opts['image_size'], opts['image_size'], 3)) num_iters = opts['iterations'] if opts['generated_data']: placeholders = [input_placeholder] images = np.random.normal(size=(opts['micro_batch_size'], opts['image_size'], opts['image_size'], 3)).astype(np.uint8) labels = None else: label_placeholder = tf.placeholder(tf.int32, (opts['micro_batch_size'])) placeholders = [input_placeholder, label_placeholder] with tf.Graph().as_default(): inference_dataset = dataset.data( opts, is_training=False).map(lambda x: {'data_dict': x}) images, labels = dataset_to_list( inference_dataset, num_iters * opts['micro_batch_size']) call_result = embedded_runtime.embedded_runtime_call(placeholders, ctx) ipu.config.reset_ipu_configuration() gc.collect() thread_queue = Queue() with tf.Session() as session: # do not include time of the first iteration in stats initial_feed_dict = prepare_feed_dict(placeholders, images, labels, opts['micro_batch_size'], opts['generated_data'], 0) session.run(call_result, initial_feed_dict) def runner(session, thread_idx): thread_channel = pvti.createTraceChannel(f"Thread {thread_idx}") latencies = [] accuracies = [] for iter_idx in range(num_iters): feed_dict = prepare_feed_dict(placeholders, images, labels, opts['micro_batch_size'], opts['generated_data'], iter_idx) with pvti.Tracepoint(thread_channel, f"Iteration {iter_idx}"): start_iter = time.time() _, predictions = session.run(call_result, feed_dict) end_iter = time.time() latencies.append(end_iter - start_iter) if not opts['generated_data']: expected = feed_dict[label_placeholder] accuracy = np.mean( np.equal(predictions, expected).astype(np.float32)) accuracies.append(accuracy) thread_queue.put((latencies, accuracies), timeout=10) thp = [ Thread(target=runner, args=(session, thread_idx)) for thread_idx in range(opts['num_inference_thread']) ] inference_start = time.time() for idx, _thread in enumerate(thp): _thread.start() print(f"Thread {idx} started") for idx, _thread in enumerate(thp): _thread.join() print(f"Thread {idx} joined") val_time = time.time() - inference_start latencies, accuracies = [], [] while not thread_queue.empty(): lat_acc = thread_queue.get() latencies.extend(lat_acc[0]) accuracies.extend(lat_acc[1]) if opts['generated_data']: total_accuracy = -1 else: total_accuracy = sum(accuracies) / len(accuracies) total_accuracy *= 100 # convert latencies to miliseconds latencies = [1000 * latency_s for latency_s in latencies] max_latency = max(latencies) mean_latency = np.mean(latencies) perc_99 = np.percentile(latencies, 99) perc_99_9 = np.percentile(latencies, 99.9) print( f"Latencies - avg: {mean_latency:8.4f}, 99th percentile: {perc_99:8.4f}, " f"99.9th percentile: {perc_99_9:8.4f}, max: {max_latency:8.4f}") valid_format = ( "Validation top-1 accuracy [{name}] (iteration: {iteration:6d}, epoch: {epoch:6.2f}, " "img/sec: {img_per_sec:6.2f}, time: {val_time:8.6f}, " "latency (ms): {latency:8.4f}: {val_acc:6.3f}%") val_size = (num_iters * opts['num_inference_thread'] * opts['validation_total_batch_size']) stats = OrderedDict([ ('name', ckpt_name), ('iteration', iteration), ('epoch', epoch), ('val_acc', total_accuracy), ('val_time', val_time), ('val_size', val_size), ('img_per_sec', val_size / val_time), ('latency', mean_latency), ]) logging.print_to_file_and_screen(valid_format.format(**stats), opts) logging.write_to_csv(stats, first_run, False, opts) if opts['wandb'] and opts['distributed_worker_index'] == 0: logging.log_to_wandb(stats) logging.mlperf_logging(key="EVAL_STOP", log_type="stop", metadata={"epoch_num": round(epoch)}) logging.mlperf_logging(key="EVAL_ACCURACY", value=float(stats['val_acc']) / 100, metadata={"epoch_num": round(epoch)}) return stats
def create_poplar_exec(model, opts, poplar_exec_path): """Create graph and save it to the file.""" valid_graph = tf.Graph() with valid_graph.as_default(): # datasets must be defined outside the ipu device scope if opts['generated_data']: # create dummy dataset with images only dummy_image = np.zeros((opts['micro_batch_size'], opts['image_size'], opts['image_size'], 3), dtype=np.uint8) inference_dataset = tf.data.Dataset.from_tensors( {"image": dummy_image}) else: # create dataset with images and labels inference_dataset = dataset.data(opts, is_training=False) inference_dataset = inference_dataset.map(lambda x: {'data_dict': x}) inference_infeed_iterator = \ ipu_infeed_queue.IPUInfeedQueue(inference_dataset, prefetch_depth=opts['prefetch_depth']) acc_queue = ipu_outfeed_queue.IPUOutfeedQueue() with ipu_scope('/device:IPU:0'): def comp_fn(): def body(data_dict): accuracy = validation_graph_builder(model, data_dict, opts) accuracy_enqueue = acc_queue.enqueue(accuracy) return accuracy_enqueue accuracy = loops.repeat( int(opts['validation_batches_per_step']), body, [], inference_infeed_iterator) return accuracy filenames, _ = get_ckpt_filenames(opts) accuracy = application_compile_op.experimental_application_compile_op( comp_fn, output_path=poplar_exec_path, freeze_variables=True) outfeed = acc_queue.dequeue() valid_saver = tf.train.Saver() ipu.utils.move_variable_initialization_to_cpu() with tf.Session(graph=valid_graph, config=tf.ConfigProto()) as sess: if len(filenames) == 1: print("Restoring from a snapshot: ", filenames[0]) sess.run(inference_infeed_iterator.initializer) init = tf.global_variables_initializer() sess.run(init) valid_saver.restore(sess, filenames[0]) else: print( "Warning: no restore point found - randomly initialising weights instead" ) init = tf.global_variables_initializer() sess.run(init) path = sess.run(accuracy) print(f"Poplar executable: {path}") valid_graph.finalize()
def build_graph(bert_config, opts, iterations_per_step=1, is_training=True, feed_name=None): """Build the graph for training. Args: bert_config: configuration for the BERT model. opts: a dictionary containing all global options. iterations_per_step: number of iterations per step is_training (bool): if true return a graph with trainable variables. feed_name: name of the IPU infeed. Returns: a GraphOps containing a BERT graph and session prepared for inference or training. """ train_graph = tf.Graph() with train_graph.as_default(): placeholders = dict() placeholders['learning_rate'] = tf.placeholder(bert_config.dtype, shape=[]) learning_rate = placeholders['learning_rate'] train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=is_training), feed_name=feed_name + "_in", replication_factor=opts['replicas']) outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue( feed_name=feed_name + "_out", replication_factor=opts['replicas']) with ipu.scopes.ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( bert_config, train_iterator, outfeed_queue, opts, learning_rate, iterations_per_step, is_training=is_training) outfeed = outfeed_queue.dequeue() bert_logging.print_trainable_variables(opts['logs_path']) model_variables = tf.trainable_variables() + tf.get_collection( tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) model_and_optimiser_variables = tf.global_variables() restore = tf.train.Saver( var_list=model_and_optimiser_variables if opts['restore_optimiser_from_ckpt'] else model_variables) # We store two savers: one for the standard training and another one for the best checkpoint savers = { "train_saver": tf.train.Saver(var_list=model_variables if opts['ckpt_model_only'] else model_and_optimiser_variables, name='latest', max_to_keep=5), "best_saver": tf.train.Saver(var_list=model_variables if opts['ckpt_model_only'] else model_and_optimiser_variables, name='best', max_to_keep=1) } ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() # Calculate number of IPUs required for pretraining pipeline. num_embedding_ipu = { 'two_ipus': 2, 'same_ipu': 1, 'same_as_hidden_layers': 0 }[opts['embeddings_placement']] num_hidden_layer_stages = len(bert_config.hidden_layers_per_stage) num_ipus_required = opts['replicas'] * next_power_of_two( num_hidden_layer_stages + num_embedding_ipu) # Configure the IPU options. ipu_options = get_ipu_config( fp_exceptions=opts["fp_exceptions"], stochastic_rounding=opts['stochastic_rounding'], xla_recompute=opts["xla_recompute"], available_memory_proportion=opts['available_memory_proportion'], disable_graph_outlining=opts["disable_graph_outlining"], num_ipus_required=num_ipus_required, max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], scheduler_selection=opts['scheduler'], compile_only=opts['compile_only'], partials_type=opts['partials_type']) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto()) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, savers, restore, tvars)
def training_graph(model, opts, iterations_per_step=1): train_graph = tf.Graph() sess_config = tf.ConfigProto() sess_target = None strategy = None if opts['distributed_cluster']: strategy, sess_target, sess_config = configure_distribution( opts, sess_config) with train_graph.as_default(), ExitStack() as stack: if strategy: stack.enter_context(strategy.scope()) placeholders = dict() datatype = tf.float16 if opts["precision"].split( '.') == '16' else tf.float32 placeholders['learning_rate'] = tf.placeholder(datatype, shape=[]) learning_rate = placeholders['learning_rate'] # datasets must be defined outside the ipu device scope train_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=True), feed_name='training_feed', replication_factor=opts['replicas']) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, model, opts, learning_rate, iterations_per_step) outfeed = outfeed_queue.dequeue() if strategy: # Take the mean of all the outputs across the distributed workers outfeed = [ strategy.reduce(tf.distribute.ReduceOp.MEAN, v) for v in outfeed ] logging.print_trainable_variables(opts) train_saver = tf.train.Saver(max_to_keep=999999) with tf.device('cpu'): profile_report = gen_ipu_ops.ipu_event_trace() ipu.utils.move_variable_initialization_to_cpu(graph=None) train_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=opts["shards"], number_of_replicas=opts['replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], profile=opts['profile'], availableMemoryProportion=globalAMP) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=sess_config, target=sess_target) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, profile_report)
def validation_graph(model, opts): reconfigure = not opts.get('reuse_IPUs', False) if opts['use_popdist'] and reconfigure: hvd.init() valid_graph = tf.Graph() with valid_graph.as_default(): # datasets must be defined outside the ipu device scope valid_dataset = dataset.data( opts, is_training=False).map(lambda x: {'data_dict': x}) valid_iterator = ipu_infeed_queue.IPUInfeedQueue( valid_dataset, prefetch_depth=opts['prefetch_depth']) if opts['latency']: timestamp_queue = ipu_outfeed_queue.IPUOutfeedQueue() with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_accuracy, data_dict): accuracy = validation_graph_builder(model, data_dict, opts) if opts['latency']: timestamp_enqueue = timestamp_queue.enqueue( data_dict['timestamp']) return (total_accuracy + (tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]), timestamp_enqueue) else: return total_accuracy + ( tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]) accuracy = loops.repeat( int(opts["validation_batches_per_step"]), body, [tf.constant(0, tf.float32)], valid_iterator) if opts['total_replicas'] * opts['shards'] > 1 and not opts.get( 'inference', False): accuracy = cross_replica_ops.cross_replica_sum( accuracy) / (opts['total_replicas'] * opts['shards']) return accuracy (accuracy, ) = xla.compile(comp_fn, []) accuracy = 100 * accuracy if opts['latency']: print(f'relative_timer start {relative_timer.get_start()}') timestamp = tf.cast(tf.timestamp() - relative_timer.get_start(), tf.float32) latency_per_batch = tf.reshape( timestamp - timestamp_queue.dequeue(), [-1]) else: latency_per_batch = None valid_saver = tf.train.Saver() ipu.utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() if opts['use_popdist']: broadcast_weights = [] for var in tf.global_variables(): broadcast_weights.append( var.assign(hvd.broadcast(var, root_rank=0))) global_batch_size_ph = tf.placeholder(dtype=tf.int32, shape=()) broadcast_global_batch_size = hvd.broadcast(global_batch_size_ph, root_rank=0) num_files_ph = tf.placeholder(dtype=tf.int32, shape=()) broadcast_num_files = hvd.broadcast(num_files_ph, root_rank=0) iteration_ph = tf.placeholder(dtype=tf.int32, shape=()) broadcast_iteration = hvd.broadcast(iteration_ph, root_rank=0) else: broadcast_weights = None broadcast_global_batch_size, global_batch_size_ph = None, None broadcast_num_files, num_files_ph = None, None broadcast_iteration, iteration_ph = None, None globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=False, # disable Stochastic Rounding for validation shards=opts['shards'], number_of_replicas=opts['total_replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], half_partials=opts["enable_half_partials"], conv_dithering=opts["enable_conv_dithering"], enable_recomputation=opts["enable_recomputation"], seed=opts["seed"], availableMemoryProportion=globalAMP, stable_norm=opts["stable_norm"], compile_only=opts["compile_only"], internalExchangeOptimisationTarget=opts[ "internal_exchange_optimisation_target"], num_io_tiles=opts["num_io_tiles"], number_of_distributed_batch_norm_replicas=opts.get("BN_span", 1), nanoo=not opts["saturate_on_overflow"], ) if opts['use_popdist'] and reconfigure: ipu_options = popdist.tensorflow.set_ipu_config(ipu_options, opts['shards'], configure_device=False) if opts['on_demand'] and reconfigure: ipu_options.device_connection.enable_remote_buffers = True ipu_options.device_connection.type = ipu.utils.DeviceConnectionType.ON_DEMAND if reconfigure: ipu_options.configure_ipu_system() valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto()) ops = { 'accuracy': accuracy, 'broadcast_weights': broadcast_weights, 'broadcast_global_batch_size': broadcast_global_batch_size, 'broadcast_num_files': broadcast_num_files, 'broadcast_iteration': broadcast_iteration, 'latency_per_batch': latency_per_batch } placeholders = { 'global_batch_size': global_batch_size_ph, 'num_files': num_files_ph, 'iteration': iteration_ph } valid_graph.finalize() return train.GraphOps(valid_graph, valid_sess, valid_init, ops, placeholders, valid_iterator, None, valid_saver)