def configure_ipu(opts): """Set the IPU configuration based on execution options.""" global_amp = None if opts['available_memory_proportion'] and len( opts['available_memory_proportion']) == 1: global_amp = opts['available_memory_proportion'][0] ipu_options = get_config( ipu_id=opts['select_ipu'], prng=not opts['no_stochastic_rounding'], shards=opts['shards'], number_of_replicas=opts['total_replicas'], max_cross_replica_buffer_size=opts['max_cross_replica_buffer_size'], fp_exceptions=opts['fp_exceptions'], half_partials=opts['enable_half_partials'], conv_dithering=opts['enable_conv_dithering'], enable_recomputation=opts['enable_recomputation'], seed=opts['seed'], availableMemoryProportion=global_amp, stable_norm=opts['stable_norm'], compile_only=opts['compile_only'], internalExchangeOptimisationTarget=opts[ "internal_exchange_optimisation_target"], num_io_tiles=opts['num_io_tiles'], number_of_distributed_batch_norm_replicas=opts.get("BN_span", 1), nanoo=not opts['saturate_on_overflow'], ) if opts['on_demand']: ipu_options.device_connection.enable_remote_buffers = True ipu_options.device_connection.type = ipu.utils.DeviceConnectionType.ON_DEMAND ipu_options.configure_ipu_system()
def validation_graph(model, opts): valid_graph = tf.Graph() with valid_graph.as_default(): # datasets must be defined outside the ipu device scope valid_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=False), feed_name='validation_feed', replication_factor=opts['replicas'] * opts['shards']) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_accuracy, image, label): accuracy = validation_graph_builder( model, image, label, opts) return total_accuracy + ( tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]) accuracy = loops.repeat( int(opts["validation_batches_per_step"]), body, [tf.constant(0, tf.float32)], valid_iterator) if opts['replicas'] > 1: accuracy = cross_replica_ops.cross_replica_sum( accuracy) / (opts['replicas'] * opts['shards']) return accuracy (accuracy, ) = xla.compile(comp_fn, []) accuracy = 100 * accuracy valid_saver = tf.train.Saver() ipu.utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=1, number_of_replicas=opts['replicas'] * opts['shards'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], profile=opts['profile'], availableMemoryProportion=globalAMP, stable_norm=opts["stable_norm"]) ipu.utils.configure_ipu_system(ipu_options) valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto()) return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy], None, valid_iterator, None, valid_saver, None)
def training_graph(model, opts, iterations_per_step=1): train_graph = tf.Graph() with train_graph.as_default(): placeholders = dict() datatype = tf.float16 if opts["precision"].split( '.') == '16' else tf.float32 placeholders['learning_rate'] = tf.placeholder(datatype, shape=[]) learning_rate = placeholders['learning_rate'] # datasets must be defined outside the ipu device scope train_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=True), feed_name='training_feed', replication_factor=opts['replicas']) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, model, opts, learning_rate, iterations_per_step) outfeed = outfeed_queue.dequeue() logging.print_trainable_variables(opts) train_saver = tf.train.Saver(max_to_keep=999999) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=opts["shards"], number_of_replicas=opts['replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], availableMemoryProportion=globalAMP) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto()) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver)
def training_graph(model, opts, iterations_per_step=1): train_graph = tf.Graph() sess_config = tf.ConfigProto() sess_target = None strategy = None if opts['distributed_cluster']: strategy, sess_target, sess_config = configure_distribution( opts, sess_config) with train_graph.as_default(), ExitStack() as stack: if strategy: stack.enter_context(strategy.scope()) placeholders = dict() datatype = tf.float16 if opts["precision"].split( '.') == '16' else tf.float32 placeholders['learning_rate'] = tf.placeholder(datatype, shape=[]) learning_rate = placeholders['learning_rate'] # datasets must be defined outside the ipu device scope train_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=True), feed_name='training_feed', replication_factor=opts['replicas']) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, model, opts, learning_rate, iterations_per_step) outfeed = outfeed_queue.dequeue() if strategy: # Take the mean of all the outputs across the distributed workers outfeed = [ strategy.reduce(tf.distribute.ReduceOp.MEAN, v) for v in outfeed ] logging.print_trainable_variables(opts) train_saver = tf.train.Saver(max_to_keep=999999) with tf.device('cpu'): profile_report = gen_ipu_ops.ipu_event_trace() ipu.utils.move_variable_initialization_to_cpu(graph=None) train_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=opts["shards"], number_of_replicas=opts['replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], profile=opts['profile'], availableMemoryProportion=globalAMP) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=sess_config, target=sess_target) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, profile_report)
def build_graph(opts, is_training=True): train_graph = tf.Graph() strategy = None if opts['use_popdist']: strategy = create_popdist_strategy() with train_graph.as_default(), ExitStack() as stack: if strategy: stack.enter_context(strategy.scope()) if opts["groupbert"]: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.GroupBertConfig(vocab_size=None)) else: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.BertConfig(vocab_size=None)) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 # define placeholders placeholders = { 'learning_rate': tf.placeholder(tf.float32, shape=[]), 'loss_scaling': tf.placeholder(tf.float32, shape=[]) } learning_rate = placeholders['learning_rate'] loss_scaling = placeholders['loss_scaling'] # define input, datasets must be defined outside the ipu device scope. train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue( data_loader.load(opts, is_training=is_training)) # define output outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue() # building networks with pipeline def bert_net(): return build_network(train_iterator, outfeed_queue, bert_config, opts, learning_rate, loss_scaling, is_training) with ipu.scopes.ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, bert_config, opts, learning_rate, loss_scaling, is_training) # get result from outfeed queue outfeed = outfeed_queue.dequeue() if strategy: # Take the mean of all the outputs across the distributed workers outfeed = [ strategy.reduce(tf.distribute.ReduceOp.MEAN, v) for v in outfeed ] if opts['distributed_worker_index'] == 0 or opts['log_all_workers']: log.print_trainable_variables(opts) model_and_optimiser_variables = tf.global_variables() model_variables = tf.trainable_variables() + tf.get_collection( tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) restore = tf.train.Saver( var_list=model_and_optimiser_variables if opts['restore_optimiser_from_checkpoint'] else model_variables) train_saver = tf.train.Saver( var_list=model_and_optimiser_variables if opts['save_optimiser_to_checkpoint'] else model_variables, max_to_keep=5) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() # calculate the number of required IPU num_ipus = (max(opts['device_mapping']) + 1) * opts['replicas'] num_ipus = ipu_utils.next_power_of_two(num_ipus) ipu_config = ipu_utils.get_config( fp_exceptions=opts["fp_exceptions"], enable_recomputation=opts["enable_recomputation"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], minimum_remote_tensor_size=opts['min_remote_tensor_size'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'], scheduler_selection=opts['scheduler'], compile_only=opts['compile_only'], ipu_id=opts['select_ipu']) if opts['use_popdist']: ipu_config = popdist.tensorflow.set_ipu_config(ipu_config, opts['shards'], configure_device=False) # Do not acquire a device, compile only. if opts["compile_only"]: ipu_config.device_connection.version = "ipu2" ipu_config.device_connection.enable_remote_buffers = True # PRE_COMPILE allows for runing execuatables on graph without being online ipu_config.device_connection.type = DeviceConnectionType.PRE_COMPILE # Enforce using a exe cache dir, defaulting if not given if ("TF_POPLAR_FLAGS" in os.environ): if ("--executable_cache_path" not in os.environ["TF_POPLAR_FLAGS"]): print( "Warning: --executable_cache_path in TF_POPLAR_FLAGS " + "(for 'poprun --mpi_local_args') not set. Setting to default " + "path: ./tmp/tf_cache/") os.environ[ "TF_POPLAR_FLAGS"] = "--executable_cache_path=/tmp/tf_cache" # Sometimes TF_POPLAR_FLAGS might not even exist else: print( "Warning: TF_POPLAR_FLAGS environment variable (for 'poprun " + "--mpi_local_args') not set. --executable_cache_path must be " + "defined when using --compile-only. Setting to default path: " + "./tmp/tf_cache/") os.environ[ "TF_POPLAR_FLAGS"] = "--executable_cache_path=/tmp/tf_cache" ipu_config.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, restore, tvars)
def build_graph(opts, iterations_per_step=1, is_training=True): train_graph = tf.Graph() with train_graph.as_default(): bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.BertConfig(vocab_size=None)) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 placeholders = dict() learning_rate = None opts['version_2_with_negative'] = False train_iterator = ipu_infeed_queue.IPUInfeedQueue( data_loader.load(opts, is_training=is_training)) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() # building networks with pipeline if not should_be_pipeline_when_inference(opts): def bert_net(): return build_infer_network_without_pipeline( train_iterator, outfeed_queue, iterations_per_step, bert_config=bert_config, opts=opts) else: def bert_net(): return build_network(train_iterator, outfeed_queue, iterations_per_step, bert_config, opts, learning_rate, is_training) with ipu_scope('/device:IPU:0'): embedded = opts["embedded_runtime"] if embedded and is_training: raise ValueError( "embedded_runtime is only to be used for inference.") train = ipu.ipu_compiler.compile(bert_net, []) if not embedded else None exec_path = None compile_op = None poplar_exec_filepath = get_exec_path( opts['seq_length'], opts['micro_batch_size'], opts['device_mapping'], should_be_pipeline_when_inference(opts)) exec_path = os.path.join(poplar_exec_filepath) compile_op = application_compile_op.experimental_application_compile_op( bert_net, output_path=exec_path, freeze_variables=True) outfeed = outfeed_queue.dequeue() restore = tf.train.Saver(var_list=tf.global_variables()) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() # Calculate the number of required IPU""" num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas']) # The number of acquired IPUs must be the power of 2. if num_ipus & (num_ipus - 1) != 0: num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2))) ipu_config = get_config( fp_exceptions=opts["fp_exceptions"], enable_recomputation=opts["enable_recomputation"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'], scheduler_selection='CLUSTERING', compile_only=False, ipu_id=None, partials_type=opts["partials_type"], available_memory_proportion=opts['available_memory_proportion']) ipu_config.configure_ipu_system() train_sess = tf.Session(graph=train_graph) _ = train_sess.run(train_init, []) # ----------------- # Checkpoints restore and save init_checkpoint_path = opts['init_checkpoint'] logger.info(f"At the checkpoint location {init_checkpoint_path}") if init_checkpoint_path: logger.info("Loading checkpoint...") if os.path.isfile(init_checkpoint_path): init_checkpoint_path = os.path.splitext(init_checkpoint_path)[0] logger.info(f"checkpoint path: {init_checkpoint_path}") (assignment_map, initialized_variable_names ) = bert_ipu.get_assignment_map_from_checkpoint( tvars, init_checkpoint_path) for var in tvars: if var.name in initialized_variable_names: mark = "*" else: mark = " " logger.info("%-60s [%s]\t%s (%s)", var.name, mark, var.shape, var.dtype.name) reader = tf.train.NewCheckpointReader(init_checkpoint_path) load_vars = reader.get_variable_to_shape_map() saver_restore = tf.train.Saver(assignment_map) saver_restore.restore(train_sess, init_checkpoint_path) # ----------------- if compile_op is not None: logger.info( f"Compiling and saving Poplar executable to {poplar_exec_filepath}" ) _ = train_sess.run(compile_op, []) else: exec_path = None return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, restore, tvars, exec_path), ipu_config
def build_graph(opts, iterations_per_step=1, is_training=True): train_graph = tf.Graph() with train_graph.as_default(): if opts["groupbert"]: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.GroupBertConfig(vocab_size=None)) else: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.BertConfig(vocab_size=None)) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 placeholders = dict() if is_training: placeholders['learning_rate'] = tf.placeholder(bert_config.dtype, shape=[]) learning_rate = placeholders['learning_rate'] else: learning_rate = None # Need to load the Glue File here label_list = opts["pass_in"][1] bert_config.num_lables = len(label_list) if opts['do_training'] and opts['current_mode'] == 'train': input_file = os.path.join(opts["output_dir"], f"train_{opts['task_type']}.tf_record") elif opts['do_eval'] and opts['current_mode'] == 'eval': input_file = os.path.join(opts["output_dir"], f"eval_{opts['task_type']}.tf_record") elif opts['do_predict'] and opts['current_mode'] == 'predict': input_file = os.path.join( opts["output_dir"], f"predict_{opts['task_type']}.tf_record") else: raise NotImplementedError() opts['input_file'] = input_file opts['drop_remainder'] = True train_iterator = ipu_infeed_queue.IPUInfeedQueue( data_loader.load(opts, is_training=is_training)) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() def bert_net(): return build_network(train_iterator, outfeed_queue, iterations_per_step, bert_config, opts, learning_rate, is_training) with ipu_scope('/device:IPU:0'): train = ipu.ipu_compiler.compile(bert_net, []) outfeed = outfeed_queue.dequeue() log.print_trainable_variables(opts) restore = tf.train.Saver(var_list=tf.global_variables()) train_saver = tf.train.Saver(max_to_keep=5) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() """calculate the number of required IPU""" num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas']) # The number of acquired IPUs must be the power of 2. if num_ipus & (num_ipus - 1) != 0: num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2))) ipu_config = get_config( fp_exceptions=opts["fp_exceptions"], enable_recomputation=opts["enable_recomputation"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'], scheduler_selection='CLUSTERING', compile_only=False, ipu_id=None, available_memory_proportion=opts["available_memory_proportion"]) ipu_config.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, restore, tvars)
def build_graph(opts, iterations_per_step=1, is_training=True): train_graph = tf.Graph() with train_graph.as_default(): if opts["groupbert"]: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.GroupBertConfig(vocab_size=None)) else: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.BertConfig(vocab_size=None)) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 placeholders = dict() if is_training: placeholders['learning_rate'] = tf.placeholder(bert_config.dtype, shape=[]) learning_rate = placeholders['learning_rate'] else: learning_rate = None train_iterator = ipu_infeed_queue.IPUInfeedQueue( data_loader.load(opts, is_training=is_training)) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() # building networks with pipeline if not should_be_pipeline_when_inference(opts): def bert_net(): return build_infer_network_without_pipeline( train_iterator, outfeed_queue, iterations_per_step, bert_config=bert_config, opts=opts) else: def bert_net(): return build_network(train_iterator, outfeed_queue, iterations_per_step, bert_config, opts, learning_rate, is_training) with ipu_scope('/device:IPU:0'): train = ipu.ipu_compiler.compile(bert_net, []) outfeed = outfeed_queue.dequeue() restore = tf.train.Saver(var_list=tf.global_variables()) train_saver = tf.train.Saver(max_to_keep=5) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() """calculate the number of required IPU""" num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas']) # The number of acquired IPUs must be the power of 2. if num_ipus & (num_ipus - 1) != 0: num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2))) ipu_config = get_config( fp_exceptions=opts["fp_exceptions"], enable_recomputation=opts["enable_recomputation"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'], scheduler_selection='CLUSTERING', compile_only=False, ipu_id=None, partials_type=opts["partials_type"]) ipu_config.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, restore, tvars)
def validation_graph(model, opts): reconfigure = not opts.get('reuse_IPUs', False) if opts['use_popdist'] and reconfigure: hvd.init() valid_graph = tf.Graph() with valid_graph.as_default(): # datasets must be defined outside the ipu device scope valid_dataset = dataset.data( opts, is_training=False).map(lambda x: {'data_dict': x}) valid_iterator = ipu_infeed_queue.IPUInfeedQueue( valid_dataset, prefetch_depth=opts['prefetch_depth']) if opts['latency']: timestamp_queue = ipu_outfeed_queue.IPUOutfeedQueue() with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_accuracy, data_dict): accuracy = validation_graph_builder(model, data_dict, opts) if opts['latency']: timestamp_enqueue = timestamp_queue.enqueue( data_dict['timestamp']) return (total_accuracy + (tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]), timestamp_enqueue) else: return total_accuracy + ( tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]) accuracy = loops.repeat( int(opts["validation_batches_per_step"]), body, [tf.constant(0, tf.float32)], valid_iterator) if opts['total_replicas'] * opts['shards'] > 1 and not opts.get( 'inference', False): accuracy = cross_replica_ops.cross_replica_sum( accuracy) / (opts['total_replicas'] * opts['shards']) return accuracy (accuracy, ) = xla.compile(comp_fn, []) accuracy = 100 * accuracy if opts['latency']: print(f'relative_timer start {relative_timer.get_start()}') timestamp = tf.cast(tf.timestamp() - relative_timer.get_start(), tf.float32) latency_per_batch = tf.reshape( timestamp - timestamp_queue.dequeue(), [-1]) else: latency_per_batch = None valid_saver = tf.train.Saver() ipu.utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() if opts['use_popdist']: broadcast_weights = [] for var in tf.global_variables(): broadcast_weights.append( var.assign(hvd.broadcast(var, root_rank=0))) global_batch_size_ph = tf.placeholder(dtype=tf.int32, shape=()) broadcast_global_batch_size = hvd.broadcast(global_batch_size_ph, root_rank=0) num_files_ph = tf.placeholder(dtype=tf.int32, shape=()) broadcast_num_files = hvd.broadcast(num_files_ph, root_rank=0) iteration_ph = tf.placeholder(dtype=tf.int32, shape=()) broadcast_iteration = hvd.broadcast(iteration_ph, root_rank=0) else: broadcast_weights = None broadcast_global_batch_size, global_batch_size_ph = None, None broadcast_num_files, num_files_ph = None, None broadcast_iteration, iteration_ph = None, None globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=False, # disable Stochastic Rounding for validation shards=opts['shards'], number_of_replicas=opts['total_replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], half_partials=opts["enable_half_partials"], conv_dithering=opts["enable_conv_dithering"], enable_recomputation=opts["enable_recomputation"], seed=opts["seed"], availableMemoryProportion=globalAMP, stable_norm=opts["stable_norm"], compile_only=opts["compile_only"], internalExchangeOptimisationTarget=opts[ "internal_exchange_optimisation_target"], num_io_tiles=opts["num_io_tiles"], number_of_distributed_batch_norm_replicas=opts.get("BN_span", 1), nanoo=not opts["saturate_on_overflow"], ) if opts['use_popdist'] and reconfigure: ipu_options = popdist.tensorflow.set_ipu_config(ipu_options, opts['shards'], configure_device=False) if opts['on_demand'] and reconfigure: ipu_options.device_connection.enable_remote_buffers = True ipu_options.device_connection.type = ipu.utils.DeviceConnectionType.ON_DEMAND if reconfigure: ipu_options.configure_ipu_system() valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto()) ops = { 'accuracy': accuracy, 'broadcast_weights': broadcast_weights, 'broadcast_global_batch_size': broadcast_global_batch_size, 'broadcast_num_files': broadcast_num_files, 'broadcast_iteration': broadcast_iteration, 'latency_per_batch': latency_per_batch } placeholders = { 'global_batch_size': global_batch_size_ph, 'num_files': num_files_ph, 'iteration': iteration_ph } valid_graph.finalize() return train.GraphOps(valid_graph, valid_sess, valid_init, ops, placeholders, valid_iterator, None, valid_saver)
def build_graph(opts, is_training=True, feed_name=None): train_graph = tf.Graph() strategy = None if opts['use_popdist']: strategy = create_popdist_strategy() with train_graph.as_default(), ExitStack() as stack: if strategy: stack.enter_context(strategy.scope()) bert_config = bert_ipu.BertConfig.from_dict(opts) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 # define placeholders placeholders = { 'learning_rate': tf.placeholder(bert_config.dtype, shape=[]), 'loss_scaling': tf.placeholder(bert_config.dtype, shape=[]) } learning_rate = placeholders['learning_rate'] loss_scaling = placeholders['loss_scaling'] # define input, datasets must be defined outside the ipu device scope. train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue( dataset.load(opts, is_training=is_training), feed_name=feed_name + "_in", replication_factor=opts['replicas']) # define output outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue( feed_name=feed_name + "_out", replication_factor=opts['replicas']) # building networks with pipeline def bert_net(): return build_network(train_iterator, outfeed_queue, bert_config, opts, learning_rate, loss_scaling, is_training) with ipu.scopes.ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, bert_config, opts, learning_rate, loss_scaling, is_training) # get result from outfeed queue outfeed = outfeed_queue.dequeue() if strategy: # Take the mean of all the outputs across the distributed workers outfeed = [ strategy.reduce(tf.distribute.ReduceOp.MEAN, v) for v in outfeed ] if opts['distributed_worker_index'] == 0 or opts['log_all_workers']: log.print_trainable_variables(opts) model_and_optimiser_variables = tf.global_variables() model_variables = tf.trainable_variables() + tf.get_collection( tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) restore = tf.train.Saver( var_list=model_and_optimiser_variables if opts['restore_optimiser_from_checkpoint'] else model_variables) train_saver = tf.train.Saver( var_list=model_and_optimiser_variables if opts['save_optimiser_to_checkpoint'] else model_variables, max_to_keep=5) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() # calculate the number of required IPU num_ipus = (max(opts['device_mapping']) + 1) * opts['replicas'] num_ipus = ipu_utils.next_power_of_two(num_ipus) ipu_options = ipu_utils.get_config( fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], scheduler_selection=opts['scheduler'], compile_only=opts['compile_only'], ipu_id=opts['select_ipu']) if opts['use_popdist']: ipu_options = popdist.tensorflow.set_ipu_config(ipu_options, opts['shards'], configure_device=False) ipu.utils.configure_ipu_system(ipu_options) # This is a workaround bug https://github.com/tensorflow/tensorflow/issues/23780 from tensorflow.core.protobuf import rewriter_config_pb2 sess_cfg = tf.ConfigProto() sess_cfg.graph_options.rewrite_options.memory_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) train_sess = tf.Session(graph=train_graph, config=sess_cfg) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, restore, tvars)