def create_popdist_strategy(): """ Creates a distribution strategy for use with popdist. We use the Horovod-based IPUMultiReplicaStrategy. Horovod is used for the initial broadcast of the weights and when reductions are requested on the host. Imports are placed here so they are only done when required, as Horovod might not always be available. """ from tensorflow.python.ipu.horovod import popdist_strategy hvd.init() # We add the IPU cross replica reductions explicitly in the IPUOptimizer, # so disable them in the IPUMultiReplicaStrategy. return popdist_strategy.IPUMultiReplicaStrategy( add_ipu_cross_replica_reductions=False)
def setUpClass(cls): hvd.init()
def ipu_prog(num_replicas, gradient_accumulation): import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) popdist_on = popdist.isPopdistEnvSet() num_global_replicas = popdist.getNumTotalReplicas( ) if popdist_on else num_replicas num_instances = popdist.getNumInstances() if popdist_on else 1 dataset_size = global_batch_size = 16 micro_batch_size = int(global_batch_size / num_global_replicas / gradient_accumulation) X = np.arange(1, dataset_size + 1, 1, dtype=float) Y = [0] * dataset_size ds = tf.data.Dataset.from_tensor_slices((X, Y)) if popdist_on: ds = ds.shard(num_instances, index=popdist.getInstanceIndex()) ds = ds.batch(micro_batch_size, drop_remainder=True) ds = ds.repeat() cfg = ipu.config.IPUConfig() if popdist_on: cfg = popdist.tensorflow.set_ipu_config( cfg, ipus_per_replica=popdist.getNumIpusPerReplica(), configure_device=True) hvd.init() else: cfg.auto_select_ipus = num_global_replicas cfg.configure_ipu_system() strategy = popdist_strategy.PopDistStrategy( ) if popdist_on else ipu.ipu_strategy.IPUStrategy() with strategy.scope(): def get_model(): input_layer = tf.keras.Input(shape=1) kernel_initializer = tf.keras.initializers.Constant(1) x = tf.keras.layers.Dense( 1, use_bias=False, kernel_initializer=kernel_initializer)(input_layer) return tf.keras.Model(input_layer, x) model = get_model() model.set_gradient_accumulation_options( gradient_accumulation_steps_per_replica=gradient_accumulation) model.build(input_shape=(micro_batch_size, 1)) if popdist_on: def gradient_normalizer(grads_and_vars): return [(grad / gradient_accumulation, var) for grad, var in grads_and_vars] else: def gradient_normalizer(grads_and_vars): return [ (grad / num_global_replicas / gradient_accumulation, var) for grad, var in grads_and_vars ] optimizer = tf.keras.optimizers.SGD( learning_rate=1.0, gradient_transformers=[gradient_normalizer]) loss_class = tf.keras.losses.MeanSquaredError loss_outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue() loss_class = wrap_loss_in_enqueuer(loss_class, loss_outfeed_queue) loss = loss_class() micro_batches_per_weight_update = num_global_replicas * gradient_accumulation steps_per_execution = dataset_size // ( micro_batch_size * micro_batches_per_weight_update ) * micro_batches_per_weight_update model.compile(optimizer=optimizer, loss=loss, metrics=[tf.keras.losses.MSE], steps_per_execution=steps_per_execution) callbacks = [ OutFeedQueueCallback(queue=loss_outfeed_queue, name='average_loss') ] if num_instances > 1: callbacks += [AllReduceMetricsCallback()] callbacks += [LoggingCallback(1)] model.fit(ds, steps_per_epoch=steps_per_execution, callbacks=callbacks) return model.get_weights()[0][0][0]
available_memory_proportion[0] / 100) if gather_conv_output: cfg.convolutions.poplar_options['gatherConvOutput'] = 'true' cfg.floating_point_behaviour.inv = fp_exceptions cfg.floating_point_behaviour.div0 = fp_exceptions cfg.floating_point_behaviour.oflo = fp_exceptions cfg.compilation_poplar_options[ 'target.deterministicWorkers'] = 'false' if seed is None else 'portable' if internal_exchange_optimization_target is not None: cfg.compilation_poplar_options[ 'opt.internalExchangeOptimisationTarget'] = internal_exchange_optimization_target if distributed_training: popdist.tensorflow.set_ipu_config( cfg, ipus_per_replica=num_ipus_per_replica, configure_device=True) hvd.init() else: cfg.auto_select_ipus = num_ipus_per_replica * num_replicas cfg.configure_ipu_system() set_seed(seed) batch_config = BatchConfig(micro_batch_size, num_replicas, gradient_accumulation_count, global_batch_size) logging.info(f'micro batch size {batch_config.micro_batch_size}') logging.info(f'global batch size {batch_config.global_batch_size}') logging.info( f'gradient accumulation {batch_config.gradient_accumulation_count}') logging.info(f'num replicas {batch_config.num_replicas}')
def validation_graph(model, opts): reconfigure = not opts.get('reuse_IPUs', False) if opts['use_popdist'] and reconfigure: hvd.init() valid_graph = tf.Graph() with valid_graph.as_default(): # datasets must be defined outside the ipu device scope valid_dataset = dataset.data( opts, is_training=False).map(lambda x: {'data_dict': x}) valid_iterator = ipu_infeed_queue.IPUInfeedQueue( valid_dataset, prefetch_depth=opts['prefetch_depth']) if opts['latency']: timestamp_queue = ipu_outfeed_queue.IPUOutfeedQueue() with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_accuracy, data_dict): accuracy = validation_graph_builder(model, data_dict, opts) if opts['latency']: timestamp_enqueue = timestamp_queue.enqueue( data_dict['timestamp']) return (total_accuracy + (tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]), timestamp_enqueue) else: return total_accuracy + ( tf.cast(accuracy, tf.float32) / opts["validation_batches_per_step"]) accuracy = loops.repeat( int(opts["validation_batches_per_step"]), body, [tf.constant(0, tf.float32)], valid_iterator) if opts['total_replicas'] * opts['shards'] > 1 and not opts.get( 'inference', False): accuracy = cross_replica_ops.cross_replica_sum( accuracy) / (opts['total_replicas'] * opts['shards']) return accuracy (accuracy, ) = xla.compile(comp_fn, []) accuracy = 100 * accuracy if opts['latency']: print(f'relative_timer start {relative_timer.get_start()}') timestamp = tf.cast(tf.timestamp() - relative_timer.get_start(), tf.float32) latency_per_batch = tf.reshape( timestamp - timestamp_queue.dequeue(), [-1]) else: latency_per_batch = None valid_saver = tf.train.Saver() ipu.utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() if opts['use_popdist']: broadcast_weights = [] for var in tf.global_variables(): broadcast_weights.append( var.assign(hvd.broadcast(var, root_rank=0))) global_batch_size_ph = tf.placeholder(dtype=tf.int32, shape=()) broadcast_global_batch_size = hvd.broadcast(global_batch_size_ph, root_rank=0) num_files_ph = tf.placeholder(dtype=tf.int32, shape=()) broadcast_num_files = hvd.broadcast(num_files_ph, root_rank=0) iteration_ph = tf.placeholder(dtype=tf.int32, shape=()) broadcast_iteration = hvd.broadcast(iteration_ph, root_rank=0) else: broadcast_weights = None broadcast_global_batch_size, global_batch_size_ph = None, None broadcast_num_files, num_files_ph = None, None broadcast_iteration, iteration_ph = None, None globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=False, # disable Stochastic Rounding for validation shards=opts['shards'], number_of_replicas=opts['total_replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], half_partials=opts["enable_half_partials"], conv_dithering=opts["enable_conv_dithering"], enable_recomputation=opts["enable_recomputation"], seed=opts["seed"], availableMemoryProportion=globalAMP, stable_norm=opts["stable_norm"], compile_only=opts["compile_only"], internalExchangeOptimisationTarget=opts[ "internal_exchange_optimisation_target"], num_io_tiles=opts["num_io_tiles"], number_of_distributed_batch_norm_replicas=opts.get("BN_span", 1), nanoo=not opts["saturate_on_overflow"], ) if opts['use_popdist'] and reconfigure: ipu_options = popdist.tensorflow.set_ipu_config(ipu_options, opts['shards'], configure_device=False) if opts['on_demand'] and reconfigure: ipu_options.device_connection.enable_remote_buffers = True ipu_options.device_connection.type = ipu.utils.DeviceConnectionType.ON_DEMAND if reconfigure: ipu_options.configure_ipu_system() valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto()) ops = { 'accuracy': accuracy, 'broadcast_weights': broadcast_weights, 'broadcast_global_batch_size': broadcast_global_batch_size, 'broadcast_num_files': broadcast_num_files, 'broadcast_iteration': broadcast_iteration, 'latency_per_batch': latency_per_batch } placeholders = { 'global_batch_size': global_batch_size_ph, 'num_files': num_files_ph, 'iteration': iteration_ph } valid_graph.finalize() return train.GraphOps(valid_graph, valid_sess, valid_init, ops, placeholders, valid_iterator, None, valid_saver)