def sparse_transformer_fwd_and_grad(transformer, input_activation): transformer.compute_dense_grad = True output_activation = transformer.encoder_layer(input_activation, mask=None, compute_dense_grad=True, debug_name="layer_0") loss = tf.reduce_sum(output_activation) # Wrap the optimizer (this would help manage the slot variables) optimizer = optimizers.SparseOptimizer(tf.train.AdamOptimizer) optimizer = optimizer(learning_rate=1e-3, sparse_layers=transformer.sparse_layers.values()) grads = optimizer.compute_gradients(loss) input_grad = tf.gradients(loss, input_activation)[0] with tf.control_dependencies([input_grad]): train_op = optimizer.apply_gradients(grads) with tf.control_dependencies([train_op]): streamOps = {"output_activation": output_activation} streamOps["input_grad"] = input_grad # Sparse grads for grad, var in grads: streamOps[var.op.name + "_grad"] = grad # Dense grads stream_dense_grads_from_device(transformer, loss, streamOps) return streamOps
def make_optimizer(lr, last_itr): with tf.variable_scope("training", reuse=tf.AUTO_REUSE, use_resource=True): optimizer_class, optimizer_kwargs = build_optimizer(opts.optimizer, opts.optimizer_arg) optimizer_class = optimizers.SparseOptimizer(optimizer_class) optimizer_class = global_step_update_opt.GlobalStepUpdateOptimizer(optimizer_class) if opts.loss_scale != 1: optimizer_class = scaling_opt.LossScalingOptimizer(optimizer_class) optimizer_kwargs['loss_scale'] = opts.loss_scale optimizer_kwargs['unscale_grad_pre_acc'] = opts.unscale_grad_pre_acc if opts.grad_acculation_mode == 'Avg': optimizer_class = scaling_opt.GradScalingOptimizer(optimizer_class) optimizer_kwargs['grad_scale'] = 1 / opts.gradient_accumulation_count optimizer_kwargs['scale_grad_pre_acc'] = opts.scale_grad_pre_acc if opts.grad_norm_clip: optimizer_class = grad_clip_opt.GradientClippingOptimizer(optimizer_class) optimizer_kwargs['norm_clip_threshold'] = opts.grad_norm_clip if opts.slots_fp_type is not None and tf.as_dtype(opts.slots_fp_type) != opts.dtype: optimizer_class = fp_slot_opt.SelectableSlotFPFormatOptimizer(optimizer_class) optimizer_kwargs['slots_dtype'] = opts.slots_fp_type optimizer_kwargs['force_fp32_weight_update'] = opts.force_fp32_weight_update optimizer = optimizer_class(learning_rate=lr, **optimizer_kwargs, sparse_layers=transformer.sparse_layers.values(), dense_gradient_condition=enable_dense_grad and last_itr, prune_and_grow_outfeed=dense_queue) return optimizer
def graph_builder(opts, inputs): input_activation = inputs["input_activation"] transformer = DynsparseTransformer(opts) transformer.compute_dense_grad = opts.compute_dense_grad and opts.train output_activation = transformer.feed_forward(input_activation) loss = tf.reduce_sum(output_activation) output = loss if opts.train: with tf.variable_scope("train", reuse=tf.AUTO_REUSE, use_resource=True): global_step = tf.train.get_or_create_global_step() optimizer = optimizers.SparseOptimizer(tf.train.AdamOptimizer) optimizer = optimizer( learning_rate=1e-3, sparse_layers=transformer.sparse_layers.values()) train_op = optimizer.minimize(loss, global_step=global_step) input_grad = tf.gradients(loss, input_activation)[0] dense_grads = [] if opts.compute_dense_grad: dense_grads = list( transformer.streamDenseGradsFromDevice( loss, optimizer, {}).values()) with tf.control_dependencies(dense_grads + [train_op, input_grad]): output = tf.identity(loss) return output
def sparse_transformer_fwd_and_grad(transformer, input_activation): transformer.compute_dense_grad = True x = input_activation # Optional autoregressive mask mask = None if transformer.use_autoregressive_mask_for_test: mask = np.triu(np.ones([transformer.source_sequence_length, transformer.source_sequence_length]), k=1)*-10000 # Multi-head attention output_activation = transformer.attention(x, x, x, mask=mask, is_self_attention=True, compute_dense_grad=True) loss = tf.reduce_sum(output_activation) # Wrap the optimizer (this would help manage the slot variables) optimizer = optimizers.SparseOptimizer(tf.train.AdamOptimizer) optimizer = optimizer(learning_rate=1e-3, sparse_layers=transformer.sparse_layers.values()) grads = optimizer.compute_gradients(loss) input_grad = tf.gradients(loss, input_activation)[0] with tf.control_dependencies([input_grad]): train_op = optimizer.apply_gradients(grads) with tf.control_dependencies([train_op]): streamOps = {"output_activation": output_activation} streamOps["input_grad"] = input_grad # Sparse grads for grad, var in grads: streamOps[var.op.name + "_grad"] = grad # Dense grads stream_dense_grads_from_device(transformer, loss, streamOps) return streamOps
def optimizer_function(outputs): with tf.variable_scope("training", reuse=tf.AUTO_REUSE, use_resource=True): optimizer = optimizers.SparseOptimizer(opt_cls)( learning_rate=outputs['lr'], **opt_kws, name='optimise', sparse_layers=fc_layers.values(), dense_gradient_condition=outputs['last_itr'] if dense_grad_enabled else None, prune_and_grow_outfeed=png_queue) return pipelining_ops.OptimizerFunctionOutput(optimizer, outputs['mean_loss'])
def graph_builder(layers, opts, inputs): layers['fc'] = layers['fc_gen']() if opts.train: # Need to check if this is the last iteration of the loop with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True): itr_counter = tf.get_variable("iterations", shape=[], dtype=tf.int32, initializer=tf.zeros_initializer()) mod_itrs = tf.math.floormod(itr_counter, opts.batches_per_step) last_itr = tf.equal(mod_itrs, 0) inc = tf.assign_add(itr_counter, 1) z = layers['fc'](inputs["inputs"], last_itr) # Loss is the mean across output matrix: loss = tf.reduce_mean(z) with tf.variable_scope("train", reuse=tf.AUTO_REUSE, use_resource=True): # We need to ensure that the train op is executed as part of # the benchmarking loop by maintaining a step variable and # forcing a control dependency between it and the train op: global_step = tf.get_variable("step_control", dtype=tf.int32, shape=[]) optimiser = optimizers.SparseOptimizer(tf.train.MomentumOptimizer)( learning_rate=0.01, momentum=0.0001, use_nesterov=True, name='optimise', sparse_layers=[layers['fc']]) with tf.control_dependencies([global_step]): train_op = optimiser.minimize(loss) all_ops = tf.group(inc, train_op) with tf.control_dependencies([all_ops]): global_step = tf.identity(global_step) return global_step else: return layers['fc'](inputs["inputs"], inputs["cond"])
def model(opts, use_ipu_function, input_x): sparse_layers = [] # The outer function is just a Python function. def sparseLinear(x, dense_length, opts): x_shape = x.shape.with_rank(2).as_list() limit = np.sqrt(6 / ((x_shape[-1] + dense_length) * opts.density)) uniform_gen = partial(np.random.uniform, -limit, limit) indices_random_gen = np.random.default_rng(seed=0) sparse_layer = layers.SparseFcLayer.from_random_generator( hidden_size=dense_length, input_shape=x_shape, density=opts.density, block_size=1, values_initialiser_gen=uniform_gen, indices_initialiser_gen=indices_random_gen, name="sparse_layer", dtype=x.dtype, matmul_options=opts.sparse_matmul_options, use_bias=opts.use_bias, relu=True, disable_updating=opts.disable_updating, pooling_type="NONE") # Create placeholders on the host, outside XLA with tf.init_scope(): # escapes XLA with tf.device("cpu"): sparse_layer.create_placeholders() sparse_layers.append(sparse_layer) if use_ipu_function: @ipu.outlined_function def f(x): # Call the layer with the provided input x = sparse_layer(x, opts.compute_dense_grad) return x return f(x) else: return sparse_layer(x, opts.compute_dense_grad) x = input_x outputs = {} # Loop through n_layers which all use the same shape, # and therefore the same ipu_function for i in range(opts.n_layers): with tf.variable_scope(f"sparse_{i}", use_resource=True): x = sparseLinear(x, opts.hidden_length, opts) outputs[f"activation_{i}"] = x loss = tf.reduce_sum(x) # Construct a sparse optimizer as usual optimizer = optimizers.SparseOptimizer(tf.train.AdamOptimizer) optimizer = optimizer(sparse_layers=sparse_layers) g = optimizer.compute_gradients(loss) # Record the grads for comparison for grad, var in g: outputs[var.name + "_grad"] = grad if opts.compute_dense_grad: for i, layer in enumerate(sparse_layers): outputs["layer_{i}_gradW"] = layer.get_dense_grad_w(tf.reduce_sum(loss)) outputs["input_grad"] = tf.gradients(loss, input_x)[0] return outputs
def forward_pass(opts, transformer, lr, iterations_per_step, is_training, outfeed, png_outfeed, source, target): # Input may require padding for some block-sizes as B, S, H = source.shape.as_list() if (H % opts.block_size) != 0: pad_size = (1 + (H // opts.block_size)) * opts.block_size - H source = tf.pad(source, [[0, 0], [0, 0], [0, pad_size]]) transformer.embedding_length = source.shape.as_list()[-1] with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True): itr_counter = tf.get_variable("iterations", [], tf.int32) mod_itrs = tf.math.floormod(itr_counter, iterations_per_step) last_itr = tf.equal(mod_itrs, 0) increment_counter = tf.assign_add(itr_counter, 1) with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE, use_resource=True): transformer.compute_dense_grad = last_itr # Add position embeddings to prevent permutation invariance x = transformer.position_encoder(source, transformer.source_sequence_length) # Project image to hidden dimension (to enable skip connects) with transformer.namescope("embedding_projection"): x = transformer.sparseLinear(x, transformer.sparsity, transformer.hidden_length, last_itr, use_bias=True) # Use a single encoder layer x [B, S, H] x = transformer.encoder_layer(x, mask=None, debug_name="encoder_layer0") # Each token position then produces logits independently. # The model logits is the sum over all output tokens with transformer.namescope("output"): # The output dimension may need to be padded to conform to block size T = transformer.target_vocab_length output_pad_size = 0 if (T % opts.block_size) != 0: output_pad_size = ( 1 + (T // opts.block_size)) * opts.block_size - T # Compute output x = transformer.sparseLinear(x, transformer.sparsity, T + output_pad_size, last_itr, use_bias=False) # Remove padding if output_pad_size > 0: x = x[:, :, :T] model_output = tf.reduce_sum(x, axis=1) # [B, S, 10] -> [B, 10] with tf.variable_scope("metrics", reuse=tf.AUTO_REUSE, use_resource=True): predictions = tf.argmax(model_output, axis=1, output_type=tf.int32) acc, acc_op = tf.metrics.accuracy(target, predictions, name="accuracy") # Sparse softmax can lead to NaNs very easily in float16 logits = model_output if model_output.dtype == tf.float32 else tf.cast( model_output, tf.float32) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=logits) mean_loss = tf.reduce_mean(loss, name='train_loss') if is_training: with tf.variable_scope("training", reuse=tf.AUTO_REUSE, use_resource=True): optimizer_class, optimizer_kwargs = build_optimizer( opts.optimizer, opts.optimizer_arg) optimizer = optimizers.SparseOptimizer(optimizer_class) optimizer = optimizer( learning_rate=lr, **optimizer_kwargs, sparse_layers=transformer.sparse_layers.values(), prune_and_grow_outfeed=png_outfeed, dense_gradient_condition=last_itr) train_op = optimizer.minimize(loss) metrics = { 'mean_loss': mean_loss, 'acc': acc, 'iteration': itr_counter } # Only stream back final metrics: def true_fn(): with tf.control_dependencies([outfeed.enqueue(metrics), acc_op]): return tf.no_op() with tf.control_dependencies([train_op]): condition = tf.cond(last_itr, true_fn, tf.no_op) output = tf.group(increment_counter, condition) else: # At inference time stream back the loss and accuracy with tf.control_dependencies([acc_op]): mean_loss = tf.reduce_mean(loss, name='test_loss') output = outfeed.enqueue({ 'mean_loss': mean_loss, 'acc': acc, 'iteration': itr_counter }) return output
def forward_pass(opts, transformer, lr, iterations_per_step, is_training, outfeed, source, target): with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True): itr_counter = tf.get_variable("iterations", [], tf.int32) mod_itrs = tf.math.floormod(itr_counter, iterations_per_step) last_itr = tf.equal(mod_itrs, 0) increment_counter = tf.assign_add(itr_counter, 1) with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE, use_resource=True): transformer.compute_dense_grad = last_itr # Add position embeddings to prevent permutation invariance x = transformer.position_encoder(source, transformer.source_sequence_length) # Project image to hidden dimension (to enable skip connects) with transformer.namescope("embedding_projection"): x = transformer.sparseLinear(x, transformer.sparsity, transformer.hidden_length, last_itr, use_bias=True) # Use a single encoder layer x [B, S, H] x = transformer.encoder_layer(x, mask=None, debug_name="encoder_layer0") # Each token position then produces logits independently. # The model logits is the sum over all output tokens with transformer.namescope("output"): x = transformer.sparseLinear(x, transformer.sparsity, transformer.target_vocab_length, last_itr, use_bias=False) model_output = tf.reduce_sum(x, axis=1) # [B, S, 10] -> [B, 10] with tf.variable_scope("metrics", reuse=tf.AUTO_REUSE, use_resource=True): predictions = tf.argmax(model_output, axis=1, output_type=tf.int32) acc, acc_op = tf.metrics.accuracy(target, predictions, name="accuracy") # Sparse softmax can lead to NaNs very easily in float16 logits = model_output if model_output.dtype == tf.float32 else tf.cast( model_output, tf.float32) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=logits) mean_loss = tf.reduce_mean(loss, name='train_loss') if is_training: with tf.variable_scope("training", reuse=tf.AUTO_REUSE, use_resource=True): optimizer_class, optimizer_kwargs = build_optimizer( opts.optimizer, opts.optimizer_arg) optimizer = optimizers.SparseOptimizer(optimizer_class) optimizer = optimizer( learning_rate=lr, **optimizer_kwargs, sparse_layers=transformer.sparse_layers.values()) train_op = optimizer.minimize(loss) # Prepare tensors that should stream back to host streamOps = {'mean_loss': mean_loss, 'acc': acc} transformer.streamWeightsFromDevice(streamOps) transformer.streamOptimizerSlotsFromDevice(optimizer, streamOps) transformer.streamDenseGradsFromDevice(loss, streamOps) # Sparse weights will stream back to host at the end of # every iterations_per_step. We use a tf.cond to check whether # it is time to stream back def true_fn(): with tf.control_dependencies([outfeed.enqueue(streamOps), acc_op]): return tf.no_op() condition = tf.cond(last_itr, true_fn, tf.no_op) output = tf.group(increment_counter, condition, train_op) else: # At inference time stream back the loss and accuracy with tf.control_dependencies([acc_op]): mean_loss = tf.reduce_mean(loss, name='test_loss') output = outfeed.enqueue({'mean_loss': mean_loss, 'acc': acc}) print("XLA Output: ", output) return output
def model(fc_layers, droprate, lr, opt_cls, opt_kws, iterations_per_step, training: bool, last_outqueue, inputs, labels): with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True): itr_counter = tf.get_variable("iterations", shape=[], dtype=tf.int32, trainable=False, initializer=tf.zeros_initializer()) mod_itrs = tf.math.floormod(itr_counter, iterations_per_step) last_itr = tf.equal(mod_itrs, 0) inc = tf.assign_add(itr_counter, 1) fc1 = fc_layers['fc1'] fc2 = fc_layers['fc2'] relu1 = fc1(inputs, last_itr) # Use the IPU optimised version of dropout: if training: drop1 = rand_ops.dropout(relu1, rate=droprate) else: drop1 = relu1 relu2 = fc2(drop1, last_itr) with tf.variable_scope("metrics", reuse=tf.AUTO_REUSE, use_resource=True): acc, acc_op = tf.metrics.accuracy(labels=labels, predictions=tf.argmax( relu2, axis=1, output_type=tf.dtypes.int32), name="accuracy") loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=tf.cast(relu2, dtype=tf.float32, name="logits_to_fp32")) if training: with tf.variable_scope("training", reuse=tf.AUTO_REUSE, use_resource=True): optimiser = optimizers.SparseOptimizer(opt_cls)( learning_rate=lr, **opt_kws, name='optimise', sparse_layers=[fc1, fc2]) train_op = optimiser.minimize(loss) slot_names = optimiser.get_slot_names() logger.debug(f"Optimiser slot names: {slot_names}") with tf.control_dependencies([train_op, acc_op]): mean_loss = tf.reduce_mean(loss, name='train_loss') else: with tf.control_dependencies([acc_op]): mean_loss = tf.reduce_mean(loss, name='test_loss') # Prepare results for feeds: last_results = {'mean_loss': mean_loss, 'acc': acc} for name, fc in fc_layers.items(): if fc.is_sparse(): weights_tensor = tf.convert_to_tensor(fc.get_values_var()) last_results[name + '_non_zeros'] = weights_tensor if training: dense_grad_w = fc.get_dense_grad_w(loss) last_results[name + '_grad_w'] = tf.convert_to_tensor(dense_grad_w) for slot_name in fc.sparse_slots: last_results[name + f'_{slot_name}'] = \ tf.convert_to_tensor(fc.sparse_slots[slot_name].tf_variable) # When training we only want to return the sparse # non-zero weight values on the last iteration. if training: def enqueue_last_itr(): enqueue_weights = last_outqueue.enqueue(last_results) with tf.control_dependencies([enqueue_weights]): return tf.no_op() def nop(): return tf.no_op() cond_op = tf.cond(last_itr, enqueue_last_itr, nop) enqueue_op = tf.group(inc, cond_op, train_op) else: enqueue_op = last_outqueue.enqueue({'mean_loss': mean_loss, 'acc': acc}) return enqueue_op