def sparse_transformer_fwd_and_grad(transformer, input_activation):
    transformer.compute_dense_grad = True
    output_activation = transformer.encoder_layer(input_activation,
                                                  mask=None,
                                                  compute_dense_grad=True,
                                                  debug_name="layer_0")
    loss = tf.reduce_sum(output_activation)

    # Wrap the optimizer (this would help manage the slot variables)
    optimizer = optimizers.SparseOptimizer(tf.train.AdamOptimizer)
    optimizer = optimizer(learning_rate=1e-3,
                          sparse_layers=transformer.sparse_layers.values())

    grads = optimizer.compute_gradients(loss)
    input_grad = tf.gradients(loss, input_activation)[0]
    with tf.control_dependencies([input_grad]):
        train_op = optimizer.apply_gradients(grads)

    with tf.control_dependencies([train_op]):
        streamOps = {"output_activation": output_activation}
        streamOps["input_grad"] = input_grad
        # Sparse grads
        for grad, var in grads:
            streamOps[var.op.name + "_grad"] = grad
        # Dense grads
        stream_dense_grads_from_device(transformer, loss, streamOps)
        return streamOps
Beispiel #2
0
 def make_optimizer(lr, last_itr):
     with tf.variable_scope("training", reuse=tf.AUTO_REUSE, use_resource=True):
         optimizer_class, optimizer_kwargs = build_optimizer(opts.optimizer, opts.optimizer_arg)
         optimizer_class = optimizers.SparseOptimizer(optimizer_class)
         optimizer_class = global_step_update_opt.GlobalStepUpdateOptimizer(optimizer_class)
         if opts.loss_scale != 1:
             optimizer_class = scaling_opt.LossScalingOptimizer(optimizer_class)
             optimizer_kwargs['loss_scale'] = opts.loss_scale
             optimizer_kwargs['unscale_grad_pre_acc'] = opts.unscale_grad_pre_acc
         if opts.grad_acculation_mode == 'Avg':
             optimizer_class = scaling_opt.GradScalingOptimizer(optimizer_class)
             optimizer_kwargs['grad_scale'] = 1 / opts.gradient_accumulation_count
             optimizer_kwargs['scale_grad_pre_acc'] = opts.scale_grad_pre_acc
         if opts.grad_norm_clip:
             optimizer_class = grad_clip_opt.GradientClippingOptimizer(optimizer_class)
             optimizer_kwargs['norm_clip_threshold'] = opts.grad_norm_clip
         if opts.slots_fp_type is not None and tf.as_dtype(opts.slots_fp_type) != opts.dtype:
             optimizer_class = fp_slot_opt.SelectableSlotFPFormatOptimizer(optimizer_class)
             optimizer_kwargs['slots_dtype'] = opts.slots_fp_type
             optimizer_kwargs['force_fp32_weight_update'] = opts.force_fp32_weight_update
         optimizer = optimizer_class(learning_rate=lr, **optimizer_kwargs,
                                     sparse_layers=transformer.sparse_layers.values(),
                                     dense_gradient_condition=enable_dense_grad and last_itr,
                                     prune_and_grow_outfeed=dense_queue)
     return optimizer
Beispiel #3
0
def graph_builder(opts, inputs):
    input_activation = inputs["input_activation"]
    transformer = DynsparseTransformer(opts)
    transformer.compute_dense_grad = opts.compute_dense_grad and opts.train
    output_activation = transformer.feed_forward(input_activation)
    loss = tf.reduce_sum(output_activation)
    output = loss

    if opts.train:
        with tf.variable_scope("train", reuse=tf.AUTO_REUSE,
                               use_resource=True):
            global_step = tf.train.get_or_create_global_step()
            optimizer = optimizers.SparseOptimizer(tf.train.AdamOptimizer)
            optimizer = optimizer(
                learning_rate=1e-3,
                sparse_layers=transformer.sparse_layers.values())
            train_op = optimizer.minimize(loss, global_step=global_step)
            input_grad = tf.gradients(loss, input_activation)[0]

            dense_grads = []
            if opts.compute_dense_grad:
                dense_grads = list(
                    transformer.streamDenseGradsFromDevice(
                        loss, optimizer, {}).values())
            with tf.control_dependencies(dense_grads + [train_op, input_grad]):
                output = tf.identity(loss)

    return output
Beispiel #4
0
def sparse_transformer_fwd_and_grad(transformer, input_activation):
    transformer.compute_dense_grad = True
    x = input_activation

    # Optional autoregressive mask
    mask = None
    if transformer.use_autoregressive_mask_for_test:
        mask = np.triu(np.ones([transformer.source_sequence_length, transformer.source_sequence_length]), k=1)*-10000

    # Multi-head attention
    output_activation = transformer.attention(x, x, x, mask=mask, is_self_attention=True, compute_dense_grad=True)
    loss = tf.reduce_sum(output_activation)

    # Wrap the optimizer (this would help manage the slot variables)
    optimizer = optimizers.SparseOptimizer(tf.train.AdamOptimizer)
    optimizer = optimizer(learning_rate=1e-3, sparse_layers=transformer.sparse_layers.values())

    grads = optimizer.compute_gradients(loss)
    input_grad = tf.gradients(loss, input_activation)[0]
    with tf.control_dependencies([input_grad]):
        train_op = optimizer.apply_gradients(grads)

    with tf.control_dependencies([train_op]):
        streamOps = {"output_activation": output_activation}
        streamOps["input_grad"] = input_grad
        # Sparse grads
        for grad, var in grads:
            streamOps[var.op.name + "_grad"] = grad
        # Dense grads
        stream_dense_grads_from_device(transformer, loss, streamOps)
        return streamOps
Beispiel #5
0
 def optimizer_function(outputs):
     with tf.variable_scope("training",
                            reuse=tf.AUTO_REUSE,
                            use_resource=True):
         optimizer = optimizers.SparseOptimizer(opt_cls)(
             learning_rate=outputs['lr'],
             **opt_kws,
             name='optimise',
             sparse_layers=fc_layers.values(),
             dense_gradient_condition=outputs['last_itr']
             if dense_grad_enabled else None,
             prune_and_grow_outfeed=png_queue)
     return pipelining_ops.OptimizerFunctionOutput(optimizer,
                                                   outputs['mean_loss'])
Beispiel #6
0
def graph_builder(layers, opts, inputs):
    layers['fc'] = layers['fc_gen']()

    if opts.train:
        # Need to check if this is the last iteration of the loop
        with tf.variable_scope("counter",
                               reuse=tf.AUTO_REUSE,
                               use_resource=True):
            itr_counter = tf.get_variable("iterations",
                                          shape=[],
                                          dtype=tf.int32,
                                          initializer=tf.zeros_initializer())
            mod_itrs = tf.math.floormod(itr_counter, opts.batches_per_step)
            last_itr = tf.equal(mod_itrs, 0)
            inc = tf.assign_add(itr_counter, 1)
        z = layers['fc'](inputs["inputs"], last_itr)
        # Loss is the mean across output matrix:
        loss = tf.reduce_mean(z)
        with tf.variable_scope("train", reuse=tf.AUTO_REUSE,
                               use_resource=True):
            # We need to ensure that the train op is executed as part of
            # the benchmarking loop by maintaining a step variable and
            # forcing a control dependency between it and the train op:
            global_step = tf.get_variable("step_control",
                                          dtype=tf.int32,
                                          shape=[])
            optimiser = optimizers.SparseOptimizer(tf.train.MomentumOptimizer)(
                learning_rate=0.01,
                momentum=0.0001,
                use_nesterov=True,
                name='optimise',
                sparse_layers=[layers['fc']])
            with tf.control_dependencies([global_step]):
                train_op = optimiser.minimize(loss)
        all_ops = tf.group(inc, train_op)
        with tf.control_dependencies([all_ops]):
            global_step = tf.identity(global_step)
        return global_step

    else:
        return layers['fc'](inputs["inputs"], inputs["cond"])
Beispiel #7
0
def model(opts, use_ipu_function, input_x):
    sparse_layers = []

    # The outer function is just a Python function.
    def sparseLinear(x, dense_length, opts):
        x_shape = x.shape.with_rank(2).as_list()
        limit = np.sqrt(6 / ((x_shape[-1] + dense_length) * opts.density))
        uniform_gen = partial(np.random.uniform, -limit, limit)
        indices_random_gen = np.random.default_rng(seed=0)

        sparse_layer = layers.SparseFcLayer.from_random_generator(
            hidden_size=dense_length,
            input_shape=x_shape,
            density=opts.density,
            block_size=1,
            values_initialiser_gen=uniform_gen,
            indices_initialiser_gen=indices_random_gen,
            name="sparse_layer",
            dtype=x.dtype,
            matmul_options=opts.sparse_matmul_options,
            use_bias=opts.use_bias,
            relu=True,
            disable_updating=opts.disable_updating,
            pooling_type="NONE")

        # Create placeholders on the host, outside XLA
        with tf.init_scope():  # escapes XLA
            with tf.device("cpu"):
                sparse_layer.create_placeholders()
        sparse_layers.append(sparse_layer)

        if use_ipu_function:
            @ipu.outlined_function
            def f(x):
                # Call the layer with the provided input
                x = sparse_layer(x, opts.compute_dense_grad)
                return x
            return f(x)
        else:
            return sparse_layer(x, opts.compute_dense_grad)

    x = input_x
    outputs = {}
    # Loop through n_layers which all use the same shape,
    # and therefore the same ipu_function
    for i in range(opts.n_layers):
        with tf.variable_scope(f"sparse_{i}", use_resource=True):
            x = sparseLinear(x, opts.hidden_length, opts)
            outputs[f"activation_{i}"] = x
    loss = tf.reduce_sum(x)
    # Construct a sparse optimizer as usual
    optimizer = optimizers.SparseOptimizer(tf.train.AdamOptimizer)
    optimizer = optimizer(sparse_layers=sparse_layers)

    g = optimizer.compute_gradients(loss)

    # Record the grads for comparison
    for grad, var in g:
        outputs[var.name + "_grad"] = grad
    if opts.compute_dense_grad:
        for i, layer in enumerate(sparse_layers):
            outputs["layer_{i}_gradW"] = layer.get_dense_grad_w(tf.reduce_sum(loss))
    outputs["input_grad"] = tf.gradients(loss, input_x)[0]

    return outputs
def forward_pass(opts, transformer, lr, iterations_per_step, is_training,
                 outfeed, png_outfeed, source, target):
    # Input may require padding for some block-sizes as
    B, S, H = source.shape.as_list()
    if (H % opts.block_size) != 0:
        pad_size = (1 + (H // opts.block_size)) * opts.block_size - H
        source = tf.pad(source, [[0, 0], [0, 0], [0, pad_size]])
    transformer.embedding_length = source.shape.as_list()[-1]

    with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True):
        itr_counter = tf.get_variable("iterations", [], tf.int32)
        mod_itrs = tf.math.floormod(itr_counter, iterations_per_step)
        last_itr = tf.equal(mod_itrs, 0)
        increment_counter = tf.assign_add(itr_counter, 1)

    with tf.variable_scope("transformer",
                           reuse=tf.AUTO_REUSE,
                           use_resource=True):
        transformer.compute_dense_grad = last_itr

        # Add position embeddings to prevent permutation invariance
        x = transformer.position_encoder(source,
                                         transformer.source_sequence_length)

        # Project image to hidden dimension (to enable skip connects)
        with transformer.namescope("embedding_projection"):
            x = transformer.sparseLinear(x,
                                         transformer.sparsity,
                                         transformer.hidden_length,
                                         last_itr,
                                         use_bias=True)

        # Use a single encoder layer  x [B, S, H]
        x = transformer.encoder_layer(x,
                                      mask=None,
                                      debug_name="encoder_layer0")

        # Each token position then produces logits independently.
        # The model logits is the sum over all output tokens
        with transformer.namescope("output"):
            # The output dimension may need to be padded to conform to block size
            T = transformer.target_vocab_length
            output_pad_size = 0
            if (T % opts.block_size) != 0:
                output_pad_size = (
                    1 + (T // opts.block_size)) * opts.block_size - T

            # Compute output
            x = transformer.sparseLinear(x,
                                         transformer.sparsity,
                                         T + output_pad_size,
                                         last_itr,
                                         use_bias=False)

            # Remove padding
            if output_pad_size > 0:
                x = x[:, :, :T]

            model_output = tf.reduce_sum(x, axis=1)  # [B, S, 10] -> [B, 10]

    with tf.variable_scope("metrics", reuse=tf.AUTO_REUSE, use_resource=True):
        predictions = tf.argmax(model_output, axis=1, output_type=tf.int32)
        acc, acc_op = tf.metrics.accuracy(target, predictions, name="accuracy")
        # Sparse softmax can lead to NaNs very easily in float16
        logits = model_output if model_output.dtype == tf.float32 else tf.cast(
            model_output, tf.float32)
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target,
                                                              logits=logits)
        mean_loss = tf.reduce_mean(loss, name='train_loss')

    if is_training:
        with tf.variable_scope("training",
                               reuse=tf.AUTO_REUSE,
                               use_resource=True):
            optimizer_class, optimizer_kwargs = build_optimizer(
                opts.optimizer, opts.optimizer_arg)
            optimizer = optimizers.SparseOptimizer(optimizer_class)
            optimizer = optimizer(
                learning_rate=lr,
                **optimizer_kwargs,
                sparse_layers=transformer.sparse_layers.values(),
                prune_and_grow_outfeed=png_outfeed,
                dense_gradient_condition=last_itr)
            train_op = optimizer.minimize(loss)

        metrics = {
            'mean_loss': mean_loss,
            'acc': acc,
            'iteration': itr_counter
        }

        # Only stream back final metrics:
        def true_fn():
            with tf.control_dependencies([outfeed.enqueue(metrics), acc_op]):
                return tf.no_op()

        with tf.control_dependencies([train_op]):
            condition = tf.cond(last_itr, true_fn, tf.no_op)
            output = tf.group(increment_counter, condition)

    else:
        # At inference time stream back the loss and accuracy
        with tf.control_dependencies([acc_op]):
            mean_loss = tf.reduce_mean(loss, name='test_loss')
        output = outfeed.enqueue({
            'mean_loss': mean_loss,
            'acc': acc,
            'iteration': itr_counter
        })
    return output
Beispiel #9
0
def forward_pass(opts, transformer, lr, iterations_per_step, is_training,
                 outfeed, source, target):
    with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True):
        itr_counter = tf.get_variable("iterations", [], tf.int32)
        mod_itrs = tf.math.floormod(itr_counter, iterations_per_step)
        last_itr = tf.equal(mod_itrs, 0)
        increment_counter = tf.assign_add(itr_counter, 1)

    with tf.variable_scope("transformer",
                           reuse=tf.AUTO_REUSE,
                           use_resource=True):
        transformer.compute_dense_grad = last_itr

        # Add position embeddings to prevent permutation invariance
        x = transformer.position_encoder(source,
                                         transformer.source_sequence_length)

        # Project image to hidden dimension (to enable skip connects)
        with transformer.namescope("embedding_projection"):
            x = transformer.sparseLinear(x,
                                         transformer.sparsity,
                                         transformer.hidden_length,
                                         last_itr,
                                         use_bias=True)

        # Use a single encoder layer  x [B, S, H]
        x = transformer.encoder_layer(x,
                                      mask=None,
                                      debug_name="encoder_layer0")

        # Each token position then produces logits independently.
        # The model logits is the sum over all output tokens
        with transformer.namescope("output"):
            x = transformer.sparseLinear(x,
                                         transformer.sparsity,
                                         transformer.target_vocab_length,
                                         last_itr,
                                         use_bias=False)
            model_output = tf.reduce_sum(x, axis=1)  # [B, S, 10] -> [B, 10]

    with tf.variable_scope("metrics", reuse=tf.AUTO_REUSE, use_resource=True):
        predictions = tf.argmax(model_output, axis=1, output_type=tf.int32)
        acc, acc_op = tf.metrics.accuracy(target, predictions, name="accuracy")
        # Sparse softmax can lead to NaNs very easily in float16
        logits = model_output if model_output.dtype == tf.float32 else tf.cast(
            model_output, tf.float32)
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target,
                                                              logits=logits)
        mean_loss = tf.reduce_mean(loss, name='train_loss')

    if is_training:
        with tf.variable_scope("training",
                               reuse=tf.AUTO_REUSE,
                               use_resource=True):
            optimizer_class, optimizer_kwargs = build_optimizer(
                opts.optimizer, opts.optimizer_arg)
            optimizer = optimizers.SparseOptimizer(optimizer_class)
            optimizer = optimizer(
                learning_rate=lr,
                **optimizer_kwargs,
                sparse_layers=transformer.sparse_layers.values())
            train_op = optimizer.minimize(loss)

        # Prepare tensors that should stream back to host
        streamOps = {'mean_loss': mean_loss, 'acc': acc}
        transformer.streamWeightsFromDevice(streamOps)
        transformer.streamOptimizerSlotsFromDevice(optimizer, streamOps)
        transformer.streamDenseGradsFromDevice(loss, streamOps)

        # Sparse weights will stream back to host at the end of
        # every iterations_per_step. We use a tf.cond to check whether
        # it is time to stream back
        def true_fn():
            with tf.control_dependencies([outfeed.enqueue(streamOps), acc_op]):
                return tf.no_op()

        condition = tf.cond(last_itr, true_fn, tf.no_op)
        output = tf.group(increment_counter, condition, train_op)

    else:
        # At inference time stream back the loss and accuracy
        with tf.control_dependencies([acc_op]):
            mean_loss = tf.reduce_mean(loss, name='test_loss')
        output = outfeed.enqueue({'mean_loss': mean_loss, 'acc': acc})
    print("XLA Output: ", output)
    return output
Beispiel #10
0
def model(fc_layers, droprate, lr, opt_cls, opt_kws, iterations_per_step, training: bool,
          last_outqueue, inputs, labels):

    with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True):
        itr_counter = tf.get_variable("iterations", shape=[], dtype=tf.int32,
                                      trainable=False,
                                      initializer=tf.zeros_initializer())
        mod_itrs = tf.math.floormod(itr_counter, iterations_per_step)
        last_itr = tf.equal(mod_itrs, 0)
        inc = tf.assign_add(itr_counter, 1)

    fc1 = fc_layers['fc1']
    fc2 = fc_layers['fc2']

    relu1 = fc1(inputs, last_itr)

    # Use the IPU optimised version of dropout:
    if training:
        drop1 = rand_ops.dropout(relu1, rate=droprate)
    else:
        drop1 = relu1

    relu2 = fc2(drop1, last_itr)

    with tf.variable_scope("metrics", reuse=tf.AUTO_REUSE, use_resource=True):
        acc, acc_op = tf.metrics.accuracy(labels=labels,
                                          predictions=tf.argmax(
                                              relu2, axis=1, output_type=tf.dtypes.int32),
                                          name="accuracy")
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=labels, logits=tf.cast(relu2, dtype=tf.float32, name="logits_to_fp32"))

    if training:
        with tf.variable_scope("training", reuse=tf.AUTO_REUSE, use_resource=True):
            optimiser = optimizers.SparseOptimizer(opt_cls)(
                learning_rate=lr, **opt_kws, name='optimise',
                sparse_layers=[fc1, fc2])
            train_op = optimiser.minimize(loss)
            slot_names = optimiser.get_slot_names()
            logger.debug(f"Optimiser slot names: {slot_names}")
            with tf.control_dependencies([train_op, acc_op]):
                mean_loss = tf.reduce_mean(loss, name='train_loss')
    else:
        with tf.control_dependencies([acc_op]):
            mean_loss = tf.reduce_mean(loss, name='test_loss')

    # Prepare results for feeds:
    last_results = {'mean_loss': mean_loss, 'acc': acc}
    for name, fc in fc_layers.items():
        if fc.is_sparse():
            weights_tensor = tf.convert_to_tensor(fc.get_values_var())
            last_results[name + '_non_zeros'] = weights_tensor
            if training:
                dense_grad_w = fc.get_dense_grad_w(loss)
                last_results[name + '_grad_w'] = tf.convert_to_tensor(dense_grad_w)

                for slot_name in fc.sparse_slots:
                    last_results[name + f'_{slot_name}'] = \
                        tf.convert_to_tensor(fc.sparse_slots[slot_name].tf_variable)

    # When training we only want to return the sparse
    # non-zero weight values on the last iteration.
    if training:
        def enqueue_last_itr():
            enqueue_weights = last_outqueue.enqueue(last_results)
            with tf.control_dependencies([enqueue_weights]):
                return tf.no_op()

        def nop():
            return tf.no_op()

        cond_op = tf.cond(last_itr, enqueue_last_itr, nop)
        enqueue_op = tf.group(inc, cond_op, train_op)
    else:
        enqueue_op = last_outqueue.enqueue({'mean_loss': mean_loss, 'acc': acc})

    return enqueue_op