Exemple #1
0
def graph_builder(opts, inputs):
    input_activation = inputs["input_activation"]
    transformer = DynsparseTransformer(opts)
    transformer.compute_dense_grad = opts.compute_dense_grad and opts.train
    output_activation = transformer.feed_forward(input_activation)
    loss = tf.reduce_sum(output_activation)
    output = loss

    if opts.train:
        with tf.variable_scope("train", reuse=tf.AUTO_REUSE,
                               use_resource=True):
            global_step = tf.train.get_or_create_global_step()
            optimizer = optimizers.SparseOptimizer(tf.train.AdamOptimizer)
            optimizer = optimizer(
                learning_rate=1e-3,
                sparse_layers=transformer.sparse_layers.values())
            train_op = optimizer.minimize(loss, global_step=global_step)
            input_grad = tf.gradients(loss, input_activation)[0]

            dense_grads = []
            if opts.compute_dense_grad:
                dense_grads = list(
                    transformer.streamDenseGradsFromDevice(
                        loss, optimizer, {}).values())
            with tf.control_dependencies(dense_grads + [train_op, input_grad]):
                output = tf.identity(loss)

    return output
def run_mnist(opts):
    if opts.random_seed is not None:
        utils.reset_ipu_seed(opts.random_seed)

    # MNIST
    numpy_dtype = opts.dtype.as_numpy_dtype
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0
    x_train, x_test = x_train.astype(numpy_dtype), x_test.astype(numpy_dtype)
    y_train, y_test = y_train.astype(np.int32), y_test.astype(np.int32)

    # Create a transformer object (does not build a graph until called)
    if opts.mode in ["all", "train"]:
        training_transformer = DynsparseTransformer(opts)
        run_training(opts, training_transformer, x_train, y_train)

    if opts.mode in ["all", "test"]:
        testing_transformer = DynsparseTransformer(opts)
        run_testing(opts, testing_transformer, x_test, y_test)
def run_language_model(opts):
    if opts.random_seed is not None:
        utils.reset_ipu_seed(opts.random_seed)

    # Setup and acquire an IPU device:
    logging.info("Acquiring devices")
    if not opts.pipeline:
        opts.num_shards = 1  # FIX-ME enable sparse models using multiple shards

    # Make sure that no matter the number of shards/stages required, we always
    # acquire a power of 2 ipus (else attachment will fail)
    k = 0
    while 2**k < opts.num_shards:
        k += 1
    num_ipus = 2**k
    logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}")
    config = utils.create_ipu_config()

    if opts.compile_only:
        if opts.compile_only_ipu_version is None:
            raise AttributeError(
                "Must provide --compile-only-ipu-version if --compile-only is set."
            )

        config = utils.set_ipu_connection_type(
            config,
            utils.DeviceConnectionType.NEVER,
            ipu_version=opts.compile_only_ipu_version,
            enable_remote_buffers=True)

    config = utils.auto_select_ipus(config, num_ipus)
    config = utils.set_recomputation_options(config,
                                             allow_recompute=opts.recompute)
    # Enable stochastic rounding
    config = utils.set_floating_point_behaviour_options(config,
                                                        inv=False,
                                                        div0=False,
                                                        oflo=False,
                                                        esr=True,
                                                        nanoo=False)
    config = sparse.set_system_config(
        config, custom_op_debug_printing=opts.debug_dense_grad)
    utils.configure_ipu_system(config)

    transformer = DynsparseTransformer(opts)
    if opts.mode in ["all", "train"]:
        run_training(opts, transformer)

    if opts.mode in ["all", "test"]:
        run_testing(opts, transformer)
def main(args):
    tf.logging.set_verbosity(tf.logging.ERROR)
    np.set_printoptions(linewidth=200)
    random_seed = args.random_seed
    checkpoint_path = os.path.join(tempfile.mkdtemp(), "model.ckpt")

    # Input activations for the attention layer
    random_gen = np.random.default_rng(seed=random_seed)
    activations_np = random_gen.uniform(-0.1,
                                        0.1,
                                        size=(args.batch_size,
                                              args.source_sequence_length,
                                              args.hidden_length))

    # Configure the IPU
    cfg = ipu.utils.create_ipu_config(profiling=args.profile,
                                      report_directory="./report/")
    cfg = ipu.utils.auto_select_ipus(cfg, 1)
    ipu.utils.configure_ipu_system(cfg)

    # Build IPU graphs
    sparse_decoder_graph = tf.Graph()
    sparse_transformer = DynsparseTransformer(args)
    with sparse_decoder_graph.as_default():
        with tf.device("cpu"):
            # placeholder for activations
            # weight placeholders are created inside sparse_transfomer
            inputs_ph = tf.placeholder(args.dtype, activations_np.shape)
        with ipu.scopes.ipu_scope("/device:IPU:0"):
            sparse_decoder = partial(sparse_transformer_fwd_and_grad,
                                     sparse_transformer)
            sparse_decoder_fetches = ipu.ipu_compiler.compile(
                sparse_decoder, [inputs_ph])
            ipu.utils.move_variable_initialization_to_cpu()

    # sparse-decoder
    with tf.Session(graph=sparse_decoder_graph) as sess:
        # initialize weights
        sess.run(tf.global_variables_initializer())

        # Save the sparse weights to checkpoint as dense
        sparse_transformer.checkpointAsDense(checkpoint_path)

        # run sparse decoder
        sparse_result = sess.run(sparse_decoder_fetches,
                                 feed_dict={inputs_ph: activations_np})

    # Create a dense transformer and initialize the weights to the values that
    # the sparse model was initialzed with originally
    dense_decoder_graph = tf.Graph()
    dense_transformer = DenseTransformer(args)
    with dense_decoder_graph.as_default():
        with tf.device("cpu"):
            # placeholder for activations
            # weights will get streamed from checkpoint
            inputs_ph = tf.placeholder(args.dtype, activations_np.shape)

        with ipu.scopes.ipu_scope("/device:IPU:0"):
            dense_decoder_fetches = partial(dense_transformer_fwd_and_grad,
                                            dense_transformer)
            dense_graph = ipu.ipu_compiler.compile(dense_decoder_fetches,
                                                   [inputs_ph])
            ipu.utils.move_variable_initialization_to_cpu()

        with tf.device("cpu"):
            # We will only load the trainable variables, not momentum etc.
            loader = tf.train.Saver(tf.trainable_variables())

    # dense-decoder
    with tf.Session(graph=dense_decoder_graph) as sess:
        # Initialized momentums which are not part of the checkpoint
        sess.run(tf.global_variables_initializer())
        # Restore saved trainable variables
        loader.restore(sess, checkpoint_path)
        dense_result = sess.run(dense_graph,
                                feed_dict={inputs_ph: activations_np})

    # TEST
    rtol = 1e-05
    atol = 1e-05
    if args.dtype == tf.float16:
        rtol = 1e-04
        atol = 1e-02
    # Compare model output activations (actual vs. desired) -> (sparse vs. dense)
    np.testing.assert_allclose(sparse_result["output_activation"],
                               dense_result["output_activation"],
                               atol=atol,
                               rtol=rtol,
                               err_msg="Output activations do not match.")

    # Compate gradient of output wrt. input
    np.testing.assert_allclose(sparse_result["input_grad"],
                               dense_result["input_grad"],
                               atol=atol,
                               rtol=rtol,
                               err_msg="Grads wrt. inputs do not match")

    # Compare the dense_w and sparse grads of every sparse layer
    for name, sparse_layer in sparse_transformer.sparse_layers.items():
        # Compate the dense grads
        dense_grad = dense_result[name + "/weight" + "_grad"]
        sparse_grad_w = sparse_result[name + "_grad_w"]
        np.testing.assert_allclose(
            sparse_grad_w,
            dense_grad,
            atol=atol,
            rtol=rtol,
            err_msg=f"Dense grads for layer {name} do not match")

        # Compare the sparse grads
        sparse_grad_padded = sparse_result[name +
                                           "/sparse_layer/nz_values_grad"]
        sparse_grad_data = sparse.SparseRepresentation(
            sparse_layer.weights.get_metainfo(), sparse_grad_padded)
        i, j, sparse_grad = sparse.triplets_from_representation(
            sparse_layer.weights.spec, sparse_grad_data,
            sparse_layer.weights.matmul_options)

        # Convert dense grads to blocks
        block_size, _ = sparse_layer.get_nonzero_blocks_shape()
        nx, ny = dense_grad.shape[0] // block_size, dense_grad.shape[
            1] // block_size
        strides = np.array(dense_grad.strides)  # strides are in bytes
        strides = tuple(strides * block_size) + tuple(strides)
        blocked_dense_grad = np.lib.stride_tricks.as_strided(
            dense_grad, (nx, ny, block_size, block_size), strides)
        blocked_dense_grad = np.squeeze(
            np.copy(blocked_dense_grad
                    ))  # this will squeeze out the special case block size 1
        np.testing.assert_allclose(
            sparse_grad,
            blocked_dense_grad[i, j],
            atol=atol,
            rtol=rtol,
            err_msg=f"Sparse grads for layer {name} do not match")

    print("All results match.")
    return sparse_result, dense_result