Python all_reduce Exemples, synkhronos.all_reduce Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : example_1.py Projet : mharradon/synkhronos

synk.fork()
s_init = np.ones(3, dtype='float32')
x = T.matrix('x')
s = theano.shared(s_init, name='s')
s_old = s
f = synk.function([x], updates={s: T.sum(x * s, axis=0)})
synk.distribute()
x_dat = np.array([[1., 1, 1],
                  [2, 2, 2],
                  [3, 3, 3],
                  [4, 4, 4]]).astype('float32')
print("\ns initial:\n", s.get_value())
f.as_theano(x_dat)
print("\ns after Theano call:\n", s.get_value())
s.set_value(s_init)
f(x_dat)
print("\nlocal s after reset and Synkhronos call:\n", s.get_value())
gathered_s = synk.gather(s, nd_up=1)
print("\ngathered s:\n", gathered_s)
synk.reduce(s, op="sum")
print("\nlocal s after in-place reduce:\n", s.get_value())
gathered_s = synk.gather(s, nd_up=1)
print("\ngathered s after reduce:\n", gathered_s)
s.set_value(s_init)
synk.broadcast(s)
f(x_dat)
synk.all_reduce(s, op="sum")
gathered_s = synk.gather(s, nd_up=1)
print("\ngathered s after local reset, broadcast, Synkhronos call, "
    "and all-reduce:\n", gathered_s)

Exemple #2

0

Afficher le fichier

Fichier : lasagne_mnist.py Projet : mharradon/synkhronos

def main(model='mlp', num_epochs=500):
    # Load the dataset
    print("Loading data...")
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()

    # Fork workers and initialize gpu before building any variables.
    synk.fork()

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")
    if model == 'mlp':
        network = build_mlp(input_var)
    elif model.startswith('custom_mlp:'):
        depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',')
        network = build_custom_mlp(input_var, int(depth), int(width),
                                   float(drop_in), float(drop_hid))
    elif model == 'cnn':
        network = build_cnn(input_var)
    else:
        print("Unrecognized model type %r." % model)
        return

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(loss,
                                                params,
                                                learning_rate=0.01,
                                                momentum=0.9)
    # ipdb.set_trace()
    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    # train_fn = theano.function([input_var, target_var], loss, updates=updates)
    train_fn = synk.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    # val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
    val_fn = synk.function([input_var, target_var], [test_loss, test_acc])

    # Send all functions and variables to workers (in the future, automatic)
    synk.distribute()

    # Write data into input shared memory (also applies to val_fn--same vars).
    X_train_synk, y_train_synk = train_fn.build_inputs(X_train, y_train)
    X_val_synk, y_val_synk = train_fn.build_inputs(X_val, y_val)
    X_test_synk, y_test_synk = train_fn.build_inputs(X_test, y_test)

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatch_indices(len(y_train), 500,
                                               shuffle=True):
            train_err += train_fn(X_train_synk, y_train_synk, batch=batch)
            synk.all_reduce(params)
            train_batches += 1
        mid_time = time.time()

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatch_indices(len(y_val), 500, shuffle=False):
            err, acc = val_fn(X_val_synk,
                              y_val_synk,
                              batch=batch,
                              num_slices=1)
            val_err += err
            val_acc += acc
            val_batches += 1
        end_time = time.time()

        val_fn_time = end_time - mid_time
        train_fn_time = mid_time - start_time

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("Train function time: {:.3f}s".format(train_fn_time))
        print("Validation function time: {:.3f}s".format(val_fn_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(val_acc /
                                                          val_batches * 100))

    # After training, we compute and print the test error:
    test_err = 0
    test_acc = 0
    test_batches = 0
    for batch in iterate_minibatch_indices(len(y_test), 500, shuffle=False):
        err, acc = val_fn(X_test_synk, y_test_synk, batch=batch)
        test_err += err
        test_acc += acc
        test_batches += 1

    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))

    # Optionally, you could now dump the network weights to a file like this:
    np.savez('model.npz', *lasagne.layers.get_all_param_values(network))

    # And load them again later on like this:
    with np.load('model.npz') as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    lasagne.layers.set_all_param_values(network, param_values)

Exemple #3

0

Afficher le fichier

Fichier : train.py Projet : david-leon/Synkhronos

 def train_minibatch(x_data, y_data, batch=None):
     train_loss = f_grad_shared(x_data, y_data, batch=batch)  # (synk_data)
     synk.all_reduce(grad_shared, op="avg")  # (assumes loss is an avg)
     f_param_update()
     return train_loss

Exemple #4

0

Afficher le fichier

f_theano = theano.function([x_mat, y_mat], z)
r_theano = f_theano(x_dat, y_dat)

r = f()
assert np.allclose(r, r_theano)
print("\nShared variable math test passed.")

r_slc = f(num_slices=2)  # slicing here works on the GPU variable s_x
assert np.allclose(r_slc, r_theano)
print("Shared variable sliced math test passed.")

# Further manipulations.
# Average values of shared variable on all GPUs:
x_values = synk.gather(s_x, nd_up=1)  # (x_values is on the GPU)
x_values = np.asarray(x_values)  # (now it's on CPU)
synk.all_reduce(s_x, op="avg")  # (default op is "avg")
x_avg = x_values.mean(axis=0)
new_x_values = np.array(synk.gather(s_x, nd_up=1))
for i in range(n_gpus):
    assert np.allclose(x_avg, new_x_values[i])
print("\nValue on rank 1 after all_reduce:\n", synk.get_value(1, s_x))
print("All_reduce avg test passed.")

# Reset one of the GPUs to previous value:
# ipdb.set_trace()
synk.set_value(rank=1, shared_vars=s_x, values=x_values[1])
print("\nReset the value on rank 1:\n", synk.get_value(1, s_x))

# Make all the GPUs have all the data (can't change ndim of variable)
synk.all_gather(s_x)
print("\nShapes of s_x on GPUs after all_gather: ", synk.get_shapes(s_x))

Exemple #5

0

Afficher le fichier

def main(model='mlp', batch_size=500, num_epochs=10):

    # Load the dataset
    print("Loading data...")
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()
    y_train = y_train.astype("int32")  # (some downstream type error on uint8)
    y_val = y_val.astype("int32")

    # Fork worker processes and initilize GPU before building variables.
    n_gpu = synk.fork()

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    network = build_network(model, input_var)

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)

    grad_updates, param_updates, grad_shared = updates.nesterov_momentum(
        loss, params, learning_rate=0.01, momentum=0.9)
    # updates = lasagne.updates.nesterov_momentum(
    #         loss, params, learning_rate=0.01, momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Make GPU variables to hold the data.
    s_input_train = theano.shared(X_train[:len(X_train) // n_gpu])
    s_target_train = theano.shared(y_train[:len(y_train) // n_gpu])
    s_input_val = theano.shared(X_val[:len(X_val) // n_gpu])
    s_target_val = theano.shared(y_val[:len(y_val) // n_gpu])

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_grad_fn = synk.function(
        inputs=[],
        outputs=loss,
        givens=[(input_var, s_input_train), (target_var, s_target_train)],
        sliceable_shareds=[s_input_train, s_target_train],
        updates=grad_updates)
    train_update_fn = synk.function([], updates=param_updates)
    # train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = synk.function(inputs=[],
                           givens=[(input_var, s_input_val),
                                   (target_var, s_target_val)],
                           sliceable_shareds=[s_input_val, s_target_val],
                           outputs=[test_loss, test_acc])
    # val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    # Don't bother to put test data on GPU ahead of time.
    test_fn = synk.function([input_var, target_var],
                            outputs=[test_loss, test_acc])

    # After building all functions, give them to workers.
    synk.distribute()

    # Put data into OS shared memory for worker access.
    X_test, y_test = test_fn.build_inputs(X_test, y_test)

    print("Scattering data to GPUs.")
    scatter_vars = [s_input_train, s_target_train, s_input_val, s_target_val]
    scatter_vals = [X_train, y_train, X_val, y_val]
    synk.scatter(scatter_vars, scatter_vals)
    train_worker_len = min(synk.get_lengths(s_target_train))
    worker_batch_size = batch_size // n_gpu

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        # for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True):
        for batch in iterate_minibatch_indices(train_worker_len,
                                               worker_batch_size,
                                               shuffle=True):
            train_err += train_grad_fn(batch_s=batch)
            synk.all_reduce(grad_shared)  # (averges)
            train_update_fn()
            train_batches += 1

        # And a full pass over the validation data:
        # val_err = 0
        # val_acc = 0
        # val_batches = 0
        # for batch in iterate_minibatches(X_val, y_val, batch_size, shuffle=False):
        #     inputs, targets = batch
        #     err, acc = val_fn(inputs, targets)
        #     val_err += err
        #     val_acc += acc
        #     val_batches += 1
        val_err, val_acc = val_fn(num_slices=4)

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(float(val_err)))
        print("  validation accuracy:\t\t{:.2f} %".format(
            float(val_acc) * 100))

    # After training, we compute and print the test error:
    # test_err = 0
    # test_acc = 0
    # test_batches = 0
    # for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False):
    #     inputs, targets = batch
    #     err, acc = val_fn(inputs, targets)
    #     test_err += err
    #     test_acc += acc
    #     test_batches += 1
    test_err, test_acc = test_fn(X_test, y_test, num_slices=4)
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(float(test_err)))
    print("  test accuracy:\t\t{:.2f} %".format(float(test_acc) * 100))