synk.fork() s_init = np.ones(3, dtype='float32') x = T.matrix('x') s = theano.shared(s_init, name='s') s_old = s f = synk.function([x], updates={s: T.sum(x * s, axis=0)}) synk.distribute() x_dat = np.array([[1., 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]]).astype('float32') print("\ns initial:\n", s.get_value()) f.as_theano(x_dat) print("\ns after Theano call:\n", s.get_value()) s.set_value(s_init) f(x_dat) print("\nlocal s after reset and Synkhronos call:\n", s.get_value()) gathered_s = synk.gather(s, nd_up=1) print("\ngathered s:\n", gathered_s) synk.reduce(s, op="sum") print("\nlocal s after in-place reduce:\n", s.get_value()) gathered_s = synk.gather(s, nd_up=1) print("\ngathered s after reduce:\n", gathered_s) s.set_value(s_init) synk.broadcast(s) f(x_dat) synk.all_reduce(s, op="sum") gathered_s = synk.gather(s, nd_up=1) print("\ngathered s after local reset, broadcast, Synkhronos call, " "and all-reduce:\n", gathered_s)
def main(model='mlp', num_epochs=500): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() # Fork workers and initialize gpu before building any variables. synk.fork() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") if model == 'mlp': network = build_mlp(input_var) elif model.startswith('custom_mlp:'): depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',') network = build_custom_mlp(input_var, int(depth), int(width), float(drop_in), float(drop_hid)) elif model == 'cnn': network = build_cnn(input_var) else: print("Unrecognized model type %r." % model) return # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) # ipdb.set_trace() # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: # train_fn = theano.function([input_var, target_var], loss, updates=updates) train_fn = synk.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: # val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) val_fn = synk.function([input_var, target_var], [test_loss, test_acc]) # Send all functions and variables to workers (in the future, automatic) synk.distribute() # Write data into input shared memory (also applies to val_fn--same vars). X_train_synk, y_train_synk = train_fn.build_inputs(X_train, y_train) X_val_synk, y_val_synk = train_fn.build_inputs(X_val, y_val) X_test_synk, y_test_synk = train_fn.build_inputs(X_test, y_test) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatch_indices(len(y_train), 500, shuffle=True): train_err += train_fn(X_train_synk, y_train_synk, batch=batch) synk.all_reduce(params) train_batches += 1 mid_time = time.time() # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatch_indices(len(y_val), 500, shuffle=False): err, acc = val_fn(X_val_synk, y_val_synk, batch=batch, num_slices=1) val_err += err val_acc += acc val_batches += 1 end_time = time.time() val_fn_time = end_time - mid_time train_fn_time = mid_time - start_time # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print("Train function time: {:.3f}s".format(train_fn_time)) print("Validation function time: {:.3f}s".format(val_fn_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatch_indices(len(y_test), 500, shuffle=False): err, acc = val_fn(X_test_synk, y_test_synk, batch=batch) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100)) # Optionally, you could now dump the network weights to a file like this: np.savez('model.npz', *lasagne.layers.get_all_param_values(network)) # And load them again later on like this: with np.load('model.npz') as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, param_values)
def train_minibatch(x_data, y_data, batch=None): train_loss = f_grad_shared(x_data, y_data, batch=batch) # (synk_data) synk.all_reduce(grad_shared, op="avg") # (assumes loss is an avg) f_param_update() return train_loss
f_theano = theano.function([x_mat, y_mat], z) r_theano = f_theano(x_dat, y_dat) r = f() assert np.allclose(r, r_theano) print("\nShared variable math test passed.") r_slc = f(num_slices=2) # slicing here works on the GPU variable s_x assert np.allclose(r_slc, r_theano) print("Shared variable sliced math test passed.") # Further manipulations. # Average values of shared variable on all GPUs: x_values = synk.gather(s_x, nd_up=1) # (x_values is on the GPU) x_values = np.asarray(x_values) # (now it's on CPU) synk.all_reduce(s_x, op="avg") # (default op is "avg") x_avg = x_values.mean(axis=0) new_x_values = np.array(synk.gather(s_x, nd_up=1)) for i in range(n_gpus): assert np.allclose(x_avg, new_x_values[i]) print("\nValue on rank 1 after all_reduce:\n", synk.get_value(1, s_x)) print("All_reduce avg test passed.") # Reset one of the GPUs to previous value: # ipdb.set_trace() synk.set_value(rank=1, shared_vars=s_x, values=x_values[1]) print("\nReset the value on rank 1:\n", synk.get_value(1, s_x)) # Make all the GPUs have all the data (can't change ndim of variable) synk.all_gather(s_x) print("\nShapes of s_x on GPUs after all_gather: ", synk.get_shapes(s_x))
def main(model='mlp', batch_size=500, num_epochs=10): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() y_train = y_train.astype("int32") # (some downstream type error on uint8) y_val = y_val.astype("int32") # Fork worker processes and initilize GPU before building variables. n_gpu = synk.fork() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') network = build_network(model, input_var) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) grad_updates, param_updates, grad_shared = updates.nesterov_momentum( loss, params, learning_rate=0.01, momentum=0.9) # updates = lasagne.updates.nesterov_momentum( # loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Make GPU variables to hold the data. s_input_train = theano.shared(X_train[:len(X_train) // n_gpu]) s_target_train = theano.shared(y_train[:len(y_train) // n_gpu]) s_input_val = theano.shared(X_val[:len(X_val) // n_gpu]) s_target_val = theano.shared(y_val[:len(y_val) // n_gpu]) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_grad_fn = synk.function( inputs=[], outputs=loss, givens=[(input_var, s_input_train), (target_var, s_target_train)], sliceable_shareds=[s_input_train, s_target_train], updates=grad_updates) train_update_fn = synk.function([], updates=param_updates) # train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = synk.function(inputs=[], givens=[(input_var, s_input_val), (target_var, s_target_val)], sliceable_shareds=[s_input_val, s_target_val], outputs=[test_loss, test_acc]) # val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # Don't bother to put test data on GPU ahead of time. test_fn = synk.function([input_var, target_var], outputs=[test_loss, test_acc]) # After building all functions, give them to workers. synk.distribute() # Put data into OS shared memory for worker access. X_test, y_test = test_fn.build_inputs(X_test, y_test) print("Scattering data to GPUs.") scatter_vars = [s_input_train, s_target_train, s_input_val, s_target_val] scatter_vals = [X_train, y_train, X_val, y_val] synk.scatter(scatter_vars, scatter_vals) train_worker_len = min(synk.get_lengths(s_target_train)) worker_batch_size = batch_size // n_gpu # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() # for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): for batch in iterate_minibatch_indices(train_worker_len, worker_batch_size, shuffle=True): train_err += train_grad_fn(batch_s=batch) synk.all_reduce(grad_shared) # (averges) train_update_fn() train_batches += 1 # And a full pass over the validation data: # val_err = 0 # val_acc = 0 # val_batches = 0 # for batch in iterate_minibatches(X_val, y_val, batch_size, shuffle=False): # inputs, targets = batch # err, acc = val_fn(inputs, targets) # val_err += err # val_acc += acc # val_batches += 1 val_err, val_acc = val_fn(num_slices=4) # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(float(val_err))) print(" validation accuracy:\t\t{:.2f} %".format( float(val_acc) * 100)) # After training, we compute and print the test error: # test_err = 0 # test_acc = 0 # test_batches = 0 # for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False): # inputs, targets = batch # err, acc = val_fn(inputs, targets) # test_err += err # test_acc += acc # test_batches += 1 test_err, test_acc = test_fn(X_test, y_test, num_slices=4) print("Final results:") print(" test loss:\t\t\t{:.6f}".format(float(test_err))) print(" test accuracy:\t\t{:.2f} %".format(float(test_acc) * 100))