def main(model='mlp', num_epochs=500): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() # Fork workers and initialize gpu before building any variables. synk.fork() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") if model == 'mlp': network = build_mlp(input_var) elif model.startswith('custom_mlp:'): depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',') network = build_custom_mlp(input_var, int(depth), int(width), float(drop_in), float(drop_hid)) elif model == 'cnn': network = build_cnn(input_var) else: print("Unrecognized model type %r." % model) return # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) # ipdb.set_trace() # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: # train_fn = theano.function([input_var, target_var], loss, updates=updates) train_fn = synk.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: # val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) val_fn = synk.function([input_var, target_var], [test_loss, test_acc]) # Send all functions and variables to workers (in the future, automatic) synk.distribute() # Write data into input shared memory (also applies to val_fn--same vars). X_train_synk, y_train_synk = train_fn.build_inputs(X_train, y_train) X_val_synk, y_val_synk = train_fn.build_inputs(X_val, y_val) X_test_synk, y_test_synk = train_fn.build_inputs(X_test, y_test) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatch_indices(len(y_train), 500, shuffle=True): train_err += train_fn(X_train_synk, y_train_synk, batch=batch) synk.all_reduce(params) train_batches += 1 mid_time = time.time() # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatch_indices(len(y_val), 500, shuffle=False): err, acc = val_fn(X_val_synk, y_val_synk, batch=batch, num_slices=1) val_err += err val_acc += acc val_batches += 1 end_time = time.time() val_fn_time = end_time - mid_time train_fn_time = mid_time - start_time # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print("Train function time: {:.3f}s".format(train_fn_time)) print("Validation function time: {:.3f}s".format(val_fn_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatch_indices(len(y_test), 500, shuffle=False): err, acc = val_fn(X_test_synk, y_test_synk, batch=batch) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100)) # Optionally, you could now dump the network weights to a file like this: np.savez('model.npz', *lasagne.layers.get_all_param_values(network)) # And load them again later on like this: with np.load('model.npz') as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, param_values)
import numpy as np import theano import theano.tensor as T import synkhronos as synk synk.fork() s_init = np.ones(3, dtype='float32') x = T.matrix('x') s = theano.shared(s_init, name='s') s_old = s f = synk.function([x], updates={s: T.sum(x * s, axis=0)}) synk.distribute() x_dat = np.array([[1., 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]]).astype('float32') print("\ns initial:\n", s.get_value()) f.as_theano(x_dat) print("\ns after Theano call:\n", s.get_value()) s.set_value(s_init) f(x_dat) print("\nlocal s after reset and Synkhronos call:\n", s.get_value()) gathered_s = synk.gather(s, nd_up=1) print("\ngathered s:\n", gathered_s) synk.reduce(s, op="sum") print("\nlocal s after in-place reduce:\n", s.get_value()) gathered_s = synk.gather(s, nd_up=1) print("\ngathered s after reduce:\n", gathered_s) s.set_value(s_init) synk.broadcast(s) f(x_dat)
""" Demonstrate interactions with synkhronos.Data objects """ import numpy as np import theano # import theano.tensor as T import synkhronos as synk synk.fork() # (making data: after fork(), before and/or after distribute()) # Generate some random data sets x_dat_0 = np.random.randn(5, 4).astype(theano.config.floatX) x_dat_1 = np.random.randn(5, 4).astype(theano.config.floatX) x_dat_2 = np.random.randn(2000, 10).astype(theano.config.floatX) x_dat_3 = np.random.randn(2100, 10).astype(theano.config.floatX) x_dat_4 = np.random.randn(100, 8).astype(theano.config.floatX) x_dat_5 = np.random.randn(100, 8).astype("float64") # Create a Synkhronos data object to be used as data input to functions. x = synk.data(x_dat_0) print("\nSome information about x_data...") print("object: ", x) print("values:\n", x.data) # x.data: numpy array, underlying memory: OS-shared print("\nshape: ", x.shape) print("length: ", len(x)) print("allocation size (items): ", x.alloc_size) print("type(x.data): ", type(x.data)) print("\ndir(x): \n", dir(x)) # Reading and writing like a numpy array
""" Demonstrate interactions with Theano shared variables (GPU memory) """ import numpy as np import theano import theano.tensor as T import synkhronos as synk n_gpus = synk.fork() # Make data-parallel computation with Theano shared variable (exists on GPU). dtype = theano.config.floatX s_x = theano.shared(np.ones([100, 4], dtype=dtype), name='s_x') s_y = theano.shared(np.zeros([4, 5], dtype=dtype), name='s_y') s_unused = theano.shared(np.zeros([5, 5], dtype=dtype)) # (see note at bottom) z = T.mean(s_x.dot(s_y), axis=0) f = synk.function(inputs=[], sliceable_shareds=[s_x], outputs=z) synk.distribute() # (shared variable data sent to workers with function) # Inspect values of Theano shared variables--separate copy on each GPU. print("\nLengths of s_x on each GPU: ", synk.get_lengths(s_x)) print("Shapes of s_x on each GPU: ", synk.get_shapes(s_x)) x_dat = np.random.randn(8 * n_gpus, 4).astype(dtype) y_dat = np.random.randn(4, 5).astype(dtype) # Manipulate values of Theano shared variables across all GPUs. synk.scatter(s_x, x_dat) synk.broadcast(s_y, y_dat) # (without data arg, operates on existing var data)
import theano import theano.tensor as T import numpy as np import synkhronos as synk n_gpu = synk.fork() # x = T.matrix('x') x_dat = np.random.randn(100, 10).astype(theano.config.floatX) y_dat = np.random.randn(10, 5).astype(theano.config.floatX) x = theano.shared(x_dat, 'x_gpu') y = theano.shared(y_dat, 'y_gpu') z = T.mean(x.dot(y), axis=0) f = synk.function(inputs=[], outputs=z, sliceable_shareds=[x]) synk.distribute() full_x_dat = np.random.randn(n_gpu * 100, 10).astype(theano.config.floatX) synk.scatter(x, full_x_dat) r = f()
def main(): B_SIZE = 10000 MID = B_SIZE // 2 synk.fork() import lasagne input_var = T.tensor4('inputs') target_var = T.ivector('targets') network = build_mlp(input_var) # network = build_cnn(input_var) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) grads = theano.grad(loss, wrt=params) flat_grad = T.concatenate(list(map(T.flatten, grads))) f_loss = synk.function([input_var, target_var], loss, collect_modes=[None], reduce_ops="sum") f_grad = synk.function([input_var, target_var], flat_grad, collect_modes=[None]) synk.distribute() x_data, y_data = make_data([1, 28, 28], B_SIZE) loss_1 = f_loss(x_data, y_data) grad_1 = f_grad(x_data, y_data) x_shmem, y_shmem = f_loss.get_input_shmems() x_dat_sh = x_shmem[:B_SIZE] y_dat_sh = y_shmem[:B_SIZE] x_data_1 = x_data[:MID] x_data_2 = x_data[MID:] y_data_1 = y_data[:MID] y_data_2 = y_data[MID:] ITERS = 10 t0 = timer() for _ in range(ITERS): loss_i = f_loss.as_theano(x_data_1, y_data_1) loss_j = f_loss.as_theano(x_data_2, y_data_2) loss_time = timer() - t0 print("theano loss_time: ", loss_time) t0 = timer() for _ in range(ITERS): grad_i = f_grad.as_theano(x_data_1, y_data_1) grad_j = f_grad.as_theano(x_data_2, y_data_2) grad_time = timer() - t0 print("theano grad_time: ", grad_time) t0 = timer() for _ in range(ITERS): loss_i = f_loss(x_dat_sh, y_dat_sh) loss_time = timer() - t0 print("synk shmem loss_time: ", loss_time) t0 = timer() for _ in range(ITERS): grad_i = f_grad(x_dat_sh, y_dat_sh) grad_time = timer() - t0 print("synk shmem grad_time: ", grad_time) t0 = timer() for _ in range(ITERS): loss_i = f_loss(x_data, y_data) loss_time = timer() - t0 print("synk new input loss_time: ", loss_time) t0 = timer() for _ in range(ITERS): grad_i = f_grad(x_data, y_data) grad_time = timer() - t0 print("synk new input grad_time: ", grad_time)
""" Demonstrate basic functionality: building functions, output reductions, function call slicing. """ import numpy as np import theano import theano.tensor as T import synkhronos as synk synk.fork() # processes forked, GPUs initialized # Build simple data-parallel computations (parallel across rows of "x") x = T.matrix('x') y = T.matrix('y') z_avg = T.mean(x.dot(y), axis=0) z_sum = T.sum(x.dot(y), axis=0) z_max = T.max(x.dot(y), axis=0) # Build Synk function. NOTES: # 1. bcast_input "y" will have the full value broadcast to all workers # 2. outputs have different reduce operations (default is "avg") f = synk.function(inputs=[x], bcast_inputs=[y], outputs=[z_avg, (z_sum, "sum"), (z_max, "max")]) synk.distribute() # worker GPUs receive all synk functions, prepare to execute # Generate random data and compute results x_dat = 0.01 * np.random.randn(1000, 10).astype(theano.config.floatX)
def main(model='mlp', batch_size=500, num_epochs=10): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() y_train = y_train.astype("int32") # (some downstream type error on uint8) y_val = y_val.astype("int32") # Fork worker processes and initilize GPU before building variables. n_gpu = synk.fork() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') network = build_network(model, input_var) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) grad_updates, param_updates, grad_shared = updates.nesterov_momentum( loss, params, learning_rate=0.01, momentum=0.9) # updates = lasagne.updates.nesterov_momentum( # loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Make GPU variables to hold the data. s_input_train = theano.shared(X_train[:len(X_train) // n_gpu]) s_target_train = theano.shared(y_train[:len(y_train) // n_gpu]) s_input_val = theano.shared(X_val[:len(X_val) // n_gpu]) s_target_val = theano.shared(y_val[:len(y_val) // n_gpu]) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_grad_fn = synk.function( inputs=[], outputs=loss, givens=[(input_var, s_input_train), (target_var, s_target_train)], sliceable_shareds=[s_input_train, s_target_train], updates=grad_updates) train_update_fn = synk.function([], updates=param_updates) # train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = synk.function(inputs=[], givens=[(input_var, s_input_val), (target_var, s_target_val)], sliceable_shareds=[s_input_val, s_target_val], outputs=[test_loss, test_acc]) # val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # Don't bother to put test data on GPU ahead of time. test_fn = synk.function([input_var, target_var], outputs=[test_loss, test_acc]) # After building all functions, give them to workers. synk.distribute() # Put data into OS shared memory for worker access. X_test, y_test = test_fn.build_inputs(X_test, y_test) print("Scattering data to GPUs.") scatter_vars = [s_input_train, s_target_train, s_input_val, s_target_val] scatter_vals = [X_train, y_train, X_val, y_val] synk.scatter(scatter_vars, scatter_vals) train_worker_len = min(synk.get_lengths(s_target_train)) worker_batch_size = batch_size // n_gpu # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() # for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): for batch in iterate_minibatch_indices(train_worker_len, worker_batch_size, shuffle=True): train_err += train_grad_fn(batch_s=batch) synk.all_reduce(grad_shared) # (averges) train_update_fn() train_batches += 1 # And a full pass over the validation data: # val_err = 0 # val_acc = 0 # val_batches = 0 # for batch in iterate_minibatches(X_val, y_val, batch_size, shuffle=False): # inputs, targets = batch # err, acc = val_fn(inputs, targets) # val_err += err # val_acc += acc # val_batches += 1 val_err, val_acc = val_fn(num_slices=4) # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(float(val_err))) print(" validation accuracy:\t\t{:.2f} %".format( float(val_acc) * 100)) # After training, we compute and print the test error: # test_err = 0 # test_acc = 0 # test_batches = 0 # for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False): # inputs, targets = batch # err, acc = val_fn(inputs, targets) # test_err += err # test_acc += acc # test_batches += 1 test_err, test_acc = test_fn(X_test, y_test, num_slices=4) print("Final results:") print(" test loss:\t\t\t{:.6f}".format(float(test_err))) print(" test accuracy:\t\t{:.2f} %".format(float(test_acc) * 100))
import numpy as np import theano import theano.tensor as T import synkhronos as synk synk.fork(2) s_init = np.ones(2, dtype='float32') x = T.matrix('x') s = theano.shared(s_init, name='s') f = synk.function([x], updates=[(s, T.sum(x * s, axis=0))]) synk.distribute() x_dat = synk.data(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]).astype('float32')) print("\ns initial:\n", s.get_value()) f.as_theano(x_dat.data) print("\ns after Theano call:\n", s.get_value()) s.set_value(s_init) f(x_dat) print("\nlocal s after reset and Synkhronos call:\n", s.get_value()) gathered_s = synk.gather(s, nd_up=1) print("\ngathered s:\n", gathered_s) synk.reduce(s, op="sum") print("\nlocal s after in-place reduce:\n", s.get_value()) gathered_s = synk.gather(s, nd_up=1) print("\ngathered s after reduce:\n", gathered_s) synk.broadcast(s, s_init)