def __init__(self, network): ''' The constructor of MVNetParamManager The constructor will associate the parameter with multiverso array table. The initial value of ArrayTableHandler will be same as the parameters of network. If different parameters are used in different processes, the average of them will be used as the initial value ''' self.shapes = [] self.dtypes = [] self.sizes = [] self.all_param_list = [] self.network = network for arr in lasagne.layers.get_all_param_values(self.network): self.shapes.append(arr.shape) # TODO: Now only float32 is supported in multiverso. So I store all # the parameters in a float32 array. This place need modification # after other types are supported assert(np.dtype("float32") == arr.dtype) self.dtypes.append(arr.dtype) self.sizes.append(arr.size) self.all_param_list.extend([i for i in np.nditer(arr)]) self.all_param_list = np.array(self.all_param_list) self.tbh = mv.ArrayTableHandler(len(self.all_param_list), init_value=self.all_param_list) mv.barrier() # add barrier to make sure the initial values have token effect self.all_param_list = self.tbh.get() self._set_all_param_to_net()
def test_matrix(self): num_row = 11 num_col = 10 size = num_col * num_row workers_num = mv.workers_num() tbh = mv.MatrixTableHandler(num_row, num_col) mv.barrier() for count in xrange(1, 21): row_ids = [0, 1, 5, 10] tbh.add(range(size)) tbh.add( [range(rid * num_col, (1 + rid) * num_col) for rid in row_ids], row_ids) mv.barrier() data = tbh.get() mv.barrier() for i, row in enumerate(data): for j, actual in enumerate(row): expected = (i * num_col + j) * count * workers_num if i in row_ids: expected += (i * num_col + j) * count * workers_num self.assertEqual(expected, actual) data = tbh.get(row_ids) mv.barrier() for i, row in enumerate(data): for j, actual in enumerate(row): expected = (row_ids[i] * num_col + j) * count * workers_num * 2 self.assertEqual(expected, actual)
def test_matrix(self): num_row = 11 num_col = 10 size = num_col * num_row workers_num = mv.workers_num() tbh = mv.MatrixTableHandler(num_row, num_col) mv.barrier() for count in xrange(1, 21): row_ids = [0, 1, 5, 10] tbh.add(range(size)) tbh.add([range(rid * num_col, (1 + rid) * num_col) for rid in row_ids], row_ids) mv.barrier() data = tbh.get() mv.barrier() for i, row in enumerate(data): for j, actual in enumerate(row): expected = (i * num_col + j) * count * workers_num if i in row_ids: expected += (i * num_col + j) * count * workers_num self.assertEqual(expected, actual) data = tbh.get(row_ids) mv.barrier() for i, row in enumerate(data): for j, actual in enumerate(row): expected = (row_ids[i] * num_col + j) * count * workers_num * 2 self.assertEqual(expected, actual)
def __init__(self, svobj): '''Constructor of the MVSharedVariable The constructor will create ArrayTableHandler and associate the shared variable with it. The initial value of ArrayTableHandler will be same as the value of SharedVariable. *Notice*: Only the `init_value` from the master will be used! ''' assert(isinstance(svobj, SharedVariable)) self._svobj = svobj self._mv_array = mv.ArrayTableHandler(self._svobj.get_value().size, init_value=self._svobj.get_value().reshape((-1,))) mv.barrier() # add barrier to make sure the initial values have token effect # _last_mv_data restore a copy of value. It will be used for calculate # the update for multiverso when calling mv_sync self._last_mv_data = self._mv_array.get().reshape(self._svobj.get_value().shape) self._svobj.set_value(self._last_mv_data, borrow=False)
def __init__(self, svobj): '''Constructor of the MVSharedVariable The constructor will create ArrayTableHandler and associate the shared variable with it. The initial value of ArrayTableHandler will be same as the value of SharedVariable. If different initial value is used in different processes, the average of them will be used as the initial value ''' assert(isinstance(svobj, SharedVariable)) self._svobj = svobj self._mv_array = mv.ArrayTableHandler(self._svobj.get_value().size, init_value=self._svobj.get_value().reshape((-1,))) mv.barrier() # add barrier to make sure the initial values have token effect # _last_mv_data restore a copy of value. It will be used for calculate # the update for multiverso when calling mv_sync self._last_mv_data = self._mv_array.get().reshape(self._svobj.get_value().shape) self._svobj.set_value(self._last_mv_data, borrow=False)
def _test_array(self, size): tbh = mv.ArrayTableHandler(size) mv.barrier() for i in xrange(100): tbh.add(range(1, size + 1)) tbh.add(range(1, size + 1)) mv.barrier() for j, actual in enumerate(tbh.get()): self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(), actual) mv.barrier()
def _test_sharedvar(self, row, col): W = sharedvar.mv_shared(value=np.zeros((row, col), dtype=theano.config.floatX), name='W', borrow=True) delta = np.array(range(1, row * col + 1), dtype=theano.config.floatX).reshape((row, col)) train_model = theano.function([], updates=[(W, W + delta)]) mv.barrier() for i in xrange(100): train_model() train_model() sharedvar.sync_all_mv_shared_vars() mv.barrier() # to get the newest value, we must sync again sharedvar.sync_all_mv_shared_vars() for j, actual in enumerate(W.get_value().reshape(-1)): self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(), actual) mv.barrier()
def _test_sharedvar(self, row, col): W = sharedvar.mv_shared( value=np.zeros( (row, col), dtype=theano.config.floatX ), name='W', borrow=True ) delta = np.array(range(1, row * col + 1), dtype=theano.config.floatX).reshape((row, col)) train_model = theano.function([], updates=[(W, W + delta)]) mv.barrier() for i in xrange(100): train_model() train_model() sharedvar.sync_all_mv_shared_vars() mv.barrier() # to get the newest value, we must sync again sharedvar.sync_all_mv_shared_vars() for j, actual in enumerate(W.get_value().reshape(-1)): self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(), actual) mv.barrier()
p_y_given_x = model(x, *params) y = T.argmax(p_y_given_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(p_y_given_x, t)) updates = momentum(cost, params, learning_rate=0.01, momentum=0.9) # compile theano functions train = theano.function([x, t], cost, updates=updates, allow_input_downcast=True) predict = theano.function([x], y, allow_input_downcast=True) # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() # train model batch_size = 50 for i in range(50): for start in range(0, len(x_train), batch_size): # every process only train batches assigned to itself if start / batch_size % workers_num != worker_id: continue x_batch = x_train[start:start + batch_size] t_batch = t_train[start:start + batch_size] cost = train(x_batch, t_batch) # MULTIVERSO: sync value with multiverso after every batch sharedvar.sync_all_mv_shared_vars()
def train(dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', n_words_src=30000, n_words=30000, max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=1., # learning rate maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=80, saveto='model.npz', saveFreq=1000, # save the parameters after every saveFreq updates validFreq=2500, dev_bleu_freq=20000, datasets=('/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'), valid_datasets=('./data/dev/dev_en.tok', './data/dev/dev_fr.tok'), small_train_datasets=('./data/train/small_en-fr.en','./data/train/small_en-fr.fr', './data/train/small_en-fr.fr'), use_dropout=False, reload_=False, overwrite=False, preload='', # Options below are from v-yanfa dump_before_train=True, plot_graph=None, vocab_filenames=('./data/dic/filtered_dic_en-fr.en.pkl', './data/dic/filtered_dic_en-fr.fr.pkl'), map_filename='./data/dic/mapFullVocab2Top1MVocab.pkl', lr_discount_freq=80000, # Options of deeper encoder and decoder n_encoder_layers=1, n_decoder_layers=1, encoder_many_bidirectional=True, attention_layer_id=0, unit='gru', residual_enc=None, residual_dec=None, use_zigzag=False, initializer='orthogonal', given_embedding=None, dist_type=None, dist_recover_lr_iter=False, unit_size=2, cond_unit_size=2, given_imm=False, dump_imm=False, shuffle_data=False, decoder_all_attention=False, average_context=False, task='en-fr', fine_tune_patience=8, nccl = False, src_vocab_map_file = None, tgt_vocab_map_file = None, trg_attention_layer_id=None, fix_dp_bug = False, temperature = 1.0, scale=1.0, gate_dropout=0.0, ): model_options = locals().copy() # Set distributed computing environment worker_id = 0 if dist_type == 'mv': try: import multiverso as mv except ImportError: from . import multiverso_ as mv worker_id = mv.worker_id() elif dist_type == 'mpi_reduce': from mpi4py import MPI mpi_communicator = MPI.COMM_WORLD worker_id = mpi_communicator.Get_rank() workers_cnt = mpi_communicator.Get_size() if nccl: nccl_comm = init_nccl_env(mpi_communicator) print 'Use {}, worker id: {}'.format('multiverso' if dist_type == 'mv' else 'mpi' if dist_recover_lr_iter else 'none', worker_id) sys.stdout.flush() # Set logging file set_logging_file('log/complete/e{}d{}_res{}_att{}_worker{}_task{}_{}.txt'.format( n_encoder_layers, n_decoder_layers, residual_enc, attention_layer_id, worker_id, task, time.strftime('%m-%d-%H-%M-%S'), )) log('''\ Start Time = {} '''.format( time.strftime('%c'), )) # Model options: load and save message('Top options:') pprint(model_options) pprint(model_options, stream=get_logging_file()) message('Done') sys.stdout.flush() #load_options(model_options, reload_, preload, src_vocab_map_file and tgt_vocab_map_file) check_options(model_options) model_options['cost_normalization'] = 1 ada_alpha = 0.95 if dist_type == 'mpi_reduce': model_options['cost_normalization'] = workers_cnt message('Model options:') pprint(model_options) pprint(model_options, stream=get_logging_file()) message() print 'Loading data' log('\n\n\nStart to prepare data\n@Current Time = {}'.format(time.time())) sys.stdout.flush() dataset_src, dataset_tgt = datasets[0], datasets[1] if shuffle_data: text_iterator_list = [None for _ in range(10)] text_iterator = None else: text_iterator_list = None text_iterator = TextIterator( dataset_src, dataset_tgt, vocab_filenames[0], vocab_filenames[1], batch_size,n_words_src, n_words,maxlen ) valid_iterator = TextIterator( valid_datasets[0], valid_datasets[1], vocab_filenames[0], vocab_filenames[1], valid_batch_size, n_words_src, n_words ) small_train_iterator = TextIterator( small_train_datasets[0], small_train_datasets[1], vocab_filenames[0], vocab_filenames[1], valid_batch_size, n_words_src, n_words ) print 'Building model' model = NMTModel(model_options) params = model.initializer.init_params() # Reload parameters if reload_ and os.path.exists(preload): print 'Reloading model parameters' load_params(preload, params, src_map_file = src_vocab_map_file, tgt_map_file = tgt_vocab_map_file) sys.stdout.flush() # Given embedding if given_embedding is not None: print 'Loading given embedding...', load_embedding(params, given_embedding) print 'Done' print_params(params) model.init_tparams(params) # Build model, stochastic_mode = 0(soft), 1(stochastic), 2(hard) trng, use_noise, stochastic_mode, hyper_param,\ x, x_mask, y, y_mask, \ opt_ret, \ cost, test_cost, x_emb, stochastic_updates,_ = model.build_model() inps = [x, x_mask, y, y_mask] all_stochastic_updates = OrderedDictUpdates() for item1 in stochastic_updates: for item2 in item1: all_stochastic_updates.update(item2) print 'Building sampler' f_init, f_next = model.build_sampler(trng=trng, use_noise=use_noise, batch_mode=True, stochastic_mode=stochastic_mode, hyper_param=hyper_param) stochastic_mode.set_value(1) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile, updates=all_stochastic_updates) print 'Done' sys.stdout.flush() test_cost = test_cost.mean() #FIXME: do not regularize test_cost here cost = cost.mean() cost = l2_regularization(cost, model.P, decay_c) cost = regularize_alpha_weights(cost, alpha_c, model_options, x_mask, y_mask, opt_ret) print 'Building f_cost...', f_cost = theano.function(inps, test_cost, profile=profile, updates=all_stochastic_updates) print 'Done' if plot_graph is not None: print 'Plotting post-compile graph...', theano.printing.pydotprint( f_cost, outfile='pictures/post_compile_{}'.format(plot_graph), var_with_name_simple=True, ) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(model.P)) clip_shared = theano.shared(np.array(clip_c, dtype=fX), name='clip_shared') if dist_type != 'mpi_reduce': #build grads clip into computational graph grads, g2 = clip_grad_remove_nan(grads, clip_shared, model.P) else: #do the grads clip after gradients aggregation g2 = None # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', given_imm_data = get_adadelta_imm_data(optimizer, given_imm, preload) if optimizer == 'adadelta': f_grad_shared, f_update, grads_shared, imm_shared = Optimizers[optimizer]( lr, model.P, grads, inps, cost, g2=g2, given_imm_data=given_imm_data, alpha = ada_alpha, all_stochastic_updates=all_stochastic_updates) if optimizer == 'adam': f_grad_shared, f_update, grads_shared, imm_shared = Optimizers[optimizer]( lr, model.P, grads, inps, cost, g2=g2, given_imm_data=given_imm_data, all_stochastic_updates=all_stochastic_updates) print 'Done' if dist_type == 'mpi_reduce': f_grads_clip = make_grads_clip_func(grads_shared = grads_shared, mt_tparams= model.P, clip_c_shared = clip_shared) print 'Optimization' log('Preparation Done\n@Current Time = {}'.format(time.time())) if dist_type == 'mv': mv.barrier() elif dist_type == 'mpi_reduce': #create receive buffers for mpi allreduce rec_grads = [np.zeros_like(p.get_value()) for p in model.P.itervalues()] estop = False history_errs = [] best_bleu = -1.0 best_valid_cost = 1e6 best_p = None bad_counter = 0 uidx = search_start_uidx(reload_, preload) epoch_n_batches = 0 start_epoch = 0 pass_batches = 0 print 'worker', worker_id, 'uidx', uidx, 'l_rate', lrate, 'ada_alpha', ada_alpha, 'n_batches', epoch_n_batches, 'start_epoch', start_epoch, 'pass_batches', pass_batches start_uidx = uidx if dump_before_train: print 'Dumping before train...', saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) np.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip(model.P)) save_options(model_options, uidx, saveto) print 'Done' sys.stdout.flush() stochastic_mode.set_value(0) valid_cost = validation(valid_iterator, f_cost, use_noise) small_train_cost = validation(small_train_iterator, f_cost, use_noise) message('Soft Valid cost {:.5f} Small train cost {:.5f}'.format(valid_cost, small_train_cost)) stochastic_mode.set_value(1) #new_bleu = translate_dev_get_bleu(model, f_init, f_next, trng, use_noise, 5, 1.0) #best_bleu = new_bleu #message('BLEU = {:.2f} at uidx {}'.format(new_bleu, uidx)) sys.stdout.flush() commu_time_sum = 0.0 cp_time_sum =0.0 reduce_time_sum = 0.0 start_time = time.time() finetune_cnt = 0 for eidx in xrange(start_epoch, max_epochs): if shuffle_data: text_iterator = load_shuffle_text_iterator( eidx, worker_id, text_iterator_list, datasets, vocab_filenames, batch_size, maxlen, n_words_src, n_words ) n_samples = 0 if dist_type == 'mpi_reduce': mpi_communicator.Barrier() for i, (x, y) in enumerate(text_iterator): if eidx == start_epoch and i < pass_batches: #ignore the first several batches when reload continue n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue effective_uidx = uidx - start_uidx ud_start = time.time() # compute cost, grads if dist_type != 'mpi_reduce': cost, g2_value = f_grad_shared(x, x_mask, y, y_mask) else: cost = f_grad_shared(x, x_mask, y, y_mask) if dist_type == 'mpi_reduce': reduce_start = time.time() commu_time = 0 gpucpu_cp_time = 0 if not nccl: commu_time, gpucpu_cp_time = all_reduce_params(grads_shared, rec_grads) else: commu_time, gpucpu_cp_time = all_reduce_params_nccl(nccl_comm, grads_shared) reduce_time = time.time() - reduce_start commu_time_sum += commu_time reduce_time_sum += reduce_time cp_time_sum += gpucpu_cp_time g2_value = f_grads_clip() print '@Worker = {}, Reduce time = {:.5f}, Commu time = {:.5f}, Copy time = {:.5f}'.format(worker_id, reduce_time, commu_time, gpucpu_cp_time) curr_lr = lrate if not dist_type or dist_recover_lr_iter < effective_uidx else lrate * 0.05 + effective_uidx * lrate / dist_recover_lr_iter * 0.95 if curr_lr < lrate: print 'Curr lr {:.3f}'.format(curr_lr) # do the update on parameters f_update(curr_lr) ud = time.time() - ud_start if np.isnan(g2_value) or np.isinf(g2_value): message('gradient NaN detected') sys.stdout.flush() if np.isnan(cost) or np.isinf(cost): message('cost NaN detected') model.save_model(saveto, history_errs, uidx) save_minibatch(x, y, saveto, uidx, vocab_filenames) sys.stdout.flush() return 1., 1., 1. # discount learning rate # FIXME: Do NOT enable this and fine-tune at the same time if lr_discount_freq > 0 and np.mod(effective_uidx, lr_discount_freq) == 0: lrate *= 0.5 message('Discount learning rate to {} at iteration {}'.format(lrate, uidx)) # sync batch if dist_type == 'mv' and np.mod(uidx, dispFreq) == 0: comm_start = time.time() model.sync_tparams() message('@Comm time = {:.5f}'.format(time.time() - comm_start)) # verbose if np.mod(effective_uidx, dispFreq) == 0: message('Worker {} Epoch {} Update {} Cost {:.5f} G2 {:.5f} UD {:.5f} Time {:.5f} s'.format( worker_id, eidx, uidx, float(cost), float(g2_value), ud, time.time() - start_time, )) sys.stdout.flush() if np.mod(effective_uidx, saveFreq) == 0 and worker_id == 0: # save with uidx if not overwrite: print 'Saving the model at iteration {}...'.format(uidx), model.save_model(saveto, history_errs, uidx) print 'Done' sys.stdout.flush() # save immediate data in adadelta saveto_imm_path = '{}_latest.npz'.format(os.path.splitext(saveto)[0]) dump_adadelta_imm_data(optimizer, imm_shared, dump_imm, saveto_imm_path) if np.mod(effective_uidx, validFreq) == 0: stochastic_mode.set_value(0) valid_cost = validation(valid_iterator, f_cost, use_noise) small_train_cost = validation(small_train_iterator, f_cost, use_noise) message('Soft Valid cost {:.5f} Small train cost {:.5f}'.format(valid_cost, small_train_cost)) #new_bleu = translate_dev_get_bleu(model, f_init, f_next, trng, use_noise, 5, 1.0) #message('BLEU = {:.2f} at uidx {}'.format(new_bleu, uidx)) sys.stdout.flush() #if new_bleu > best_bleu: # print 'Saving the model at iteration {}...'.format(uidx), # model.save_model(saveto, history_errs, uidx) # print 'Done' # best_bleu = new_bleu # sys.stdout.flush() stochastic_mode.set_value(1) # Fine-tune based on dev cost if fine_tune_patience > 0: if valid_cost < best_valid_cost: bad_counter = 0 best_valid_cost = valid_cost #dump the best model so far, including the immediate file if worker_id == 0: message('Dump the the best model so far at uidx {}'.format(uidx)) model.save_model(saveto, history_errs) #dump_adadelta_imm_data(optimizer, imm_shared, dump_imm, saveto) else: bad_counter += 1 if bad_counter >= fine_tune_patience: print 'Fine tune:', if finetune_cnt % 2 == 0: lrate = np.float32(lrate * 0.5) message('Discount learning rate to {} at iteration {}'.format(lrate, uidx)) if lrate <= 0.025: message('Learning rate decayed to {:.5f}, task completed'.format(lrate)) return 1., 1., 1. else: clip_shared.set_value(np.float32(clip_shared.get_value() * 0.25)) message('Discount clip value to {} at iteration {}'.format(clip_shared.get_value(), uidx)) finetune_cnt += 1 bad_counter = 0 # finish after this many updates if uidx >= finish_after: print 'Finishing after {} iterations!'.format(uidx) estop = True break print 'Seen {} samples'.format(n_samples) if estop: break if best_p is not None: zipp(best_p, model.P) use_noise.set_value(0.) return 0.
def main(batch_size=128, lr=0.1, sync=False, n=5, num_epochs=82, model=None): # Check if cifar data exists if not os.path.exists("./cifar-10-batches-py"): print( "CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'." ) return # Load the dataset print("Loading data...") data = load_data() X_train = data['X_train'] Y_train = data['Y_train'] X_test = data['X_test'] Y_test = data['Y_test'] # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model print("Building model and compiling functions...") network = build_cnn(input_var, n) print("number of parameters in model: %d" % lasagne.layers.count_params(network, trainable=True)) # MULTIVERSO: LasagneParamManager is a parameter manager which can # synchronize parameters of Lasagne with multiverso. lpm = param_manager.LasagneParamManager(network) if model is None: # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy( prediction, target_var) loss = loss.mean() # add weight decay all_layers = lasagne.layers.get_all_layers(network) l2_penalty = lasagne.regularization.regularize_layer_params( all_layers, lasagne.regularization.l2) * 0.0001 loss = loss + l2_penalty # Create update expressions for training # Stochastic Gradient Descent (SGD) with momentum params = lasagne.layers.get_all_params(network, trainable=True) sh_lr = theano.shared(lasagne.utils.floatX(lr)) updates = lasagne.updates.momentum(loss, params, learning_rate=sh_lr, momentum=0.9) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Create a loss expression for validation/testing test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) if model is None: # launch the training loop print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # devide the data into different process examples_per_worker = X_train.shape[0] / workers_num start_index = worker_id * (examples_per_worker) train_indices = np.arange(start_index, start_index + examples_per_worker) # shuffle training data np.random.shuffle(train_indices) rand_X_train = X_train[train_indices, :, :, :] rand_Y_train = Y_train[train_indices] # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(rand_X_train, rand_Y_train, batch_size, shuffle=True, augment=True): train_batches += 1 inputs, targets = batch train_err += train_fn(inputs, targets) # MULTIVERSO: when you want to commit all the delta of # parameters manage by LasagneParamManager and update the latest # parameters from parameter server, you can call this function to # synchronize the values lpm.sync_all_param() # And a full pass over the validation data: # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() if mv.is_master_worker(): val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) # adjust learning rate as in paper # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs if (epoch + 1) == 41 or (epoch + 1) == 61: # TODO: because of ASGD and multiple GPU are used, so Learning # rate change schedule should be reconsidered new_lr = sh_lr.get_value() * 0.1 print("New LR:" + str(new_lr)) sh_lr.set_value(lasagne.utils.floatX(new_lr)) # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() if mv.is_master_worker(): # MULTIVERSO: update the parameters before save the model lpm.sync_all_param() # dump the network weights to a file : np.savez('cifar10_deep_residual_model.npz', *lasagne.layers.get_all_param_values(network)) else: # load network weights from model file with np.load(model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, param_values) if mv.is_master_worker(): # Calculate validation error of model: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100)) # MULTIVERSO: You must call shutdown at the end of the file mv.shutdown()
def main(batch_size=128, lr=0.1, sync=False, n=5, num_epochs=82, model=None): # Check if cifar data exists if not os.path.exists("./cifar-10-batches-py"): print("CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'.") return # Load the dataset print("Loading data...") data = load_data() X_train = data['X_train'] Y_train = data['Y_train'] X_test = data['X_test'] Y_test = data['Y_test'] # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model print("Building model and compiling functions...") network = build_cnn(input_var, n) print("number of parameters in model: %d" % lasagne.layers.count_params(network, trainable=True)) # MULTIVERSO: MVNetParamManager is a parameter manager which can # synchronize parameters of Lasagne with multiverso. mvnpm = param_manager.MVNetParamManager(network) if model is None: # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # add weight decay all_layers = lasagne.layers.get_all_layers(network) l2_penalty = lasagne.regularization.regularize_layer_params(all_layers, lasagne.regularization.l2) * 0.0001 loss = loss + l2_penalty # Create update expressions for training # Stochastic Gradient Descent (SGD) with momentum params = lasagne.layers.get_all_params(network, trainable=True) sh_lr = theano.shared(lasagne.utils.floatX(lr)) updates = lasagne.updates.momentum( loss, params, learning_rate=sh_lr, momentum=0.9) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Create a loss expression for validation/testing test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) if model is None: # launch the training loop print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # devide the data into different process examples_per_worker = X_train.shape[0] / workers_num start_index = worker_id * (examples_per_worker) train_indices = np.arange(start_index, start_index + examples_per_worker) # shuffle training data np.random.shuffle(train_indices) rand_X_train = X_train[train_indices,:,:,:] rand_Y_train = Y_train[train_indices] # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(rand_X_train, rand_Y_train, batch_size, shuffle=True, augment=True): train_batches += 1 inputs, targets = batch train_err += train_fn(inputs, targets) # MULTIVERSO: when you want to commit all the delta of # parameters manage by MVNetParamManager and update the latest # parameters from parameter server, you can call this function to # synchronize the values mvnpm.sync_all_param() # And a full pass over the validation data: # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() if mv.is_master_worker(): val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) # adjust learning rate as in paper # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs if (epoch+1) == 41 or (epoch+1) == 61: # TODO: because of ASGD and multiple GPU are used, so Learning # rate change schedule should be reconsidered new_lr = sh_lr.get_value() * 0.1 print("New LR:"+str(new_lr)) sh_lr.set_value(lasagne.utils.floatX(new_lr)) # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() if mv.is_master_worker(): # MULTIVERSO: update the parameters before save the model mvnpm.sync_all_param() # dump the network weights to a file : np.savez('cifar10_deep_residual_model.npz', *lasagne.layers.get_all_param_values(network)) else: # load network weights from model file with np.load(model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, param_values) if mv.is_master_worker(): # Calculate validation error of model: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format( test_acc / test_batches * 100)) # MULTIVERSO: You must call shutdown at the end of the file mv.shutdown()
p_y_given_x = model(x, *params) y = T.argmax(p_y_given_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(p_y_given_x, t)) updates = momentum(cost, params, learning_rate=0.01, momentum=0.9) # compile theano functions train = theano.function([x, t], cost, updates=updates, allow_input_downcast=True) predict = theano.function([x], y, allow_input_downcast=True) # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() # train model batch_size = 50 for i in range(50): for start in range(0, len(x_train), batch_size): # every process only train batches assigned to itself if start / batch_size % workers_num != worker_id: continue x_batch = x_train[start:start + batch_size] t_batch = t_train[start:start + batch_size] cost = train(x_batch, t_batch) # MULTIVERSO: sync value with multiverso after every batch
# For the decoder's input, we repeat the encoded input for each time step model.add(RepeatVector(DIGITS + 1)) # The decoder RNN could be multiple layers stacked or a single layer for _ in range(LAYERS): model.add(RNN(HIDDEN_SIZE, return_sequences=True)) # For each of step of the output sequence, decide which character should be chosen model.add(TimeDistributed(Dense(len(chars)))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) mv.barrier() # Train the model each generation and show predictions against the validation dataset for iteration in range(1, 200): print() print('-' * 50) print('Iteration', iteration) # Add the MVCallback to update the parameters from multiverso model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=1, verbose = (1 if mv.is_master_() else 0), validation_data=(X_val, y_val), callbacks=[MVCallback(model, freq=2)]) ### # Select 10 samples from the validation set at random so we can visualize errors if mv.is_master_worker(): for i in range(10): ind = np.random.randint(0, len(X_val)) rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])]
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # MULTIVERSO: you should call mv.init before call multiverso apis mv.init() # MULTIVERSO: every process has distinct worker id worker_id = mv.worker_id() # MULTIVERSO: mv.workers_num will return the number of workers total_worker = mv.workers_num() # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-3 ############### # TRAIN MODEL # ############### print('... training the model') validation_frequency = n_train_batches start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): # MULTIVERSO: we distribute the batches to different workers. # A worker will only train batches belonged to itself if minibatch_index % total_worker == worker_id: minibatch_avg_cost = train_model(minibatch_index) # MULTIVERSO: when you want to commit all the delta of # parameters produced by mv_shared and update the latest # parameters from parameter server, you can call this function to # synchronize the values sharedvar.sync_all_mv_shared_vars() iter = (epoch - 1) * n_train_batches + minibatch_index # MULTIVERSO: only master worker will output the model if mv.is_master_worker() and (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, validation_loss * 100.)) # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() # MULTIVERSO: You should make sure only one process will output the result. # Otherwise results will be outputted repeatedly if mv.is_master_worker(): end_time = timeit.default_timer() test_losses = [test_model(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) print(('Optimization complete with validation score of %f %%,' 'with test performance %f %%') % (validation_loss * 100., test_score * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) # save the model with open('model.pkl', 'wb') as f: pickle.dump(classifier, f) # MULTIVERSO: You must call shutdown at the end of the file mv.shutdown()