def setUpModule(): mv.init()
def model(x, w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o): c1 = T.maximum(0, conv.conv2d(x, w_c1) + b_c1.dimshuffle('x', 0, 'x', 'x')) p1 = downsample.max_pool_2d(c1, (3, 3)) c2 = T.maximum(0, conv.conv2d(p1, w_c2) + b_c2.dimshuffle('x', 0, 'x', 'x')) p2 = downsample.max_pool_2d(c2, (2, 2)) p2_flat = p2.flatten(2) h3 = T.maximum(0, T.dot(p2_flat, w_h3) + b_h3) p_y_given_x = T.nnet.softmax(T.dot(h3, w_o) + b_o) return p_y_given_x # MULTIVERSO: you should call mv.init before call multiverso apis mv.init() worker_id = mv.worker_id() # MULTIVERSO: every process has distinct worker id workers_num = mv.workers_num() w_c1 = init_weights((4, 3, 3, 3), name="w_c1") b_c1 = init_weights((4, ), name="b_c1") w_c2 = init_weights((8, 4, 3, 3), name="w_c2") b_c2 = init_weights((8, ), name="b_c2") w_h3 = init_weights((8 * 4 * 4, 100), name="w_h3") b_h3 = init_weights((100, ), name="b_h3") w_o = init_weights((100, 10), name="w_o") b_o = init_weights((10, ), name="b_o") params = [w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o]
def main(): parser = argparse.ArgumentParser( description='Train the deep NMT model.', fromfile_prefix_chars='@', ) parser.add_argument('-R', action="store_false", default=True, dest='reload', help='Reload old model, default to True, set to False') parser.add_argument('-d', action='store_true', default=False, dest='dump_before_train', help='Dump before train default to False, set to True') parser.add_argument('--lr', action="store", metavar="learning_rate", dest="learning_rate", type=float, default=1.0, help='Start learning rate, default is %(default)s') parser.add_argument('--optimizer', action='store', default='adadelta') parser.add_argument('--plot', action='store', default=None, help='Plot filename, default is None (not plot) (deprecated).') parser.add_argument('--save_freq', action='store', default=10000, type=int, dest='save_freq', help='Model save frequency, default is %(default)s') parser.add_argument('--dev_bleu_freq', action='store', default=20000, type=int, dest='dev_bleu_freq', help='Get dev set BLEU frequency, default is %(default)s') parser.add_argument('--dim', action='store', default=512, type=int, dest='dim', help='Dim of hidden units, default is %(default)s') parser.add_argument('--bs', action='store', default=128, type=int, dest='batch_size', help='Train batch size, default is %(default)s') parser.add_argument('--valid_bs', action='store', default=128, type=int, dest='valid_batch_size', help='Valid batch size, default is %(default)s') parser.add_argument('--dim_word', action='store', default=512, type=int, dest='dim_word', help='Dim of word embedding, default is %(default)s') parser.add_argument('--maxlen', action='store', default=80, type=int, dest='maxlen', help='Max sentence length, default is %(default)s') parser.add_argument('-S', action='store_false', default=True, dest='shuffle', help='Shuffle data per epoch, default is True, set to False') parser.add_argument('--train1', action='store', metavar='filename', dest='train1', type=str, default='filtered_en-fr.en', help='Source train file, default is %(default)s') parser.add_argument('--train2', action='store', metavar='filename', dest='train2', type=str, default='filtered_en-fr.fr', help='Target train file, default is %(default)s') parser.add_argument('--small1', action='store', metavar='filename', dest='small1', type=str, default='small_en-fr.en', help='Source small train file, default is %(default)s') parser.add_argument('--small2', action='store', metavar='filename', dest='small2', type=str, default='small_en-fr.fr', help='Target small train file, default is %(default)s') parser.add_argument('--valid1', action='store', metavar='filename', dest='valid1', type=str, default='dev_en.tok', help='Source valid file, default is %(default)s') parser.add_argument('--valid2', action='store', metavar='filename', dest='valid2', type=str, default='dev_fr.tok', help='Target valid file, default is %(default)s') parser.add_argument('--dic1', action='store', metavar='filename', dest='dic1', type=str, default='filtered_dic_en-fr.en.pkl', help='Source dict file, default is %(default)s') parser.add_argument('--dic2', action='store', metavar='filename', dest='dic2', type=str, default='filtered_dic_en-fr.fr.pkl', help='Target dict file, default is %(default)s') parser.add_argument('--n_words_src', action='store', default=30000, type=int, dest='n_words_src', help='Vocabularies in source side, default is %(default)s') parser.add_argument('--n_words_tgt', action='store', default=30000, type=int, dest='n_words_tgt', help='Vocabularies in target side, default is %(default)s') parser.add_argument('model_file', nargs='?', default='model/baseline/baseline.npz', help='Generated model file, default is "%(default)s"') parser.add_argument('pre_load_file', nargs='?', default='model/en2fr.iter160000.npz', help='Pre-load model file, default is "%(default)s"') parser.add_argument('--src_vocab_map', action='store', metavar='filename', dest='src_vocab_map_file', type=str, default=None, help='The file containing source vocab mapping information' 'used to initialize a model on large dataset from small one') parser.add_argument('--tgt_vocab_map', action='store', metavar='filename', dest='tgt_vocab_map_file', type=str, default=None, help='The file containing target vocab mapping information' 'used to initialize a model on large dataset from small one') parser.add_argument('--enc', action='store', default=1, type=int, dest='n_encoder_layers', help='Number of encoder layers, default is 1') parser.add_argument('--dec', action='store', default=1, type=int, dest='n_decoder_layers', help='Number of decoder layers, default is 1') parser.add_argument('--conn', action='store', default=2, type=int, dest='connection_type', help='Connection type, ' 'default is 2 (bidirectional only in first layer, other layers are forward);' '1 is divided bidirectional GRU') parser.add_argument('--max_epochs', action='store', default=100, type=int, dest='max_epochs', help='Maximum epoches, default is 100') parser.add_argument('--unit', action='store', metavar='unit', dest='unit', type=str, default='lstm', help='The unit type, default is "lstm", can be set to "gru".') parser.add_argument('--attention', action='store', metavar='index', dest='attention_layer_id', type=int, default=0, help='Attention layer index, default is 0') parser.add_argument('--residual_enc', action='store', metavar='type', dest='residual_enc', type=str, default=None, help='Residual connection of encoder, default is None, candidates are "layer_wise", "last"') parser.add_argument('--residual_dec', action='store', metavar='type', dest='residual_dec', type=str, default='layer_wise', help='Residual connection of decoder, default is "layer_wise", candidates are None, "last"') parser.add_argument('-z', '--zigzag', action='store_false', default=True, dest='use_zigzag', help='Use zigzag in encoder, default is True, set to False') parser.add_argument('--dropout', action="store", metavar="dropout", dest="dropout", type=float, default=False, help='Dropout rate, default is False (not use dropout)') parser.add_argument('--unit_size', action='store', default=2, type=int, dest='unit_size', help='Number of unit size, default is %(default)s') # TODO: rename this option to decoder_unit_size in future parser.add_argument('--cond_unit_size', action='store', default=2, type=int, dest='cond_unit_size', help='Number of decoder unit size (will rename in future), default is %(default)s') parser.add_argument('--clip', action='store', metavar='clip', dest='clip', type=float, default=1.0, help='Gradient clip rate, default is 1.0.') parser.add_argument('--manual', action='store_false', dest='auto', default=True, help='Set dropout rate and grad clip rate manually.') parser.add_argument('--emb', action='store', metavar='filename', dest='given_embedding', type=str, default=None, help='Given embedding model file, default is None') parser.add_argument('--lr_discount', action='store', metavar='freq', dest='lr_discount_freq', type=int, default=-1, help='The learning rate discount frequency, default is -1') parser.add_argument('--distribute', action = 'store', metavar ='type', dest = 'dist_type', type = str, default= None, help = 'The distribution version, default is None (singe GPU mode), candiates are "mv", "mpi_reduce"') parser.add_argument('--nccl', action="store_true", default=False, dest='nccl', help='Use NCCL in distributed mode, default to False, set to True') parser.add_argument('--clip_grads_local', action="store_true", default=False, dest='clip_grads_local', help='Whether to clip grads in distributed mode, default to False, set to True') parser.add_argument('--recover_lr_iter', action='store', dest='dist_recover_lr', type = int, default=10000, help='The mini-batch index to recover lrate in distributed mode, default is 10000.') parser.add_argument('--all_att', action='store_true', dest='all_att', default=False, help='Generate attention from all decoder layers, default is False, set to True') parser.add_argument('--avg_ctx', action='store_true', dest='avg_ctx', default=False, help='Average all context vectors to get softmax, default is False, set to True') parser.add_argument('--dataset', action='store', dest='dataset', default='en-fr', help='Dataset, default is "%(default)s"') parser.add_argument('--gpu_map_file', action='store', metavar='filename', dest='gpu_map_file', type=str, default=None, help='The file containing gpu id mapping information, ' 'each line is in the form physical_gpu_id\\theano_id') parser.add_argument('--ft_patience', action='store', metavar='N', dest='fine_tune_patience', type=int, default=-1, help='Fine tune patience, default is %(default)s, set 8 to enable it') parser.add_argument('--valid_freq', action='store', metavar='N', dest='valid_freq', type=int, default=5000, help='Validation frequency, default is 5000') parser.add_argument('--trg_att', action='store', metavar='N', dest='trg_attention_layer_id', type=int, default=None, help='Target attention layer id, default is None (not use target attention)') parser.add_argument('--fix_dp_bug', action="store_true", default=False, dest='fix_dp_bug', help='Fix previous dropout bug, default to False, set to True') parser.add_argument('--abandon_imm', action="store_true", default=False, dest='abandon_imm', help='Whether to load previous immediate params, default to True, set to False') parser.add_argument('--tp', action="store", metavar="temperature", dest="temperature", type=float, default=1.0, help='temperature, default is %(default)s') parser.add_argument('--scale', action="store", metavar="scale", dest="scale", type=float, default=1.0, help='scale, default is %(default)s') parser.add_argument('--gate_dp', action="store", metavar="gate_dropout", dest="gate_dropout", type=float, default=1.0, help='gate_dropout, default is %(default)s') args = parser.parse_args() print args if args.residual_enc == 'None': args.residual_enc = None if args.residual_dec == 'None': args.residual_dec = None if args.dist_type != 'mv' and args.dist_type != 'mpi_reduce': args.dist_type = None # FIXME: Auto mode if args.auto: if args.n_encoder_layers <= 2: args.dropout = False args.clip = 1.0 else: args.dropout = 0.1 args.clip = 5.0 if args.n_encoder_layers <= 1: args.residual_enc = None if args.n_decoder_layers <= 1: args.residual_dec = None args.attention_layer_id = 0 args.cond_unit_size = args.unit_size # If dataset is not 'en-fr', old value of dataset options like 'args.train1' will be omitted if args.dataset != 'en-fr': args.train1, args.train2, args.small1, args.small2, args.valid1, args.valid2, args.valid3, args.test1, args.test2, args.dic1, args.dic2 = \ Datasets[args.dataset] print 'Command line arguments:' print args sys.stdout.flush() # Init multiverso or mpi and set theano flags. if args.dist_type == 'mv': try: import multiverso as mv except ImportError: import libs.multiverso_ as mv # FIXME: This must before the import of theano! mv.init(sync=True) worker_id = mv.worker_id() workers_cnt = mv.workers_num() elif args.dist_type == 'mpi_reduce': from mpi4py import MPI communicator = MPI.COMM_WORLD worker_id = communicator.Get_rank() workers_cnt = communicator.Get_size() if args.dist_type: available_gpus = get_gpu_usage(workers_cnt) gpu_maps_info = {idx: idx for idx in available_gpus} if args.gpu_map_file: for line in open(os.path.join('resources', args.gpu_map_file), 'r'): phy_id, theano_id = line.split() gpu_maps_info[int(phy_id)] = int(theano_id) theano_id = gpu_maps_info[available_gpus[worker_id]] print 'worker id:%d, using theano id:%d, physical id %d' % (worker_id, theano_id, available_gpus[worker_id]) os.environ['THEANO_FLAGS'] = 'device=cuda{},floatX=float32'.format(theano_id) sys.stdout.flush() from libs.nmt import train train( max_epochs= args.max_epochs, saveto=args.model_file, preload=args.pre_load_file, reload_=args.reload, dim_word=args.dim_word, dim=args.dim, decay_c=0., clip_c=args.clip, lrate=args.learning_rate, optimizer=args.optimizer, maxlen=args.maxlen, batch_size=args.batch_size, valid_batch_size=args.valid_batch_size, dispFreq=1, saveFreq=args.save_freq, validFreq=args.valid_freq, datasets=(r'data/train/{}'.format(args.train1), r'data/train/{}'.format(args.train2)), valid_datasets=(r'data/dev/{}'.format(args.valid1), r'data/dev/{}'.format(args.valid2)), small_train_datasets=(r'data/test/{}'.format(args.small1),r'data/test/{}'.format(args.small2), r'data/test/{}'.format(args.test2)), vocab_filenames=(r'data/dic/{}'.format(args.dic1), r'data/dic/{}'.format(args.dic2)), task=args.dataset, use_dropout=args.dropout, overwrite=False, n_words=args.n_words_tgt, n_words_src=args.n_words_src, # Options from v-yanfa dump_before_train=args.dump_before_train, plot_graph=args.plot, lr_discount_freq=args.lr_discount_freq, n_encoder_layers=args.n_encoder_layers, n_decoder_layers=args.n_decoder_layers, encoder_many_bidirectional=args.connection_type == 1, attention_layer_id=args.attention_layer_id, unit=args.unit, residual_enc=args.residual_enc, residual_dec=args.residual_dec, use_zigzag=args.use_zigzag, given_embedding=args.given_embedding, unit_size=args.unit_size, cond_unit_size=args.cond_unit_size, given_imm = not args.abandon_imm, dump_imm=True, shuffle_data=args.shuffle, decoder_all_attention=args.all_att, average_context=args.avg_ctx, dist_type=args.dist_type, dist_recover_lr_iter = args.dist_recover_lr, fine_tune_patience=args.fine_tune_patience, nccl= args.nccl, src_vocab_map_file= args.src_vocab_map_file, tgt_vocab_map_file= args.tgt_vocab_map_file, trg_attention_layer_id=args.trg_attention_layer_id, dev_bleu_freq = args.dev_bleu_freq, fix_dp_bug= args.fix_dp_bug, temperature=args.temperature, scale=args.scale, gate_dropout=args.gate_dropout, )
args = parser.parse_args() if args.gpus is None or args.gpus is '': args.gpus = '0' args.cuda = not args.gpus == '-1' and torch.cuda.is_available() if args.cuda: devs = [int(i) for i in args.gpus.split(',')] torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) args.parallel = True if len(devs) > 1 else False if args.parallel: import multiverso as mv from multiverso.torch_ext import torchmodel mv.init(sync=True, updater=b"sgd") kwargs = {'num_workers': 1, 'pin_memory': False} if args.cuda else {} train_loader = torch.utils.data.DataLoader(datasets.CIFAR10( '../data', train=True, download=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), transforms.Normalize(mean=[x / 255 for x in [125.3, 123.0, 113.9]], std=[x / 255 for x in [63.0, 62.1, 66.7]]) ])), batch_size=args.batch_size,
False)", default=128) parser.add_argument('-e', '--epoches', type=int, help="Number of epoches(default:\ 82)", default=82) args = parser.parse_args() print(args) # MULTIVERSO: import multiverso import multiverso as mv # MULTIVERSO: you should call mv.init before call multiverso apis mv.init(sync=args.sync) # MULTIVERSO: every process has distinct worker id worker_id = mv.worker_id() # MULTIVERSO: mv.workers_num will return the number of workers workers_num = mv.workers_num() # NOTICE: To use multiple gpus, we must set the environment before import theano. if "THEANO_FLAGS" not in os.environ: os.environ[ "THEANO_FLAGS"] = 'floatX=float32,device=gpu%d,lib.cnmem=1' % worker_id import numpy as np import theano import theano.tensor as T import lasagne from multiverso.theano_ext.lasagne_ext import param_manager
import torchvision import torchvision.transforms as transforms import os import argparse from models import * from torch.autograd import Variable import numpy as np import multiverso as mv from multiverso.torch_ext import torchmodel mv.init(sync=False, updater=b"sgd") parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training') parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18', type=str, help='model architecture: (default: resnet18)') parser.add_argument('--lr', type=float, default=0.1, metavar='LR', help='learning rate (default: 0.1)') parser.add_argument('--resume', '-r',
0.1)", default=0.1) parser.add_argument('-s', '--sync', type=bool, help="run multiverso in sync \ mode (default: False)", default=False) parser.add_argument('-b', '--batch-size', type=int, help="batch size (default:\ False)", default=128) parser.add_argument('-e', '--epoches', type=int, help="Number of epoches(default:\ 82)", default=82) args = parser.parse_args() print(args) # MULTIVERSO: import multiverso import multiverso as mv # MULTIVERSO: you should call mv.init before call multiverso apis mv.init(sync=args.sync) # MULTIVERSO: every process has distinct worker id worker_id = mv.worker_id() # MULTIVERSO: mv.workers_num will return the number of workers workers_num = mv.workers_num() # NOTICE: To use multiple gpus, we must set the environment before import theano. if "THEANO_FLAGS" not in os.environ: os.environ["THEANO_FLAGS"] = 'floatX=float32,device=gpu%d,lib.cnmem=1' % worker_id import numpy as np import theano import theano.tensor as T import lasagne from multiverso.theano_ext.lasagne_ext import param_manager # for the larger networks (n>=9), we need to adjust pythons recursion limit
def model(x, w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o): c1 = T.maximum(0, conv.conv2d(x, w_c1) + b_c1.dimshuffle('x', 0, 'x', 'x')) p1 = downsample.max_pool_2d(c1, (3, 3)) c2 = T.maximum(0, conv.conv2d(p1, w_c2) + b_c2.dimshuffle('x', 0, 'x', 'x')) p2 = downsample.max_pool_2d(c2, (2, 2)) p2_flat = p2.flatten(2) h3 = T.maximum(0, T.dot(p2_flat, w_h3) + b_h3) p_y_given_x = T.nnet.softmax(T.dot(h3, w_o) + b_o) return p_y_given_x # MULTIVERSO: you should call mv.init before call multiverso apis mv.init() worker_id = mv.worker_id() # MULTIVERSO: every process has distinct worker id workers_num = mv.workers_num() w_c1 = init_weights((4, 3, 3, 3), name="w_c1") b_c1 = init_weights((4,), name="b_c1") w_c2 = init_weights((8, 4, 3, 3), name="w_c2") b_c2 = init_weights((8,), name="b_c2") w_h3 = init_weights((8 * 4 * 4, 100), name="w_h3") b_h3 = init_weights((100,), name="b_h3") w_o = init_weights((100, 10), name="w_o") b_o = init_weights((10,), name="b_o") params = [w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o]
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # MULTIVERSO: you should call mv.init before call multiverso apis mv.init() # MULTIVERSO: every process has distinct worker id worker_id = mv.worker_id() # MULTIVERSO: mv.workers_num will return the number of workers total_worker = mv.workers_num() # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-3 ############### # TRAIN MODEL # ############### print('... training the model') validation_frequency = n_train_batches start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): # MULTIVERSO: we distribute the batches to different workers. # A worker will only train batches belonged to itself if minibatch_index % total_worker == worker_id: minibatch_avg_cost = train_model(minibatch_index) # MULTIVERSO: when you want to commit all the delta of # parameters produced by mv_shared and update the latest # parameters from parameter server, you can call this function to # synchronize the values sharedvar.sync_all_mv_shared_vars() iter = (epoch - 1) * n_train_batches + minibatch_index # MULTIVERSO: only master worker will output the model if mv.is_master_worker() and (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, validation_loss * 100.)) # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() # MULTIVERSO: You should make sure only one process will output the result. # Otherwise results will be outputted repeatedly if mv.is_master_worker(): end_time = timeit.default_timer() test_losses = [test_model(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) print(('Optimization complete with validation score of %f %%,' 'with test performance %f %%') % (validation_loss * 100., test_score * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) # save the model with open('model.pkl', 'wb') as f: pickle.dump(classifier, f) # MULTIVERSO: You must call shutdown at the end of the file mv.shutdown()