Example #1
0
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        if batch_idx % mv.workers_num() == mv.worker_id():
            if use_cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            optimizer.zero_grad()
            inputs, targets = Variable(inputs), Variable(targets)
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            net.cpu()
            net.mv_sync()
            net.cuda()

            train_loss += loss.data[0]
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += predicted.eq(targets.data).cpu().sum()

            if (batch_idx / mv.workers_num()) % args.log_interval == 0:
                print(
                    'Worker: {}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'
                    .format(mv.worker_id(), epoch, batch_idx * len(inputs),
                            len(trainloader.dataset),
                            100. * batch_idx / len(trainloader), loss.data[0]))
Example #2
0
def paralleltrain(epoch):
    model.train()
    scheduler.step()
    for batch_idx, (data, target) in enumerate(train_loader):
        if batch_idx % mv.workers_num() != mv.worker_id():
            continue
        if args.cuda:
            data, target = data.cuda(device), target.cuda(device)
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        model.cpu()
        model.mv_sync()
        model.cuda(device)

        if (batch_idx / mv.workers_num()) % args.log_interval == 0:
            print(
                'Worker: {}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.
                format(mv.worker_id(), epoch, batch_idx * len(data),
                       len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.data[0]))

    if batch_idx % mv.workers_num() < mv.worker_id():
        optimizer.zero_grad()
        model.cpu()
        model.mv_sync()
        model.cuda(device)
Example #3
0
 def test_matrix(self):
     num_row = 11
     num_col = 10
     size = num_col * num_row
     workers_num = mv.workers_num()
     tbh = mv.MatrixTableHandler(num_row, num_col)
     mv.barrier()
     for count in xrange(1, 21):
         row_ids = [0, 1, 5, 10]
         tbh.add(range(size))
         tbh.add(
             [range(rid * num_col, (1 + rid) * num_col) for rid in row_ids],
             row_ids)
         mv.barrier()
         data = tbh.get()
         mv.barrier()
         for i, row in enumerate(data):
             for j, actual in enumerate(row):
                 expected = (i * num_col + j) * count * workers_num
                 if i in row_ids:
                     expected += (i * num_col + j) * count * workers_num
                 self.assertEqual(expected, actual)
         data = tbh.get(row_ids)
         mv.barrier()
         for i, row in enumerate(data):
             for j, actual in enumerate(row):
                 expected = (row_ids[i] * num_col +
                             j) * count * workers_num * 2
                 self.assertEqual(expected, actual)
 def test_matrix(self):
     num_row = 11
     num_col = 10
     size = num_col * num_row
     workers_num = mv.workers_num()
     tbh = mv.MatrixTableHandler(num_row, num_col)
     mv.barrier()
     for count in xrange(1, 21):
         row_ids = [0, 1, 5, 10]
         tbh.add(range(size))
         tbh.add([range(rid * num_col, (1 + rid) * num_col) for rid in row_ids], row_ids)
         mv.barrier()
         data = tbh.get()
         mv.barrier()
         for i, row in enumerate(data):
             for j, actual in enumerate(row):
                 expected = (i * num_col + j) * count * workers_num
                 if i in row_ids:
                     expected += (i * num_col + j) * count * workers_num
                 self.assertEqual(expected, actual)
         data = tbh.get(row_ids)
         mv.barrier()
         for i, row in enumerate(data):
             for j, actual in enumerate(row):
                 expected = (row_ids[i] * num_col + j) * count * workers_num * 2
                 self.assertEqual(expected, actual)
    def _test_array(self, size):
        tbh = mv.ArrayTableHandler(size)
        mv.barrier()

        for i in xrange(100):
            tbh.add(range(1, size + 1))
            tbh.add(range(1, size + 1))
            mv.barrier()
            for j, actual in enumerate(tbh.get()):
                self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(), actual)
            mv.barrier()
Example #6
0
    def _test_array(self, size):
        tbh = mv.ArrayTableHandler(size)
        mv.barrier()

        for i in xrange(100):
            tbh.add(range(1, size + 1))
            tbh.add(range(1, size + 1))
            mv.barrier()
            for j, actual in enumerate(tbh.get()):
                self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(),
                                 actual)
            mv.barrier()
Example #7
0
    def _test_sharedvar(self, row, col):
        W = sharedvar.mv_shared(value=np.zeros((row, col),
                                               dtype=theano.config.floatX),
                                name='W',
                                borrow=True)
        delta = np.array(range(1, row * col + 1),
                         dtype=theano.config.floatX).reshape((row, col))
        train_model = theano.function([], updates=[(W, W + delta)])
        mv.barrier()

        for i in xrange(100):
            train_model()
            train_model()
            sharedvar.sync_all_mv_shared_vars()
            mv.barrier()
            # to get the newest value, we must sync again
            sharedvar.sync_all_mv_shared_vars()
            for j, actual in enumerate(W.get_value().reshape(-1)):
                self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(),
                                 actual)
            mv.barrier()
    def _test_sharedvar(self, row, col):
        W = sharedvar.mv_shared(
            value=np.zeros(
                (row, col),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        delta = np.array(range(1, row * col + 1),
                        dtype=theano.config.floatX).reshape((row, col))
        train_model = theano.function([], updates=[(W, W + delta)])
        mv.barrier()

        for i in xrange(100):
            train_model()
            train_model()
            sharedvar.sync_all_mv_shared_vars()
            mv.barrier()
            # to get the newest value, we must sync again
            sharedvar.sync_all_mv_shared_vars()
            for j, actual in enumerate(W.get_value().reshape(-1)):
                self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(), actual)
            mv.barrier()
Example #9
0
def main():
    parser = argparse.ArgumentParser(
        description='Train the deep NMT model.',
        fromfile_prefix_chars='@',
    )

    parser.add_argument('-R', action="store_false", default=True, dest='reload',
                        help='Reload old model, default to True, set to False')
    parser.add_argument('-d', action='store_true', default=False, dest='dump_before_train',
                        help='Dump before train default to False, set to True')
    parser.add_argument('--lr', action="store", metavar="learning_rate", dest="learning_rate", type=float, default=1.0,
                        help='Start learning rate, default is %(default)s')
    parser.add_argument('--optimizer', action='store', default='adadelta')
    parser.add_argument('--plot', action='store', default=None,
                        help='Plot filename, default is None (not plot) (deprecated).')
    parser.add_argument('--save_freq', action='store', default=10000, type=int, dest='save_freq',
                        help='Model save frequency, default is %(default)s')
    parser.add_argument('--dev_bleu_freq', action='store', default=20000, type=int, dest='dev_bleu_freq',
                        help='Get dev set BLEU frequency, default is %(default)s')
    parser.add_argument('--dim', action='store', default=512, type=int, dest='dim',
                        help='Dim of hidden units, default is %(default)s')
    parser.add_argument('--bs', action='store', default=128, type=int, dest='batch_size',
                        help='Train batch size, default is %(default)s')
    parser.add_argument('--valid_bs', action='store', default=128, type=int, dest='valid_batch_size',
                        help='Valid batch size, default is %(default)s')
    parser.add_argument('--dim_word', action='store', default=512, type=int, dest='dim_word',
                        help='Dim of word embedding, default is %(default)s')
    parser.add_argument('--maxlen', action='store', default=80, type=int, dest='maxlen',
                        help='Max sentence length, default is %(default)s')
    parser.add_argument('-S', action='store_false', default=True, dest='shuffle',
                        help='Shuffle data per epoch, default is True, set to False')
    parser.add_argument('--train1', action='store', metavar='filename', dest='train1', type=str,
                        default='filtered_en-fr.en',
                        help='Source train file, default is %(default)s')
    parser.add_argument('--train2', action='store', metavar='filename', dest='train2', type=str,
                        default='filtered_en-fr.fr',
                        help='Target train file, default is %(default)s')
    parser.add_argument('--small1', action='store', metavar='filename', dest='small1', type=str,
                        default='small_en-fr.en',
                        help='Source small train file, default is %(default)s')
    parser.add_argument('--small2', action='store', metavar='filename', dest='small2', type=str,
                        default='small_en-fr.fr',
                        help='Target small train file, default is %(default)s')
    parser.add_argument('--valid1', action='store', metavar='filename', dest='valid1', type=str,
                        default='dev_en.tok',
                        help='Source valid file, default is %(default)s')
    parser.add_argument('--valid2', action='store', metavar='filename', dest='valid2', type=str,
                        default='dev_fr.tok',
                        help='Target valid file, default is %(default)s')
    parser.add_argument('--dic1', action='store', metavar='filename', dest='dic1', type=str,
                        default='filtered_dic_en-fr.en.pkl',
                        help='Source dict file, default is %(default)s')
    parser.add_argument('--dic2', action='store', metavar='filename', dest='dic2', type=str,
                        default='filtered_dic_en-fr.fr.pkl',
                        help='Target dict file, default is %(default)s')
    parser.add_argument('--n_words_src', action='store', default=30000, type=int, dest='n_words_src',
                        help='Vocabularies in source side, default is %(default)s')
    parser.add_argument('--n_words_tgt', action='store', default=30000, type=int, dest='n_words_tgt',
                        help='Vocabularies in target side, default is %(default)s')

    parser.add_argument('model_file', nargs='?', default='model/baseline/baseline.npz',
                        help='Generated model file, default is "%(default)s"')
    parser.add_argument('pre_load_file', nargs='?', default='model/en2fr.iter160000.npz',
                        help='Pre-load model file, default is "%(default)s"')
    parser.add_argument('--src_vocab_map', action='store', metavar='filename', dest='src_vocab_map_file', type=str,
                        default=None, help='The file containing source vocab mapping information' 
                                           'used to initialize a model on large dataset from small one')
    parser.add_argument('--tgt_vocab_map', action='store', metavar='filename', dest='tgt_vocab_map_file', type=str,
                        default=None, help='The file containing target vocab mapping information'
                                           'used to initialize a model on large dataset from small one')

    parser.add_argument('--enc', action='store', default=1, type=int, dest='n_encoder_layers',
                        help='Number of encoder layers, default is 1')
    parser.add_argument('--dec', action='store', default=1, type=int, dest='n_decoder_layers',
                        help='Number of decoder layers, default is 1')
    parser.add_argument('--conn', action='store', default=2, type=int, dest='connection_type',
                        help='Connection type, '
                             'default is 2 (bidirectional only in first layer, other layers are forward);'
                             '1 is divided bidirectional GRU')
    parser.add_argument('--max_epochs', action='store', default=100, type=int, dest='max_epochs',
                        help='Maximum epoches, default is 100')
    parser.add_argument('--unit', action='store', metavar='unit', dest='unit', type=str, default='lstm',
                        help='The unit type, default is "lstm", can be set to "gru".')
    parser.add_argument('--attention', action='store', metavar='index', dest='attention_layer_id', type=int, default=0,
                        help='Attention layer index, default is 0')
    parser.add_argument('--residual_enc', action='store', metavar='type', dest='residual_enc', type=str, default=None,
                        help='Residual connection of encoder, default is None, candidates are "layer_wise", "last"')
    parser.add_argument('--residual_dec', action='store', metavar='type', dest='residual_dec', type=str,
                        default='layer_wise',
                        help='Residual connection of decoder, default is "layer_wise", candidates are None, "last"')
    parser.add_argument('-z', '--zigzag', action='store_false', default=True, dest='use_zigzag',
                        help='Use zigzag in encoder, default is True, set to False')
    parser.add_argument('--dropout', action="store", metavar="dropout", dest="dropout", type=float, default=False,
                        help='Dropout rate, default is False (not use dropout)')
    parser.add_argument('--unit_size', action='store', default=2, type=int, dest='unit_size',
                        help='Number of unit size, default is %(default)s')
    # TODO: rename this option to decoder_unit_size in future
    parser.add_argument('--cond_unit_size', action='store', default=2, type=int, dest='cond_unit_size',
                        help='Number of decoder unit size (will rename in future), default is %(default)s')
    parser.add_argument('--clip', action='store', metavar='clip', dest='clip', type=float, default=1.0,
                        help='Gradient clip rate, default is 1.0.')
    parser.add_argument('--manual', action='store_false', dest='auto', default=True,
                        help='Set dropout rate and grad clip rate manually.')
    parser.add_argument('--emb', action='store', metavar='filename', dest='given_embedding', type=str, default=None,
                        help='Given embedding model file, default is None')
    parser.add_argument('--lr_discount', action='store', metavar='freq', dest='lr_discount_freq', type=int,
                        default=-1, help='The learning rate discount frequency, default is -1')

    parser.add_argument('--distribute', action = 'store', metavar ='type', dest = 'dist_type', type = str, default= None,
                        help = 'The distribution version, default is None (singe GPU mode), candiates are "mv", "mpi_reduce"')
    parser.add_argument('--nccl', action="store_true", default=False, dest='nccl',
                        help='Use NCCL in distributed mode, default to False, set to True')
    parser.add_argument('--clip_grads_local', action="store_true", default=False, dest='clip_grads_local',
                        help='Whether to clip grads in distributed mode, default to False, set to True')
    parser.add_argument('--recover_lr_iter', action='store', dest='dist_recover_lr', type = int, default=10000,
                        help='The mini-batch index to recover lrate in distributed mode, default is 10000.')

    parser.add_argument('--all_att', action='store_true', dest='all_att', default=False,
                        help='Generate attention from all decoder layers, default is False, set to True')
    parser.add_argument('--avg_ctx', action='store_true', dest='avg_ctx', default=False,
                        help='Average all context vectors to get softmax, default is False, set to True')
    parser.add_argument('--dataset', action='store', dest='dataset', default='en-fr',
                        help='Dataset, default is "%(default)s"')
    parser.add_argument('--gpu_map_file', action='store', metavar='filename', dest='gpu_map_file', type=str,
                        default=None, help='The file containing gpu id mapping information, '
                                           'each line is in the form physical_gpu_id\\theano_id')
    parser.add_argument('--ft_patience', action='store', metavar='N', dest='fine_tune_patience', type=int, default=-1,
                        help='Fine tune patience, default is %(default)s, set 8 to enable it')
    parser.add_argument('--valid_freq', action='store', metavar='N', dest='valid_freq', type=int, default=5000,
                        help='Validation frequency, default is 5000')
    parser.add_argument('--trg_att', action='store', metavar='N', dest='trg_attention_layer_id', type=int, default=None,
                        help='Target attention layer id, default is None (not use target attention)')
    parser.add_argument('--fix_dp_bug', action="store_true", default=False, dest='fix_dp_bug',
                        help='Fix previous dropout bug, default to False, set to True')
    parser.add_argument('--abandon_imm', action="store_true", default=False, dest='abandon_imm',
                        help='Whether to load previous immediate params, default to True, set to False')
    parser.add_argument('--tp', action="store", metavar="temperature", dest="temperature", type=float, default=1.0,
                        help='temperature, default is %(default)s')
    parser.add_argument('--scale', action="store", metavar="scale", dest="scale", type=float, default=1.0,
                        help='scale, default is %(default)s')
    parser.add_argument('--gate_dp', action="store", metavar="gate_dropout", dest="gate_dropout", type=float, default=1.0,
                        help='gate_dropout, default is %(default)s')

    args = parser.parse_args()
    print args

    if args.residual_enc == 'None':
        args.residual_enc = None
    if args.residual_dec == 'None':
        args.residual_dec = None
    if args.dist_type != 'mv' and args.dist_type != 'mpi_reduce':
        args.dist_type = None

    # FIXME: Auto mode
    if args.auto:
        if args.n_encoder_layers <= 2:
            args.dropout = False
            args.clip = 1.0
        else:
            args.dropout = 0.1
            args.clip = 5.0

        if args.n_encoder_layers <= 1:
            args.residual_enc = None
        if args.n_decoder_layers <= 1:
            args.residual_dec = None
            args.attention_layer_id = 0

        args.cond_unit_size = args.unit_size

    # If dataset is not 'en-fr', old value of dataset options like 'args.train1' will be omitted
    if args.dataset != 'en-fr':
        args.train1, args.train2, args.small1, args.small2, args.valid1, args.valid2, args.valid3, args.test1, args.test2, args.dic1, args.dic2 = \
            Datasets[args.dataset]

    print 'Command line arguments:'
    print args
    sys.stdout.flush()

    # Init multiverso or mpi and set theano flags.
    if args.dist_type == 'mv':
        try:
            import multiverso as mv
        except ImportError:
            import libs.multiverso_ as mv

        # FIXME: This must before the import of theano!
        mv.init(sync=True)
        worker_id = mv.worker_id()
        workers_cnt = mv.workers_num()
    elif args.dist_type == 'mpi_reduce':
        from mpi4py import MPI

        communicator = MPI.COMM_WORLD
        worker_id = communicator.Get_rank()
        workers_cnt = communicator.Get_size()

    if args.dist_type:
        available_gpus = get_gpu_usage(workers_cnt)
        gpu_maps_info = {idx: idx for idx in available_gpus}
        if args.gpu_map_file:
            for line in open(os.path.join('resources', args.gpu_map_file), 'r'):
                phy_id, theano_id = line.split()
                gpu_maps_info[int(phy_id)] = int(theano_id)
        theano_id = gpu_maps_info[available_gpus[worker_id]]
        print 'worker id:%d, using theano id:%d, physical id %d' % (worker_id, theano_id, available_gpus[worker_id])
        os.environ['THEANO_FLAGS'] = 'device=cuda{},floatX=float32'.format(theano_id)
        sys.stdout.flush()

    from libs.nmt import train

    train(
        max_epochs= args.max_epochs,
        saveto=args.model_file,
        preload=args.pre_load_file,
        reload_=args.reload,
        dim_word=args.dim_word,
        dim=args.dim,
        decay_c=0.,
        clip_c=args.clip,
        lrate=args.learning_rate,
        optimizer=args.optimizer,
        maxlen=args.maxlen,
        batch_size=args.batch_size,
        valid_batch_size=args.valid_batch_size,
        dispFreq=1,
        saveFreq=args.save_freq,
        validFreq=args.valid_freq,
        datasets=(r'data/train/{}'.format(args.train1),
                  r'data/train/{}'.format(args.train2)),
        valid_datasets=(r'data/dev/{}'.format(args.valid1),
                        r'data/dev/{}'.format(args.valid2)),
        small_train_datasets=(r'data/test/{}'.format(args.small1),r'data/test/{}'.format(args.small2),
                              r'data/test/{}'.format(args.test2)),
        vocab_filenames=(r'data/dic/{}'.format(args.dic1),
                         r'data/dic/{}'.format(args.dic2)),
        task=args.dataset,
        use_dropout=args.dropout,
        overwrite=False,
        n_words=args.n_words_tgt,
        n_words_src=args.n_words_src,

        # Options from v-yanfa
        dump_before_train=args.dump_before_train,
        plot_graph=args.plot,
        lr_discount_freq=args.lr_discount_freq,

        n_encoder_layers=args.n_encoder_layers,
        n_decoder_layers=args.n_decoder_layers,
        encoder_many_bidirectional=args.connection_type == 1,

        attention_layer_id=args.attention_layer_id,
        unit=args.unit,
        residual_enc=args.residual_enc,
        residual_dec=args.residual_dec,
        use_zigzag=args.use_zigzag,
        given_embedding=args.given_embedding,

        unit_size=args.unit_size,
        cond_unit_size=args.cond_unit_size,

        given_imm = not args.abandon_imm,
        dump_imm=True,
        shuffle_data=args.shuffle,

        decoder_all_attention=args.all_att,
        average_context=args.avg_ctx,

        dist_type=args.dist_type,
        dist_recover_lr_iter = args.dist_recover_lr,

        fine_tune_patience=args.fine_tune_patience,
        nccl= args.nccl,
        src_vocab_map_file= args.src_vocab_map_file,
        tgt_vocab_map_file= args.tgt_vocab_map_file,

        trg_attention_layer_id=args.trg_attention_layer_id,
        dev_bleu_freq = args.dev_bleu_freq,
        fix_dp_bug= args.fix_dp_bug,
        temperature=args.temperature,
        scale=args.scale,
        gate_dropout=args.gate_dropout,
    )
Example #10
0
    c2 = T.maximum(0,
                   conv.conv2d(p1, w_c2) + b_c2.dimshuffle('x', 0, 'x', 'x'))
    p2 = downsample.max_pool_2d(c2, (2, 2))

    p2_flat = p2.flatten(2)
    h3 = T.maximum(0, T.dot(p2_flat, w_h3) + b_h3)
    p_y_given_x = T.nnet.softmax(T.dot(h3, w_o) + b_o)
    return p_y_given_x


# MULTIVERSO: you should call mv.init before call multiverso apis
mv.init()
worker_id = mv.worker_id()
# MULTIVERSO: every process has distinct worker id
workers_num = mv.workers_num()

w_c1 = init_weights((4, 3, 3, 3), name="w_c1")
b_c1 = init_weights((4, ), name="b_c1")
w_c2 = init_weights((8, 4, 3, 3), name="w_c2")
b_c2 = init_weights((8, ), name="b_c2")
w_h3 = init_weights((8 * 4 * 4, 100), name="w_h3")
b_h3 = init_weights((100, ), name="b_h3")
w_o = init_weights((100, 10), name="w_o")
b_o = init_weights((10, ), name="b_o")

params = [w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o]

p_y_given_x = model(x, *params)
y = T.argmax(p_y_given_x, axis=1)
 False)", default=128)
parser.add_argument('-e', '--epoches', type=int, help="Number of epoches(default:\
 82)", default=82)
args = parser.parse_args()
print(args)


# MULTIVERSO: import multiverso
import multiverso as mv

# MULTIVERSO: you should call mv.init before call multiverso apis
mv.init(sync=args.sync)
# MULTIVERSO: every process has distinct worker id
worker_id = mv.worker_id()
# MULTIVERSO: mv.workers_num will return the number of workers
workers_num = mv.workers_num()
# NOTICE: To use multiple gpus, we must set the environment before import theano.
if "THEANO_FLAGS" not in os.environ:
    os.environ["THEANO_FLAGS"] = 'floatX=float32,device=gpu%d,lib.cnmem=1' % worker_id

import numpy as np
import theano
import theano.tensor as T
import lasagne
from multiverso.theano_ext.lasagne_ext import param_manager

# for the larger networks (n>=9), we need to adjust pythons recursion limit
sys.setrecursionlimit(10000)

# ##################### Load data from CIFAR-10 dataset #######################
# this code assumes the cifar dataset from 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
Example #12
0
def sgd_optimization_mnist(learning_rate=0.13,
                           n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # MULTIVERSO: you should call mv.init before call multiverso apis
    mv.init()
    # MULTIVERSO: every process has distinct worker id
    worker_id = mv.worker_id()

    # MULTIVERSO: mv.workers_num will return the number of workers
    total_worker = mv.workers_num()

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the model')
    validation_frequency = n_train_batches
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            # MULTIVERSO: we distribute the batches to different workers.
            # A worker will only train batches belonged to itself
            if minibatch_index % total_worker == worker_id:
                minibatch_avg_cost = train_model(minibatch_index)
                # MULTIVERSO: when you want to commit all the delta of
                # parameters produced by mv_shared and update the latest
                # parameters from parameter server, you can call this function to
                # synchronize the values
                sharedvar.sync_all_mv_shared_vars()

            iter = (epoch - 1) * n_train_batches + minibatch_index

            # MULTIVERSO: only master worker will output the model
            if mv.is_master_worker() and (iter +
                                          1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       validation_loss * 100.))
        # MULTIVERSO: all the workers will synchronize at the place you call barrier
        mv.barrier()

    # MULTIVERSO: You should make sure only one process will output the result.
    # Otherwise results will be outputted repeatedly
    if mv.is_master_worker():
        end_time = timeit.default_timer()

        test_losses = [test_model(i) for i in range(n_test_batches)]
        test_score = numpy.mean(test_losses)

        print(('Optimization complete with validation score of %f %%,'
               'with test performance %f %%') %
              (validation_loss * 100., test_score * 100.))
        print('The code run for %d epochs, with %f epochs/sec' %
              (epoch, 1. * epoch / (end_time - start_time)))
        print(('The code for file ' + os.path.split(__file__)[1] +
               ' ran for %.1fs' % ((end_time - start_time))),
              file=sys.stderr)

        # save the model
        with open('model.pkl', 'wb') as f:
            pickle.dump(classifier, f)
    # MULTIVERSO: You must call shutdown at the end of the file
    mv.shutdown()