Example #1
0
    def __init__(self, network):
        ''' The constructor of MVNetParamManager

        The constructor will associate the parameter with multiverso array
        table.  The initial value of ArrayTableHandler will be same as the
        parameters of network. If different parameters are used in different
        processes, the average of them will be used as the initial value
        '''
        self.shapes = []
        self.dtypes = []
        self.sizes = []
        self.all_param_list = []
        self.network = network
        for arr in lasagne.layers.get_all_param_values(self.network):
            self.shapes.append(arr.shape)
            # TODO: Now only float32 is supported in multiverso. So I store all
            # the parameters in a float32 array. This place need modification
            # after other types are supported
            assert(np.dtype("float32") == arr.dtype)
            self.dtypes.append(arr.dtype)
            self.sizes.append(arr.size)
            self.all_param_list.extend([i for i in np.nditer(arr)])
        self.all_param_list = np.array(self.all_param_list)

        self.tbh = mv.ArrayTableHandler(len(self.all_param_list), init_value=self.all_param_list)
        mv.barrier()  # add barrier to make sure the initial values have token effect
        self.all_param_list = self.tbh.get()
        self._set_all_param_to_net()
Example #2
0
 def test_matrix(self):
     num_row = 11
     num_col = 10
     size = num_col * num_row
     workers_num = mv.workers_num()
     tbh = mv.MatrixTableHandler(num_row, num_col)
     mv.barrier()
     for count in xrange(1, 21):
         row_ids = [0, 1, 5, 10]
         tbh.add(range(size))
         tbh.add(
             [range(rid * num_col, (1 + rid) * num_col) for rid in row_ids],
             row_ids)
         mv.barrier()
         data = tbh.get()
         mv.barrier()
         for i, row in enumerate(data):
             for j, actual in enumerate(row):
                 expected = (i * num_col + j) * count * workers_num
                 if i in row_ids:
                     expected += (i * num_col + j) * count * workers_num
                 self.assertEqual(expected, actual)
         data = tbh.get(row_ids)
         mv.barrier()
         for i, row in enumerate(data):
             for j, actual in enumerate(row):
                 expected = (row_ids[i] * num_col +
                             j) * count * workers_num * 2
                 self.assertEqual(expected, actual)
 def test_matrix(self):
     num_row = 11
     num_col = 10
     size = num_col * num_row
     workers_num = mv.workers_num()
     tbh = mv.MatrixTableHandler(num_row, num_col)
     mv.barrier()
     for count in xrange(1, 21):
         row_ids = [0, 1, 5, 10]
         tbh.add(range(size))
         tbh.add([range(rid * num_col, (1 + rid) * num_col) for rid in row_ids], row_ids)
         mv.barrier()
         data = tbh.get()
         mv.barrier()
         for i, row in enumerate(data):
             for j, actual in enumerate(row):
                 expected = (i * num_col + j) * count * workers_num
                 if i in row_ids:
                     expected += (i * num_col + j) * count * workers_num
                 self.assertEqual(expected, actual)
         data = tbh.get(row_ids)
         mv.barrier()
         for i, row in enumerate(data):
             for j, actual in enumerate(row):
                 expected = (row_ids[i] * num_col + j) * count * workers_num * 2
                 self.assertEqual(expected, actual)
Example #4
0
    def __init__(self, svobj):
        '''Constructor of the MVSharedVariable

        The constructor will create ArrayTableHandler and associate the shared
        variable with it. The initial value of ArrayTableHandler will be same
        as the value of SharedVariable. 
        *Notice*: Only the `init_value` from the master will be used!
        '''
        assert(isinstance(svobj, SharedVariable))
        self._svobj = svobj
        self._mv_array = mv.ArrayTableHandler(self._svobj.get_value().size,
                                              init_value=self._svobj.get_value().reshape((-1,)))

        mv.barrier()  # add barrier to make sure the initial values have token effect
        # _last_mv_data restore a copy of value. It will be used for calculate
        # the update for multiverso when calling mv_sync
        self._last_mv_data = self._mv_array.get().reshape(self._svobj.get_value().shape)
        self._svobj.set_value(self._last_mv_data, borrow=False)
Example #5
0
    def __init__(self, svobj):
        '''Constructor of the MVSharedVariable

        The constructor will create ArrayTableHandler and associate the shared
        variable with it. The initial value of ArrayTableHandler will be same
        as the value of SharedVariable. If different initial value is used in
        different processes, the average of them will be used as the initial
        value
        '''
        assert(isinstance(svobj, SharedVariable))
        self._svobj = svobj
        self._mv_array = mv.ArrayTableHandler(self._svobj.get_value().size,
                                              init_value=self._svobj.get_value().reshape((-1,)))

        mv.barrier()  # add barrier to make sure the initial values have token effect
        # _last_mv_data restore a copy of value. It will be used for calculate
        # the update for multiverso when calling mv_sync
        self._last_mv_data = self._mv_array.get().reshape(self._svobj.get_value().shape)
        self._svobj.set_value(self._last_mv_data, borrow=False)
    def _test_array(self, size):
        tbh = mv.ArrayTableHandler(size)
        mv.barrier()

        for i in xrange(100):
            tbh.add(range(1, size + 1))
            tbh.add(range(1, size + 1))
            mv.barrier()
            for j, actual in enumerate(tbh.get()):
                self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(), actual)
            mv.barrier()
Example #7
0
    def _test_array(self, size):
        tbh = mv.ArrayTableHandler(size)
        mv.barrier()

        for i in xrange(100):
            tbh.add(range(1, size + 1))
            tbh.add(range(1, size + 1))
            mv.barrier()
            for j, actual in enumerate(tbh.get()):
                self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(),
                                 actual)
            mv.barrier()
Example #8
0
    def _test_sharedvar(self, row, col):
        W = sharedvar.mv_shared(value=np.zeros((row, col),
                                               dtype=theano.config.floatX),
                                name='W',
                                borrow=True)
        delta = np.array(range(1, row * col + 1),
                         dtype=theano.config.floatX).reshape((row, col))
        train_model = theano.function([], updates=[(W, W + delta)])
        mv.barrier()

        for i in xrange(100):
            train_model()
            train_model()
            sharedvar.sync_all_mv_shared_vars()
            mv.barrier()
            # to get the newest value, we must sync again
            sharedvar.sync_all_mv_shared_vars()
            for j, actual in enumerate(W.get_value().reshape(-1)):
                self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(),
                                 actual)
            mv.barrier()
    def _test_sharedvar(self, row, col):
        W = sharedvar.mv_shared(
            value=np.zeros(
                (row, col),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        delta = np.array(range(1, row * col + 1),
                        dtype=theano.config.floatX).reshape((row, col))
        train_model = theano.function([], updates=[(W, W + delta)])
        mv.barrier()

        for i in xrange(100):
            train_model()
            train_model()
            sharedvar.sync_all_mv_shared_vars()
            mv.barrier()
            # to get the newest value, we must sync again
            sharedvar.sync_all_mv_shared_vars()
            for j, actual in enumerate(W.get_value().reshape(-1)):
                self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(), actual)
            mv.barrier()
Example #10
0
p_y_given_x = model(x, *params)
y = T.argmax(p_y_given_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(p_y_given_x, t))

updates = momentum(cost, params, learning_rate=0.01, momentum=0.9)

# compile theano functions
train = theano.function([x, t],
                        cost,
                        updates=updates,
                        allow_input_downcast=True)
predict = theano.function([x], y, allow_input_downcast=True)

# MULTIVERSO: all the workers will synchronize at the place you call barrier
mv.barrier()

# train model
batch_size = 50

for i in range(50):
    for start in range(0, len(x_train), batch_size):
        # every process only train batches assigned to itself
        if start / batch_size % workers_num != worker_id:
            continue
        x_batch = x_train[start:start + batch_size]
        t_batch = t_train[start:start + batch_size]
        cost = train(x_batch, t_batch)

        # MULTIVERSO: sync value with multiverso after every batch
        sharedvar.sync_all_mv_shared_vars()
Example #11
0
def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of LSTM units
          encoder='gru',
          decoder='gru_cond',
          n_words_src=30000,
          n_words=30000,
          max_epochs=5000,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 regularization penalty
          alpha_c=0.,  # alignment regularization
          clip_c=-1.,  # gradient clipping threshold
          lrate=1.,  # learning rate
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size=16,
          valid_batch_size=80,
          saveto='model.npz',
          saveFreq=1000,  # save the parameters after every saveFreq updates
          validFreq=2500,
          dev_bleu_freq=20000,
          datasets=('/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
                    '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'),
          valid_datasets=('./data/dev/dev_en.tok',
                          './data/dev/dev_fr.tok'),
          small_train_datasets=('./data/train/small_en-fr.en','./data/train/small_en-fr.fr',
                                './data/train/small_en-fr.fr'),
          use_dropout=False,
          reload_=False,
          overwrite=False,
          preload='',

          # Options below are from v-yanfa
          dump_before_train=True,
          plot_graph=None,
          vocab_filenames=('./data/dic/filtered_dic_en-fr.en.pkl',
                           './data/dic/filtered_dic_en-fr.fr.pkl'),
          map_filename='./data/dic/mapFullVocab2Top1MVocab.pkl',
          lr_discount_freq=80000,

          # Options of deeper encoder and decoder
          n_encoder_layers=1,
          n_decoder_layers=1,
          encoder_many_bidirectional=True,

          attention_layer_id=0,
          unit='gru',
          residual_enc=None,
          residual_dec=None,
          use_zigzag=False,

          initializer='orthogonal',
          given_embedding=None,

          dist_type=None,
          dist_recover_lr_iter=False,

          unit_size=2,
          cond_unit_size=2,

          given_imm=False,
          dump_imm=False,
          shuffle_data=False,

          decoder_all_attention=False,
          average_context=False,
          task='en-fr',

          fine_tune_patience=8,
          nccl = False,
          src_vocab_map_file = None,
          tgt_vocab_map_file = None,

          trg_attention_layer_id=None,
          fix_dp_bug = False,
          temperature = 1.0,
          scale=1.0,
          gate_dropout=0.0,
          ):
    model_options = locals().copy()

    # Set distributed computing environment
    worker_id = 0
    if dist_type == 'mv':
        try:
            import multiverso as mv
        except ImportError:
            from . import multiverso_ as mv

        worker_id = mv.worker_id()
    elif dist_type == 'mpi_reduce':
        from mpi4py import MPI
        mpi_communicator = MPI.COMM_WORLD
        worker_id = mpi_communicator.Get_rank()
        workers_cnt = mpi_communicator.Get_size()

        if nccl:
            nccl_comm = init_nccl_env(mpi_communicator)

    print 'Use {}, worker id: {}'.format('multiverso' if dist_type == 'mv' else 'mpi' if dist_recover_lr_iter else 'none', worker_id)
    sys.stdout.flush()

    # Set logging file
    set_logging_file('log/complete/e{}d{}_res{}_att{}_worker{}_task{}_{}.txt'.format(
        n_encoder_layers, n_decoder_layers, residual_enc, attention_layer_id,
        worker_id, task, time.strftime('%m-%d-%H-%M-%S'),
    ))

    log('''\
Start Time = {}
'''.format(
        time.strftime('%c'),
    ))

    # Model options: load and save
    message('Top options:')
    pprint(model_options)
    pprint(model_options, stream=get_logging_file())
    message('Done')
    sys.stdout.flush()

    #load_options(model_options, reload_, preload, src_vocab_map_file and tgt_vocab_map_file)
    check_options(model_options)
    model_options['cost_normalization'] = 1
    ada_alpha = 0.95
    if dist_type == 'mpi_reduce':
        model_options['cost_normalization'] = workers_cnt

    message('Model options:')
    pprint(model_options)
    pprint(model_options, stream=get_logging_file())
    message()

    print 'Loading data'
    log('\n\n\nStart to prepare data\n@Current Time = {}'.format(time.time()))
    sys.stdout.flush()

    dataset_src, dataset_tgt = datasets[0], datasets[1]

    if shuffle_data:
        text_iterator_list = [None for _ in range(10)]
        text_iterator = None
    else:
        text_iterator_list = None

        text_iterator = TextIterator(
            dataset_src, dataset_tgt,
            vocab_filenames[0], vocab_filenames[1],
            batch_size,n_words_src, n_words,maxlen
        )

    valid_iterator = TextIterator(
        valid_datasets[0], valid_datasets[1],
        vocab_filenames[0], vocab_filenames[1],
        valid_batch_size, n_words_src, n_words
    )

    small_train_iterator = TextIterator(
        small_train_datasets[0], small_train_datasets[1],
        vocab_filenames[0], vocab_filenames[1],
        valid_batch_size, n_words_src, n_words
    )

    print 'Building model'
    model = NMTModel(model_options)

    params = model.initializer.init_params()

    # Reload parameters
    if reload_ and os.path.exists(preload):
        print 'Reloading model parameters'
        load_params(preload, params, src_map_file = src_vocab_map_file, tgt_map_file = tgt_vocab_map_file)
    sys.stdout.flush()

    # Given embedding
    if given_embedding is not None:
        print 'Loading given embedding...',
        load_embedding(params, given_embedding)
        print 'Done'

    print_params(params)

    model.init_tparams(params)

    # Build model, stochastic_mode = 0(soft), 1(stochastic), 2(hard)
    trng, use_noise, stochastic_mode, hyper_param,\
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost, test_cost, x_emb, stochastic_updates,_ = model.build_model()
    inps = [x, x_mask, y, y_mask]

    all_stochastic_updates = OrderedDictUpdates()
    for item1 in stochastic_updates:
        for item2 in item1:
            all_stochastic_updates.update(item2)

    print 'Building sampler'
    f_init, f_next = model.build_sampler(trng=trng, use_noise=use_noise, batch_mode=True, stochastic_mode=stochastic_mode, hyper_param=hyper_param)
    stochastic_mode.set_value(1)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile, updates=all_stochastic_updates)
    print 'Done'
    sys.stdout.flush()
    test_cost = test_cost.mean() #FIXME: do not regularize test_cost here

    cost = cost.mean()

    cost = l2_regularization(cost, model.P, decay_c)

    cost = regularize_alpha_weights(cost, alpha_c, model_options, x_mask, y_mask, opt_ret)

    print 'Building f_cost...',
    f_cost = theano.function(inps, test_cost, profile=profile, updates=all_stochastic_updates)
    print 'Done'

    if plot_graph is not None:
        print 'Plotting post-compile graph...',
        theano.printing.pydotprint(
            f_cost,
            outfile='pictures/post_compile_{}'.format(plot_graph),
            var_with_name_simple=True,
        )
        print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(model.P))

    clip_shared = theano.shared(np.array(clip_c, dtype=fX), name='clip_shared')

    if dist_type != 'mpi_reduce': #build grads clip into computational graph
        grads, g2 = clip_grad_remove_nan(grads, clip_shared, model.P)
    else: #do the grads clip after gradients aggregation
        g2 = None

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',

    given_imm_data = get_adadelta_imm_data(optimizer, given_imm, preload)

    if optimizer == 'adadelta':
        f_grad_shared, f_update, grads_shared, imm_shared = Optimizers[optimizer](
        lr, model.P, grads, inps, cost, g2=g2, given_imm_data=given_imm_data, alpha = ada_alpha, all_stochastic_updates=all_stochastic_updates)

    if optimizer == 'adam':
        f_grad_shared, f_update, grads_shared, imm_shared = Optimizers[optimizer](
            lr, model.P, grads, inps, cost, g2=g2, given_imm_data=given_imm_data,
            all_stochastic_updates=all_stochastic_updates)

    print 'Done'

    if dist_type == 'mpi_reduce':
        f_grads_clip = make_grads_clip_func(grads_shared = grads_shared, mt_tparams= model.P, clip_c_shared = clip_shared)

    print 'Optimization'
    log('Preparation Done\n@Current Time = {}'.format(time.time()))

    if dist_type == 'mv':
        mv.barrier()
    elif dist_type == 'mpi_reduce':
        #create receive buffers for mpi allreduce
        rec_grads = [np.zeros_like(p.get_value()) for p in model.P.itervalues()]

    estop = False
    history_errs = []
    best_bleu = -1.0
    best_valid_cost = 1e6
    best_p = None
    bad_counter = 0
    uidx = search_start_uidx(reload_, preload)

    epoch_n_batches = 0
    start_epoch = 0
    pass_batches = 0

    print 'worker', worker_id, 'uidx', uidx, 'l_rate', lrate, 'ada_alpha', ada_alpha, 'n_batches', epoch_n_batches, 'start_epoch', start_epoch, 'pass_batches', pass_batches

    start_uidx = uidx

    if dump_before_train:
        print 'Dumping before train...',
        saveto_uidx = '{}.iter{}.npz'.format(
            os.path.splitext(saveto)[0], uidx)
        np.savez(saveto_uidx, history_errs=history_errs,
                 uidx=uidx, **unzip(model.P))
        save_options(model_options, uidx, saveto)
        print 'Done'
        sys.stdout.flush()

    stochastic_mode.set_value(0)
    valid_cost = validation(valid_iterator, f_cost, use_noise)
    small_train_cost = validation(small_train_iterator, f_cost, use_noise)
    message('Soft Valid cost {:.5f} Small train cost {:.5f}'.format(valid_cost, small_train_cost))
    stochastic_mode.set_value(1)
    #new_bleu = translate_dev_get_bleu(model, f_init, f_next, trng, use_noise, 5, 1.0)
    #best_bleu = new_bleu
    #message('BLEU = {:.2f} at uidx {}'.format(new_bleu, uidx))
    sys.stdout.flush()
    
    commu_time_sum = 0.0
    cp_time_sum =0.0
    reduce_time_sum = 0.0

    start_time = time.time()
    finetune_cnt = 0

    for eidx in xrange(start_epoch, max_epochs):
        if shuffle_data:
            text_iterator = load_shuffle_text_iterator(
                eidx, worker_id, text_iterator_list,
                datasets, vocab_filenames, batch_size, maxlen, n_words_src, n_words
            )
        n_samples = 0
        if dist_type == 'mpi_reduce':
            mpi_communicator.Barrier()

        for i, (x, y) in enumerate(text_iterator):
            if eidx == start_epoch and i < pass_batches: #ignore the first several batches when reload
                continue
            n_samples += len(x)
            uidx += 1

            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            effective_uidx = uidx - start_uidx
            ud_start = time.time()

            # compute cost, grads
            if dist_type != 'mpi_reduce':
                cost, g2_value = f_grad_shared(x, x_mask, y, y_mask)
            else:
                cost = f_grad_shared(x, x_mask, y, y_mask)

            if dist_type == 'mpi_reduce':
                reduce_start = time.time()
                commu_time = 0
                gpucpu_cp_time = 0
                if not nccl:
                    commu_time, gpucpu_cp_time = all_reduce_params(grads_shared, rec_grads)
                else:
                    commu_time, gpucpu_cp_time = all_reduce_params_nccl(nccl_comm, grads_shared)
                reduce_time = time.time() - reduce_start
                commu_time_sum += commu_time
                reduce_time_sum += reduce_time
                cp_time_sum += gpucpu_cp_time

                g2_value = f_grads_clip()
                print '@Worker = {}, Reduce time = {:.5f}, Commu time = {:.5f}, Copy time = {:.5f}'.format(worker_id, reduce_time, commu_time, gpucpu_cp_time)

            curr_lr = lrate if not dist_type or dist_recover_lr_iter < effective_uidx else lrate * 0.05 + effective_uidx * lrate / dist_recover_lr_iter * 0.95
            if curr_lr < lrate:
                print 'Curr lr {:.3f}'.format(curr_lr)

            # do the update on parameters
            f_update(curr_lr)

            ud = time.time() - ud_start

            if np.isnan(g2_value) or np.isinf(g2_value):
                message('gradient NaN detected')
                sys.stdout.flush()
                
            if np.isnan(cost) or np.isinf(cost):
                message('cost NaN detected')
                model.save_model(saveto, history_errs, uidx)
                save_minibatch(x, y, saveto, uidx, vocab_filenames)
                sys.stdout.flush()

                return 1., 1., 1.

            # discount learning rate
            # FIXME: Do NOT enable this and fine-tune at the same time
            if lr_discount_freq > 0 and np.mod(effective_uidx, lr_discount_freq) == 0:
                lrate *= 0.5
                message('Discount learning rate to {} at iteration {}'.format(lrate, uidx))

            # sync batch
            if dist_type == 'mv' and np.mod(uidx, dispFreq) == 0:
                comm_start = time.time()
                model.sync_tparams()
                message('@Comm time = {:.5f}'.format(time.time() - comm_start))

            # verbose
            if np.mod(effective_uidx, dispFreq) == 0:
                message('Worker {} Epoch {} Update {} Cost {:.5f} G2 {:.5f} UD {:.5f} Time {:.5f} s'.format(
                    worker_id, eidx, uidx, float(cost), float(g2_value), ud, time.time() - start_time,
                ))
                sys.stdout.flush()

            if np.mod(effective_uidx, saveFreq) == 0 and worker_id == 0:
                # save with uidx
                if not overwrite:
                    print 'Saving the model at iteration {}...'.format(uidx),
                    model.save_model(saveto, history_errs, uidx)
                    print 'Done'
                    sys.stdout.flush()

                # save immediate data in adadelta
                saveto_imm_path = '{}_latest.npz'.format(os.path.splitext(saveto)[0])
                dump_adadelta_imm_data(optimizer, imm_shared, dump_imm, saveto_imm_path)

            if np.mod(effective_uidx, validFreq) == 0:
                stochastic_mode.set_value(0)
                valid_cost = validation(valid_iterator, f_cost, use_noise)
                small_train_cost = validation(small_train_iterator, f_cost, use_noise)
                message('Soft Valid cost {:.5f} Small train cost {:.5f}'.format(valid_cost, small_train_cost))
                #new_bleu = translate_dev_get_bleu(model, f_init, f_next, trng, use_noise, 5, 1.0)
                #message('BLEU = {:.2f} at uidx {}'.format(new_bleu, uidx))
                sys.stdout.flush()

                #if new_bleu > best_bleu:
                #    print 'Saving the model at iteration {}...'.format(uidx),
                #    model.save_model(saveto, history_errs, uidx)
                #    print 'Done'
                #    best_bleu = new_bleu
                #    sys.stdout.flush()
                    
                stochastic_mode.set_value(1)


                # Fine-tune based on dev cost
                if fine_tune_patience > 0:
                    if valid_cost < best_valid_cost:
                        bad_counter = 0
                        best_valid_cost = valid_cost
                        #dump the best model so far, including the immediate file
                        if worker_id == 0:
                            message('Dump the the best model so far at uidx {}'.format(uidx))
                            model.save_model(saveto, history_errs)
                            #dump_adadelta_imm_data(optimizer, imm_shared, dump_imm, saveto)
                    else:
                        bad_counter += 1
                        if bad_counter >= fine_tune_patience:
                            print 'Fine tune:',
                            if finetune_cnt % 2 == 0:
                                lrate = np.float32(lrate * 0.5)
                                message('Discount learning rate to {} at iteration {}'.format(lrate, uidx))
                                if lrate <= 0.025:
                                    message('Learning rate decayed to {:.5f}, task completed'.format(lrate))
                                    return 1., 1., 1.
                            else:
                                clip_shared.set_value(np.float32(clip_shared.get_value() * 0.25))
                                message('Discount clip value to {} at iteration {}'.format(clip_shared.get_value(), uidx))
                            finetune_cnt += 1
                            bad_counter = 0


            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after {} iterations!'.format(uidx)
                estop = True
                break

        print 'Seen {} samples'.format(n_samples)

        if estop:
            break

    if best_p is not None:
        zipp(best_p, model.P)

    use_noise.set_value(0.)

    return 0.
Example #12
0
def main(batch_size=128, lr=0.1, sync=False, n=5, num_epochs=82, model=None):
    # Check if cifar data exists
    if not os.path.exists("./cifar-10-batches-py"):
        print(
            "CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'."
        )
        return

    # Load the dataset
    print("Loading data...")
    data = load_data()
    X_train = data['X_train']
    Y_train = data['Y_train']
    X_test = data['X_test']
    Y_test = data['Y_test']

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model
    print("Building model and compiling functions...")
    network = build_cnn(input_var, n)
    print("number of parameters in model: %d" %
          lasagne.layers.count_params(network, trainable=True))

    # MULTIVERSO: LasagneParamManager is a parameter manager which can
    # synchronize parameters of Lasagne with multiverso.
    lpm = param_manager.LasagneParamManager(network)

    if model is None:
        # Create a loss expression for training, i.e., a scalar objective we want
        # to minimize (for our multi-class problem, it is the cross-entropy loss):
        prediction = lasagne.layers.get_output(network)
        loss = lasagne.objectives.categorical_crossentropy(
            prediction, target_var)
        loss = loss.mean()
        # add weight decay
        all_layers = lasagne.layers.get_all_layers(network)
        l2_penalty = lasagne.regularization.regularize_layer_params(
            all_layers, lasagne.regularization.l2) * 0.0001
        loss = loss + l2_penalty

        # Create update expressions for training
        # Stochastic Gradient Descent (SGD) with momentum
        params = lasagne.layers.get_all_params(network, trainable=True)
        sh_lr = theano.shared(lasagne.utils.floatX(lr))
        updates = lasagne.updates.momentum(loss,
                                           params,
                                           learning_rate=sh_lr,
                                           momentum=0.9)

        # Compile a function performing a training step on a mini-batch (by giving
        # the updates dictionary) and returning the corresponding training loss:
        train_fn = theano.function([input_var, target_var],
                                   loss,
                                   updates=updates)

    # Create a loss expression for validation/testing
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    if model is None:
        # launch the training loop
        print("Starting training...")
        # We iterate over epochs:
        for epoch in range(num_epochs):
            # devide the data into different process
            examples_per_worker = X_train.shape[0] / workers_num
            start_index = worker_id * (examples_per_worker)
            train_indices = np.arange(start_index,
                                      start_index + examples_per_worker)
            # shuffle training data
            np.random.shuffle(train_indices)
            rand_X_train = X_train[train_indices, :, :, :]
            rand_Y_train = Y_train[train_indices]

            # In each epoch, we do a full pass over the training data:
            train_err = 0
            train_batches = 0
            start_time = time.time()
            for batch in iterate_minibatches(rand_X_train,
                                             rand_Y_train,
                                             batch_size,
                                             shuffle=True,
                                             augment=True):
                train_batches += 1
                inputs, targets = batch
                train_err += train_fn(inputs, targets)
                # MULTIVERSO: when you want to commit all the delta of
                # parameters manage by LasagneParamManager and update the latest
                # parameters from parameter server, you can call this function to
                # synchronize the values
                lpm.sync_all_param()

            # And a full pass over the validation data:
            # MULTIVERSO: all the workers will synchronize at the place you call barrier
            mv.barrier()
            if mv.is_master_worker():
                val_err = 0
                val_acc = 0
                val_batches = 0
                for batch in iterate_minibatches(X_test,
                                                 Y_test,
                                                 500,
                                                 shuffle=False):
                    inputs, targets = batch
                    err, acc = val_fn(inputs, targets)
                    val_err += err
                    val_acc += acc
                    val_batches += 1

                # Then we print the results for this epoch:
                print("Epoch {} of {} took {:.3f}s".format(
                    epoch + 1, num_epochs,
                    time.time() - start_time))
                print("  training loss:\t\t{:.6f}".format(train_err /
                                                          train_batches))
                print("  validation loss:\t\t{:.6f}".format(val_err /
                                                            val_batches))
                print("  validation accuracy:\t\t{:.2f} %".format(
                    val_acc / val_batches * 100))

            # adjust learning rate as in paper
            # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs
            if (epoch + 1) == 41 or (epoch + 1) == 61:
                # TODO: because of ASGD and multiple GPU are used, so Learning
                # rate change schedule should be reconsidered
                new_lr = sh_lr.get_value() * 0.1
                print("New LR:" + str(new_lr))
                sh_lr.set_value(lasagne.utils.floatX(new_lr))

        # MULTIVERSO: all the workers will synchronize at the place you call barrier
        mv.barrier()
        if mv.is_master_worker():
            # MULTIVERSO: update the parameters before save the model
            lpm.sync_all_param()
            # dump the network weights to a file :
            np.savez('cifar10_deep_residual_model.npz',
                     *lasagne.layers.get_all_param_values(network))
    else:
        # load network weights from model file
        with np.load(model) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        lasagne.layers.set_all_param_values(network, param_values)

    if mv.is_master_worker():
        # Calculate validation error of model:
        test_err = 0
        test_acc = 0
        test_batches = 0
        for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            test_err += err
            test_acc += acc
            test_batches += 1
        print("Final results:")
        print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
        print("  test accuracy:\t\t{:.2f} %".format(test_acc / test_batches *
                                                    100))

    # MULTIVERSO: You must call shutdown at the end of the file
    mv.shutdown()
def main(batch_size=128, lr=0.1, sync=False, n=5, num_epochs=82, model=None):
    # Check if cifar data exists
    if not os.path.exists("./cifar-10-batches-py"):
        print("CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'.")
        return

    # Load the dataset
    print("Loading data...")
    data = load_data()
    X_train = data['X_train']
    Y_train = data['Y_train']
    X_test = data['X_test']
    Y_test = data['Y_test']

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model
    print("Building model and compiling functions...")
    network = build_cnn(input_var, n)
    print("number of parameters in model: %d" % lasagne.layers.count_params(network, trainable=True))

    # MULTIVERSO: MVNetParamManager is a parameter manager which can
    # synchronize parameters of Lasagne with multiverso.
    mvnpm = param_manager.MVNetParamManager(network)

    if model is None:
        # Create a loss expression for training, i.e., a scalar objective we want
        # to minimize (for our multi-class problem, it is the cross-entropy loss):
        prediction = lasagne.layers.get_output(network)
        loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
        loss = loss.mean()
        # add weight decay
        all_layers = lasagne.layers.get_all_layers(network)
        l2_penalty = lasagne.regularization.regularize_layer_params(all_layers, lasagne.regularization.l2) * 0.0001
        loss = loss + l2_penalty

        # Create update expressions for training
        # Stochastic Gradient Descent (SGD) with momentum
        params = lasagne.layers.get_all_params(network, trainable=True)
        sh_lr = theano.shared(lasagne.utils.floatX(lr))
        updates = lasagne.updates.momentum(
                loss, params, learning_rate=sh_lr, momentum=0.9)

        # Compile a function performing a training step on a mini-batch (by giving
        # the updates dictionary) and returning the corresponding training loss:
        train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Create a loss expression for validation/testing
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                            target_var)
    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    if model is None:
        # launch the training loop
        print("Starting training...")
        # We iterate over epochs:
        for epoch in range(num_epochs):
            # devide the data into different process
            examples_per_worker = X_train.shape[0] / workers_num
            start_index = worker_id * (examples_per_worker)
            train_indices = np.arange(start_index, start_index + examples_per_worker)
            # shuffle training data
            np.random.shuffle(train_indices)
            rand_X_train = X_train[train_indices,:,:,:]
            rand_Y_train = Y_train[train_indices]

            # In each epoch, we do a full pass over the training data:
            train_err = 0
            train_batches = 0
            start_time = time.time()
            for batch in iterate_minibatches(rand_X_train, rand_Y_train, batch_size, shuffle=True, augment=True):
                train_batches += 1
                inputs, targets = batch
                train_err += train_fn(inputs, targets)
                # MULTIVERSO: when you want to commit all the delta of
                # parameters manage by MVNetParamManager and update the latest
                # parameters from parameter server, you can call this function to
                # synchronize the values
                mvnpm.sync_all_param()

            # And a full pass over the validation data:
            # MULTIVERSO: all the workers will synchronize at the place you call barrier
            mv.barrier()
            if mv.is_master_worker():
                val_err = 0
                val_acc = 0
                val_batches = 0
                for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False):
                    inputs, targets = batch
                    err, acc = val_fn(inputs, targets)
                    val_err += err
                    val_acc += acc
                    val_batches += 1

                # Then we print the results for this epoch:
                print("Epoch {} of {} took {:.3f}s".format(
                    epoch + 1, num_epochs, time.time() - start_time))
                print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
                print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
                print("  validation accuracy:\t\t{:.2f} %".format(
                    val_acc / val_batches * 100))

            # adjust learning rate as in paper
            # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs
            if (epoch+1) == 41 or (epoch+1) == 61:
                # TODO: because of ASGD and multiple GPU are used, so Learning
                # rate change schedule should be reconsidered
                new_lr = sh_lr.get_value() * 0.1
                print("New LR:"+str(new_lr))
                sh_lr.set_value(lasagne.utils.floatX(new_lr))

        # MULTIVERSO: all the workers will synchronize at the place you call barrier
        mv.barrier()
        if mv.is_master_worker():
            # MULTIVERSO: update the parameters before save the model
            mvnpm.sync_all_param()
            # dump the network weights to a file :
            np.savez('cifar10_deep_residual_model.npz', *lasagne.layers.get_all_param_values(network))
    else:
        # load network weights from model file
        with np.load(model) as f:
             param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        lasagne.layers.set_all_param_values(network, param_values)

    if mv.is_master_worker():
        # Calculate validation error of model:
        test_err = 0
        test_acc = 0
        test_batches = 0
        for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            test_err += err
            test_acc += acc
            test_batches += 1
        print("Final results:")
        print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
        print("  test accuracy:\t\t{:.2f} %".format(
            test_acc / test_batches * 100))

    # MULTIVERSO: You must call shutdown at the end of the file
    mv.shutdown()
Example #14
0
p_y_given_x = model(x, *params)
y = T.argmax(p_y_given_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(p_y_given_x, t))

updates = momentum(cost, params, learning_rate=0.01, momentum=0.9)


# compile theano functions
train = theano.function([x, t], cost, updates=updates, allow_input_downcast=True)
predict = theano.function([x], y, allow_input_downcast=True)


# MULTIVERSO: all the workers will synchronize at the place you call barrier
mv.barrier()


# train model
batch_size = 50

for i in range(50):
    for start in range(0, len(x_train), batch_size):
        # every process only train batches assigned to itself
        if start / batch_size % workers_num != worker_id:
            continue
        x_batch = x_train[start:start + batch_size]
        t_batch = t_train[start:start + batch_size]
        cost = train(x_batch, t_batch)

        # MULTIVERSO: sync value with multiverso after every batch
Example #15
0
# For the decoder's input, we repeat the encoded input for each time step
model.add(RepeatVector(DIGITS + 1))
# The decoder RNN could be multiple layers stacked or a single layer
for _ in range(LAYERS):
    model.add(RNN(HIDDEN_SIZE, return_sequences=True))

# For each of step of the output sequence, decide which character should be chosen
model.add(TimeDistributed(Dense(len(chars))))
model.add(Activation('softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

mv.barrier()

# Train the model each generation and show predictions against the validation dataset
for iteration in range(1, 200):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    # Add the MVCallback to update the parameters from multiverso
    model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=1,
              verbose = (1 if mv.is_master_() else 0), validation_data=(X_val, y_val), callbacks=[MVCallback(model, freq=2)])
    ###
    # Select 10 samples from the validation set at random so we can visualize errors
    if mv.is_master_worker():
        for i in range(10):
            ind = np.random.randint(0, len(X_val))
            rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])]
Example #16
0
def sgd_optimization_mnist(learning_rate=0.13,
                           n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # MULTIVERSO: you should call mv.init before call multiverso apis
    mv.init()
    # MULTIVERSO: every process has distinct worker id
    worker_id = mv.worker_id()

    # MULTIVERSO: mv.workers_num will return the number of workers
    total_worker = mv.workers_num()

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the model')
    validation_frequency = n_train_batches
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            # MULTIVERSO: we distribute the batches to different workers.
            # A worker will only train batches belonged to itself
            if minibatch_index % total_worker == worker_id:
                minibatch_avg_cost = train_model(minibatch_index)
                # MULTIVERSO: when you want to commit all the delta of
                # parameters produced by mv_shared and update the latest
                # parameters from parameter server, you can call this function to
                # synchronize the values
                sharedvar.sync_all_mv_shared_vars()

            iter = (epoch - 1) * n_train_batches + minibatch_index

            # MULTIVERSO: only master worker will output the model
            if mv.is_master_worker() and (iter +
                                          1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       validation_loss * 100.))
        # MULTIVERSO: all the workers will synchronize at the place you call barrier
        mv.barrier()

    # MULTIVERSO: You should make sure only one process will output the result.
    # Otherwise results will be outputted repeatedly
    if mv.is_master_worker():
        end_time = timeit.default_timer()

        test_losses = [test_model(i) for i in range(n_test_batches)]
        test_score = numpy.mean(test_losses)

        print(('Optimization complete with validation score of %f %%,'
               'with test performance %f %%') %
              (validation_loss * 100., test_score * 100.))
        print('The code run for %d epochs, with %f epochs/sec' %
              (epoch, 1. * epoch / (end_time - start_time)))
        print(('The code for file ' + os.path.split(__file__)[1] +
               ' ran for %.1fs' % ((end_time - start_time))),
              file=sys.stderr)

        # save the model
        with open('model.pkl', 'wb') as f:
            pickle.dump(classifier, f)
    # MULTIVERSO: You must call shutdown at the end of the file
    mv.shutdown()