def adadelta(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_grad' % k) for k, p in six.iteritems(tparams)] running_up2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rup2' % k) for k, p in six.iteritems(tparams)] running_grads2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2' % k) for k, p in six.iteritems(tparams)] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup + rg2up, name='adadelta') updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2) ] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore') return f_grad_shared, f_update
def adadelta(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up) updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore') return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup + rgup + rg2up, profile=profile) updir = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_updir' % k) for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(itemlist(tparams), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_grad' % k) for k, p in six.iteritems(tparams)] running_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad' % k) for k, p in six.iteritems(tparams)] running_grads2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2' % k) for k, p in six.iteritems(tparams)] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup + rgup + rg2up, name='rmsprop') updir = [theano.shared(p.get_value() * np.float32(0.), name='%s_updir' % k) for k, p in six.iteritems(tparams)] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip( itemlist(tparams), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore') return f_grad_shared, f_update
def sgd(lr, tparams, grads, x, mask, y, cost): gshared = [theano.shared(p.get_value() * 0.0, name="%s_grad" % k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, profile=profile) pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] f_update = theano.function([lr], [], updates=pup, profile=profile) return f_grad_shared, f_update
def sgd(lr, tparams, grads, x, mask, y, cost): gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, profile=profile) pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] f_update = theano.function([lr], [], updates=pup, profile=profile) return f_grad_shared, f_update
def sgd(lr, tparams, grads, inp, cost): gshared = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems() ] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inp, cost, updates=gsup, profile=False) pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] f_update = theano.function([lr], [], updates=pup, profile=False) return f_grad_shared, f_update
def sgd(lr, tparams, grads, inp, cost): gshared = [ theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in six.iteritems(tparams) ] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inp, cost, updates=gsup, name='sgd') pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] f_update = theano.function([lr], [], updates=pup) return f_grad_shared, f_update, []
def sgd(lr, tparams, grads, inp, cost, use_noise,**kwargs): print 'Using SGD' gshared = [theano.shared(p.get_value() * floatx(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] gsup = [(gs, gs + g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inp, cost, givens={use_noise: numpy.float32(1.)}, on_unused_input='warn', updates=gsup, allow_input_downcast=True) pup = [(p, p - lr * (g)) for p, g in zip(itemlist(tparams), gshared)] f_update = theano.function([lr], [], updates=pup, allow_input_downcast=True) return f_grad_shared, f_update, gshared
def adadelta(lr, tparams, grads, inp, cost, errors): gnorm = get_norms(grads) pnorm = get_norms(tparams.values()) zipped_grads = [ sharedX(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems() ] running_up2 = [ sharedX(p.get_value() * numpy.float32(0.), name='%s_rup2' % k) for k, p in tparams.iteritems() ] running_grads2 = [ sharedX(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in \ zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) \ for rg2, g in \ zip(running_grads2, grads)] f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], \ updates=zgup + rg2up) updir = [-tensor.sqrt(ru2 + 1e-6) / \ tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 \ in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) \ for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in \ zip(itemlist(tparams), updir)] upnorm = get_norms(updir) f_update = theano.function([lr], [upnorm], updates=ru2up + param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def sgd(lr, tparams, grads, x, mask, y, cost, errors): gshared = [sharedX(p.get_value() * 0., name='%s_grad'%k) for k, p \ in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] pnorm = get_norms(tparams.values()) gnorm = get_norms(grads) f_grad_shared = theano.function([x, mask, y], [cost, errors, gnorm, pnorm], updates=gsup, profile=profile) pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] upnorm = lr*gnorm f_update = theano.function([lr], [upnorm], updates=pup, profile=profile) return f_grad_shared, f_update
def sgd(lr, tparams, grads, x, mask, y, cost, errors): gshared = [sharedX(p.get_value() * 0., name='%s_grad'%k) for k, p \ in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] pnorm = get_norms(tparams.values()) gnorm = get_norms(grads) f_grad_shared = theano.function([x, mask, y], [cost, errors, gnorm, pnorm], updates=gsup, profile=profile) pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] upnorm = lr * gnorm f_update = theano.function([lr], [upnorm], updates=pup, profile=profile) return f_grad_shared, f_update
def adadelta(lr, tparams, grads, inp, cost, errors): gnorm = get_norms(grads) pnorm = get_norms(tparams.values()) zipped_grads = [sharedX(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] running_up2 = [sharedX(p.get_value() * numpy.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()] running_grads2 = [sharedX(p.get_value() * numpy.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in \ zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) \ for rg2, g in \ zip(running_grads2, grads)] f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], \ updates=zgup + rg2up) updir = [-tensor.sqrt(ru2 + 1e-6) / \ tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 \ in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) \ for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in \ zip(itemlist(tparams), updir)] upnorm = get_norms(updir) f_update = theano.function([lr], [upnorm], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, inp, cost, errors): zipped_grads = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_grad'%k) for k, p in tparams.iteritems()] running_grads = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_rgrad'%k) for k, p in tparams.iteritems()] running_grads2 = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g \ in zip(running_grads2, grads)] pnorm = get_norms(tparams.values()) gnorm = get_norms(grads) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=zgup+rgup+rg2up, profile=profile) updir = [sharedX(p.get_value() * numpy.float32(0.), name='%s_updir'%k) \ for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - lr * zg / \ tensor.maximum(tensor.sqrt(rg2 - rg ** 2 + 1e-8)), 1e-8) \ for ud, zg, rg, rg2 in zip(updir, zipped_grads, \ running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in \ zip(itemlist(tparams), updir_new)] upnorm = get_norms(updir_new) f_update = theano.function([lr], [upnorm], updates=updir_new+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, inp, cost, errors): zipped_grads = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_grad'%k) for k, p in tparams.iteritems()] running_grads = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_rgrad'%k) for k, p in tparams.iteritems()] running_grads2 = [sharedX(p.get_value() * numpy.float32(0.), \ name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g \ in zip(running_grads2, grads)] pnorm = get_norms(tparams.values()) gnorm = get_norms(grads) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=zgup + rgup + rg2up, profile=profile) updir = [sharedX(p.get_value() * numpy.float32(0.), name='%s_updir'%k) \ for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - lr * zg / \ tensor.maximum(tensor.sqrt(rg2 - rg ** 2 + 1e-8)), 1e-8) \ for ud, zg, rg, rg2 in zip(updir, zipped_grads, \ running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in \ zip(itemlist(tparams), updir_new)] upnorm = get_norms(updir_new) f_update = theano.function([lr], [upnorm], updates=updir_new + param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def train(dim_word=100, # word vector dimensionality dim=1000, # the number of GRU units encoder='gru', patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 weight decay penalty lrate=0.01, n_words=100000, # vocabulary size vocab_dim=100000, # Size of M, C memory_dim=1000, # Dimension of memory memory_size=15, # n_back to attend maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq dataset='/data/lisatmp3/chokyun/wikipedia/extracted/wiki.tok.txt.gz', valid_dataset='../data/dev/newstest2011.en.tok', dictionary='/data/lisatmp3/chokyun/wikipedia/extracted/' 'wiki.tok.txt.gz.pkl', use_dropout=False, reload_=False): # Model options model_options = locals().copy() # Theano random stream trng = RandomStreams(1234) # load dictionary with open(dictionary, 'rb') as f: worddicts = pkl.load(f) # invert dictionary worddicts_r = dict() for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print 'Loading data' train = TextIterator(dataset, dictionary, n_words_source=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_dataset, dictionary, n_words_source=n_words, batch_size=valid_batch_size, maxlen=maxlen) # initialize RMN rmn_ = RMN(model_options) print 'Building model' rmn_.init_params() # reload parameters if reload_ and os.path.exists(saveto): rmn_.load_params(saveto) # create shared variables for parameters tparams = rmn_.tparams # build the symbolic computational graph use_noise, x, x_mask, opt_ret, cost = rmn_.build_model() inps = [x, x_mask] print 'Buliding sampler' f_next = rmn_.build_sampler(trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', optimizer = getattr(importlib.import_module('optimizer'), optimizer) f_grad_shared, f_update = optimizer(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] uidx = 0 estop = False bad_counter = 0 # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) uidx = numpy.load(saveto)['uidx'] best_p = None if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size # Training loop for eidx in xrange(max_epochs): n_samples = 0 for x in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) # pad batch and create mask x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(5): sample, score = rmn_.gen_sample(tparams, f_next, trng=trng, maxlen=30, argmax=False) print 'Sample ', jj, ': ', ss = sample for vv in ss: if vv == 0: break if vv in worddicts_r: print worddicts_r[vv], else: print 'UNK', print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = rmn_.pred_probs(valid, f_log_probs, prepare_data) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = rmn_.pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, uidx=uidx, **params) return valid_err
def train( random_seed=1234, dim_word=256, # word vector dimensionality ctx_dim=-1, # context vector dimensionality, auto set dim=1000, # the number of LSTM units n_layers_out=1, n_layers_init=1, encoder='none', encoder_dim=100, prev2out=False, ctx2out=False, patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., alpha_entropy_r=0., lrate=0.01, selector=False, n_words=100000, maxlen=100, # maximum length of the description optimizer='adadelta', clip_c=2., batch_size=64, valid_batch_size=64, save_model_dir='/data/lisatmp3/yaoli/exp/capgen_vid/attention/test/', validFreq=10, saveFreq=10, # save the parameters after every saveFreq updates sampleFreq=10, # generate some samples after every sampleFreq updates metric='blue', dataset='youtube2text', video_feature='googlenet', use_dropout=False, reload_=False, from_dir=None, K=10, OutOf=240, verbose=True, debug=True): rng_numpy, rng_theano = utils.get_two_rngs() model_options = locals().copy() if 'self' in model_options: del model_options['self'] with open('%smodel_options.pkl' % save_model_dir, 'wb') as f: pkl.dump(model_options, f) # instance model layers = Layers() model = Model() print 'Loading data' engine = data_engine.Movie2Caption('attention', dataset, video_feature, batch_size, valid_batch_size, maxlen, n_words, K, OutOf) model_options['ctx_dim'] = engine.ctx_dim model_options['n_words'] = engine.n_words print 'n_words:', model_options['n_words'] # set test values, for debugging idx = engine.kf_train[0] [x_tv, mask_tv, ctx_tv, ctx_mask_tv ] = data_engine.prepare_data(engine, [engine.train[index] for index in idx]) print 'init params' t0 = time.time() params = model.init_params(model_options) # reloading if reload_: model_saved = from_dir + '/model_best_so_far.npz' assert os.path.isfile(model_saved) print "Reloading model params..." params = utils.load_params(model_saved, params) tparams = utils.init_tparams(params) trng, use_noise, \ x, mask, ctx, mask_ctx, \ cost, extra = \ model.build_model(tparams, model_options) alphas = extra[1] betas = extra[2] print 'buliding sampler' f_init, f_next = model.build_sampler(tparams, model_options, use_noise, trng) # before any regularizer print 'building f_log_probs' f_log_probs = theano.function([x, mask, ctx, mask_ctx], -cost, profile=False, on_unused_input='ignore') cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1. - alphas.sum(0))**2).sum(-1).mean() cost += alpha_reg if alpha_entropy_r > 0: alpha_entropy_r = theano.shared(numpy.float32(alpha_entropy_r), name='alpha_entropy_r') alpha_reg_2 = alpha_entropy_r * (-tensor.sum( alphas * tensor.log(alphas + 1e-8), axis=-1)).sum(-1).mean() cost += alpha_reg_2 else: alpha_reg_2 = tensor.zeros_like(cost) print 'building f_alpha' f_alpha = theano.function([x, mask, ctx, mask_ctx], [alphas, betas], name='f_alpha', on_unused_input='ignore') print 'compute grad' grads = tensor.grad(cost, wrt=utils.itemlist(tparams)) if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'build train fns' f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, [x, mask, ctx, mask_ctx], cost, extra + grads) print 'compilation took %.4f sec' % (time.time() - t0) print 'Optimization' history_errs = [] # reload history if reload_: print 'loading history error...' history_errs = numpy.load( from_dir + 'model_best_so_far.npz')['history_errs'].tolist() bad_counter = 0 processes = None queue = None rqueue = None shared_params = None uidx = 0 uidx_best_blue = 0 uidx_best_valid_err = 0 estop = False best_p = utils.unzip(tparams) best_blue_valid = 0 best_valid_err = 999 alphas_ratio = [] for eidx in xrange(max_epochs): n_samples = 0 train_costs = [] grads_record = [] print 'Epoch ', eidx for idx in engine.kf_train: tags = [engine.train[index] for index in idx] n_samples += len(tags) uidx += 1 use_noise.set_value(1.) pd_start = time.time() x, mask, ctx, ctx_mask = data_engine.prepare_data(engine, tags) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue ud_start = time.time() rvals = f_grad_shared(x, mask, ctx, ctx_mask) cost = rvals[0] probs = rvals[1] alphas = rvals[2] betas = rvals[3] grads = rvals[4:] grads, NaN_keys = utils.grad_nan_report(grads, tparams) if len(grads_record) >= 5: del grads_record[0] grads_record.append(grads) if NaN_keys != []: print 'grads contain NaN' import pdb pdb.set_trace() if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected in cost' import pdb pdb.set_trace() # update params f_update(lrate) ud_duration = time.time() - ud_start if eidx == 0: train_error = cost else: train_error = train_error * 0.95 + cost * 0.05 train_costs.append(cost) if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Train cost mean so far', \ train_error, 'fetching data time spent (sec)', pd_duration, \ 'update time spent (sec)', ud_duration, 'save_dir', save_model_dir alphas, betas = f_alpha(x, mask, ctx, ctx_mask) counts = mask.sum(0) betas_mean = (betas * mask).sum(0) / counts betas_mean = betas_mean.mean() print 'alpha ratio %.3f, betas mean %.3f' % ( alphas.min(-1).mean() / (alphas.max(-1)).mean(), betas_mean) l = 0 for vv in x[:, 0]: if vv == 0: break if vv in engine.word_idict: print '(', numpy.round(betas[l, 0], 3), ')', engine.word_idict[vv], else: print '(', numpy.round(betas[l, 0], 3), ')', 'UNK', l += 1 print '(', numpy.round(betas[l, 0], 3), ')' if numpy.mod(uidx, saveFreq) == 0: pass if numpy.mod(uidx, sampleFreq) == 0: use_noise.set_value(0.) print '------------- sampling from train ----------' x_s = x mask_s = mask ctx_s = ctx ctx_mask_s = ctx_mask model.sample_execute(engine, model_options, tparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, trng) print '------------- sampling from valid ----------' idx = engine.kf_valid[numpy.random.randint( 1, len(engine.kf_valid) - 1)] tags = [engine.valid[index] for index in idx] x_s, mask_s, ctx_s, mask_ctx_s = data_engine.prepare_data( engine, tags) model.sample_execute(engine, model_options, tparams, f_init, f_next, x_s, ctx_s, mask_ctx_s, trng) if validFreq != -1 and numpy.mod(uidx, validFreq) == 0: t0_valid = time.time() alphas, _ = f_alpha(x, mask, ctx, ctx_mask) ratio = alphas.min(-1).mean() / (alphas.max(-1)).mean() alphas_ratio.append(ratio) numpy.savetxt(save_model_dir + 'alpha_ratio.txt', alphas_ratio) current_params = utils.unzip(tparams) numpy.savez(save_model_dir + 'model_current.npz', history_errs=history_errs, **current_params) use_noise.set_value(0.) train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 if not debug: # first compute train cost if 0: print 'computing cost on trainset' train_err, train_perp = model.pred_probs( engine, 'train', f_log_probs, verbose=model_options['verbose']) else: train_err = 0. train_perp = 0. if 1: print 'validating...' valid_err, valid_perp = model.pred_probs( engine, 'valid', f_log_probs, verbose=model_options['verbose'], ) else: valid_err = 0. valid_perp = 0. if 1: print 'testing...' test_err, test_perp = model.pred_probs( engine, 'test', f_log_probs, verbose=model_options['verbose']) else: test_err = 0. test_perp = 0. mean_ranking = 0 blue_t0 = time.time() scores, processes, queue, rqueue, shared_params = \ metrics.compute_score( model_type='attention', model_archive=current_params, options=model_options, engine=engine, save_dir=save_model_dir, beam=5, n_process=5, whichset='both', on_cpu=False, processes=processes, queue=queue, rqueue=rqueue, shared_params=shared_params, metric=metric, one_time=False, f_init=f_init, f_next=f_next, model=model ) ''' {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]}, 'alternative_valid': {'Bleu_3': 0.40702270203174923, 'Bleu_4': 0.29276570520368456, 'CIDEr': 0.25247168210607884, 'Bleu_2': 0.529069629270047, 'Bleu_1': 0.6804308797115253, 'ROUGE_L': 0.51083584331688392}, 'meteor': {'test': [-1], 'valid': [0.282787550236724]}} ''' valid_B1 = scores['valid']['Bleu_1'] valid_B2 = scores['valid']['Bleu_2'] valid_B3 = scores['valid']['Bleu_3'] valid_B4 = scores['valid']['Bleu_4'] valid_Rouge = scores['valid']['ROUGE_L'] valid_Cider = scores['valid']['CIDEr'] valid_meteor = scores['valid']['METEOR'] test_B1 = scores['test']['Bleu_1'] test_B2 = scores['test']['Bleu_2'] test_B3 = scores['test']['Bleu_3'] test_B4 = scores['test']['Bleu_4'] test_Rouge = scores['test']['ROUGE_L'] test_Cider = scores['test']['CIDEr'] test_meteor = scores['test']['METEOR'] print 'computing meteor/blue score used %.4f sec, '\ 'blue score: %.1f, meteor score: %.1f'%( time.time()-blue_t0, valid_B4, valid_meteor) history_errs.append([ eidx, uidx, train_err, train_perp, valid_perp, test_perp, valid_err, test_err, valid_B1, valid_B2, valid_B3, valid_B4, valid_meteor, valid_Rouge, valid_Cider, test_B1, test_B2, test_B3, test_B4, test_meteor, test_Rouge, test_Cider ]) numpy.savetxt(save_model_dir + 'train_valid_test.txt', history_errs, fmt='%.3f') print 'save validation results to %s' % save_model_dir # save best model according to the best blue or meteor if len(history_errs) > 1 and \ valid_B4 > numpy.array(history_errs)[:-1,11].max(): print 'Saving to %s...' % save_model_dir, numpy.savez(save_model_dir + 'model_best_blue_or_meteor.npz', history_errs=history_errs, **best_p) if len(history_errs) > 1 and \ valid_err < numpy.array(history_errs)[:-1,6].min(): best_p = utils.unzip(tparams) bad_counter = 0 best_valid_err = valid_err uidx_best_valid_err = uidx print 'Saving to %s...' % save_model_dir, numpy.savez(save_model_dir + 'model_best_so_far.npz', history_errs=history_errs, **best_p) with open('%smodel_options.pkl' % save_model_dir, 'wb') as f: pkl.dump(model_options, f) print 'Done' elif len(history_errs) > 1 and \ valid_err >= numpy.array(history_errs)[:-1,6].min(): bad_counter += 1 print 'history best ', numpy.array(history_errs)[:, 6].min() print 'bad_counter ', bad_counter print 'patience ', patience if bad_counter > patience: print 'Early Stop!' estop = True break if test_B4 > 0.52 and test_meteor > 0.32: print 'Saving to %s...' % save_model_dir, numpy.savez(save_model_dir + 'model_' + str(uidx) + '.npz', history_errs=history_errs, **current_params) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \ 'best valid err so far',best_valid_err print 'valid took %.2f sec' % (time.time() - t0_valid) # end of validatioin if debug: break if estop: break if debug: break # end for loop over minibatches print 'This epoch has seen %d samples, train cost %.2f' % ( n_samples, numpy.mean(train_costs)) # end for loop over epochs print 'Optimization ended.' if best_p is not None: utils.zipp(best_p, tparams) use_noise.set_value(0.) valid_err = 0 test_err = 0 if not debug: #if valid: valid_err, valid_perp = model.pred_probs( engine, 'valid', f_log_probs, verbose=model_options['verbose']) #if test: #test_err, test_perp = self.pred_probs( # 'test', f_log_probs, # verbose=model_options['verbose']) print 'stopped at epoch %d, minibatch %d, '\ 'curent Train %.2f, current Valid %.2f, current Test %.2f '%( eidx,uidx,numpy.mean(train_err),numpy.mean(valid_err),numpy.mean(test_err)) params = copy.copy(best_p) numpy.savez(save_model_dir + 'model_best.npz', train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) if history_errs != []: history = numpy.asarray(history_errs) best_valid_idx = history[:, 6].argmin() numpy.savetxt(save_model_dir + 'train_valid_test.txt', history, fmt='%.4f') print 'final best exp ', history[best_valid_idx] return train_err, valid_err, test_err
def train( experiment_id, data_base_path, output_base_path, model_options, data_options, validation_options, patience, # early stopping patience max_epochs, finish_after, # finish after this many updates clip_c, # gradient clipping threshold lrate, # learning rate optimizer, saveto, valid_freq, time_limit, save_freq, # save the parameters after every saveFreq updates sample_freq, # generate some samples after every sampleFreq verbose, reload_from=None, pretrained_word_emb=None): start_time = time.time() def join_data_base_path(data_base, options): for kk, vv in six.iteritems(options): if kk in [ 'src', 'trg', 'input_vocab', 'label_vocab', 'valid_src', 'valid_trg' ]: options[kk] = os.path.join(data_base, options[kk]) return options data_options = join_data_base_path(data_base_path, data_options) validation_options = join_data_base_path(data_base_path, validation_options) worddicts_r, train_stream, valid_stream = load_data(**data_options) model_options['n_input_tokens'] = len(worddicts_r[0]) model_options['n_labels'] = len(worddicts_r[1]) if model_options['label_type'] == 'binary': model_options['n_bins'] = len(worddicts_r[1]) model_options['n_labels'] = 2 max_sample_length = len(worddicts_r[1]) else: max_sample_length = data_options['max_label_length'] LOGGER.info('Building model') params = init_params(model_options) # reload parameters best_filename = '{}/{}.{}.best.npz'.format(output_base_path, experiment_id, saveto) if pretrained_word_emb and os.path.exists(pretrained_word_emb): assert model_options['input_token_level'] == 'word' LOGGER.info('Loading pretrained word embeddings from {}'.format( pretrained_word_emb)) pretrained_emb = load_pretrained_embeddings(pretrained_word_emb) # TODO check if the size of the pretrained word embedding equals # the size of the initialized word embeddings # Also, check whether or not the vocabulary in the pretrained word # embeddings is identical to the vocabulary in the model. pvocab = pretrained_emb['vocab'] # (idx, word) # XXX if the assertians passed, then load the pretrained embeddings assert pretrained_emb['Wemb'].dtype == numpy.float32, \ 'The pretrained word embeddings should be float32\n' assert pretrained_emb['Wemb'].shape[1] == params['Wemb'].shape[1], \ '{} does not match {}\n'.format(pretrained_emb['Wemb'].shape[1], params['Wemb'].shape[1]) pretrained_word2id = {word: idx for (idx, word) in pvocab} param_indices, indices = [], [] for ii in xrange(len(worddicts_r[0])): if ii >= data_options['n_input_tokens']: break word = worddicts_r[0][ii] if word in pretrained_word2id: word_idx = pretrained_word2id[word] indices.append(word_idx) param_indices.append(ii) assert len(indices) <= data_options['n_input_tokens'] params['Wemb'][param_indices] = pretrained_emb['Wemb'][indices] # normalize word embeddings params['Wemb'] = params['Wemb'] / \ numpy.sqrt((params['Wemb']**2).sum(axis=1)[:, None]) if reload_from and os.path.exists(reload_from): LOGGER.info('Loading parameters from {}'.format(reload_from)) params = load_params(reload_from, params) LOGGER.info('Initializing parameters') tparams = init_tparams(params) # use_noise is for dropout trng, use_noise, encoder_vars, decoder_vars, \ opt_ret, costs = build_model(tparams, model_options) inps = encoder_vars + decoder_vars LOGGER.info('Building sampler') f_sample_inits, f_sample_nexts \ = build_sampler(tparams, model_options, trng, use_noise) # before any regularizer LOGGER.info('Building functions to compute log prob') f_log_probs = [ theano.function(inps, cost_, name='f_log_probs_%s' % cost_.name, on_unused_input='ignore') for cost_ in costs ] assert len(costs) == 1 cost = costs[0] ''' for cost_ in costs[1:]: cost += cost_ ''' cost = cost.mean() LOGGER.info('Computing gradient') grads = tensor.grad(cost, wrt=itemlist(tparams)) # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') LOGGER.info('Building optimizers') f_grad_shared, f_update, optimizer_state = \ getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) optimizer_state = name_dict(optimizer_state) # TODO set_value optimizer_state if reload_from and os.path.exists(reload_from): LOGGER.info('Loading optimizer state from {}'.format(reload_from)) optimizer_state = load_params(reload_from, optimizer_state, theano_var=True) LOGGER.info('Optimization') log = Logger( filename='{}/{}.log.jsonl.gz'.format(output_base_path, experiment_id)) best_valid_err = float('inf') best_model = None total_nsamples = 0 uidx = 0 uidx_restore = [0] estop = False if reload_from and os.path.exists(reload_from): rmodel = numpy.load(reload_from) if 'uidx' in rmodel: uidx_restore = rmodel['uidx'] if 'best_valid_err' in rmodel: best_valid_err = rmodel['best_valid_err'] if 'total_nsamples' in rmodel and rmodel['total_nsamples'] > 0: total_nsamples = rmodel['total_nsamples'] best_model = [unzip(tparams), unzip(optimizer_state), uidx_restore] train_start = time.clock() max_updates_per_epoch = total_nsamples / data_options['batch_size'] try: for epoch in xrange(0, max_epochs): if total_nsamples > 0 and \ uidx + max_updates_per_epoch < uidx_restore[0]: uidx += max_updates_per_epoch continue n_samples = 0 for x, x_mask, \ y, y_mask in train_stream.get_epoch_iterator(): n_samples += len(x) uidx += 1 if uidx < uidx_restore[0]: continue x_length = x_mask.sum(1).mean() if model_options['label_type'] == 'binary': old_y = y y, y_mask = mul2bin(y, y_mask, model_options['n_bins']) y, y_mask = y.T, y_mask.T if data_options['input_token_level'] == 'character': x, x_mask = prepare_character_tensor(x) else: x, x_mask = x.T, x_mask.T unk_token_ratio = (x == 1).sum(0) / x_mask.sum(0) non_empty_insts = unk_token_ratio <= 0.5 y = y[:, non_empty_insts] y_mask = y_mask[:, non_empty_insts] x = x[:, non_empty_insts] x_mask = x_mask[:, non_empty_insts] if x.shape[1] == 0: continue encoder_inps = [x, x_mask] decoder_inps = [y, y_mask] inps = encoder_inps + decoder_inps use_noise.set_value(1.) log_entry = {'iteration': uidx, 'epoch': epoch} # compute cost, grads and copy grads to shared variables update_start = time.clock() cost = f_grad_shared(*inps) f_update(lrate) if verbose: log_entry['cost'] = float(cost) log_entry['average_source_length'] = \ float(x_length) log_entry['average_target_length'] = \ float(y_mask.sum(0).mean()) log_entry['update_time'] = time.clock() - update_start log_entry['train_time'] = time.clock() - train_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if not numpy.isfinite(cost): LOGGER.error('NaN detected') return 1., 1., 1. # validate model on validation set and early stop if necessary if numpy.mod(uidx, valid_freq) == 0: use_noise.set_value(0.) valid_errs = [ numpy.mean(pred_probs(f_, model_options, valid_stream)) for f_ in f_log_probs ] for f_, err_ in zip(f_log_probs, valid_errs): log_entry['validation_%s' % f_.name] = float(err_) valid_scores = do_validation(f_sample_inits, f_sample_nexts, tparams, max_sample_length, trng, model_options, valid_stream) for eval_type, score in valid_scores.items(): log_entry['validation_%s' % eval_type] = score for f_, err_ in zip(f_log_probs, valid_errs): if not numpy.isfinite(err_): raise RuntimeError(('NaN detected in validation ' 'error of %s') % f_.name) valid_err = numpy.array(valid_errs).sum() if valid_err < best_valid_err: best_valid_err = valid_err best_model = [ unzip(tparams), unzip(optimizer_state), [uidx] ] # save the best model so far if numpy.mod(uidx, save_freq) == 0 and \ uidx > uidx_restore[0]: LOGGER.info('Saving best model so far') if best_model is not None: params, opt_state, save_at_uidx = best_model else: params = unzip(tparams) opt_state = unzip(optimizer_state) save_at_uidx = [uidx] # save params to exp_id.npz and symlink model.npz to it params_and_state = merge( params, opt_state, {'uidx': save_at_uidx}, {'best_valid_err': best_valid_err}, {'total_nsamples': total_nsamples}) save_params(params_and_state, best_filename) # generate some samples with the model and display them if sample_freq > 0 and numpy.mod(uidx, sample_freq) == 0: # FIXME: random selection? log_entry['samples'] = [] if data_options['input_token_level'] == 'character': batch_size = x.shape[2] else: batch_size = x.shape[1] for jj in xrange(numpy.minimum(5, batch_size)): stats = [('source', ''), ('truth', ''), ('sample', ''), ('align_sample', '')] log_entry['samples'].append(OrderedDict(stats)) if data_options['input_token_level'] == 'character': sample_encoder_inps = [ x[:, :, jj][:, :, None], x_mask[:, :, jj][:, :, None] ] else: sample_encoder_inps = [ x[:, jj][:, None], x_mask[:, jj][:, None] ] solutions = gen_sample(tparams, f_sample_inits, f_sample_nexts, sample_encoder_inps, model_options, trng=trng, k=12, max_label_len=max_sample_length, argmax=False) sample = solutions['samples'] alignment = solutions['alignments'] score = solutions['scores'] score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] alignment = alignment[score.argmin()] if model_options['label_type'] == 'binary': # print(y[0], y.shape, old_y.shape) y = old_y.T assert type(ss) == list assert len(ss) == max_sample_length new_ss = [ tidx for tidx, s in enumerate(ss) if s == 1 ] # print(len(ss), numpy.sum(ss), new_ss) ss = new_ss assert type(ss) == list if len(ss) == 0: ss.append(0) if ss[0] == 0: # if the first token is <EOS> ss = ss[1:] + ss[:1] if data_options['input_token_level'] == 'character': num_src_words = int( (x_mask[:, :, jj].sum(0) > 0).sum()) num_chars, num_words, num_samples = x.shape for widx in xrange(num_words): if x_mask[:, widx, jj].sum() == 0: break for cidx in xrange(num_chars): cc = x[cidx, widx, jj] if cc == 0: break if cc in worddicts_r[0]: token = worddicts_r[0][cc] else: token = UNK_TOKEN log_entry['samples'][-1]['source'] \ += token log_entry['samples'][-1]['source'] += \ ' ' else: num_src_words = int(x_mask[:, jj].sum()) num_words, num_samples = x.shape for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: token = worddicts_r[0][vv] else: token = UNK_TOKEN log_entry['samples'][-1]['source'] \ += token + ' ' for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: token = worddicts_r[1][vv] else: token = UNK_TOKEN log_entry['samples'][-1]['truth'] += token + ' ' for tidx, vv in enumerate(ss): if vv == 0: break if vv in worddicts_r[1]: token = worddicts_r[1][vv] else: token = UNK_TOKEN assert tidx >= 0 and tidx < len(alignment), \ '%d\t%d' % (tidx, len(alignment)) align_src_word_idx = \ (alignment[tidx][ :num_src_words-1]).argmax() aligned_token = '%s_<%d>' % \ (token, align_src_word_idx) log_entry['samples'][-1]['sample'] += token + ' ' log_entry['samples'][-1]['align_sample'] \ += aligned_token + ' ' # finish after this many updates if uidx >= finish_after: LOGGER.info('Finishing after {} iterations'.format(uidx)) estop = True break if time_limit > 0 and \ (time.time() - start_time > time_limit * 60): LOGGER.info( 'Time limit {} mins is over'.format(time_limit)) estop = True break if verbose and len(log_entry) > 2: log.log(log_entry) LOGGER.info('Completed epoch, seen {} samples'.format(n_samples)) if total_nsamples == 0: total_nsamples = n_samples if estop: log.log(log_entry) break if best_model is not None: assert len(best_model) == 3 best_p, best_state, best_uidx = best_model zipp(best_p, tparams) zipp(best_state, optimizer_state) ''' use_noise.set_value(0.) LOGGER.info('Calculating validation cost') valid_errs = do_validation(f_log_probs, model_options, valid_stream) ''' if not best_model: best_p = unzip(tparams) best_state = unzip(optimizer_state) best_uidx = [uidx] best_p = copy.copy(best_p) best_state = copy.copy(best_state) params_and_state = merge(best_p, best_state, {'uidx': best_uidx}, {'best_valid_err': best_valid_err}, {'total_nsamples': total_nsamples}) save_params(params_and_state, best_filename) except Exception: LOGGER.error(traceback.format_exc()) best_valid_err = -1. else: # XXX add something needed print('Training Done') return best_valid_err
def train(dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., diag_c=0., clip_c=-1., lrate=0.01, n_words_src=100000, n_words=100000, maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates datasets=['/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'], valid_datasets=['../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok'], dictionaries=['/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'], use_dropout=False, reload_=False): # Model options model_options = locals().copy() worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) print 'Loading data' train = TextIterator(datasets[0], datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Buliding sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:,None]- opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' print 'Building f_grad...', f_grad = theano.function(inps, grads, profile=profile) print 'Done' if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5,x.shape[1])): stochastic = True sample, score = gen_sample(tparams, f_init, f_next, x[:,jj][:,None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False) print 'Source ', jj, ': ', for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: print worddicts_r[0][vv], else: print 'UNK', print print 'Truth ', jj, ' : ', for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): import ipdb; ipdb.set_trace() print 'Valid ', valid_err print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
def train(dim_word=100, # word vector dimensionality dim=1000, # the number of GRU units encoder='gru', patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 weight decay penalty lrate=0.01, n_words=100000, # vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq dataset='/data/lisatmp4/anirudhg/wiki.tok.txt.gz', valid_dataset='/data/lisatmp4/anirudhg/newstest2011.en.tok', dictionary='/data/lisatmp4/anirudhg/wiki.tok.txt.gz.pkl', use_dropout=False, reload_=False): # Model options model_options = locals().copy() # load dictionary with open(dictionary, 'rb') as f: worddicts = pkl.load(f) # invert dictionary worddicts_r = dict() for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print 'Loading data' train = TextIterator(dataset, dictionary, n_words_source=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_dataset, dictionary, n_words_source=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) # create shared variables for parameters tparams = init_tparams(params) # build the symbolic computational graph trng, use_noise, \ x, x_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask] print 'Buliding sampler' f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size # Training loop uidx = 0 estop = False bad_counter = 0 for eidx in xrange(max_epochs): n_samples = 0 for x in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) # pad batch and create mask x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(5): sample, score = gen_sample(tparams, f_next, model_options, trng=trng, maxlen=30, argmax=False) print 'Sample ', jj, ': ', ss = sample for vv in ss: if vv == 0: break if vv in worddicts_r: print worddicts_r[vv], else: print 'UNK', print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
def train(model_options, dataset_name = 'MSVD', cnn_name = 'ResNet50', train_data_ids_path = config.MSVD_DATA_IDS_TRAIN_PATH, val_data_ids_path = config.MSVD_DATA_IDS_VAL_PATH, test_data_ids_path = config.MSVD_DATA_IDS_TEST_PATH, vocab_path = config.MSVD_VOCAB_PATH, reverse_vocab_path = config.MSVD_REVERSE_VOCAB_PATH, mb_size_train = 64, mb_size_test = 128, train_caps_path = config.MSVD_VID_CAPS_TRAIN_PATH, val_caps_path = config.MSVD_VID_CAPS_VAL_PATH, test_caps_path = config.MSVD_VID_CAPS_TEST_PATH, feats_dir = config.MSVD_FEATS_DIR, save_dir = config.SAVE_DIR_PATH, word_dim = 512, # word embeddings size ctx_dim = 2048, # video cnn feature dimension lstm_dim = 512, # lstm unit size patience = 20, max_epochs = 500, decay_c = 1e-4, alpha_entropy_r = 0., alpha_c = 0.70602, clip_c = 10., lrate = 0.0001, vocab_size = 20000, # n_words maxlen_caption = 30, # max length of the descprition optimizer = 'adadelta', batch_size = 64, # for trees use 25 metric = 'everything', # set to perplexity on DVS # blue, meteor, or both use_dropout = True, selector = True, ctx2out = True, prev2out = True, dispFreq = 10, validFreq = 2000, saveFreq = -1, # save the parameters after every saveFreq updates sampleFreq = 100, # generate some samples after every sampleFreq updates verbose = True, debug = False, reload_model = False, from_dir = '', ctx_frames = 28, # 26 when compare random_seed = 1234, beam_search = True ): tf.set_random_seed(random_seed) model = Model() print 'loading data' engine = data_engine.Movie2Caption(dataset_name,cnn_name,train_data_ids_path, val_data_ids_path, test_data_ids_path, vocab_path, reverse_vocab_path, mb_size_train, mb_size_test, maxlen_caption, train_caps_path, val_caps_path, test_caps_path, feats_dir) model_options['ctx_dim'] = engine.ctx_dim ctx_dim = engine.ctx_dim model_options['vocab_size'] = engine.vocab_size vocab_size = engine.vocab_size print 'n_words:', model_options['vocab_size'] print 'ctx_dim:', model_options['ctx_dim'] utils.write_to_json(model_options, '%smodel_options.json'%save_dir) # set test values, for debugging idx = engine.kf_train[0] x_tv, mask_tv, ctx_tv, ctx_mask_tv, ctx_pca_tv = data_engine.prepare_data(engine, [engine.train_data_ids[index] for index in idx], mode="train") print 'init params' t0 = time.time() params = model.init_params(model_options) k_centers = 3 # description string: #words x #samples X = tf.placeholder(tf.int32, shape=(None, None), name='word_seq_x') # word seq input (t,m) MASK = tf.placeholder(tf.float32, shape=(None, None), name='word_seq_mask') # (t,m) # context: #samples x #annotations x dim CTX = tf.placeholder(tf.float32, shape=(None, ctx_frames, ctx_dim), name='ctx') CTX_MASK = tf.placeholder(tf.float32, shape=(None, ctx_frames), name='ctx_mask') CTX_PCA = tf.placeholder(tf.float32, shape=(None, k_centers, ctx_dim), name='ctx_pca') CTX_SAMPLER = tf.placeholder(tf.float32, shape=(ctx_frames, ctx_dim), name='ctx_sampler') CTX_MASK_SAMPLER = tf.placeholder(tf.float32, shape=(ctx_frames), name='ctx_mask_sampler') CTX_PCA_SAMPLER = tf.placeholder(tf.float32, shape=(k_centers, ctx_dim), name='ctx_pca_sampler') X_SAMPLER = tf.placeholder(tf.int32, shape=(None,), name='x_sampler') # DOUBT 1 or None ? BO_INIT_STATE_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='bo_init_state_sampler') TO_INIT_STATE_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='to_init_state_sampler') BO_INIT_MEMORY_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='bo_init_memory_sampler') TO_INIT_MEMORY_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='to_init_memory_sampler') # create tensorflow variables print 'buliding model' tfparams = utils.init_tfparams(params) use_noise, COST, extra = model.build_model(tfparams, model_options, X, MASK, CTX, CTX_MASK, CTX_PCA) ALPHAS = extra[1] # (t,64,28) BETAS = extra[2] # (t,64) print 'buliding sampler' f_init, f_next = model.build_sampler(tfparams, model_options, use_noise, CTX_SAMPLER, CTX_MASK_SAMPLER, CTX_PCA_SAMPLER, X_SAMPLER, BO_INIT_STATE_SAMPLER, TO_INIT_STATE_SAMPLER, BO_INIT_MEMORY_SAMPLER, TO_INIT_MEMORY_SAMPLER) print 'building f_log_probs' f_log_probs = -COST print 'check trainables' wrt = utils.itemlist(tfparams, model_options) trainables = tf.trainable_variables() print len(wrt),len(trainables) # assert len(wrt)==len(trainables) COST = tf.reduce_mean(COST, name="LOSS") if decay_c > 0.: decay_c = tf.Variable(np.float32(decay_c), trainable=False, name='decay_c') weight_decay = 0. for vv in wrt: weight_decay += tf.reduce_sum(vv ** 2) weight_decay *= decay_c COST += weight_decay if alpha_c > 0.: alpha_c = tf.Variable(np.float32(alpha_c), trainable=False, name='alpha_c') alpha_reg = alpha_c * tf.reduce_mean(tf.reduce_sum(((1.-tf.reduce_sum(ALPHAS, axis=0))**2), axis=-1)) COST += alpha_reg if alpha_entropy_r > 0: alpha_entropy_r = tf.Variable(np.float32(alpha_entropy_r), name='alpha_entropy_r') alpha_reg_2 = alpha_entropy_r * tf.reduce_mean(tf.reduce_sum((-tf.add(ALPHAS * tf.log(ALPHAS+1e-8),axis=-1)), axis=-1)) COST += alpha_reg_2 else: alpha_reg_2 = tf.zeros_like(COST) print 'building f_alpha' f_alpha = [ALPHAS, BETAS] print 'build train fns' UPDATE_OPS = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(UPDATE_OPS): # optimizer = tf.train.AdadeltaOptimizer(learning_rate=1.0, rho=0.95, epsilon=1e-06).minimize(loss=COST, var_list=wrt) optimizer = tf.train.AdadeltaOptimizer(learning_rate=1.0, rho=0.95, epsilon=1e-06) # optimizer = tf.train.AdamOptimizer() gradients, variables = zip(*optimizer.compute_gradients(loss=COST, var_list=wrt)) gradients, _ = tf.clip_by_global_norm(gradients, clip_c) capped_grads_and_vars = zip(gradients, variables) TRAIN_OP = optimizer.apply_gradients(capped_grads_and_vars) # Initialize all variables var_init = tf.global_variables_initializer() # Ops to save and restore all the variables. saver = tf.train.Saver() print 'compilation took %.4f sec'%(time.time()-t0) print 'Optimization' history_errs = [] # reload history if reload_model: print 'loading history error...' history_errs = np.load(from_dir+'model_best_so_far.npz')['history_errs'].tolist() bad_counter = 0 processes = None queue = None rqueue = None shared_params = None uidx = 0 uidx_best_blue = 0 uidx_best_valid_err = 0 estop = False # best_p = utils.unzip(tparams) best_blue_valid = 0 best_valid_err = 999 alphas_ratio = [] train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 # Launch the graph with tf.Session() as sess: sess.run(var_init) if reload_model: print 'restoring model...' saver.restore(sess, from_dir+"model_best_so_far.ckpt") for eidx in xrange(max_epochs): n_samples = 0 train_costs = [] grads_record = [] for idx in engine.kf_train: tags = [engine.train_data_ids[index] for index in idx] n_samples += len(tags) uidx += 1 sess.run(tf.assign(use_noise, True)) pd_start = time.time() x, mask, ctx, ctx_mask, ctx_pca = data_engine.prepare_data(engine, tags, mode="train") pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # writer = tf.summary.FileWriter("graph_cost", sess.graph) cost, alphas, betas = sess.run([COST,ALPHAS,BETAS], feed_dict={ X: x, MASK: mask, CTX: ctx, CTX_PCA: ctx_pca, CTX_MASK: ctx_mask}) ud_start = time.time() sess.run(TRAIN_OP, feed_dict={ X: x, MASK: mask, CTX: ctx, CTX_PCA: ctx_pca, CTX_MASK: ctx_mask}) ud_duration = time.time() - ud_start # writer.close() if np.isnan(cost) or np.isinf(cost): print 'NaN detected in cost' import pdb; pdb.set_trace() if eidx == 0: train_error = cost else: train_error = train_error * 0.95 + cost * 0.05 train_costs.append(cost) if np.mod(uidx, dispFreq) == 0: print 'Epoch: ', eidx, \ ', Update: ', uidx, \ ', train cost mean so far: ', train_error, \ ', fetching data time spent (sec): ', pd_duration, \ ', update time spent (sec): ', ud_duration, \ ', save_dir: ', save_dir, '\n' alphas, betas = sess.run(f_alpha, feed_dict={ X: x, MASK: mask, CTX: ctx, CTX_PCA: ctx_pca, CTX_MASK: ctx_mask}) counts = mask.sum(0) betas_mean = (betas * mask).sum(0) / counts betas_mean = betas_mean.mean() print 'alpha ratio %.3f, betas mean %.3f\n'%( alphas.min(-1).mean() / (alphas.max(-1)).mean(), betas_mean) l = 0 for vv in x[:, 0]: if vv == 0: # eos break if vv in engine.reverse_vocab: print '(', np.round(betas[l, 0], 3), ')', engine.reverse_vocab[vv], else: print '(', np.round(betas[l, 0], 3), ')', 'UNK', print ",", l += 1 print '(', np.round(betas[l, 0], 3), ')\n' if np.mod(uidx, saveFreq) == 0: pass if np.mod(uidx, sampleFreq) == 0: sess.run(tf.assign(use_noise, False)) print '------------- sampling from train ----------' x_s = x # (t,m) mask_s = mask # (t,m) ctx_s = ctx # (m,28,2048) ctx_mask_s = ctx_mask # (m,28) ctx_pca_s = ctx_pca model.sample_execute(sess, engine, model_options, tfparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, ctx_pca_s) # print '------------- sampling from valid ----------' # idx = engine.kf_val[np.random.randint(1, len(engine.kf_val) - 1)] # tags = [engine.val_data_ids[index] for index in idx] # x_s, mask_s, ctx_s, mask_ctx_s, ctx_pca_s = data_engine.prepare_data(engine, tags,"val") # model.sample_execute(sess, engine, model_options, tfparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, ctx_pca_s) # print "" if validFreq != -1 and np.mod(uidx, validFreq) == 0: t0_valid = time.time() alphas, _ = sess.run(f_alpha, feed_dict={ X: x, MASK: mask, CTX: ctx, CTX_PCA: ctx_pca, CTX_MASK: ctx_mask}) ratio = alphas.min(-1).mean()/(alphas.max(-1)).mean() alphas_ratio.append(ratio) np.savetxt(save_dir+'alpha_ratio.txt',alphas_ratio) np.savez(save_dir+'model_current.npz', history_errs=history_errs) saver.save(sess, save_dir+'model_current.ckpt') sess.run(tf.assign(use_noise, False)) train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 if not debug: # first compute train cost if 0: print 'computing cost on trainset' train_err, train_perp = model.pred_probs(sess, engine, 'train', f_log_probs, verbose=model_options['verbose']) else: train_err = 0. train_perp = 0. if 1: print 'validating...' valid_err, valid_perp = model.pred_probs(sess, engine, 'val', f_log_probs, verbose=model_options['verbose']) else: valid_err = 0. valid_perp = 0. if 0: print 'testing...' test_err, test_perp = model.pred_probs(sess, engine, 'test', f_log_probs, verbose=model_options['verbose']) else: test_err = 0. test_perp = 0. mean_ranking = 0 blue_t0 = time.time() scores, processes, queue, rqueue, shared_params = \ metrics.compute_score(sess=sess, model_type='attention', model_archive=None, options=model_options, engine=engine, save_dir=save_dir, beam=5, n_process=5, whichset='both', on_cpu=False, processes=processes, queue=queue, rqueue=rqueue, shared_params=shared_params, metric=metric, one_time=False, f_init=f_init, f_next=f_next, model=model ) ''' {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]}, 'alternative_valid': {'Bleu_3': 0.40702270203174923, 'Bleu_4': 0.29276570520368456, 'CIDEr': 0.25247168210607884, 'Bleu_2': 0.529069629270047, 'Bleu_1': 0.6804308797115253, 'ROUGE_L': 0.51083584331688392}, 'meteor': {'test': [-1], 'valid': [0.282787550236724]}} ''' valid_B1 = scores['valid']['Bleu_1'] valid_B2 = scores['valid']['Bleu_2'] valid_B3 = scores['valid']['Bleu_3'] valid_B4 = scores['valid']['Bleu_4'] valid_Rouge = scores['valid']['ROUGE_L'] valid_Cider = scores['valid']['CIDEr'] valid_meteor = scores['valid']['METEOR'] test_B1 = scores['test']['Bleu_1'] test_B2 = scores['test']['Bleu_2'] test_B3 = scores['test']['Bleu_3'] test_B4 = scores['test']['Bleu_4'] test_Rouge = scores['test']['ROUGE_L'] test_Cider = scores['test']['CIDEr'] test_meteor = scores['test']['METEOR'] print 'computing meteor/blue score used %.4f sec, '\ 'blue score: %.1f, meteor score: %.1f'%( time.time()-blue_t0, valid_B4, valid_meteor) history_errs.append([eidx, uidx, train_err, train_perp, valid_perp, test_perp, valid_err, test_err, valid_B1, valid_B2, valid_B3, valid_B4, valid_meteor, valid_Rouge, valid_Cider, test_B1, test_B2, test_B3, test_B4, test_meteor, test_Rouge, test_Cider]) np.savetxt(save_dir+'train_valid_test.txt', history_errs, fmt='%.3f') print 'save validation results to %s'%save_dir # save best model according to the best blue or meteor if len(history_errs) > 1 and \ valid_B4 > np.array(history_errs)[:-1,11].max(): print 'Saving to %s...'%save_dir, np.savez( save_dir+'model_best_blue_or_meteor.npz', history_errs=history_errs) saver.save(sess, save_dir+'model_best_blue_or_meteor.ckpt') # DOUBT if len(history_errs) > 1 and \ valid_err < np.array(history_errs)[:-1,6].min(): # best_p = utils.unzip(tparams) # DOUBT bad_counter = 0 best_valid_err = valid_err uidx_best_valid_err = uidx print 'Saving to %s...'%save_dir, np.savez(save_dir+'model_best_so_far.npz', history_errs=history_errs) saver.save(sess, save_dir+'model_best_so_far.ckpt') utils.write_to_json(model_options, '%smodel_options.json'%save_dir) print 'Done' elif len(history_errs) > 1 and \ valid_err >= np.array(history_errs)[:-1,6].min(): bad_counter += 1 print 'history best ',np.array(history_errs)[:,6].min() print 'bad_counter ',bad_counter print 'patience ',patience if bad_counter > patience: print 'Early Stop!' estop = True break if test_B4>0.52 and test_meteor>0.32: print 'Saving to %s...'%save_dir, np.savez( save_dir+'model_'+str(uidx)+'.npz', history_errs=history_errs) saver.save(sess, save_dir+'model_'+str(uidx)+'.ckpt') print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \ 'best valid err so far',best_valid_err print 'valid took %.2f sec'%(time.time() - t0_valid) # end of validatioin if debug: break if estop: break if debug: break # end for loop over minibatches print 'This epoch has seen %d samples, train cost %.2f'%( n_samples, np.mean(train_costs)) # end for loop over epochs print 'Optimization ended.' print 'stopped at epoch %d, minibatch %d, '\ 'curent Train %.2f, current Valid %.2f, current Test %.2f '%( eidx,uidx,np.mean(train_err),np.mean(valid_err),np.mean(test_err)) if history_errs != []: history = np.asarray(history_errs) best_valid_idx = history[:,6].argmin() np.savetxt(save_dir+'train_valid_test.txt', history, fmt='%.4f') print 'final best exp ', history[best_valid_idx] np.savez( save_dir+'model_train_end.npz', history_errs=history_errs) saver.save(sess, save_dir+'model_train_end.ckpt') return
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of GRU units encoder='gru', patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 weight decay penalty lrate=0.01, n_words=100000, # vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq dataset='/data/lisatmp4/anirudhg/wiki.tok.txt.gz', valid_dataset='/data/lisatmp4/anirudhg/newstest2011.en.tok', dictionary='/data/lisatmp4/anirudhg/wiki.tok.txt.gz.pkl', use_dropout=False, reload_=False): # Model options model_options = locals().copy() # load dictionary with open(dictionary, 'rb') as f: worddicts = pkl.load(f) # invert dictionary worddicts_r = dict() for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print 'Loading data' train = TextIterator(dataset, dictionary, n_words_source=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_dataset, dictionary, n_words_source=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) # create shared variables for parameters tparams = init_tparams(params) # build the symbolic computational graph trng, use_noise, \ x, x_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask] print 'Buliding sampler' f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size f = open('workfile', 'w') # Training loop # generate some samples with the model and display them for jj in xrange(5000): sample, score = gen_sample(tparams, f_next, model_options, trng=trng, maxlen=30, argmax=False) print 'Sample ', jj, ': ', ss = sample for vv in ss: if vv == 0: break if vv in worddicts_r: print worddicts_r[vv], f.write(worddicts_r[vv]) f.write(' ') else: print 'UNK', print f.write('UNK') f.write(' ') print '\n' f.write('\n') f.close()
def train(dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', n_words_src=30000, n_words=30000, patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=1., # learning rate maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, saveto='model.npz', saveFreq=1000, # save the parameters after every saveFreq updates datasets=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'], picked_train_idxes_file=r'', use_dropout=False, reload_=False, overwrite=False, preload='', sort_by_len=False, convert_embedding=True, dump_before_train=False, ): # Model options model_options = locals().copy() if reload_: lrate *= 0.5 # load dictionaries and invert them # reload options if reload_ and os.path.exists(preload): print 'Reloading model options' with open(r'.\model\en2fr.iter160000.npz.pkl', 'rb') as f: model_options = pkl.load(f) print 'Configuration from fy' vocab_en_filename = './data/dic/en2fr_en_vocabs_top1M.pkl' vocab_fr_filename = './data/dic/en2fr_fr_vocabs_top1M.pkl' map_filename = './data/dic/mapFullVocab2Top1MVocab.pkl' lr_discount_freq = 80000 print 'Done' print 'Loading data' text_iterator = TextIterator( datasets[0], datasets[1], vocab_en_filename, vocab_fr_filename, batch_size, maxlen, n_words_src, n_words, ) # sys.stdout.flush() # train_data_x = pkl.load(open(datasets[0], 'rb')) # train_data_y = pkl.load(open(datasets[1], 'rb')) # # if len(picked_train_idxes_file) != 0: # picked_idxes = pkl.load(open(picked_train_idxes_file, 'rb')) # train_data_x = [train_data_x[id] for id in picked_idxes] # train_data_y = [train_data_y[id] for id in picked_idxes] # # print 'Total train:', len(train_data_x) # print 'Max len:', max([len(x) for x in train_data_x]) # sys.stdout.flush() # # if sort_by_len: # slen = np.array([len(s) for s in train_data_x]) # sidx = slen.argsort() # # _sbuf = [train_data_x[i] for i in sidx] # _tbuf = [train_data_y[i] for i in sidx] # # train_data_x = _sbuf # train_data_y = _tbuf # print len(train_data_x[0]), len(train_data_x[-1]) # sys.stdout.flush() # train_batch_idx = get_minibatches_idx(len(train_data_x), batch_size, shuffle=False) # else: # train_batch_idx = get_minibatches_idx(len(train_data_x), batch_size, shuffle=True) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(preload): print 'Reloading model parameters' params = load_params(preload, params) # for k, v in params.iteritems(): # print '>', k, v.shape, v.dtype # Only convert parameters when reloading if convert_embedding: # ================= # Convert input and output embedding parameters with a exist word embedding # ================= print 'Convert input and output embedding' temp_Wemb = params['Wemb'] orig_emb_mean = np.mean(temp_Wemb, axis=0) params['Wemb'] = np.tile(orig_emb_mean, [params['Wemb'].shape[0], 1]) # Load vocabulary map dicts and do mapping with open(map_filename, 'rb') as map_file: map_en = pkl.load(map_file) map_fr = pkl.load(map_file) for full, top in map_en.iteritems(): emb_size = temp_Wemb.shape[0] if full < emb_size and top < emb_size: params['Wemb'][top] = temp_Wemb[full] print 'Convert input embedding done' temp_ff_logit_W = params['ff_logit_W'] temp_Wemb_dec = params['Wemb_dec'] temp_b = params['ff_logit_b'] orig_ff_logit_W_mean = np.mean(temp_ff_logit_W, axis=1) orig_Wemb_dec_mean = np.mean(temp_Wemb_dec, axis=0) orig_b_mean = np.mean(temp_b) params['ff_logit_W'] = np.tile(orig_ff_logit_W_mean, [params['ff_logit_W'].shape[1], 1]).T params['ff_logit_b'].fill(orig_b_mean) params['Wemb_dec'] = np.tile(orig_Wemb_dec_mean, [params['Wemb_dec'].shape[0], 1]) for full, top in map_en.iteritems(): emb_size = temp_Wemb.shape[0] if full < emb_size and top < emb_size: params['ff_logit_W'][:, top] = temp_ff_logit_W[:, full] params['ff_logit_b'][top] = temp_b[full] params['Wemb_dec'][top] = temp_Wemb[full] print 'Convert output embedding done' # for k, v in params.iteritems(): # print '>', k, v.shape, v.dtype # ================ # End Convert # ================ tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost, x_emb = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, trng, use_noise) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) f_x_emb = theano.function([x, x_mask], x_emb, profile=profile) print 'Done' sys.stdout.flush() cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(np.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(np.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0)) ** 2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' sys.stdout.flush() # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g ** 2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' best_p = None bad_counter = 0 uidx = 0 if reload_: m = re.search('.+iter(\d+?)\.npz', preload) if m: uidx = int(m.group(1)) print 'uidx', uidx, 'l_rate', lrate estop = False history_errs = [] # reload history if dump_before_train: print 'Dumping before train...', saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) np.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip(tparams)) print 'Done' if saveFreq == -1: saveFreq = len(train[0]) / batch_size for eidx in xrange(max_epochs): n_samples = 0 # for i, batch_idx in train_batch_idx: # # x = [train_data_x[id] for id in batch_idx] # y = [train_data_y[id] for id in batch_idx] for i, (x, y) in enumerate(text_iterator): n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1., 1., 1. # discount reward if lr_discount_freq > 0 and np.mod(uidx, lr_discount_freq) == 0: lrate *= 0.5 print 'Discount learning rate to {} at iteration {}'.format(lrate, uidx) # verbose if np.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud sys.stdout.flush() if np.mod(uidx, saveFreq) == 0: # save with uidx if not overwrite: # print 'Saving the model at iteration {}...'.format(uidx), saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) np.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip(tparams)) # print 'Done' # sys.stdout.flush() # generate some samples with the model and display them # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) return 0.
def train(worker, model_options, data_options, patience, # early stopping patience max_epochs, finish_after, # finish after this many updates decay_c, # L2 regularization penalty alpha_c, # alignment regularization clip_c, # gradient clipping threshold lrate, # learning rate optimizer, saveto, valid_freq, train_len, valid_sync, save_freq, # save the parameters after every saveFreq updates sample_freq, # generate some samples after every sampleFreq control_port, batch_port, log_port, reload_): LOGGER.info('Connecting to data socket ({}) and loading validation data' .format(batch_port)) worker.init_mb_sock(batch_port) _, _, valid_stream = load_data(**data_options) LOGGER.info('Building model') params = init_params(model_options) # reload parameters experiment_id = worker.send_req('experiment_id') model_filename = '{}.model.npz'.format(experiment_id) saveto_filename = '{}.npz'.format(saveto) if reload_ and os.path.exists(saveto_filename): LOGGER.info('Loading parameters from {}'.format(saveto_filename)) params = load_params(saveto_filename, params) LOGGER.info('Initializing parameters') tparams = init_tparams(params) alpha = worker.send_req('alpha') worker.init_shared_params(tparams.values(), param_sync_rule=EASGD(alpha)) # use_noise is for dropout trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] LOGGER.info('Building sampler') f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer LOGGER.info('Building f_log_probs') f_log_probs = theano.function(inps, cost, profile=False) cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in six.iteritems(tparams): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((tensor.cast( y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0)) ** 2).sum(1).mean() cost += alpha_reg # Not used? # after all regularizers - compile the computational graph for cost # LOGGER.info('Building f_cost') # f_cost = theano.function(inps, cost, profile=False) LOGGER.info('Computing gradient') grads = tensor.grad(cost, wrt=itemlist(tparams)) # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g ** 2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt( g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') LOGGER.info('Building optimizers') f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) LOGGER.info('Optimization') log = RemoteLogger(port=log_port) train_start = time.clock() best_p = None # Making sure that the worker start training with the most recent params worker.copy_to_local() uidx = 0 while True: step = worker.send_req('next') LOGGER.debug('Received command: {}'.format(step)) if step == 'train': use_noise.set_value(1.) for i in xrange(train_len): x, x_mask, y, y_mask = worker.recv_mb() uidx += 1 log_entry = {'iteration': uidx} # compute cost, grads and copy grads to shared variables update_start = time.clock() cost = f_grad_shared(x, x_mask, y, y_mask) f_update(lrate) log_entry['cost'] = float(cost) log_entry['average_source_length'] = \ float(x_mask.sum(0).mean()) log_entry['average_target_length'] = \ float(y_mask.sum(0).mean()) log_entry['update_time'] = time.clock() - update_start log_entry['train_time'] = time.clock() - train_start log_entry['time'] = time.time() log.log(log_entry) step = worker.send_req({'done': train_len}) LOGGER.debug("Syncing with global params") worker.sync_params(synchronous=True) if step == 'valid': if valid_sync: worker.copy_to_local() use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, model_options, valid_stream) valid_err = float(valid_errs.mean()) res = worker.send_req({'valid_err': valid_err}) log.log({'validation_cost': valid_err, 'train_time': time.clock() - train_start, 'time': time.time()}) if res == 'best' and saveto: best_p = unzip(tparams) save_params(best_p, model_filename, saveto_filename) if valid_sync: worker.copy_to_local() if step == 'stop': break # Release all shared ressources. worker.close()
def train(dim_word_desc=400,# word vector dimensionality dim_word_q=400, dim_word_ans=600, dim_proj=300, dim=400,# the number of LSTM units encoder_desc='lstm', encoder_desc_word='lstm', encoder_desc_sent='lstm', use_dq_sims=False, eyem=None, learn_h0=False, use_desc_skip_c_g=False, debug=False, encoder_q='lstm', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., clip_c=-1., lrate=0.01, n_words_q=49145, n_words_desc=115425, n_words_ans=409, pkl_train_files=None, pkl_valid_files=None, maxlen=2000, # maximum length of the description optimizer='rmsprop', batch_size=2, vocab=None, valid_batch_size=16, use_elu_g=False, saveto='model.npz', model_dir=None, ms_nlayers=3, validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates datasets=[None], truncate=400, momentum=0.9, use_bidir=False, cost_mask=None, valid_datasets=['/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5', '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'], dropout_rate=0.5, use_dropout=True, reload_=True, **opt_ds): ensure_dir_exists(model_dir) mpath = os.path.join(model_dir, saveto) mpath_best = os.path.join(model_dir, prfx("best", saveto)) mpath_last = os.path.join(model_dir, prfx("last", saveto)) mpath_stats = os.path.join(model_dir, prfx("stats", saveto)) # Model options model_options = locals().copy() model_options['use_sent_reps'] = opt_ds['use_sent_reps'] stats = defaultdict(list) del model_options['eyem'] del model_options['cost_mask'] if cost_mask is not None: cost_mask = sharedX(cost_mask) # reload options and parameters if reload_: print "Reloading the model." if os.path.exists(mpath_best): print "Reloading the best model from %s." % mpath_best with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath_best, params) elif os.path.exists(mpath): print "Reloading the model from %s." % mpath with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath, params) else: raise IOError("Couldn't open the file.") else: print "Couldn't reload the models initializing from scratch." params = init_params(model_options) if datasets[0]: print "Short dataset", datasets[0] print 'Loading data' print 'Building model' if pkl_train_files is None or pkl_valid_files is None: train, valid, test = load_data(path=datasets[0], valid_path=valid_datasets[0], test_path=valid_datasets[1], batch_size=batch_size, **opt_ds) else: train, valid, test = load_pkl_data(train_file_paths=pkl_train_files, valid_file_paths=pkl_valid_files, batch_size=batch_size, vocab=vocab, eyem=eyem, **opt_ds) tparams = init_tparams(params) trng, use_noise, inps_d, \ opt_ret, \ cost, errors, ent_errors, ent_derrors, probs = \ build_model(tparams, model_options, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, valid, cost_mask=cost_mask) alphas = opt_ret['dec_alphas'] if opt_ds['use_sent_reps']: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], inps_d['slen'], inps_d['qlen'],\ inps_d['ent_mask'] ] else: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], \ inps_d['qlen'], \ inps_d['ent_mask']] outs = [cost, errors, probs, alphas] if ent_errors: outs += [ent_errors] if ent_derrors: outs += [ent_derrors] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, outs, profile=profile) print 'Done' # Apply weight decay on the feed-forward connections if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): if "logit" in kk or "ff" in kk: weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Computing gradient...', grads = safe_grad(cost, itemlist(tparams)) print 'Done' # Gradient clipping: if clip_c > 0.: g2 = get_norms(grads) for p, g in grads.iteritems(): grads[p] = tensor.switch(g2 > (clip_c**2), (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g) inps.pop() if optimizer.lower() == "adasecant": learning_rule = Adasecant(delta_clip=25.0, use_adagrad=True, grad_clip=0.25, gamma_clip=0.) elif optimizer.lower() == "rmsprop": learning_rule = RMSPropMomentum(init_momentum=momentum) elif optimizer.lower() == "adam": learning_rule = Adam() elif optimizer.lower() == "adadelta": learning_rule = AdaDelta() lr = tensor.scalar(name='lr') print 'Building optimizers...', learning_rule = None if learning_rule: f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr, grads=grads, inp=inps, cost=cost, errors=errors) else: f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, errors) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(mpath): history_errs = list(numpy.load(mpath)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size best_found = False uidx = 0 estop = False train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() for eidx in xrange(max_epochs): n_samples = 0 if train.done: train.reset() for d_, q_, a, em in train: n_samples += len(a) uidx += 1 use_noise.set_value(1.) if opt_ds['use_sent_reps']: # To mask the description and the question. d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared(d, d_mask, q, q_mask, a, dlen, slen, qlen) else: d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared(d, d_mask, q, q_mask, a, dlen, qlen) upnorm = f_update(lrate) ud = time.time() - ud_start # Collect the running ave train stats. train_cost_ave = running_ave(train_cost_ave, cost) train_err_ave = running_ave(train_err_ave, errors) train_gnorm_ave = running_ave(train_gnorm_ave, gnorm) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' import ipdb; ipdb.set_trace() if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, ' Update ', uidx, \ ' Cost ', cost, ' UD ', ud, \ ' UpNorm ', upnorm[0].tolist(), \ ' GNorm ', gnorm, \ ' Pnorm ', pnorm, 'Terrors ', errors if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None and best_found: numpy.savez(mpath_best, history_errs=history_errs, **best_p) pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb')) else: params = unzip(tparams) numpy.savez(mpath, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % mpath, 'wb')) pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb')) print 'Done' print_param_norms(tparams) if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if valid.done: valid.reset() valid_costs, valid_errs, valid_probs, \ valid_alphas, error_ent, error_dent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_reps']) valid_alphas_ = numpy.concatenate([va.argmax(0) for va in valid_alphas.tolist()], axis=0) valid_err = valid_errs.mean() valid_cost = valid_costs.mean() valid_alpha_ent = -negentropy(valid_alphas) mean_valid_alphas = valid_alphas_.mean() std_valid_alphas = valid_alphas_.std() mean_valid_probs = valid_probs.argmax(1).mean() std_valid_probs = valid_probs.argmax(1).std() history_errs.append([valid_cost, valid_err]) stats['train_err_ave'].append(train_err_ave) stats['train_cost_ave'].append(train_cost_ave) stats['train_gnorm_ave'].append(train_gnorm_ave) stats['valid_errs'].append(valid_err) stats['valid_costs'].append(valid_cost) stats['valid_err_ent'].append(error_ent) stats['valid_err_desc_ent'].append(error_dent) stats['valid_alphas_mean'].append(mean_valid_alphas) stats['valid_alphas_std'].append(std_valid_alphas) stats['valid_alphas_ent'].append(valid_alpha_ent) stats['valid_probs_mean'].append(mean_valid_probs) stats['valid_probs_std'].append(std_valid_probs) if uidx == 0 or valid_err <= numpy.array(history_errs)[:, 1].min(): best_p = unzip(tparams) bad_counter = 0 best_found = True else: bst_found = False if numpy.isnan(valid_err): import ipdb; ipdb.set_trace() print "============================" print '\t>>>Valid error: ', valid_err, \ ' Valid cost: ', valid_cost print '\t>>>Valid pred mean: ', mean_valid_probs, \ ' Valid pred std: ', std_valid_probs print '\t>>>Valid alphas mean: ', mean_valid_alphas, \ ' Valid alphas std: ', std_valid_alphas, \ ' Valid alpha negent: ', valid_alpha_ent, \ ' Valid error ent: ', error_ent, \ ' Valid error desc ent: ', error_dent print "============================" print "Running average train stats " print '\t>>>Train error: ', train_err_ave, \ ' Train cost: ', train_cost_ave, \ ' Train grad norm: ', train_gnorm_ave print "============================" train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid.reset() valid_cost, valid_error, valid_probs, \ valid_alphas, error_ent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_rep']) print " Final eval resuts: " print 'Valid error: ', valid_error.mean() print 'Valid cost: ', valid_cost.mean() print '\t>>>Valid pred mean: ', valid_probs.mean(), \ ' Valid pred std: ', valid_probs.std(), \ ' Valid error ent: ', error_ent params = copy.copy(best_p) numpy.savez(mpath_last, zipped_params=best_p, history_errs=history_errs, **params) return valid_err, valid_cost
def train(options, data, load_params=False, start_epoc=0): print "OPTIONS: ", options print 'Setting up model with options:' options = set_defaults(options) for kk, vv in options.iteritems(): print kk, vv print "model seed: ", options['model_seed'] print "fold: ", options['fold'] print 'seed: ', options['seed'] rng = numpy.random.RandomState(options['model_seed'] + 100 * options.get('fold', 99) + options.get('seed', 99)) params, operators = init_params(options, rng) print 'done...' if load_params: loaded = load_par(options) start_epoc = resume_epoc(options) # Check that we've loaded the correct parameters... for kk, vv in loaded.iteritems(): assert params[kk].shape == vv.shape assert type(params[kk]) == type(vv) params = loaded tparams = init_tparams(params) trng, use_noise, inps, out = build_model(tparams, options, rng) y = tensor.imatrix('y') cost = nll(out, y) f_eval = theano.function([inps, y], cost, givens={use_noise: numpy.float32(0.)}, on_unused_input='ignore') reg = 0. for k, v in tparams.iteritems(): if k[:6] == 'hidden' or k[-3:] == 'W_h': reg += options['l1'] * tensor.sum(abs(v)) reg += options['l2'] * tensor.sum((v)**2) cost += reg grads = tensor.grad(cost, wrt=itemlist(tparams)) lr = tensor.scalar(name='lr', dtype=theano.config.floatX) opt = get_optim(options['opt']) print 'Compiling functions' f_grad_shared, f_update, gshared = opt(lr, tparams, grads, [inps, y], cost, use_noise) f_out = theano.function([inps], out, givens={use_noise: numpy.float32(0.)}, on_unused_input='ignore', allow_input_downcast=True) best = numpy.inf print 'Starting training' train = list_update(data[0], f_eval, options['batch_size'], rng=rng) test = list_update(data[-1], f_eval, options['batch_size'], rng=rng) starting = (train, test) print 'Pre-training. test: %f, train: %f' % (test, train) print 'Training' lr = options['lr'] max_itr = options['max_itr'] grad_norm = 0. train_scores = 50 * [0.] try: for epoch in xrange(max_itr): start_time = time.time() for g in gshared: # manually set gradients to 0 because we accumulate in list update g.set_value(0.0 * g.get_value()) use_noise.set_value(1.) train_cost, n_obs = list_update(data[0], f_grad_shared, batchsize=options['batch_size'], rng=rng, return_n_obs=True) use_noise.set_value(0.) for g in gshared: g.set_value(floatx(g.get_value() / float(n_obs))) f_update(lr) apply_proximity(tparams, operators) train = list_update(data[0], f_eval, options['batch_size'], rng=rng) elapsed_time = time.time() - start_time if train < best: # early stopping on training set test = list_update(data[-1], f_eval) best_par = unzip(tparams) best_perf = (train, test) best = train test = list_update(data[-1], f_eval) if (epoch % 50) == 0: # Save progress.... save_progress(options, tparams, epoch, best_perf) print 'Epoch: %d, cost: %f, train: %f, test: %f, lr:%f, time: %f' % ( epoch, train_cost, train, test, lr, elapsed_time) # Check if we're diverging... train_ave = running_ave(train_scores, train, epoch) if epoch > 1000: # Only exit if we're diverging after 1000 iterations if train_ave > 1.03 * best_perf[0]: print "Diverged..." break except KeyboardInterrupt: print "Interrupted" # check that we're outputing prob distributions X = data[0][(3, 3)][0] assert abs( f_out(X.reshape(X.shape[0], 2, 3, 3)).sum() - float(X.shape[0])) < 1e-4 print "Best performance:" print "train, test" print "%f,%f" % best_perf return best_perf, best_par
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., diag_c=0., clip_c=-1., lrate=0.01, n_words_src=100000, n_words=100000, maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates datasets=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok' ], valid_datasets=[ '../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok' ], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl' ], use_dropout=False, reload_=False): # Model options model_options = locals().copy() worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) print 'Loading data' train = TextIterator(datasets[0], datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Buliding sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' print 'Building f_grad...', f_grad = theano.function(inps, grads, profile=profile) print 'Done' if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[1])): stochastic = True sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False) print 'Source ', jj, ': ', for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: print worddicts_r[0][vv], else: print 'UNK', print print 'Truth ', jj, ' : ', for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= numpy.array( history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print 'Valid ', valid_err print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
def train( dim_word_desc=400, # word vector dimensionality dim_word_q=400, dim_word_ans=600, dim_proj=300, dim=400, # the number of LSTM units encoder_desc='lstm', encoder_desc_word='lstm', encoder_desc_sent='lstm', use_dq_sims=False, eyem=None, learn_h0=False, use_desc_skip_c_g=False, debug=False, encoder_q='lstm', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., clip_c=-1., lrate=0.01, n_words_q=49145, n_words_desc=115425, n_words_ans=409, pkl_train_files=None, pkl_valid_files=None, maxlen=2000, # maximum length of the description optimizer='rmsprop', batch_size=2, vocab=None, valid_batch_size=16, use_elu_g=False, saveto='model.npz', model_dir=None, ms_nlayers=3, validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates datasets=[None], truncate=400, momentum=0.9, use_bidir=False, cost_mask=None, valid_datasets=[ '/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5', '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5' ], dropout_rate=0.5, use_dropout=True, reload_=True, **opt_ds): ensure_dir_exists(model_dir) mpath = os.path.join(model_dir, saveto) mpath_best = os.path.join(model_dir, prfx("best", saveto)) mpath_last = os.path.join(model_dir, prfx("last", saveto)) mpath_stats = os.path.join(model_dir, prfx("stats", saveto)) # Model options model_options = locals().copy() model_options['use_sent_reps'] = opt_ds['use_sent_reps'] stats = defaultdict(list) del model_options['eyem'] del model_options['cost_mask'] if cost_mask is not None: cost_mask = sharedX(cost_mask) # reload options and parameters if reload_: print "Reloading the model." if os.path.exists(mpath_best): print "Reloading the best model from %s." % mpath_best with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath_best, params) elif os.path.exists(mpath): print "Reloading the model from %s." % mpath with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath, params) else: raise IOError("Couldn't open the file.") else: print "Couldn't reload the models initializing from scratch." params = init_params(model_options) if datasets[0]: print "Short dataset", datasets[0] print 'Loading data' print 'Building model' if pkl_train_files is None or pkl_valid_files is None: train, valid, test = load_data(path=datasets[0], valid_path=valid_datasets[0], test_path=valid_datasets[1], batch_size=batch_size, **opt_ds) else: train, valid, test = load_pkl_data(train_file_paths=pkl_train_files, valid_file_paths=pkl_valid_files, batch_size=batch_size, vocab=vocab, eyem=eyem, **opt_ds) tparams = init_tparams(params) trng, use_noise, inps_d, \ opt_ret, \ cost, errors, ent_errors, ent_derrors, probs = \ build_model(tparams, model_options, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, valid, cost_mask=cost_mask) alphas = opt_ret['dec_alphas'] if opt_ds['use_sent_reps']: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], inps_d['slen'], inps_d['qlen'],\ inps_d['ent_mask'] ] else: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], \ inps_d['qlen'], \ inps_d['ent_mask']] outs = [cost, errors, probs, alphas] if ent_errors: outs += [ent_errors] if ent_derrors: outs += [ent_derrors] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, outs, profile=profile) print 'Done' # Apply weight decay on the feed-forward connections if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): if "logit" in kk or "ff" in kk: weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Computing gradient...', grads = safe_grad(cost, itemlist(tparams)) print 'Done' # Gradient clipping: if clip_c > 0.: g2 = get_norms(grads) for p, g in grads.iteritems(): grads[p] = tensor.switch(g2 > (clip_c**2), (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g) inps.pop() if optimizer.lower() == "adasecant": learning_rule = Adasecant(delta_clip=25.0, use_adagrad=True, grad_clip=0.25, gamma_clip=0.) elif optimizer.lower() == "rmsprop": learning_rule = RMSPropMomentum(init_momentum=momentum) elif optimizer.lower() == "adam": learning_rule = Adam() elif optimizer.lower() == "adadelta": learning_rule = AdaDelta() lr = tensor.scalar(name='lr') print 'Building optimizers...', learning_rule = None if learning_rule: f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr, grads=grads, inp=inps, cost=cost, errors=errors) else: f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, errors) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(mpath): history_errs = list(numpy.load(mpath)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size best_found = False uidx = 0 estop = False train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() for eidx in xrange(max_epochs): n_samples = 0 if train.done: train.reset() for d_, q_, a, em in train: n_samples += len(a) uidx += 1 use_noise.set_value(1.) if opt_ds['use_sent_reps']: # To mask the description and the question. d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents( d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared( d, d_mask, q, q_mask, a, dlen, slen, qlen) else: d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared( d, d_mask, q, q_mask, a, dlen, qlen) upnorm = f_update(lrate) ud = time.time() - ud_start # Collect the running ave train stats. train_cost_ave = running_ave(train_cost_ave, cost) train_err_ave = running_ave(train_err_ave, errors) train_gnorm_ave = running_ave(train_gnorm_ave, gnorm) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' import ipdb ipdb.set_trace() if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, ' Update ', uidx, \ ' Cost ', cost, ' UD ', ud, \ ' UpNorm ', upnorm[0].tolist(), \ ' GNorm ', gnorm, \ ' Pnorm ', pnorm, 'Terrors ', errors if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None and best_found: numpy.savez(mpath_best, history_errs=history_errs, **best_p) pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb')) else: params = unzip(tparams) numpy.savez(mpath, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % mpath, 'wb')) pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb')) print 'Done' print_param_norms(tparams) if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if valid.done: valid.reset() valid_costs, valid_errs, valid_probs, \ valid_alphas, error_ent, error_dent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_reps']) valid_alphas_ = numpy.concatenate( [va.argmax(0) for va in valid_alphas.tolist()], axis=0) valid_err = valid_errs.mean() valid_cost = valid_costs.mean() valid_alpha_ent = -negentropy(valid_alphas) mean_valid_alphas = valid_alphas_.mean() std_valid_alphas = valid_alphas_.std() mean_valid_probs = valid_probs.argmax(1).mean() std_valid_probs = valid_probs.argmax(1).std() history_errs.append([valid_cost, valid_err]) stats['train_err_ave'].append(train_err_ave) stats['train_cost_ave'].append(train_cost_ave) stats['train_gnorm_ave'].append(train_gnorm_ave) stats['valid_errs'].append(valid_err) stats['valid_costs'].append(valid_cost) stats['valid_err_ent'].append(error_ent) stats['valid_err_desc_ent'].append(error_dent) stats['valid_alphas_mean'].append(mean_valid_alphas) stats['valid_alphas_std'].append(std_valid_alphas) stats['valid_alphas_ent'].append(valid_alpha_ent) stats['valid_probs_mean'].append(mean_valid_probs) stats['valid_probs_std'].append(std_valid_probs) if uidx == 0 or valid_err <= numpy.array( history_errs)[:, 1].min(): best_p = unzip(tparams) bad_counter = 0 best_found = True else: bst_found = False if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print "============================" print '\t>>>Valid error: ', valid_err, \ ' Valid cost: ', valid_cost print '\t>>>Valid pred mean: ', mean_valid_probs, \ ' Valid pred std: ', std_valid_probs print '\t>>>Valid alphas mean: ', mean_valid_alphas, \ ' Valid alphas std: ', std_valid_alphas, \ ' Valid alpha negent: ', valid_alpha_ent, \ ' Valid error ent: ', error_ent, \ ' Valid error desc ent: ', error_dent print "============================" print "Running average train stats " print '\t>>>Train error: ', train_err_ave, \ ' Train cost: ', train_cost_ave, \ ' Train grad norm: ', train_gnorm_ave print "============================" train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid.reset() valid_cost, valid_error, valid_probs, \ valid_alphas, error_ent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_rep']) print " Final eval resuts: " print 'Valid error: ', valid_error.mean() print 'Valid cost: ', valid_cost.mean() print '\t>>>Valid pred mean: ', valid_probs.mean(), \ ' Valid pred std: ', valid_probs.std(), \ ' Valid error ent: ', error_ent params = copy.copy(best_p) numpy.savez(mpath_last, zipped_params=best_p, history_errs=history_errs, **params) return valid_err, valid_cost
def train( experiment_id, model_options, data_options, validation_options, patience, # early stopping patience max_epochs, finish_after, # finish after this many updates decay_c, # L2 regularization penalty alpha_c, # alignment regularization clip_c, # gradient clipping threshold lrate, # learning rate optimizer, saveto, valid_freq, eval_intv, # time interval for evaluation in minutes save_freq, # save the parameters after every saveFreq updates sample_freq, # generate some samples after every sampleFreq reload_=False): worddicts_r, train_stream, valid_stream = load_data(**data_options) LOGGER.info('Building model') params = init_params(model_options) # reload parameters model_filename = '{}.model.npz'.format(experiment_id) model_option_filename = '{}.config.json'.format(experiment_id) saveto_filename = '{}.npz'.format(saveto) if reload_ and os.path.exists(saveto_filename): LOGGER.info('Loading parameters from {}'.format(saveto_filename)) params = load_params(saveto_filename, params) LOGGER.info('Initializing parameters') tparams = init_tparams(params) # use_noise is for dropout trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] LOGGER.info('Building sampler') f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer LOGGER.info('Building f_log_probs') f_log_probs = theano.function(inps, cost, profile=False) cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in six.iteritems(tparams): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # Not used? # after all regularizers - compile the computational graph for cost # LOGGER.info('Building f_cost') # f_cost = theano.function(inps, cost, profile=False) LOGGER.info('Computing gradient') grads = tensor.grad(cost, wrt=itemlist(tparams)) # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') LOGGER.info('Building optimizers') f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) LOGGER.info('Optimization') log = Logger(filename='{}.log.jsonl.gz'.format(experiment_id)) # evaluation score will be stored into the following queue valid_ret_queue = Queue.Queue() process_queue = Queue.Queue() rt = prepare_validation_timer(tparams, process_queue, model_filename, model_option_filename, eval_intv, valid_ret_queue, **validation_options) rt.start() def _timer_signal_handler(signum, frame): LOGGER.info('Received SIGINT') LOGGER.info('Now attempting to stop the timer') rt.stop() LOGGER.info('Please wait for terminating all child processes') while not process_queue.empty(): proc = process_queue.get() if proc.poll() is None: # check if the process has terminated # child process is still working # LOGGER.info('Attempt to kill', proc.pid) # terminate it by sending an interrupt signal proc.send_signal(signal.SIGINT) # wait for child process while avoiding deadlock # ignore outputs proc.communicate() sys.exit(130) signal.signal(signal.SIGINT, _timer_signal_handler) train_start = time.clock() best_p = None best_score = 0 bad_counter = 0 uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 for x, x_mask, y, y_mask in train_stream.get_epoch_iterator(): n_samples += len(x) x, x_mask, y, y_mask = x.T, x_mask.T, y.T, y_mask.T use_noise.set_value(1.) uidx += 1 log_entry = {'iteration': uidx, 'epoch': eidx} # compute cost, grads and copy grads to shared variables update_start = time.clock() cost = f_grad_shared(x, x_mask, y, y_mask) f_update(lrate) log_entry['cost'] = float(cost) log_entry['average_source_length'] = float(x_mask.sum(0).mean()) log_entry['average_target_length'] = float(y_mask.sum(0).mean()) log_entry['update_time'] = time.clock() - update_start log_entry['train_time'] = time.clock() - train_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if not numpy.isfinite(cost): LOGGER.error('NaN detected') return 1., 1., 1. # save the best model so far if numpy.mod(uidx, save_freq) == 0: LOGGER.info('Saving best model so far') if best_p is not None: params = best_p else: params = unzip(tparams) # save params to exp_id.npz and symlink model.npz to it save_params(params, model_filename, saveto_filename) # generate some samples with the model and display them if numpy.mod(uidx, sample_freq) == 0: # FIXME: random selection? log_entry['samples'] = [] for jj in xrange(numpy.minimum(5, x.shape[1])): log_entry['samples'].append({ 'source': '', 'truth': '', 'sample': '' }) stochastic = True sample, _, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False) for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: token = worddicts_r[0][vv] else: token = UNK_TOKEN log_entry['samples'][-1]['source'] += token + ' ' for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: token = worddicts_r[1][vv] else: token = UNK_TOKEN log_entry['samples'][-1]['truth'] += token + ' ' if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: token = worddicts_r[1][vv] else: token = UNK_TOKEN log_entry['samples'][-1]['sample'] += token + ' ' # validate model on validation set and early stop if necessary if numpy.mod(uidx, valid_freq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, model_options, valid_stream) valid_err = valid_errs.mean() log_entry['validation_cost'] = float(valid_err) if not numpy.isfinite(valid_err): raise RuntimeError('NaN detected in validation error') # collect validation scores (e.g., BLEU) from the child thread if not valid_ret_queue.empty(): (ret_model, scores) = valid_ret_queue.get() valid_bleu = scores[0] # LOGGER.info('BLEU on the validation set: %.2f' % valid_bleu) log_entry['validation_bleu'] = valid_bleu if valid_bleu > best_score: best_p = ret_model best_score = valid_bleu bad_counter = 0 else: bad_counter += 1 if bad_counter > patience: estop = True break # finish after this many updates if uidx >= finish_after: LOGGER.info('Finishing after {} iterations'.format(uidx)) estop = True break log.log(log_entry) LOGGER.info('Completed epoch, seen {} samples'.format(n_samples)) if estop: log.log(log_entry) break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) LOGGER.info('Calculating validation cost') valid_err = pred_probs(f_log_probs, model_options, valid_stream).mean() if not best_p: best_p = unzip(tparams) params = copy.copy(best_p) save_params(params, model_filename, saveto_filename) rt.stop() return valid_err
def train(): if prm.optimizer.lower() == 'adam': optimizer = adam elif prm.optimizer.lower() == 'sgd': optimizer = sgd elif prm.optimizer.lower() == 'rmsprop': optimizer = rmsprop elif prm.optimizer.lower() == 'adadelta': optimizer = adadelta options = locals().copy() print 'parameters:', str(options) prm_k = vars(prm).keys() prm_d = vars(prm) prm_k.sort() for x in prm_k: if not x.startswith('__'): print x, '=', prm_d[x] print 'loading Vocabulary...' vocab = utils.load_vocab(prm.vocab_path, prm.n_words) options['vocab'] = vocab options['vocabinv'] = {} for k, v in vocab.items(): options['vocabinv'][v] = k print options print 'Loading Environment...' if prm.engine.lower() == 'lucene': import lucene_search options['engine'] = lucene_search.LuceneSearch() elif prm.engine.lower() == 'elastic': import elastic_search options['engine'] = elastic_search.ElasticSearch() print 'Loading Dataset...' dh5 = dataset_hdf5.DatasetHDF5(prm.dataset_path) qi_train = dh5.get_queries(dset='train') dt_train = dh5.get_doc_ids(dset='train') qi_valid = dh5.get_queries(dset='valid') dt_valid = dh5.get_doc_ids(dset='valid') qi_test = dh5.get_queries(dset='test') dt_test = dh5.get_doc_ids(dset='test') if prm.train_size == -1: train_size = len(qi_train) else: train_size = min(prm.train_size, len(qi_train)) if prm.valid_size == -1: valid_size = len(qi_valid) else: valid_size = min(prm.valid_size, len(qi_valid)) if prm.test_size == -1: test_size = len(qi_test) else: test_size = min(prm.test_size, len(qi_test)) print '%d train examples' % len(qi_train) print '%d valid examples' % len(qi_valid) print '%d test examples' % len(qi_test) # This create the initial parameters as np ndarrays. # Dict name (string) -> np ndarray params, exclude_params = utils.init_params(options) if prm.wordemb_path: print 'loading pre-trained word embeddings' params = utils.load_wemb(params, vocab) options['W'] = params['W'] if prm.reload_model: utils.load_params(prm.reload_model, params) print 'Building model' # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = utils.init_tparams(params) for kk, value in tparams.iteritems(): tparams[kk] = theano.shared(value, name=kk) iin, out, updates, f_pred, consider_constant \ = build_model(tparams, options) # get only parameters that are not in the exclude_params list tparams_ = OrderedDict([(kk, vv) for kk, vv in tparams.iteritems() if kk not in exclude_params]) grads = tensor.grad(out[0], wrt=utils.itemlist(tparams_), consider_constant=consider_constant) lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams_, grads, iin, out, updates) history_errs = [] best_p = None if prm.validFreq == -1: validFreq = len(qi_train) / prm.batch_size_train else: validFreq = prm.validFreq if prm.saveFreq == -1: saveFreq = len(qi_train) / prm.batch_size_train else: saveFreq = prm.saveFreq uidx = 0 # the number of update done estop = False # early stop start_time = time.time() print 'Optimization' try: for eidx in xrange(prm.max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = utils.get_minibatches_idx(len(qi_train), prm.batch_size_train, shuffle=True) for _, train_index in kf: st = time.time() uidx += 1 qi, qi_i, qi_lst, D_gt_id, D_gt_url = get_samples( qi_train, dt_train, train_index, options) # share the current queries with the search engine. options['current_queries'] = qi_lst n_samples += len(qi) is_train = 1. out = f_grad_shared(qi_i, D_gt_id, is_train) cost = out.pop(0) cost_ent = out.pop(0) lr_t = f_update(prm.lrate) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1., 1., 1. print "options['reformulated_queries']", options[ 'reformulated_queries'] if np.mod(uidx, prm.dispFreq) == 0: print '\n================================================================================' print 'Epoch', eidx, 'Update', uidx, 'Cost', cost, 'LR_t', lr_t print 'Time Minibatch Update: ' + str(time.time() - st) print 'Input Query: ', qi[0].replace('\n', '\\n') print print 'Target Docs: ', str(D_gt_url[0]) print print 'Input Query Vocab: ', utils.idx2text( qi_i[0], options['vocabinv']) for ii in range(prm.n_iterations): prob = out.pop(0) ans = out.pop(0) metrics = out.pop(0) bl = out.pop(0) cost_bl = out.pop(0) D_id = out.pop(0) print "prob", prob print "ans", ans print "bl", bl print "cost_bl", cost_bl print "D_id", D_id print("current_queries", len(options['current_queries']), options['current_queries']) print print 'Iteration', ii print 'Baseline Value', bl.mean(), 'Cost', cost_bl print ' '.join(prm.metrics_map.keys()) print metrics.mean(0) print i = 7 print 'Retrieved Docs: ', str([ options['engine'].id_title_map[d_id] for d_id in D_id[i] ]) print print 'Reformulated Query:', options[ 'reformulated_queries'][ii][i] print 'Current queries:', options['current_queries'][i] print print 'Query ANS: ', for kk, word in enumerate( options['current_queries'][i][:ans.shape[1]]): print "kk, word", kk, word if word not in options['vocab'] and word != '': word += '<unk>' if ans[0, kk] == 1: word = word.upper() print str(word), print print print 'prob[:,:,0].max(1).mean(), prob[:,:,0].mean(), prob[:,:,0].min(1).mean()', prob[:, :, 0].max( 1).mean(), prob[:, :, 0].mean(), prob[:, :, 0].min(1).mean() print 'prob[:,:,1].max(1).mean(), prob[:,:,1].mean(), prob[:,:,1].min(1).mean()', prob[:, :, 1].max( 1).mean(), prob[:, :, 1].mean(), prob[:, :, 1].min(1).mean() print '==================================================================================\n' if np.mod(uidx, validFreq) == 0 or uidx == 1: kf_train = utils.get_minibatches_idx( len(qi_train), prm.batch_size_pred, shuffle=True, max_samples=train_size) kf_valid = utils.get_minibatches_idx( len(qi_valid), prm.batch_size_pred, shuffle=True, max_samples=valid_size) kf_test = utils.get_minibatches_idx(len(qi_test), prm.batch_size_pred, shuffle=True, max_samples=test_size) print '\nEvaluating - Training Set' train_metrics = pred_error(f_pred, qi_train, dt_train, options, kf_train) exit() print '\nEvaluating - Validation Set' valid_metrics = pred_error(f_pred, qi_valid, dt_valid, options, kf_valid) print '\nEvaluating - Test Set' test_metrics = pred_error(f_pred, qi_test, dt_test, options, kf_test) his = [train_metrics, valid_metrics, test_metrics] history_errs.append(his) metric_idx = prm.metrics_map[prm.reward.upper()] if (uidx == 0 or valid_metrics[-1, metric_idx] >= np.array(history_errs)[:, 1, -1, metric_idx].max()): best_p = utils.unzip(tparams) bad_counter = 0 print '=====================================================================================================' print ' '.join(prm.metrics_map.keys()) print print 'Train:' print train_metrics print print 'Valid:' print valid_metrics print print 'Test:' print test_metrics print print '=====================================================================================================' if (len(history_errs) > prm.patience and valid_metrics[-1, metric_idx] <= np.array(history_errs)[:-prm.patience, 1, -1, metric_idx].max()): bad_counter += 1 if bad_counter > prm.patience: print 'Early Stop!' estop = True break if prm.saveto and np.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = utils.unzip(tparams) np.savez(prm.saveto, history_errs=history_errs, **params) print 'Done' print 'Seen %d samples' % n_samples if estop: break except KeyboardInterrupt: print "Training interupted" return