def trainer(data='coco', margin=0.2, dim=1024, dim_image=4096, dim_word=300, max_epochs=15, encoder='lstm', dispFreq=10, grad_clip=2.0, maxlen_w=150, batch_size=128, saveto='vse/coco', validFreq=100, early_stop=20, lrate=0.0002, reload_=False): # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) # Load training and development sets print 'loading dataset' train, dev = load_dataset(data) # Create and save dictionary print 'Create dictionary' worddict = build_dictionary(train[0] + dev[0])[0] n_words = len(worddict) model_options['n_words'] = n_words print 'Dictionary size: ' + str(n_words) with open('%s.dictionary.pkl' % saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' model_options['worddict'] = worddict model_options['word_idict'] = word_idict # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w) img_sen_model = ImgSenRanking(model_options) img_sen_model = img_sen_model.cuda() loss_fn = PairwiseRankingLoss(margin=margin) loss_fn = loss_fn.cuda() params = filter(lambda p: p.requires_grad, img_sen_model.parameters()) optimizer = torch.optim.Adam(params, lrate) uidx = 0 curr = 0.0 n_samples = 0 # For Early-stopping best_r1, best_r5, best_r10, best_medr = 0.0, 0.0, 0.0, 0 best_r1i, best_r5i, best_r10i, best_medri = 0.0, 0.0, 0.0, 0 best_step = 0 for eidx in xrange(max_epochs): print 'Epoch ', eidx for x, im in train_iter: n_samples += len(x) uidx += 1 x, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue x = Variable(torch.from_numpy(x).cuda()) im = Variable(torch.from_numpy(im).cuda()) # Update x, im = img_sen_model(x, im) cost = loss_fn(im, x) optimizer.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm(params, grad_clip) optimizer.step() if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, '\tUpdate ', uidx, '\tCost ', cost.data.cpu( ).numpy()[0] if numpy.mod(uidx, validFreq) == 0: print 'Computing results...' curr_model = {} curr_model['options'] = model_options curr_model['worddict'] = worddict curr_model['word_idict'] = word_idict curr_model['img_sen_model'] = img_sen_model ls, lim = encode_sentences(curr_model, dev[0]), encode_images( curr_model, dev[1]) r_time = time.time() (r1, r5, r10, medr) = i2t(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) print "Cal Recall@K using %ss" % (time.time() - r_time) curr_step = uidx / validFreq currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore best_r1, best_r5, best_r10, best_medr = r1, r5, r10, medr best_r1i, best_r5i, best_r10i, best_medri = r1i, r5i, r10i, medri best_step = curr_step # Save model print 'Saving model...', pkl.dump( model_options, open('%s_params_%s.pkl' % (saveto, encoder), 'wb')) torch.save(img_sen_model.state_dict(), '%s_model_%s.pkl' % (saveto, encoder)) print 'Done' if curr_step - best_step > early_stop: print 'Early stopping ...' print "Image to text: %.1f, %.1f, %.1f, %.1f" % ( best_r1, best_r5, best_r10, best_medr) print "Text to image: %.1f, %.1f, %.1f, %.1f" % ( best_r1i, best_r5i, best_r10i, best_medri) return 0 print 'Seen %d samples' % n_samples
def trainer(X, C, stmodel, dimctx=4800, #vector dimensionality dim_word=620, # word vector dimensionality dim=1600, # the number of GRU units encoder='gru', decoder='gru', doutput=False, max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=40000, maxlen_w=100, optimizer='adam', batch_size = 16, saveto='adventuremodel.npz', dictionary='/home/jm7432/tell-tall-tales/decoding/adventure_dict_final.pkl', embeddings=None, saveFreq=1000, sampleFreq=100, reload_=False): # Model options model_options = {} model_options['dimctx'] = dimctx model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['doutput'] = doutput model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['embeddings'] = embeddings model_options['saveFreq'] = saveFreq model_options['sampleFreq'] = sampleFreq model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Load pre-trained embeddings, if applicable if embeddings != None: print 'Loading embeddings...' with open(embeddings, 'rb') as f: embed_map = pkl.load(f) dim_word = len(embed_map.values()[0]) model_options['dim_word'] = dim_word preemb = norm_weight(n_words, dim_word) pz = defaultdict(lambda : 0) for w in embed_map.keys(): pz[w] = 1 for w in worddict.keys()[:n_words-2]: if pz[w] > 0: preemb[worddict[w]] = embed_map[w] else: preemb = None # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options, preemb=preemb) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, inps, cost = build_model(tparams, model_options) print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([X,C], batch_size=batch_size, maxlen=maxlen_w) uidx = 0 lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, c in train_iter: n_samples += len(x) uidx += 1 x, mask, ctx = homogeneous_data.prepare_data(x, c, worddict, stmodel, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', params = unzip(tparams) numpy.savez(saveto, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' if numpy.mod(uidx, sampleFreq) == 0: x_s = x mask_s = mask ctx_s = ctx for jj in xrange(numpy.minimum(10, len(ctx_s))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj].reshape(1, model_options['dimctx']), model_options, trng=trng, k=1, maxlen=100, stochastic=False, use_unk=False) print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print print 'Seen %d samples'%n_samples
def trainer( X, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units encoder='gru', decoder='gru', max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=474000, maxlen_w=30, optimizer='adam', batch_size=64, saveto='/data/embeddingModel.npz', dictionary='dictionary.pkl', saveFreq=1000, reload_=False): # Model options model_options = {} model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ print(model_options) # reload options if reload_ and os.path.exists(saveto): print('reloading...' + saveto) with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) # load dictionary print('Loading dictionary...') worddict = load_dictionary(dictionary) # Inverse dictionary word_idict = dict() for kk, vv in worddict.items(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print('Building model') params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, x, x_mask, y, y_mask, z, z_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask, z, z_mask] # before any regularizer print('Building f_log_probs...') f_log_probs = theano.function(inps, cost, profile=False) print('Done') # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.items(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print('Building f_cost...') f_cost = theano.function(inps, cost, profile=False) print('Done') print('Done') print('Building f_grad...') grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k, t in tparams.items()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print('Building optimizers...', end='') # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print('Optimization') # Each sentence in the minibatch have same length (for encoder) trainX = homogeneous_data.grouper(X) train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, maxlen=maxlen_w) uidx = 0 lrate = 0.01 for eidx in range(max_epochs): n_samples = 0 print('Epoch ', eidx) for x, y, z in train_iter: n_samples += len(x) uidx += 1 x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data( x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) if x is None: print('Minibatch with zero sample under length ', maxlen_w) uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print('NaN detected') return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud) if numpy.mod(uidx, saveFreq) == 0: print('Saving...', end='') params = unzip(tparams) numpy.savez(saveto, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print('Done') print('Seen %d samples' % n_samples)
def trainer(data='coco', #f8k, f30k, coco margin=0.2, dim=1024, dim_image=4096, dim_word=300, encoder='gru', # gru OR bow max_epochs=15, dispFreq=10, decay_c=0., grad_clip=2., maxlen_w=100, optimizer='adam', batch_size = 128, saveto='/ais/gobi3/u/rkiros/uvsmodels/coco.npz', validFreq=100, lrate=0.0002, reload_=False): # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['encoder'] = encoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) # Load training and development sets print 'Loading dataset' train, dev = load_dataset(data)[:2] # Create and save dictionary print 'Creating dictionary' worddict = build_dictionary(train[0]+dev[0])[0] n_words = len(worddict) model_options['n_words'] = n_words print 'Dictionary size: ' + str(n_words) with open('%s.dictionary.pkl'%saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, inps, cost = build_model(tparams, model_options) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Building sentence encoder' trng, inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print 'Building image encoder' trng, inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w) uidx = 0 curr = 0. n_samples = 0 for eidx in xrange(max_epochs): print 'Epoch ', eidx for x, im in train_iter: n_samples += len(x) uidx += 1 x, mask, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, validFreq) == 0: print 'Computing results...' curr_model = {} curr_model['options'] = model_options curr_model['worddict'] = worddict curr_model['word_idict'] = word_idict curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc ls = encode_sentences(curr_model, dev[0]) lim = encode_images(curr_model, dev[1]) (r1, r5, r10, medr) = i2t(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore # Save model print 'Saving...', params = unzip(tparams) numpy.savez(saveto, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' print 'Seen %d samples'%n_samples
def trainer(Xs, Xs_val, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units encoder='gru', decoder='gru', max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=20000, maxlen_w=30, optimizer='adam', batch_size = 64, saveto='/u/rkiros/research/semhash/models/toy.npz', dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', embeddings=None, saveFreq=1000, reload_=False): # Model options model_options = {} model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['embeddings'] = embeddings model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Load pre-trained embeddings, if applicable if embeddings: print 'Loading embeddings...' from gensim.models import Word2Vec as word2vec embed_map = word2vec.load_word2vec_format(embeddings, binary=True) model_options['dim_word'] = dim_word = embed_map.vector_size preemb = norm_weight(n_words, dim_word) preemb_mask = numpy.ones((n_words, 1), dtype='float32') for w,i in worddict.items()[:n_words-2]: if w in embed_map: preemb[i] = embed_map[w] preemb_mask[i] = 0 # don't propagate gradients into pretrained embs else: preemb = None # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options, preemb=preemb) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, x, x_mask, y, y_mask, z, z_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask, z, z_mask] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads if embeddings: param_preemb_mask = theano.shared(preemb_mask, name='preemb_mask', broadcastable=(False, True)) grads[0] *= param_preemb_mask lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) if type(Xs[0]) is not list: Xs = [Xs] if type(Xs_val[0]) is not list: Xs_val = [Xs_val] trainXs = map(hd.grouper, Xs) valXs = map(hd.grouper, Xs_val) train_iters = [hd.HomogeneousData(trainX, batch_size=batch_size, maxlen=maxlen_w) for trainX in trainXs] val_iters = [hd.HomogeneousData(valX, batch_size=batch_size, maxlen=maxlen_w) for valX in valXs] f_progress = open('%s_progress.txt' % saveto, 'w', 1) uidx = 0 lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for train_iter in train_iters: for x, y, z in train_iter: n_samples += len(x) uidx += 1 x, x_mask, y, y_mask, z, z_mask = hd.prepare_data(x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: val_logprob = n_val_samples = 0 for val_iter in val_iters: for x, y, z in val_iter: n_val_samples += len(x) x, x_mask, y, y_mask, z, z_mask = hd.prepare_data(x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) val_logprob += f_log_probs(x, x_mask, y, y_mask, z, z_mask) val_logprob /= n_val_samples print 'LOGPROB: %s' % val_logprob f_progress.write('%s\n' % val_logprob) print 'Saving...', params = unzip(tparams) numpy.savez('%s_%.3f' % (saveto, val_logprob), history_errs=[], **params) pkl.dump(model_options, open('%s_%.3f.pkl'%(saveto, val_logprob), 'wb')) print 'Done' print 'Seen %d samples'%n_samples
def main(): model_config = configuration.ModelConfig() model_config.data = FLAGS.input_dataset_name #loading dataset print('Loading dataset ...') (train_caps, train_ims), (test_caps, test_ims), _ = load_dataset(name=model_config.data, load_train=True) train_nic_ims = train_ims[:, 1536:] test_nic_ims = test_ims[:, 1536:] train_ims[:, 1536:] = preprocessing.scale(train_nic_ims) test_ims[:, 1536:] = preprocessing.scale(test_nic_ims) test_vgg_feature = test_ims[:, :1536] test_NIC_feature = test_ims[:, 1536:] #create and save dictionary print('creating dictionary') worddict = build_dictionary(train_caps + test_caps)[0] n_words = len(worddict) model_config.n_words = n_words model_config.worddict = worddict print('dictionary size: ' + str(n_words)) with open('f8k.dictionary.pkl', 'wb') as f: pkl.dump(worddict, f) #Building the model print('Building the model ...') model = LTS(model_config) model.build() config = tf.ConfigProto() config.gpu_options.allow_growth = True saver = tf.train.Saver(max_to_keep=model_config.max_checkpoints_to_keep) #sess = tf.Session(config=config) print('start embedding training') curr = 0. uidx = 0. train_iter = homogeneous_data.HomogeneousData( data=[train_caps, train_ims], batch_size=model_config.batch_size, maxlen=model_config.maxlen_w) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for epoch in range(model_config.max_epochs): # Train G print('Epoch ', epoch) if epoch == 15: model_config.lrate = model_config.lrate / 10 for x, im in train_iter: uidx += 1 ls, mask, im = homogeneous_data.prepare_data( caps=x, features=im, worddict=worddict, maxlen=model_config.maxlen_w, n_words=model_config.n_words) vgg_feature = im[:, :1536] NIC_feature = im[:, 1536:] #embedding training _, cost = sess.run( [model.updates, model.embedding_loss], feed_dict={ model.VGG_pred_data: vgg_feature, model.NIC_pred_data: NIC_feature, model.ls_pred_data: ls.T, model.input_mask: mask.T, model.keep_prob: 0.5, model.phase: 1, model.learning_rate: model_config.lrate }) if np.mod(uidx, 10) == 0: print('Epoch ', epoch, 'Update ', uidx, 'Cost ', cost) if np.mod(uidx, 100) == 0: print('test ...') # encode images into the text embedding space images = getTestImageFeature(sess, model, test_vgg_feature, test_NIC_feature) features = getTestTextFeature(sess, model, model_config, test_caps) (r1, r5, r10, medr) = recall.i2t(images, features) print("Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)) (r1i, r5i, r10i, medri) = recall.t2i(images, features) print("Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri)) currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore # Save model print('Saving...') saver.save(sess, "checkpoint_files/model.ckpt", global_step=int(uidx + 1)) print('done.') sess = tf.Session() model_path = tf.train.latest_checkpoint("checkpoint_files/") if not model_path: print("Skipping testing. No checkpoint found in: %s", FLAGS.checkpoint_dir) return print("Loading model from checkpoint: %s", model_path) saver.restore(sess, model_path) print("Successfully loaded checkpoint: %s", model_path) images = getTestImageFeature(sess, model, test_vgg_feature, test_NIC_feature) # encode sentences into the text embedding space features = getTestTextFeature(sess, model, model_config, test_caps) (r1, r5, r10, medr) = recall.i2t(images, features) print("Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)) (r1i, r5i, r10i, medri) = recall.t2i(images, features) print("Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri))
def trainer(X, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units encoder='gru', decoder='gru', max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=20000, maxlen_w=30, optimizer='adam', batch_size = 64, saveto='/u/rkiros/research/semhash/models/toy.npz', dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', saveFreq=1000, reload_=False): # Model options model_options = {} model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ print model_options # reload options # TODO: if loading old parameters you need to make sure you are using them # in the rest of the code # if reload_ and os.path.exists(saveto): # print 'reloading...' + saveto # with open('%s.pkl'%saveto, 'rb') as f: # model_options = pkl.load(f) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto + '.npz', params) tparams = init_tparams(params) trng, x, x_mask, y, y_mask, z, z_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask, z, z_mask] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) trainX = homogeneous_data.grouper(X) train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, maxlen=maxlen_w) uidx = 0 lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, y, z in train_iter: n_samples += len(x) uidx += 1 x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data(x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', params = unzip(tparams) numpy.savez(saveto, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' print 'Seen %d samples'%n_samples
def trainer(X, C, stmodel, dimctx=4800, #vector dimensionality dim_word=620, # word vector dimensionality dim=1600, # the number of GRU units encoder='gru', decoder='gru', doutput=False, max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=40000, maxlen_w=100, optimizer='adam', batch_size = 16, saveto='/u/rkiros/research/semhash/models/toy.npz', dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', embeddings=None, saveFreq=1000, sampleFreq=100, reload_=False): # Model options model_options = {} model_options['dimctx'] = dimctx model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['doutput'] = doutput model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['embeddings'] = embeddings model_options['saveFreq'] = saveFreq model_options['sampleFreq'] = sampleFreq model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Load pre-trained embeddings, if applicable if embeddings != None: print 'Loading embeddings...' with open(embeddings, 'rb') as f: embed_map = pkl.load(f) dim_word = len(embed_map.values()[0]) model_options['dim_word'] = dim_word preemb = norm_weight(n_words, dim_word) pz = defaultdict(lambda : 0) for w in embed_map.keys(): pz[w] = 1 for w in worddict.keys()[:n_words-2]: if pz[w] > 0: preemb[worddict[w]] = embed_map[w] else: preemb = None # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options, preemb=preemb) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, inps, cost = build_model(tparams, model_options) print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([X,C], batch_size=batch_size, maxlen=maxlen_w) uidx = 0 lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, c in train_iter: n_samples += len(x) uidx += 1 x, mask, ctx = homogeneous_data.prepare_data(x, c, worddict, stmodel, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', params = unzip(tparams) numpy.savez(saveto, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' if numpy.mod(uidx, sampleFreq) == 0: x_s = x mask_s = mask ctx_s = ctx for jj in xrange(numpy.minimum(10, len(ctx_s))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj].reshape(1, model_options['dimctx']), model_options, trng=trng, k=1, maxlen=100, stochastic=False, use_unk=False) print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print print 'Seen %d samples'%n_samples
def trainer(data='coco', margin=0.2, dim=1024, dim_image=4096, dim_word=300, encoder='gru', max_epochs=15, dispFreq=10, decay_c=0.0, grad_clip=2.0, maxlen_w=150, batch_size=128, saveto='vse/coco', validFreq=100, lrate=0.0002, concat=True, reload_=False): hyper_params = { 'data': data, 'encoder': encoder, 'batch_size': batch_size, 'time': cur_time, 'lrate': lrate, 'concat': concat, } i2t_r1 = dict([('i2t_recall', 'r1')] + hyper_params.items()) i2t_r5 = dict([('i2t_recall', 'r5')] + hyper_params.items()) i2t_r10 = dict([('i2t_recall', 'r10')] + hyper_params.items()) t2i_r1 = dict([('t2i_recall', 'r1')] + hyper_params.items()) t2i_r5 = dict([('t2i_recall', 'r5')] + hyper_params.items()) t2i_r10 = dict([('t2i_recall', 'r10')] + hyper_params.items()) i2t_med = dict([('i2t_med', 'i2t_med')] + hyper_params.items()) t2i_med = dict([('t2i_med', 't2i_med')] + hyper_params.items()) agent = Agent(port=5020) i2t_r1_agent = agent.register(i2t_r1, 'recall', overwrite=True) i2t_r5_agent = agent.register(i2t_r5, 'recall', overwrite=True) i2t_r10_agent = agent.register(i2t_r10, 'recall', overwrite=True) t2i_r1_agent = agent.register(t2i_r1, 'recall', overwrite=True) t2i_r5_agent = agent.register(t2i_r5, 'recall', overwrite=True) t2i_r10_agent = agent.register(t2i_r10, 'recall', overwrite=True) i2t_med_agent = agent.register(i2t_med, 'median', overwrite=True) t2i_med_agent = agent.register(t2i_med, 'median', overwrite=True) # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['encoder'] = encoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ model_options['concat'] = concat print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) # Load training and development sets print 'loading dataset' train, dev = load_dataset(data)[:2] # Create and save dictionary print 'Create dictionary' worddict = build_dictionary(train[0] + dev[0])[0] n_words = len(worddict) model_options['n_words'] = n_words print 'Dictionary size: ' + str(n_words) with open('%s.dictionary.pkl' % saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' model_options['worddict'] = worddict model_options['word_idict'] = word_idict # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w) img_sen_model = ImgSenRanking(model_options) img_sen_model = img_sen_model.cuda() loss_fn = PairwiseRankingLoss(margin=margin) loss_fn = loss_fn.cuda() params = filter(lambda p: p.requires_grad, img_sen_model.parameters()) optimizer = torch.optim.Adam(params, lrate) uidx = 0 curr = 0.0 n_samples = 0 for eidx in xrange(max_epochs): print 'Epoch ', eidx for x, im in train_iter: n_samples += len(x) uidx += 1 x_id, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words) if x_id is None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue x_id = Variable(torch.from_numpy(x_id).cuda()) im = Variable(torch.from_numpy(im).cuda()) # Update ud_start = time.time() x, im = img_sen_model(x_id, im, x) cost = loss_fn(im, x) optimizer.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm(params, grad_clip) optimizer.step() ud = time.time() - ud_start if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost.data.cpu( ).numpy()[0], 'UD ', ud if numpy.mod(uidx, validFreq) == 0: print 'Computing results...' curr_model = {} curr_model['options'] = model_options curr_model['worddict'] = worddict curr_model['word_idict'] = word_idict curr_model['img_sen_model'] = img_sen_model ls, lim = encode_sentences(curr_model, dev[0]), encode_images( curr_model, dev[1]) r1, r5, r10, medr = 0.0, 0.0, 0.0, 0 r1i, r5i, r10i, medri = 0.0, 0.0, 0.0, 0 r_time = time.time() if data == 'arch' or data == 'arch_small': (r1, r5, r10, medr) = i2t_arch(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i_arch(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % ( r1i, r5i, r10i, medri) else: (r1, r5, r10, medr) = i2t(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % ( r1i, r5i, r10i, medri) print "Cal Recall@K using %ss" % (time.time() - r_time) record_num = uidx / validFreq agent.append(i2t_r1_agent, record_num, r1) agent.append(i2t_r5_agent, record_num, r5) agent.append(i2t_r10_agent, record_num, r10) agent.append(t2i_r1_agent, record_num, r1i) agent.append(t2i_r5_agent, record_num, r5i) agent.append(t2i_r10_agent, record_num, r10i) agent.append(i2t_med_agent, record_num, medr) agent.append(t2i_med_agent, record_num, medri) currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore # Save model print 'Saving model...', pkl.dump( model_options, open('%s_params_%s.pkl' % (saveto, encoder), 'wb')) torch.save(img_sen_model.state_dict(), '%s_model_%s.pkl' % (saveto, encoder)) print 'Done' print 'Seen %d samples' % n_samples
# Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, inps, cost = build_model(tparams, model_options) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Building sentence encoder' trng, inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print 'Building image encoder' trng, inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w) uidx = 0 curr = 0. n_samples = 0 for eidx in xrange(max_epochs): print 'Epoch ', eidx for x, im in train_iter: n_samples += len(x) uidx += 1 x, mask, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, validFreq) == 0: print 'Computing results...' curr_model = {} curr_model['options'] = model_options curr_model['worddict'] = worddict curr_model['word_idict'] = word_idict curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc ls = encode_sentences(curr_model, dev[0]) lim = encode_images(curr_model, dev[1]) (r1, r5, r10, medr) = i2t(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore # Save model print 'Saving...', params = unzip(tparams) numpy.savez(saveto, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' print 'Seen %d samples'%n_samples if __name__ == '__main__': pass
def trainer(X, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units encoder='gru', num_neg=4, gamma=1.0, max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=20000, maxlen_w=30, optimizer='adam', batch_size = 64, saveto='/u/rkiros/research/semhash/models/toy.npz', dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', saveFreq=1000, reload_=False): # Model options model_options = {} model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['num_neg'] = num_neg model_options['gamma'] = gamma model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, x, x_mask, p_f, p_f_mask, p_b, p_b_mask, \ ns_list, ns_masks, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, p_f, p_f_mask, p_b, p_b_mask] + ns_list + ns_masks # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) trainX = homogeneous_data.grouper(X) train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, num_neg=num_neg, maxlen=maxlen_w) uidx = 0 lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, p_f, p_b, ns in train_iter: n_samples += len(x) uidx += 1 # ns input is list of num_neg negative sentences, # output ns is list of num_neg (batchsize, neg_len) negative sentences x, x_mask, p_f, p_f_mask, p_b, p_b_mask, ns_list, ns_masks = homogeneous_data.prepare_data(x, p_f, p_b, ns, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() args = [x, x_mask, p_f, p_f_mask, p_b, p_b_mask] + ns_list + ns_masks cost = f_grad_shared(*args) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost_p1) or numpy.isinf(cost_p1) or numpy.isnan(cost_p2) or numpy.isinf(cost_p2): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost_p1 + cost_p2, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', saveto_idx = saveto.format(uidx) params = unzip(tparams) numpy.savez(saveto_idx, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl'%saveto_idx, 'wb')) print 'Done' print 'Seen %d samples'%n_samples
def trainer(data='f30k', margin=0.2, dim=1024, dim_image=4096, dim_word=300, max_epochs=15, encoder='lstm', dispFreq=10, grad_clip=2.0, maxlen_w=150, batch_size=128, saveto='vse/f30K', validFreq=100, early_stop=20, lrate=1e-3, reload_=False): # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ logging.info(model_options) # reload options if reload_ and os.path.exists(saveto): logging.info('reloading...' + saveto) with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) # Load training and development sets logging.info('loading dataset') titles, album_ims, artist, genre = load_dataset(data) artist_string = artist genre_string = genre # Create and save dictionary if os.path.exists('%s.dictionary.pkl' % saveto): logging.info('loading dict from...' + saveto) with open('%s.dictionary.pkl' % saveto, 'rb') as wdict: worddict = pkl.load(wdict) n_words = len(worddict) model_options['n_words'] = n_words logging.info('Dictionary size: ' + str(n_words)) else: logging.info('Create dictionary') worddict = build_dictionary(titles + artist + genre)[0] n_words = len(worddict) model_options['n_words'] = n_words logging.info('Dictionary words: ' + str(n_words)) with open('%s.dictionary.pkl' % saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.items(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' model_options['worddict'] = worddict model_options['word_idict'] = word_idict # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData( [titles, album_ims, artist, genre], batch_size=batch_size, maxlen=maxlen_w) img_sen_model = Img_Sen_Artist_Ranking(model_options) # todo code to load saved model dict if os.path.exists('%s_model_%s.pkl' % (saveto, encoder)): logging.info('Loading model...') # pkl.dump(model_options, open('%s_params_%s.pkl' % (saveto, encoder), 'wb')) img_sen_model.load_state_dict( torch.load('%s_model_%s.pkl' % (saveto, encoder))) logging.info('Done') img_sen_model = img_sen_model.cuda() loss_fn = PairwiseRankingLoss(margin=margin).cuda() params = filter(lambda p: p.requires_grad, img_sen_model.parameters()) optimizer = torch.optim.Adam(params, lr=lrate) scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=40, mode='min', verbose=True, threshold=1e-8) uidx = 0 curr = 0.0 n_samples = 0 # For Early-stopping best_r1, best_r5, best_r10, best_medr = 0.0, 0.0, 0.0, 0 best_step = 0 writer = SummaryWriter() for eidx in range(max_epochs): for x, im, artist, genre in train_iter: n_samples += len(x) uidx += 1 x, im, artist, genre = homogeneous_data.prepare_data( x, im, artist, genre, worddict, maxlen=maxlen_w, n_words=n_words) if x is None: logging.info('Minibatch with zero sample under length ', maxlen_w) uidx -= 1 continue x = Variable(torch.from_numpy(x).cuda()) im = Variable(torch.from_numpy(im).cuda()) artist = Variable(torch.from_numpy(artist).cuda()) genre = Variable(torch.from_numpy(genre).cuda()) # Update x1, im1, artist, genre = img_sen_model(x, im, artist, genre) #make validation on inout before trainer see it if numpy.mod(uidx, validFreq) == 0: img_sen_model.eval() with torch.no_grad(): print('Epoch ', eidx, '\tUpdate@ ', uidx, '\tCost ', cost.data.item()) writer.add_scalar('Evaluation/Validation_Loss', cost.data.item(), uidx) (r1, r5, r10, medr) = i2t(im1, x) #distances with l2norm logging.info("Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)) (r1g, r5g, r10g, medrg) = i2t(im1, genre) logging.info("Image to genre: %.1f, %.1f, %.1f, %.1f" % (r1g, r5g, r10g, medrg)) (r1a, r5a, r10a, medra) = i2t(im1, artist) logging.info("Image to Artist: %.1f, %.1f, %.1f, %.1f" % (r1a, r5a, r10a, medra)) logging.info("Cal Recall@K ") writer.add_scalars('Validation Recal/Image2Album', { 'r@1': r1, 'r@5': r5, 'r@10': r10 }, uidx) writer.add_scalars('Validation Recal/Image2Genres', { 'r@1': r1g, 'r@5': r5g, 'r@10': r10g }, uidx) writer.add_scalars('Validation Recal/Image2Artist', { 'r@1': r1a, 'r@5': r5a, 'r@10': r5a }, uidx) curr_step = uidx / validFreq currscore = r1 + r5 + r10 + r1a + r5a + r10a + r1g + r5g + r10g - medr - medrg - medra if currscore > curr: curr = currscore best_r1, best_r5, best_r10, best_medr = r1, r5, r10, medr best_r1g, best_r5g, best_r10g, best_medrg = r1, r5, r10, medrg best_step = curr_step # Save model logging.info('Saving model...') pkl.dump( model_options, open('%s_params_%s.pkl' % (saveto, encoder), 'wb')) torch.save(img_sen_model.state_dict(), '%s_model_%s.pkl' % (saveto, encoder)) logging.info('Done') if curr_step - best_step > early_stop: logging.info('early stopping, jumping now...') logging.info("Image to text: %.1f, %.1f, %.1f, %.1f" % (best_r1, best_r5, best_r10, best_medr)) logging.info( "Image to genre: %.1f, %.1f, %.1f, %.1f" % (best_r1g, best_r5g, best_r10g, best_medrg)) #return 0 lrate = 1e-4 for param_group in optimizer.param_groups: param_group['lr'] = lrate img_sen_model.train() cost = loss_fn(im1, x1, artist, genre) writer.add_scalar('Evaluation/training_Loss', cost, uidx) optimizer.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm_(params, grad_clip) scheduler.step(cost.data.item()) optimizer.step() #scheduler.step(cost.data.item()) logging.info('Seen %d samples' % n_samples)
def trainer( X, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units encoder='gru', decoder='gru', max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=20000, maxlen_w=30, optimizer='adam', batch_size=512, saveto='/u/rkiros/research/semhash/models/toy.npz', dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', saveFreq=5000, reload_=False, reload_path='output_books_full/model_ae_full_bsz_64_iter_313000.npz', SICK_eval=False): # Model options model_options = {} model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ model_options['reload_path'] = reload_path print model_options # reload options if reload_ and os.path.exists(reload_path): print 'reloading...' + reload_path with open('%s.pkl' % reload_path, 'rb') as f: models_options = pkl.load(f) reload_idx = int(reload_path.split('_')[-1].split('.')[0]) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(reload_path): params = load_params(reload_path, params) tparams = init_tparams(params) trng, x, x_mask, y, y_mask, z, z_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask, z, z_mask] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k, t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) trainX = homogeneous_data.grouper(X) train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, maxlen=maxlen_w) if not reload_: uidx = 0 else: uidx = reload_idx lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, y, z in train_iter: n_samples += len(x) uidx += 1 x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data( x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', saveto_iternum = saveto.format(uidx) params = unzip(tparams) numpy.savez(saveto_iternum, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl' % saveto_iternum, 'wb')) print 'Done' if SICK_eval: print "Evaluating SICK Test performance" embed_map = tools.load_googlenews_vectors() model = tools.load_model(path_to_model=saveto_iternum, embed_map=embed_map) yhat, pr, sr, mse = eval_sick.evaluate(model, evaltest=True) del (model) del (embed_map) print pr, sr, mse res_save_file = saveto.format('ALL').split( '.')[0] + '_SICK_EVAL.txt' with open(res_save_file, 'a') as rsf: cur_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) rsf.write('\n \n {}'.format(cur_time)) rsf.write('\n{}, {}, {}, {}'.format(uidx, pr, se, mse)) print "Done" print 'Seen %d samples' % n_samples