def __init__(self): # layers: 'name': ('parameter initializer', 'feedforward') self.layers = { 'ff': ('self.param_init_fflayer', 'self.fflayer'), 'lstm': ('self.param_init_lstm', 'self.lstm_layer'), 'lstm_cond': ('self.param_init_lstm_cond', 'self.lstm_cond_layer'), } self.rng_numpy, self.rng_theano = get_two_rngs()
def __init__(self, state, data_engine, channel): print 'Init ', self.__class__ self.random_seed = state.random_seed self.rng_numpy, self.rng_theano = utils.get_two_rngs( seed=self.random_seed) self.state = state self.data_engine = data_engine self.save_model_path = state.save_model_path self.channel = channel config_model = state.DeepOrderlessNADE self.n_visible = config_model.n_in self.n_hidden = config_model.n_hidden # number of hidden layers besides the first and last self.n_layers = config_model.n_layers self.hidden_act = config_model.hidden_act self.tied_weights = config_model.tied_weights self.use_mask = config_model.use_mask self.init_mean_field = config_model.init_mean_field self.cost_from_last = config_model.cost_from_last self.init_weights = config_model.init_weights self.center_v = config_model.center_v config_train = state.DeepOrderlessNADE.train self.valid_freq = config_train.valid_freq self.n_orderings = config_train.n_orderings self.sgd_type = config_train.sgd_type self.n_epochs = config_train.n_epochs self.minibatch_size = config_train.minibatch_size self.momentum = config_train.momentum self.lr = config_train.lr self.l2 = config_train.l2 # number of variationa inference to do self.k = self.state.DeepOrderlessNADE.train.k self.verbose = config_train.verbose self.fine_tune_n_epochs = config_train.fine_tune.n_epochs self.fine_tune_activate = config_train.fine_tune.activate assert self.n_layers >= 1 # used in training, save also to txt file. self.LL_valid_test = [] # for dataset self.trainset, _, self.validset, _, self.testset, _ = utils.load_mnist( ) self.marginal = numpy.mean(numpy.concatenate( [self.trainset, self.validset], axis=0), axis=0) # for tracking the costs for both pretrain and finetune self.costs = [] self.costs_steps = [] # decrease learning rate self.lr_decrease = self.lr / self.n_epochs
def train( random_seed=1234, dim_word=256, # word vector dimensionality ctx_dim=-1, # context vector dimensionality, auto set dim=1000, # the number of LSTM units n_layers_out=1, n_layers_init=1, encoder='none', encoder_dim=100, prev2out=False, ctx2out=False, patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., alpha_entropy_r=0., lrate=0.01, selector=False, n_words=100000, maxlen=100, # maximum length of the description optimizer='adadelta', clip_c=2., batch_size=64, valid_batch_size=64, save_model_dir='/data/lisatmp3/yaoli/exp/capgen_vid/attention/test/', validFreq=10, saveFreq=10, # save the parameters after every saveFreq updates sampleFreq=10, # generate some samples after every sampleFreq updates metric='blue', dataset='youtube2text', video_feature='googlenet', use_dropout=False, reload_=False, from_dir=None, K=10, OutOf=240, verbose=True, debug=True): rng_numpy, rng_theano = utils.get_two_rngs() model_options = locals().copy() if 'self' in model_options: del model_options['self'] with open('%smodel_options.pkl' % save_model_dir, 'wb') as f: pkl.dump(model_options, f) # instance model layers = Layers() model = Model() print 'Loading data' engine = data_engine.Movie2Caption('attention', dataset, video_feature, batch_size, valid_batch_size, maxlen, n_words, K, OutOf) model_options['ctx_dim'] = engine.ctx_dim model_options['n_words'] = engine.n_words print 'n_words:', model_options['n_words'] # set test values, for debugging idx = engine.kf_train[0] [x_tv, mask_tv, ctx_tv, ctx_mask_tv ] = data_engine.prepare_data(engine, [engine.train[index] for index in idx]) print 'init params' t0 = time.time() params = model.init_params(model_options) # reloading if reload_: model_saved = from_dir + '/model_best_so_far.npz' assert os.path.isfile(model_saved) print "Reloading model params..." params = utils.load_params(model_saved, params) tparams = utils.init_tparams(params) trng, use_noise, \ x, mask, ctx, mask_ctx, \ cost, extra = \ model.build_model(tparams, model_options) alphas = extra[1] betas = extra[2] print 'buliding sampler' f_init, f_next = model.build_sampler(tparams, model_options, use_noise, trng) # before any regularizer print 'building f_log_probs' f_log_probs = theano.function([x, mask, ctx, mask_ctx], -cost, profile=False, on_unused_input='ignore') cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1. - alphas.sum(0))**2).sum(-1).mean() cost += alpha_reg if alpha_entropy_r > 0: alpha_entropy_r = theano.shared(numpy.float32(alpha_entropy_r), name='alpha_entropy_r') alpha_reg_2 = alpha_entropy_r * (-tensor.sum( alphas * tensor.log(alphas + 1e-8), axis=-1)).sum(-1).mean() cost += alpha_reg_2 else: alpha_reg_2 = tensor.zeros_like(cost) print 'building f_alpha' f_alpha = theano.function([x, mask, ctx, mask_ctx], [alphas, betas], name='f_alpha', on_unused_input='ignore') print 'compute grad' grads = tensor.grad(cost, wrt=utils.itemlist(tparams)) if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'build train fns' f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, [x, mask, ctx, mask_ctx], cost, extra + grads) print 'compilation took %.4f sec' % (time.time() - t0) print 'Optimization' history_errs = [] # reload history if reload_: print 'loading history error...' history_errs = numpy.load( from_dir + 'model_best_so_far.npz')['history_errs'].tolist() bad_counter = 0 processes = None queue = None rqueue = None shared_params = None uidx = 0 uidx_best_blue = 0 uidx_best_valid_err = 0 estop = False best_p = utils.unzip(tparams) best_blue_valid = 0 best_valid_err = 999 alphas_ratio = [] for eidx in xrange(max_epochs): n_samples = 0 train_costs = [] grads_record = [] print 'Epoch ', eidx for idx in engine.kf_train: tags = [engine.train[index] for index in idx] n_samples += len(tags) uidx += 1 use_noise.set_value(1.) pd_start = time.time() x, mask, ctx, ctx_mask = data_engine.prepare_data(engine, tags) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue ud_start = time.time() rvals = f_grad_shared(x, mask, ctx, ctx_mask) cost = rvals[0] probs = rvals[1] alphas = rvals[2] betas = rvals[3] grads = rvals[4:] grads, NaN_keys = utils.grad_nan_report(grads, tparams) if len(grads_record) >= 5: del grads_record[0] grads_record.append(grads) if NaN_keys != []: print 'grads contain NaN' import pdb pdb.set_trace() if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected in cost' import pdb pdb.set_trace() # update params f_update(lrate) ud_duration = time.time() - ud_start if eidx == 0: train_error = cost else: train_error = train_error * 0.95 + cost * 0.05 train_costs.append(cost) if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Train cost mean so far', \ train_error, 'fetching data time spent (sec)', pd_duration, \ 'update time spent (sec)', ud_duration, 'save_dir', save_model_dir alphas, betas = f_alpha(x, mask, ctx, ctx_mask) counts = mask.sum(0) betas_mean = (betas * mask).sum(0) / counts betas_mean = betas_mean.mean() print 'alpha ratio %.3f, betas mean %.3f' % ( alphas.min(-1).mean() / (alphas.max(-1)).mean(), betas_mean) l = 0 for vv in x[:, 0]: if vv == 0: break if vv in engine.word_idict: print '(', numpy.round(betas[l, 0], 3), ')', engine.word_idict[vv], else: print '(', numpy.round(betas[l, 0], 3), ')', 'UNK', l += 1 print '(', numpy.round(betas[l, 0], 3), ')' if numpy.mod(uidx, saveFreq) == 0: pass if numpy.mod(uidx, sampleFreq) == 0: use_noise.set_value(0.) print '------------- sampling from train ----------' x_s = x mask_s = mask ctx_s = ctx ctx_mask_s = ctx_mask model.sample_execute(engine, model_options, tparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, trng) print '------------- sampling from valid ----------' idx = engine.kf_valid[numpy.random.randint( 1, len(engine.kf_valid) - 1)] tags = [engine.valid[index] for index in idx] x_s, mask_s, ctx_s, mask_ctx_s = data_engine.prepare_data( engine, tags) model.sample_execute(engine, model_options, tparams, f_init, f_next, x_s, ctx_s, mask_ctx_s, trng) if validFreq != -1 and numpy.mod(uidx, validFreq) == 0: t0_valid = time.time() alphas, _ = f_alpha(x, mask, ctx, ctx_mask) ratio = alphas.min(-1).mean() / (alphas.max(-1)).mean() alphas_ratio.append(ratio) numpy.savetxt(save_model_dir + 'alpha_ratio.txt', alphas_ratio) current_params = utils.unzip(tparams) numpy.savez(save_model_dir + 'model_current.npz', history_errs=history_errs, **current_params) use_noise.set_value(0.) train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 if not debug: # first compute train cost if 0: print 'computing cost on trainset' train_err, train_perp = model.pred_probs( engine, 'train', f_log_probs, verbose=model_options['verbose']) else: train_err = 0. train_perp = 0. if 1: print 'validating...' valid_err, valid_perp = model.pred_probs( engine, 'valid', f_log_probs, verbose=model_options['verbose'], ) else: valid_err = 0. valid_perp = 0. if 1: print 'testing...' test_err, test_perp = model.pred_probs( engine, 'test', f_log_probs, verbose=model_options['verbose']) else: test_err = 0. test_perp = 0. mean_ranking = 0 blue_t0 = time.time() scores, processes, queue, rqueue, shared_params = \ metrics.compute_score( model_type='attention', model_archive=current_params, options=model_options, engine=engine, save_dir=save_model_dir, beam=5, n_process=5, whichset='both', on_cpu=False, processes=processes, queue=queue, rqueue=rqueue, shared_params=shared_params, metric=metric, one_time=False, f_init=f_init, f_next=f_next, model=model ) ''' {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]}, 'alternative_valid': {'Bleu_3': 0.40702270203174923, 'Bleu_4': 0.29276570520368456, 'CIDEr': 0.25247168210607884, 'Bleu_2': 0.529069629270047, 'Bleu_1': 0.6804308797115253, 'ROUGE_L': 0.51083584331688392}, 'meteor': {'test': [-1], 'valid': [0.282787550236724]}} ''' valid_B1 = scores['valid']['Bleu_1'] valid_B2 = scores['valid']['Bleu_2'] valid_B3 = scores['valid']['Bleu_3'] valid_B4 = scores['valid']['Bleu_4'] valid_Rouge = scores['valid']['ROUGE_L'] valid_Cider = scores['valid']['CIDEr'] valid_meteor = scores['valid']['METEOR'] test_B1 = scores['test']['Bleu_1'] test_B2 = scores['test']['Bleu_2'] test_B3 = scores['test']['Bleu_3'] test_B4 = scores['test']['Bleu_4'] test_Rouge = scores['test']['ROUGE_L'] test_Cider = scores['test']['CIDEr'] test_meteor = scores['test']['METEOR'] print 'computing meteor/blue score used %.4f sec, '\ 'blue score: %.1f, meteor score: %.1f'%( time.time()-blue_t0, valid_B4, valid_meteor) history_errs.append([ eidx, uidx, train_err, train_perp, valid_perp, test_perp, valid_err, test_err, valid_B1, valid_B2, valid_B3, valid_B4, valid_meteor, valid_Rouge, valid_Cider, test_B1, test_B2, test_B3, test_B4, test_meteor, test_Rouge, test_Cider ]) numpy.savetxt(save_model_dir + 'train_valid_test.txt', history_errs, fmt='%.3f') print 'save validation results to %s' % save_model_dir # save best model according to the best blue or meteor if len(history_errs) > 1 and \ valid_B4 > numpy.array(history_errs)[:-1,11].max(): print 'Saving to %s...' % save_model_dir, numpy.savez(save_model_dir + 'model_best_blue_or_meteor.npz', history_errs=history_errs, **best_p) if len(history_errs) > 1 and \ valid_err < numpy.array(history_errs)[:-1,6].min(): best_p = utils.unzip(tparams) bad_counter = 0 best_valid_err = valid_err uidx_best_valid_err = uidx print 'Saving to %s...' % save_model_dir, numpy.savez(save_model_dir + 'model_best_so_far.npz', history_errs=history_errs, **best_p) with open('%smodel_options.pkl' % save_model_dir, 'wb') as f: pkl.dump(model_options, f) print 'Done' elif len(history_errs) > 1 and \ valid_err >= numpy.array(history_errs)[:-1,6].min(): bad_counter += 1 print 'history best ', numpy.array(history_errs)[:, 6].min() print 'bad_counter ', bad_counter print 'patience ', patience if bad_counter > patience: print 'Early Stop!' estop = True break if test_B4 > 0.52 and test_meteor > 0.32: print 'Saving to %s...' % save_model_dir, numpy.savez(save_model_dir + 'model_' + str(uidx) + '.npz', history_errs=history_errs, **current_params) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \ 'best valid err so far',best_valid_err print 'valid took %.2f sec' % (time.time() - t0_valid) # end of validatioin if debug: break if estop: break if debug: break # end for loop over minibatches print 'This epoch has seen %d samples, train cost %.2f' % ( n_samples, numpy.mean(train_costs)) # end for loop over epochs print 'Optimization ended.' if best_p is not None: utils.zipp(best_p, tparams) use_noise.set_value(0.) valid_err = 0 test_err = 0 if not debug: #if valid: valid_err, valid_perp = model.pred_probs( engine, 'valid', f_log_probs, verbose=model_options['verbose']) #if test: #test_err, test_perp = self.pred_probs( # 'test', f_log_probs, # verbose=model_options['verbose']) print 'stopped at epoch %d, minibatch %d, '\ 'curent Train %.2f, current Valid %.2f, current Test %.2f '%( eidx,uidx,numpy.mean(train_err),numpy.mean(valid_err),numpy.mean(test_err)) params = copy.copy(best_p) numpy.savez(save_model_dir + 'model_best.npz', train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) if history_errs != []: history = numpy.asarray(history_errs) best_valid_idx = history[:, 6].argmin() numpy.savetxt(save_model_dir + 'train_valid_test.txt', history, fmt='%.4f') print 'final best exp ', history[best_valid_idx] return train_err, valid_err, test_err
def train(random_seed=1234, dim_word=256, # word vector dimensionality ctx_dim=-1, # context vector dimensionality, auto set dim=1000, # the number of LSTM units n_layers_out=1, n_layers_init=1, encoder='none', encoder_dim=100, prev2out=False, ctx2out=False, patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., alpha_entropy_r=0., lrate=0.01, selector=False, n_words=100000, maxlen=100, # maximum length of the description optimizer='adadelta', clip_c=2., batch_size = 64, valid_batch_size = 64, save_model_dir='/data/lisatmp3/yaoli/exp/capgen_vid/attention/test/', validFreq=10, saveFreq=10, # save the parameters after every saveFreq updates sampleFreq=10, # generate some samples after every sampleFreq updates metric='blue', dataset='youtube2text', video_feature='googlenet', use_dropout=False, reload_=False, from_dir=None, K1=10, K2=10, OutOf=240, verbose=True, debug=True ): rng_numpy, rng_theano = utils.get_two_rngs() model_options = locals().copy() model_options_c = locals().copy() if 'self' in model_options: del model_options['self'] with open('model_files/model_options.pkl', 'wb') as f: pkl.dump(model_options, f) with open('model_files/model_options_c3d.pkl', 'wb') as f: pkl.dump(model_options_c, f) # instance model layers = Layers() model = Model() model_c = Model() print 'Loading data' engine = data_engine.Movie2Caption('attention', dataset, video_feature, batch_size, valid_batch_size, maxlen, n_words, K1, K2, OutOf) model_options['ctx_dim'] = engine.ctx_dim model_options_c['ctx_dim'] = engine.ctx_dim_c model_options['n_words'] = engine.n_words model_options_c['n_words'] = engine.n_words print 'n_words:', model_options['n_words'] print model_options_c['dim'],model_options_c['ctx_dim'] # set test values, for debugging idx = engine.kf_train[0] [x_tv, mask_tv, ctx_tv, ctx_mask_tv, ctx_tv_c, ctx_mask_tv_c] = data_engine.prepare_data( engine, [engine.train[index] for index in idx]) print 'init params' t0 = time.time() params = model.init_params(model_options) params_c = model_c.init_params(model_options_c) # reloading model_saved = 'model_files/model_resnet.npz' model_saved_c = 'model_files/model_c3d.npz' assert os.path.isfile(model_saved) print "Reloading model params..." params = utils.load_params(model_saved, params) params_c = utils.load_params(model_saved_c, params_c) tparams = utils.init_tparams(params) tparams_c = utils.init_tparams(params_c) trng, use_noise, \ x, mask, ctx, mask_ctx, \ cost, extra = \ model.build_model(tparams, model_options) alphas = extra[1] betas = extra[2] trng_c, use_noise_c, \ x_c, mask_c, ctx_c, mask_ctx_c, \ cost_c, extra_c = \ model_c.build_model(tparams_c, model_options_c) alphas_c = extra_c[1] betas_c = extra_c[2] print 'buliding sampler' f_init, f_next = model.build_sampler(tparams, model_options, use_noise, trng) f_init_c, f_next_c = model_c.build_sampler(tparams_c, model_options_c, use_noise_c, trng_c) # before any regularizer print 'building f_log_probs' f_log_probs = theano.function([x, mask, ctx, mask_ctx], -cost, profile=False, on_unused_input='ignore') f_log_probs_c = theano.function([x_c, mask_c, ctx_c, mask_ctx_c], -cost_c, profile=False, on_unused_input='ignore') bad_counter = 0 processes = None queue = None rqueue = None shared_params = None uidx = 0 uidx_best_blue = 0 uidx_best_valid_err = 0 estop = False best_p = utils.unzip(tparams) best_blue_valid = 0 best_valid_err = 999 alphas_ratio = [] for eidx in xrange(max_epochs): n_samples = 0 train_costs = [] grads_record = [] print 'Epoch ', eidx for idx in engine.kf_train: tags = [engine.train[index] for index in idx] n_samples += len(tags) use_noise.set_value(1.) pd_start = time.time() x, mask, ctx, ctx_mask, ctx_c, ctx_mask_c = data_engine.prepare_data( engine, tags) #print 'x:',x.shape,'ctx:',ctx.shape,'ctx_c:',ctx_c.shape pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue if numpy.mod(uidx, saveFreq) == 0: pass if numpy.mod(uidx, sampleFreq) == 0: use_noise.set_value(0.) print '------------- sampling from train ----------' x_s = x mask_s = mask ctx_s = ctx ctx_s_c = ctx_c ctx_mask_s = ctx_mask ctx_mask_s_c = ctx_mask_c model.sample_execute_ensemble(engine, model_options,model_options_c, tparams,tparams_c, f_init,f_init_c, f_next,f_next_c, x_s, ctx_s, ctx_mask_s, ctx_s_c, ctx_mask_s_c, trng) print '------------- sampling from valid ----------' idx = engine.kf_valid[numpy.random.randint(1, len(engine.kf_valid) - 1)] tags = [engine.valid[index] for index in idx] x_s, mask_s, ctx_s, mask_ctx_s, ctx_s_c,mask_ctx_s_c = data_engine.prepare_data(engine, tags) model.sample_execute_ensemble(engine, model_options,model_options_c, tparams,tparams_c, f_init, f_init_c, f_next, f_next_c, x_s, ctx_s, mask_ctx_s, ctx_s_c, mask_ctx_s_c, trng) if validFreq != -1 and numpy.mod(uidx, validFreq) == 0: current_params = utils.unzip(tparams) use_noise.set_value(0.) train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 mean_ranking = 0 blue_t0 = time.time() scores, processes, queue, rqueue, shared_params = \ metrics.compute_score_ensemble( model_type='attention', model_archive=current_params, options=model_options, options_c=model_options_c, engine=engine, save_dir=save_model_dir, beam=5, n_process=5, whichset='both', on_cpu=False, processes=processes, queue=queue, rqueue=rqueue, shared_params=shared_params, metric=metric, one_time=False, f_init=f_init, f_init_c=f_init_c, f_next=f_next, f_next_c= f_next_c, model=model ) ''' {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]}, 'alternative_valid': {'Bleu_3': 0.40702270203174923, 'Bleu_4': 0.29276570520368456, 'CIDEr': 0.25247168210607884, 'Bleu_2': 0.529069629270047, 'Bleu_1': 0.6804308797115253, 'ROUGE_L': 0.51083584331688392}, 'meteor': {'test': [-1], 'valid': [0.282787550236724]}} ''' valid_B1 = scores['valid']['Bleu_1'] valid_B2 = scores['valid']['Bleu_2'] valid_B3 = scores['valid']['Bleu_3'] valid_B4 = scores['valid']['Bleu_4'] valid_Rouge = scores['valid']['ROUGE_L'] valid_Cider = scores['valid']['CIDEr'] valid_meteor = scores['valid']['METEOR'] test_B1 = scores['test']['Bleu_1'] test_B2 = scores['test']['Bleu_2'] test_B3 = scores['test']['Bleu_3'] test_B4 = scores['test']['Bleu_4'] test_Rouge = scores['test']['ROUGE_L'] test_Cider = scores['test']['CIDEr'] test_meteor = scores['test']['METEOR'] print 'computing meteor/blue score used %.4f sec, '\ 'blue score: %.1f, meteor score: %.1f'%( time.time()-blue_t0, valid_B4, valid_meteor) if test_B4>0.52 and test_meteor>0.32: print 'Saving to %s...'%save_model_dir, numpy.savez( save_model_dir+'model_'+str(uidx)+'.npz', **current_params) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \ 'best valid err so far',best_valid_err print 'valid took %.2f sec'%(time.time() - t0_valid) # end of validatioin sys.exit() if debug: break if estop: break if debug: break # end for loop over minibatches print 'This epoch has seen %d samples, train cost %.2f'%( n_samples, numpy.mean(train_costs)) # end for loop over epochs print 'Optimization ended.' if best_p is not None: utils.zipp(best_p, tparams) use_noise.set_value(0.) valid_err = 0 test_err = 0 if not debug: #if valid: valid_err, valid_perp = model.pred_probs( engine, 'valid', f_log_probs, verbose=model_options['verbose']) #if test: #test_err, test_perp = self.pred_probs( # 'test', f_log_probs, # verbose=model_options['verbose']) print 'stopped at epoch %d, minibatch %d, '\ 'curent Train %.2f, current Valid %.2f, current Test %.2f '%( eidx,uidx,numpy.mean(train_err),numpy.mean(valid_err),numpy.mean(test_err)) params = copy.copy(best_p) numpy.savez(save_model_dir+'model_best.npz', train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) if history_errs != []: history = numpy.asarray(history_errs) best_valid_idx = history[:,6].argmin() numpy.savetxt(save_model_dir+'train_valid_test.txt', history, fmt='%.4f') print 'final best exp ', history[best_valid_idx] return train_err, valid_err, test_err
def train( self, random_seed=1234, dim_word=256, # word vector dimensionality use_w2v=False, w2v_embs=None, ctx_dim=-1, # context vector dimensionality, auto set obj_ctx_dim=-1, cond_dim=1024, # the number of LSTM units fc_dim=1000, obj_fc_dim=512, n_words_out=1, n_layers_init=1, n_loops=1, encoder='none', encoder_dim=1000, patience=30, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., alpha_entropy_r=0., lrate=0.01, selector=False, n_words=6500, maxlen=50, # maximum length of the description optimizer=adadelta, clip_c=2., batch_size=64, valid_batch_size=64, save_model_dir='./snapshots/', save_file_prefix='lstm', validFreq=10, saveFreq=1500, # save the parameters after every saveFreq updates sampleFreq=1000, # generate some samples after every sampleFreq updates video_feature='vggnet', saveto='model_best_so_far.npz', use_dropout=False, reload_=False, from_dir=None, verbose=True, debug=True): self.rng_numpy, self.rng_theano = utils.get_two_rngs() model_options = locals().copy() if 'self' in model_options: del model_options['self'] print('Loading data') # answer word dict ans_word_dict = pickle.load(open('./data/ansdict.pkl')) self.ans_words = dict((i, w) for w, i in ans_word_dict.items()) # gif id -> list of frames gif_dict = pickle.load(open('./data/tgif_key_dict.pkl')) self.gif_dict = gif_dict # gif id -> list of frames -> list of objects(regions) self.load_obj_dict('./data') saveto = os.path.join(save_model_dir, save_file_prefix + '_' + saveto) train, valid, test = load_data() ydim = numpy.max(train[2]) + 1 model_options['ydim'] = ydim model_options['ctx_dim'] += model_options['obj_fc_dim'] model_options = validate_options(model_options) model_options_file = save_file_prefix + '_model_options.pkl' with open('%s/%s' % (save_model_dir, model_options_file), 'wb') as f: pickle.dump(model_options, f) kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) print('init params') t0 = time.time() params = self.init_params(model_options) # reloading if reload_: model_saved = from_dir + 'model_best_so_far.npz' # model_saved = from_dir + saveto assert os.path.isfile(model_saved) print("Reloading model params...") params = load_params(model_saved, params) tparams = init_tparams(params) (trng, use_noise, x, mask, ctx, mask_ctx, obj_ctx, mask_obj_ctx, y, alphas, cost, extra, f_pred_prob, f_pred, f_debug) = self.build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1. - alphas.sum(0))**2).sum(0).mean() cost += alpha_reg if alpha_entropy_r > 0: alpha_entropy_r = theano.shared(numpy.float32(alpha_entropy_r), name='alpha_entropy_r') alpha_reg_2 = alpha_entropy_r * (-tensor.sum( alphas * tensor.log(alphas + 1e-8), axis=-1)).sum(0).mean() cost += alpha_reg_2 else: alpha_reg_2 = tensor.zeros_like(cost) print('building f_alpha') f_alpha = theano.function( [x, mask, ctx, mask_ctx, obj_ctx, mask_obj_ctx, y], [alphas, alpha_reg_2], name='f_alpha', on_unused_input='ignore') f_cost = theano.function( [x, mask, ctx, mask_ctx, obj_ctx, mask_obj_ctx, y], cost, name='f_cost') print('compute grad') grads = tensor.grad(cost, wrt=itemlist(tparams)) if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads f_grad = theano.function( [x, mask, ctx, mask_ctx, obj_ctx, mask_obj_ctx, y], grads, name='f_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer( lr, tparams, grads, [x, mask, ctx, mask_ctx, obj_ctx, mask_obj_ctx, y], cost) print('Optimization') print("%d train examples" % len(train[0])) print("%d valid examples" % len(valid[0])) print("%d test examples" % len(test[0])) history_errs = [] alphas_ratio = [] best_alpha_ratio = None best_p = None bad_count = 0 pred_error = self.pred_error if validFreq == -1: validFreq = len(train[0]) // batch_size if saveFreq == -1: saveFreq = len(train[0]) // batch_size uidx = 0 # the number of update done estop = False # early stop start_time = time.time() try: for eidx in range(max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 n_samples += len(train_index) use_noise.set_value(1.) # Select the random examples for this minibatch (x_args, y, _) = \ self.prepare_all(prepare_data, train, train_index, split='train') all_args = x_args + [y] # print (f_debug[0](x_args[0], x_args[4], x_args[5])) cost = f_grad_shared(*all_args) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print('bad cost detected: ', cost) return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: alphas, reg = f_alpha(*all_args) print( 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'Alpha ratio %.3f, reg %.3f' % (alphas.min(-1).mean() / (alphas.max(-1).mean()), reg)) if saveto and numpy.mod(uidx, saveFreq) == 0: print('Saving...') alpha_saveto = os.path.join( save_model_dir, save_file_prefix + '_alpha_ratio.txt') numpy.savetxt(alpha_saveto, best_alpha_ratio) if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print('Done') if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) alphas, _ = f_alpha(*all_args) ratio = alphas.min(-1).mean() / (alphas.max(-1)).mean() alphas_ratio.append(ratio) train_err = pred_error(f_pred, prepare_data, train, kf, 'train', f_debug) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, 'val', f_debug) test_err = pred_error(f_pred, prepare_data, test, kf_test, 'test', f_debug) history_errs.append([valid_err, test_err]) if (best_p is None or valid_err <= numpy.array(history_errs)[:, 0].min()): best_p = unzip(tparams) best_alpha_ratio = alphas_ratio bad_counter = 0 print(('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err)) logging.info( 'Epoch: %d, Update: %d, Train %s, Valid %s, Test %s' % (eidx, uidx, train_err, valid_err, test_err)) if (len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: print('Early Stop!') estop = True break print('Seen %d samples' % n_samples) if estop: break except KeyboardInterrupt: print("Training interupted") end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted, 'train', f_debug) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, 'val', f_debug) test_err = pred_error(f_pred, prepare_data, test, kf_test, 'test', f_debug) print('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err) if saveto: numpy.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print('The code run for %d epochs, with %f sec/epochs' % ((eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))) print(('Training took %.1fs' % (end_time - start_time)), file=sys.stderr) return train_err, valid_err, test_err