def train( dim_out=500, # hidden layer dim for outputs ctx_dim=1024, # context vector dimensionality dim=1024, # the number of LSTM units n_actions=3101, # number of actions to predict n_layers_att=1, n_layers_out=1, n_layers_init=1, ctx2out=False, patience=50, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., temperature_inverse=1.0, lrate=0.001, selector=False, maxlen=5, # maximum length of the video optimizer='sgd', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates dataset='flickr8k', # dummy dataset, replace with video ones dictionary=None, # word dictionary use_dropout=False, reload_=False, training_stride=1, testing_stride=8, last_n=16, fps=30): # Model options model_options = locals().copy() #model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print '-----' print 'Booting up all data handlers' data_pb = TrainProto(batch_size, maxlen, training_stride, dataset, fps) dh = DataHandler(data_pb) dataset_size = dh.GetDatasetSize() num_train_batches = dataset_size / batch_size if dataset_size % batch_size != 0: num_train_batches += 1 valid = True # not None test = True # not None data_test_train_pb = TestTrainProto(valid_batch_size, maxlen, testing_stride, dataset, fps) dh_test_train = DataHandler(data_test_train_pb) test_train_dataset_size = dh_test_train.GetDatasetSize() num_test_train_batches = test_train_dataset_size / valid_batch_size if test_train_dataset_size % valid_batch_size != 0: num_test_train_batches += 1 data_test_valid_pb = TestValidProto(valid_batch_size, maxlen, testing_stride, dataset, fps) dh_test_valid = DataHandler(data_test_valid_pb) test_valid_dataset_size = dh_test_valid.GetDatasetSize() num_test_valid_batches = test_valid_dataset_size / valid_batch_size if test_valid_dataset_size % valid_batch_size != 0: num_test_valid_batches += 1 data_test_test_pb = TestTestProto(valid_batch_size, maxlen, testing_stride, dataset, fps) dh_test_test = DataHandler(data_test_test_pb) test_test_dataset_size = dh_test_test.GetDatasetSize() num_test_test_batches = test_test_dataset_size / valid_batch_size if test_test_dataset_size % valid_batch_size != 0: num_test_test_batches += 1 print 'Data handlers ready' print '-----' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ inps,\ cost, \ opts_out, preds, i_gate = \ build_model(tparams, model_options) ''' get_i_gate = theano.function(inps[0:2], i_gate, profile=False, on_unused_input='ignore') print 'build get_i_gate felished' x, vid, n_ex = dh_test_train.GetBatch(data_test_train_pb) mask = numpy.ones((maxlen, batch_size)).astype('float32') if n_ex != batch_size: mask[:,n_ex:] = numpy.zeros((maxlen, batch_size-n_ex)).astype('float32') i_gate_np = get_i_gate(x,mask) print len(i_gate_np) print len(i_gate_np[0]) print len(i_gate_np[0][0]) print i_gate_np[0][0][0].shape weig = numpy.zeros((7,7,30,batch_size)) for i in xrange(7): for j in xrange(7): for k in xrange(30): weig[i,j,k,:] = numpy.mean(i_gate_np[k][j][i],axis=1) dic = {'weig':weig, 'vid':vid} sio.savemat('weig.mat', {'dic':dic}) train_err = 0 valid_err = 0 test_err = 0 ''' # before any regularizer f_log_probs = theano.function(inps, -cost, profile=False) f_preds = theano.function(inps, preds, profile=False, on_unused_input='ignore') cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay cost += 0.0001 * i_gate.sum() #if alpha_c > 0.: # alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') # alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() # cost += alpha_reg # gradient computation grads = tensor.grad(cost, wrt=itemlist(tparams)) lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_count = 0 uidx = 0 try: for epochidx in xrange(max_epochs): # If the input sequences are of variable length get mask from the data loader instead of setting them all to one mask = numpy.ones((maxlen, batch_size)).astype('float32') print 'Epoch ', epochidx n_examples_seen = 0 estop = False if epochidx > 0: dh.Reset() for tbidx in xrange(num_train_batches): n_examples_seen += batch_size uidx += 1 use_noise.set_value(1.) pd_start = time.time() x, y, n_ex = dh.GetBatch(data_pb) if n_ex != batch_size: mask[:, n_ex:] = numpy.zeros( (maxlen, batch_size - n_ex)).astype('float32') pd_duration = time.time() - pd_start if x == None: print 'Minibatch with zero sample under length ', maxlen continue ud_start = time.time() cost = f_grad_shared(x, mask, y) if uidx == 1: print 'Original Cost ', cost / x.shape[3] f_update(lrate) ud_duration = time.time() - ud_start if n_ex != batch_size: mask[:, n_ex:] = numpy.ones( (maxlen, batch_size - n_ex)).astype('float32') if numpy.isnan(cost): print 'NaN detected in cost' return 1., 1., 1. if numpy.isinf(cost): print 'INF detected in cost' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', epochidx, 'Update ', uidx, 'Cost ', cost / x.shape[ 3], 'PD ', pd_duration, 'UD ', ud_duration if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p != None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)' #train_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_train_pb, dh_test_train, test_train_dataset_size, num_test_train_batches, last_n, test=False) if valid is not None: valid_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_valid_pb, dh_test_valid, test_valid_dataset_size, num_test_valid_batches, last_n, test=True) #if test is not None: # test_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_test_pb, dh_test_test, test_test_dataset_size, num_test_test_batches, last_n, test=True) history_errs.append([valid_err, test_err]) if epochidx == 0 or valid_err >= numpy.array( history_errs)[:, 0].max(): best_p = unzip( tparams) # p for min valid err / max valid acc print 'Accuracy: Train', train_err, 'Valid', valid_err, 'Test', test_err if n_ex == batch_size: print 'Seen %d training examples' % (n_examples_seen) else: print 'Seen %d training examples' % (n_examples_seen - batch_size + n_ex) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)' #train_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_train_pb, dh_test_train, test_train_dataset_size, num_test_train_batches, last_n, test=False) if valid is not None: valid_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_valid_pb, dh_test_valid, test_valid_dataset_size, num_test_valid_batches, last_n, test=True) if test is not None: test_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_test_pb, dh_test_test, test_test_dataset_size, num_test_test_batches, last_n, test=True) history_errs.append([valid_err, test_err]) if epochidx == 0 or valid_err >= numpy.array( history_errs)[:, 0].max(): best_p = unzip(tparams) # p for min valid err / max valid acc print 'Accuracy: Train', train_err, 'Valid', valid_err, 'Test', test_err finally: #except KeyboardInterrupt: if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)' #train_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_train_pb, dh_test_train, test_train_dataset_size, num_test_train_batches, last_n, test=False) if valid is not None: valid_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_valid_pb, dh_test_valid, test_valid_dataset_size, num_test_valid_batches, last_n, test=True) if test is not None: test_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_test_pb, dh_test_test, test_test_dataset_size, num_test_test_batches, last_n, test=True) print 'Accuracy: Train', train_err, 'Valid', valid_err, 'Test', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) print model_options return train_err, valid_err, test_err
def train( dim_out=100, # hidden layer dim for outputs ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units n_actions=3, # number of actions to predict n_layers_att=1, n_layers_out=1, n_layers_init=1, ctx2out=False, patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., temperature_inverse=1.0, lrate=0.01, selector=False, maxlen=30, # maximum length of the video optimizer='adam', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates dataset='flickr8k', # dummy dataset, replace with video ones dictionary=None, # word dictionary use_dropout=False, reload_=False, training_stride=1, testing_stride=8, last_n=16, fps=100, data_dir='/home/pmorerio/datasets/IIT_IFM/'): # Model options model_options = locals().copy() #model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print '-----' print 'Booting up all data handlers' print 'Training set for actual training (randomized)' data_pb = TrainProto(batch_size, maxlen, training_stride, dataset, data_dir, fps) dh = DataHandler(data_pb) dataset_size = dh.GetDatasetSize() num_train_batches = dataset_size / batch_size if dataset_size % batch_size != 0: num_train_batches += 1 print num_train_batches, ' batches' valid = None # not None test = True # not None print 'Training set for training accuracy' # the training set is loaded twice: for actual training and for computing training error data_test_train_pb = TestTrainProto(valid_batch_size, maxlen, testing_stride, dataset, data_dir, fps) dh_test_train = DataHandler(data_test_train_pb) test_train_dataset_size = dh_test_train.GetDatasetSize() num_test_train_batches = test_train_dataset_size / valid_batch_size if test_train_dataset_size % valid_batch_size != 0: num_test_train_batches += 1 print num_test_train_batches, ' batches' if valid == True: print 'Validation set for validation accuracy' data_test_valid_pb = TestValidProto(valid_batch_size, maxlen, testing_stride, dataset, data_dir, fps) dh_test_valid = DataHandler(data_test_valid_pb) test_valid_dataset_size = dh_test_valid.GetDatasetSize() num_test_valid_batches = test_valid_dataset_size / valid_batch_size if test_valid_dataset_size % valid_batch_size != 0: num_test_valid_batches += 1 print num_test_valid_batches, ' batches' print 'Test set for test accuracy' data_test_test_pb = TestTestProto(valid_batch_size, maxlen, testing_stride, dataset, data_dir, fps) dh_test_test = DataHandler(data_test_test_pb) test_test_dataset_size = dh_test_test.GetDatasetSize() num_test_test_batches = test_test_dataset_size / valid_batch_size if test_test_dataset_size % valid_batch_size != 0: num_test_test_batches += 1 print num_test_test_batches, ' batches' print 'Data handlers ready' print '-----' print 'Building model' params = init_params(model_options) # actual parameter initialization # reload parameters if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # simply initializes Theano shared variable according to param # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) alphas - the attention weigths # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) # 6) preds - the computed labels trng, use_noise, \ inps, alphas, \ cost, \ opts_out, preds = \ build_model(tparams, model_options) # builds the whole computation graph # before any regularizer f_log_probs = theano.function(inps, -cost, profile=False) f_preds = theano.function(inps, preds, profile=False, on_unused_input='ignore') cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # add attention penalty to the cost #if alpha_c > 0.: #alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') #alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() #cost += alpha_reg # add ATTENTION FOCUS to the cost if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = -alpha_c * ( alphas * tensor.log(alphas + 1e-8)).sum(0).sum(0).mean() cost += alpha_reg # Backpropagation # gradient computation grads = tensor.grad(cost, wrt=itemlist(tparams)) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' history_acc = [] # reload history if reload_ and os.path.exists(saveto): history_acc = numpy.load(saveto)['history_acc'].tolist() best_p = None bad_count = 0 uidx = 0 train_acc = 0 valid_acc = 0 test_acc = 0 for epochidx in xrange(max_epochs): # If the input sequences are of variable length get mask from the data loader instead of setting them all to one mask = numpy.ones((maxlen, batch_size)).astype('float32') print 'Epoch ', epochidx n_examples_seen = 0 estop = False # not used #if epochidx > 0: dh.Reset() # training data is shuffled at each epoch in Reset() udtime = 0 pdtime = 0 for tbidx in xrange(num_train_batches): n_examples_seen += batch_size uidx += 1 use_noise.set_value(1.) pd_start = time.time() x, y, n_ex = dh.GetBatch( data_pb ) # looks really slow. this is maybe why also predictions are slow (must get batches for all train/test/valid) if n_ex != batch_size: mask[:, n_ex:] = numpy.zeros( (maxlen, batch_size - n_ex)).astype('float32') pdtime += time.time() - pd_start # pd stands for prepare data? #if x == None: # this gives a Warning. Replaced with -> if x is None: if x is None: print 'Minibatch with zero sample under length ', maxlen continue ud_start = time.time() cost = f_grad_shared(x, mask, y) f_update(lrate) udtime += time.time() - ud_start # ud stands for use data? if n_ex != batch_size: mask[:, n_ex:] = numpy.ones( (maxlen, batch_size - n_ex)).astype('float32') if numpy.isnan(cost): print 'NaN detected in cost' return 1., 1., 1. if numpy.isinf(cost): print 'INF detected in cost' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', epochidx, ' Update', uidx, ' Cost', cost, ' PD', pdtime / float( dispFreq), ' UD', udtime / float(dispFreq) pdtime = 0 udtime = 0 if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p != None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_acc=history_acc, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_acc = 0 valid_acc = 0 test_acc = 0 print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)' train_acc = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_train_pb, dh_test_train, test_train_dataset_size, num_test_train_batches, last_n, test=False) if valid is not None: valid_acc = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_valid_pb, dh_test_valid, test_valid_dataset_size, num_test_valid_batches, last_n, test=True) if test is not None: test_acc = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_test_pb, dh_test_test, test_test_dataset_size, num_test_test_batches, last_n, test=True) history_acc.append([train_acc, valid_acc, test_acc]) if uidx == 0 or valid_acc >= numpy.array(history_acc)[:, 1].max(): best_p = unzip( tparams) # p for min valid err / max valid acc print 'Accuracy: Train', train_acc, 'Valid', valid_acc, 'Test', test_acc #here ends the cycle over the batches if n_ex == batch_size: print 'Seen %d training examples' % (n_examples_seen) else: print 'Seen %d training examples' % (n_examples_seen - batch_size + n_ex) use_noise.set_value(0.) train_acc = 0 valid_acc = 0 test_acc = 0 print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)' train_acc = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_train_pb, dh_test_train, test_train_dataset_size, num_test_train_batches, last_n, test=False) if valid is not None: valid_acc = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_valid_pb, dh_test_valid, test_valid_dataset_size, num_test_valid_batches, last_n, test=True) if test is not None: test_acc = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_test_pb, dh_test_test, test_test_dataset_size, num_test_test_batches, last_n, test=True) history_acc.append([train_acc, valid_acc, test_acc]) if epochidx == 0 or valid_acc >= numpy.array(history_acc)[:, 1].max(): best_p = unzip(tparams) # p for min valid err / max valid acc print 'Accuracy: Train', train_acc, 'Valid', valid_acc, 'Test', test_acc # here ends the cycle over the epochs # use the best parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) # if best param were found with validation, calculate accuracy with them if valid is not None: use_noise.set_value(0.) train_acc = 0 valid_acc = 0 test_acc = 0 print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)' train_acc = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_train_pb, dh_test_train, test_train_dataset_size, num_test_train_batches, last_n, test=False) valid_acc = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_valid_pb, dh_test_valid, test_valid_dataset_size, num_test_valid_batches, last_n, test=True) if test is not None: test_acc = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_test_pb, dh_test_test, test_test_dataset_size, num_test_test_batches, last_n, test=True) print 'Accuracy: Train', train_acc, 'Valid', valid_acc, 'Test', test_acc params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_acc=train_acc, valid_acc=valid_acc, test_acc=test_acc, history_acc=history_acc, **params) print model_options return train_acc, valid_acc, test_acc