def sgd(lr, tparams, grads, inp, cost): gshared = [ theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in six.iteritems(tparams) ] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inp, cost, updates=gsup, name='sgd') pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] f_update = theano.function([lr], [], updates=pup) return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, inp, cost): zipped_grads = [ theano.shared(p.get_value() * np.float32(0.), name='%s_grad' % k) for k, p in six.iteritems(tparams) ] running_grads = [ theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad' % k) for k, p in six.iteritems(tparams) ] running_grads2 = [ theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2' % k) for k, p in six.iteritems(tparams) ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup + rgup + rg2up, name='rmsprop') updir = [ theano.shared(p.get_value() * np.float32(0.), name='%s_updir' % k) for k, p in six.iteritems(tparams) ] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg**2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(itemlist(tparams), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore') return f_grad_shared, f_update
def adadelta(lr, tparams, grads, inp, cost): zipped_grads = [ theano.shared(p.get_value() * np.float32(0.), name='%s_grad' % k) for k, p in six.iteritems(tparams) ] running_up2 = [ theano.shared(p.get_value() * np.float32(0.), name='%s_rup2' % k) for k, p in six.iteritems(tparams) ] running_grads2 = [ theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2' % k) for k, p in six.iteritems(tparams) ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup + rg2up, name='adadelta') updir = [ -tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2) ] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore') return f_grad_shared, f_update
def train(dim_word=100, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units attn_type='stochastic', # [see section 4 from paper] n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=False, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit alpha_entropy_c=0.002, # hard attn param RL_sumCost=True, # hard attn param semi_sampling_p=0.5, # hard attn param temperature=1., # hard attn param patience=10, max_epochs=5000, dispFreq=100, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=0.01, # used only for SGD selector=False, # selector (see paper) n_words=10000, # vocab size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 16, saveto='model.npz', # relative path of saved model file validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates data_path='./data', # path to find data dataset='flickr8k', dictionary=None, # word dictionary use_dropout=False, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates reload_=False, save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data(path=data_path) if dataset == 'coco': valid, _ = valid # the second one contains all the validation data # index 0 and 1 always code for the end of sentence and unknown token word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas, alphas_sample,\ cost, \ opt_outs = \ build_model(tparams, model_options) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=opt_outs['attn_updates'] if model_options['attn_type']=='stochastic' else None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg hard_attn_updates = [] # Backprop! if model_options['attn_type'] == 'deterministic': grads = tensor.grad(cost, wrt=itemlist(tparams)) else: # shared variables for hard attention baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') opt_outs['baseline_time'] = baseline_time alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c') alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean() # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule] if model_options['RL_sumCost']: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.* (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) else: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.* (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) # [equation on bottom left of page 5] hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] # updates from scan hard_attn_updates += opt_outs['attn_updates'] # to getthe cost after regularization or the gradients, use this # f_cost = theano.function([x, mask, ctx], cost, profile=False) # f_grad = theano.function([x, mask, ctx], grads, profile=False) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates) print 'Optimization' # [See note in section 4.3 of paper] train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False) # history_errs is a bare-bones training log that holds the validation and test error history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud_duration = time.time() - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Checkpoint if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' # Print a generated sample as a sanity check if numpy.mod(uidx, sampleFreq) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options, trng=trng, k=5, maxlen=30, stochastic=False) # Decode the sample from encoding back to words print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean() if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean() history_errs.append([valid_err, test_err]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto+'_bestll', history_errs=history_errs, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err print 'Seen %d samples' % n_samples if estop: break if save_per_epoch: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid) if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return train_err, valid_err, test_err
def train(dim_word=100, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units attn_type='deterministic', # [see section 4 from paper] n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=False, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit alpha_entropy_c=0.002, # hard attn param RL_sumCost=False, # hard attn param semi_sampling_p=0.5, # hard attn param temperature=1., # hard attn param patience=10, max_epochs=5000, dispFreq=100, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=0.01, # used only for SGD selector=False, # selector (see paper) n_words=10000, # vocab size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 2,#change from 16 saveto='model.npz', # relative path of saved model file validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=5, # generate some samples after every sampleFreq updates data_path='./data', # path to find data dataset='flickr30k', dictionary=None, # word dictionary use_dropout=False, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates reload_=False, save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data(path=data_path) # index 0 and 1 always code for the end of sentence and unknown token word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas, alphas_sample,\ cost, \ opt_outs = \ build_model(tparams, model_options) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=opt_outs['attn_updates'] if model_options['attn_type']=='stochastic' else None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg hard_attn_updates = [] # Backprop! if model_options['attn_type'] == 'deterministic': grads = tensor.grad(cost, wrt=itemlist(tparams)) else: # shared variables for hard attention baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') opt_outs['baseline_time'] = baseline_time alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c') alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean() # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule] if model_options['RL_sumCost']: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.* (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) else: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.* (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) # [equation on bottom left of page 5] hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] # updates from scan hard_attn_updates += opt_outs['attn_updates'] # to getthe cost after regularization or the gradients, use this # f_cost = theano.function([x, mask, ctx], cost, profile=False) # f_grad = theano.function([x, mask, ctx], grads, profile=False) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates) print 'Optimization' # [See note in section 4.3 of paper] train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False) # history_errs is a bare-bones training log that holds the validation and test error history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud_duration = time.time() - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Checkpoint if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' # Print a generated sample as a sanity check if numpy.mod(uidx, sampleFreq) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options, trng=trng, k=5, maxlen=30, stochastic=False) # Decode the sample from encoding back to words print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean() if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean() history_errs.append([valid_err, test_err]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto+'_bestll', history_errs=history_errs, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err print 'Seen %d samples' % n_samples if estop: break if save_per_epoch: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid) if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return train_err, valid_err, test_err
def train(args, model_args): #model_id = '/data/lisatmp4/lambalex/lsun_walkback/walkback_' model_id = '/data/lisatmp4/anirudhg/cifar_walk_back/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = 'logs/walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir print model_dir2 + '/' + 'log.jsonl.gz' logger = mimir.Logger(filename=model_dir2 + '/log.jsonl.gz', formatter=None) # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() #trng = RandomStreams(1234) if args.resume_file is not None: print "Resuming training from " + args.resume_file from blocks.scripts import continue_training continue_training(args.resume_file) ## load the training data if args.dataset == 'MNIST': print 'loading MNIST' from fuel.datasets import MNIST dataset_train = MNIST(['train'], sources=('features', )) dataset_test = MNIST(['test'], sources=('features', )) n_colors = 1 spatial_width = 28 elif args.dataset == 'CIFAR10': from fuel.datasets import CIFAR10 dataset_train = CIFAR10(['train'], sources=('features', )) dataset_test = CIFAR10(['test'], sources=('features', )) n_colors = 3 spatial_width = 32 elif args.dataset == "lsun" or args.dataset == "lsunsmall": print "loading lsun class!" from load_lsun import load_lsun print "loading lsun data!" if args.dataset == "lsunsmall": dataset_train, dataset_test = load_lsun(args.batch_size, downsample=True) spatial_width = 32 else: dataset_train, dataset_test = load_lsun(args.batch_size, downsample=False) spatial_width = 64 n_colors = 3 elif args.dataset == "celeba": print "loading celeba data" from fuel.datasets.celeba import CelebA dataset_train = CelebA(which_sets=['train'], which_format="64", sources=('features', ), load_in_memory=False) dataset_test = CelebA(which_sets=['test'], which_format="64", sources=('features', ), load_in_memory=False) spatial_width = 64 n_colors = 3 tr_scheme = SequentialScheme(examples=dataset_train.num_examples, batch_size=args.batch_size) ts_scheme = SequentialScheme(examples=dataset_test.num_examples, batch_size=args.batch_size) train_stream = DataStream.default_stream(dataset_train, iteration_scheme=tr_scheme) test_stream = DataStream.default_stream(dataset_test, iteration_scheme=ts_scheme) dataset_train = train_stream dataset_test = test_stream #epoch_it = train_stream.get_epoch_iterator() elif args.dataset == 'Spiral': print 'loading SPIRAL' train_set = Spiral(num_examples=100000, classes=1, cycles=2., noise=0.01, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) else: raise ValueError("Unknown dataset %s." % args.dataset) model_options = locals().copy() if args.dataset != 'lsun' and args.dataset != 'celeba': train_stream = Flatten( DataStream.default_stream( dataset_train, iteration_scheme=ShuffledScheme( examples=dataset_train.num_examples - (dataset_train.num_examples % args.batch_size), batch_size=args.batch_size))) else: train_stream = dataset_train test_stream = dataset_test print "Width", WIDTH, spatial_width shp = next(train_stream.get_epoch_iterator())[0].shape print "got epoch iterator" # make the training data 0 mean and variance 1 # TODO compute mean and variance on full dataset, not minibatch Xbatch = next(train_stream.get_epoch_iterator())[0] scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) # scale is applied before shift #train_stream = ScaleAndShift(train_stream, scl, shft) #test_stream = ScaleAndShift(test_stream, scl, shft) print 'Building model' params = init_params(model_options) if args.reload_: print "Trying to reload parameters" if os.path.exists(args.saveto_filename): print 'Reloading Parameters' print args.saveto_filename params = load_params(args.saveto_filename, params) tparams = init_tparams(params) print tparams ''' x = T.matrix('x', dtype='float32') temp = T.scalar('temp', dtype='float32') f=transition_operator(tparams, model_options, x, temp) for data in train_stream.get_epoch_iterator(): print data[0] a = f([data[0], 1.0, 1]) #ipdb.set_trace() ''' x, cost, start_temperature = build_model(tparams, model_options) inps = [x, start_temperature] x_Data = T.matrix('x_Data', dtype='float32') temperature = T.scalar('temperature', dtype='float32') forward_diffusion = one_step_diffusion(x_Data, model_options, tparams, temperature) #print 'Building f_cost...', #f_cost = theano.function(inps, cost) #print 'Done' print tparams grads = T.grad(cost, wrt=itemlist(tparams)) #get_grads = theano.function(inps, grads) for j in range(0, len(grads)): grads[j] = T.switch(T.isnan(grads[j]), T.zeros_like(grads[j]), grads[j]) # compile the optimizer, the actual computational graph is compiled here lr = T.scalar(name='lr') print 'Building optimizers...', optimizer = args.optimizer f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' for param in tparams: print param print tparams[param].get_value().shape print 'Buiding Sampler....' f_sample = sample(tparams, model_options) print 'Done' uidx = 0 estop = False bad_counter = 0 max_epochs = 4000 batch_index = 1 print 'Number of steps....' print args.num_steps print "Number of metasteps...." print args.meta_steps print 'Done' count_sample = 1 for eidx in xrange(max_epochs): if eidx % 20 == 0: params = unzip(tparams) save_params(params, model_dir + '/' + 'params_' + str(eidx) + '.npz') n_samples = 0 print 'Starting Next Epoch ', eidx for data in train_stream.get_epoch_iterator(): if args.dataset == 'CIFAR10': if data[0].shape[0] == args.batch_size: data_use = (data[0].reshape(args.batch_size, 3 * 32 * 32), ) else: continue t0 = time.time() batch_index += 1 n_samples += len(data_use[0]) uidx += 1 if data_use[0] is None: print 'No data ' uidx -= 1 continue ud_start = time.time() t1 = time.time() data_run = data_use[0] temperature_forward = args.temperature meta_cost = [] for meta_step in range(0, args.meta_steps): meta_cost.append(f_grad_shared(data_run, temperature_forward)) f_update(lrate) if args.meta_steps > 1: data_run, sigma, _, _ = forward_diffusion( [data_run, temperature_forward, 1]) temperature_forward *= args.temperature_factor cost = sum(meta_cost) / len(meta_cost) ud = time.time() - ud_start #gradient_updates_ = get_grads(data_use[0],args.temperature) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1. t1 = time.time() #print time.time() - t1, "time to get grads" t1 = time.time() logger.log({ 'epoch': eidx, 'batch_index': batch_index, 'uidx': uidx, 'training_error': cost }) #'Norm_1': np.linalg.norm(gradient_updates_[0]), #'Norm_2': np.linalg.norm(gradient_updates_[1]), #'Norm_3': np.linalg.norm(gradient_updates_[2]), #'Norm_4': np.linalg.norm(gradient_updates_[3])}) #print time.time() - t1, "time to log" #print time.time() - t0, "total time in batch" t5 = time.time() if batch_index % 20 == 0: print batch_index, "cost", cost if batch_index % 200 == 0: count_sample += 1 temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) temperature_forward = args.temperature for num_step in range(args.num_steps * args.meta_steps): print "Forward temperature", temperature_forward if num_step == 0: x_data, sampled, sampled_activation, sampled_preactivation = forward_diffusion( [data_use[0], temperature_forward, 1]) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) x_temp = x_data.reshape(args.batch_size, n_colors, WIDTH, WIDTH) plot_images( x_temp, model_dir + '/' + "batch_" + str(batch_index) + '_corrupted' + 'epoch_' + str(count_sample) + '_time_step_' + str(num_step)) else: x_data, sampled, sampled_activation, sampled_preactivation = forward_diffusion( [x_data, temperature_forward, 1]) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) x_temp = x_data.reshape(args.batch_size, n_colors, WIDTH, WIDTH) plot_images( x_temp, model_dir + '/batch_' + str(batch_index) + '_corrupted' + '_epoch_' + str(count_sample) + '_time_step_' + str(num_step)) temperature_forward = temperature_forward * args.temperature_factor x_temp2 = data_use[0].reshape(args.batch_size, n_colors, WIDTH, WIDTH) plot_images( x_temp2, model_dir + '/' + 'orig_' + 'epoch_' + str(eidx) + '_batch_index_' + str(batch_index)) temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) for i in range(args.num_steps * args.meta_steps + args.extra_steps): x_data, sampled, sampled_activation, sampled_preactivation = f_sample( [x_data, temperature, 0]) print 'On backward step number, using temperature', i, temperature reverse_time( scl, shft, x_data, model_dir + '/' + "batch_" + str(batch_index) + '_samples_backward_' + 'epoch_' + str(count_sample) + '_time_step_' + str(i)) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor if args.noise == "gaussian": x_sampled = np.random.normal( 0.5, 2.0, size=(args.batch_size, INPUT_SIZE)).clip(0.0, 1.0) else: s = np.random.binomial(1, 0.5, INPUT_SIZE) temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) x_data = np.asarray(x_sampled).astype('float32') for i in range(args.num_steps * args.meta_steps + args.extra_steps): x_data, sampled, sampled_activation, sampled_preactivation = f_sample( [x_data, temperature, 0]) print 'On step number, using temperature', i, temperature reverse_time( scl, shft, x_data, model_dir + '/batch_index_' + str(batch_index) + '_inference_' + 'epoch_' + str(count_sample) + '_step_' + str(i)) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor ipdb.set_trace()
def train(args, model_args): model_id = '/data/lisatmp4/anirudhg/spiral_walk_back/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = 'logs/walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir print model_dir2 + '/' + 'log.jsonl.gz' logger = mimir.Logger(filename=model_dir2 + '/log.jsonl.gz', formatter=None) # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() #trng = RandomStreams(1234) if args.resume_file is not None: print "Resuming training from " + args.resume_file from blocks.scripts import continue_training continue_training(args.resume_file) ## load the training data if args.dataset == 'MNIST': print 'loading MNIST' from fuel.datasets import MNIST dataset_train = MNIST(['train'], sources=('features', )) dataset_test = MNIST(['test'], sources=('features', )) n_colors = 1 spatial_width = 28 elif args.dataset == 'CIFAR10': from fuel.datasets import CIFAR10 dataset_train = CIFAR10(['train'], sources=('features', )) dataset_test = CIFAR10(['test'], sources=('features', )) n_colors = 3 spatial_width = 32 elif args.dataset == "lsun" or args.dataset == "lsunsmall": print "loading lsun class!" from load_lsun import load_lsun print "loading lsun data!" if args.dataset == "lsunsmall": dataset_train, dataset_test = load_lsun(args.batch_size, downsample=True) spatial_width = 32 else: dataset_train, dataset_test = load_lsun(args.batch_size, downsample=False) spatial_width = 64 n_colors = 3 elif args.dataset == "celeba": print "loading celeba data" from fuel.datasets.celeba import CelebA dataset_train = CelebA(which_sets=['train'], which_format="64", sources=('features', ), load_in_memory=False) dataset_test = CelebA(which_sets=['test'], which_format="64", sources=('features', ), load_in_memory=False) spatial_width = 64 n_colors = 3 tr_scheme = SequentialScheme(examples=dataset_train.num_examples, batch_size=args.batch_size) ts_scheme = SequentialScheme(examples=dataset_test.num_examples, batch_size=args.batch_size) train_stream = DataStream.default_stream(dataset_train, iteration_scheme=tr_scheme) test_stream = DataStream.default_stream(dataset_test, iteration_scheme=ts_scheme) dataset_train = train_stream dataset_test = test_stream #epoch_it = train_stream.get_epoch_iterator() elif args.dataset == 'Spiral': print 'loading SPIRAL' train_set = Spiral(num_examples=20000, classes=1, cycles=1., noise=0.01, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) elif args.dataset == 'Circle': print 'loading Circle' train_set = Circle(num_examples=20000, classes=1, cycles=1., noise=0.0, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) iter_per_epoch = train_set.num_examples else: raise ValueError("Unknown dataset %s." % args.dataset) model_options = locals().copy() train_stream = dataset_train shp = next(train_stream.get_epoch_iterator())[0].shape print "got epoch iterator" # make the training data 0 mean and variance 1 # TODO compute mean and variance on full dataset, not minibatch Xbatch = next(train_stream.get_epoch_iterator())[0] scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) # scale is applied before shift #train_stream = ScaleAndShift(train_stream, scl, shft) #test_stream = ScaleAndShift(test_stream, scl, shft) print 'Building model' params = init_params(model_options) if args.reload_: print "Trying to reload parameters" if os.path.exists(args.saveto_filename): print 'Reloading Parameters' print args.saveto_filename params = load_params(args.saveto_filename, params) tparams = init_tparams(params) print tparams x, cost, start_temperature = build_model(tparams, model_options) inps = [x, start_temperature] x_Data = T.matrix('x_Data', dtype='float32') temperature = T.scalar('temperature', dtype='float32') forward_diffusion = one_step_diffusion(x_Data, model_options, tparams, temperature) #print 'Building f_cost...', #f_cost = theano.function(inps, cost) #print 'Done' print tparams grads = T.grad(cost, wrt=itemlist(tparams)) #get_grads = theano.function(inps, grads) for j in range(0, len(grads)): grads[j] = T.switch(T.isnan(grads[j]), T.zeros_like(grads[j]), grads[j]) # compile the optimizer, the actual computational graph is compiled here lr = T.scalar(name='lr') print 'Building optimizers...', optimizer = args.optimizer f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Buiding Sampler....' f_sample = sample(tparams, model_options) print 'Done' uidx = 0 estop = False bad_counter = 0 max_epochs = 4000 batch_index = 0 print 'Number of steps....', args.num_steps print 'Done' count_sample = 1 batch_index = 0 for eidx in xrange(max_epochs): if eidx % 20 == 0: params = unzip(tparams) save_params(params, model_dir + '/' + 'params_' + str(eidx) + '.npz') if eidx == 30: ipdb.set_trace() n_samples = 0 print 'Starting Next Epoch ', eidx for data in train_stream.get_epoch_iterator(): batch_index += 1 n_samples += len(data[0]) uidx += 1 if data[0] is None: print 'No data ' uidx -= 1 continue data_run = data[0] temperature_forward = args.temperature meta_cost = [] for meta_step in range(0, args.meta_steps): meta_cost.append(f_grad_shared(data_run, temperature_forward)) f_update(lrate) if args.meta_steps > 1: data_run, sigma, _, _ = forward_diffusion( data_run, temperature_forward) temperature_forward *= args.temperature_factor cost = sum(meta_cost) / len(meta_cost) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1. logger.log({ 'epoch': eidx, 'batch_index': batch_index, 'uidx': uidx, 'training_error': cost }) empty = [] spiral_x = [empty for i in range(args.num_steps)] spiral_corrupted = [] spiral_sampled = [] grad_forward = [] grad_back = [] x_data_time = [] x_tilt_time = [] if batch_index % 8 == 0: count_sample += 1 temperature = args.temperature * (args.temperature_factor **(args.num_steps - 1)) temperature_forward = args.temperature for num_step in range(args.num_steps): if num_step == 0: x_data_time.append(data[0]) plot_images( data[0], model_dir + '/' + 'orig_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) x_data, mu_data, _, _ = forward_diffusion( data[0], temperature_forward) plot_images( x_data, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(num_step)) x_data_time.append(x_data) temp_grad = np.concatenate( (x_data_time[-2], x_data_time[-1]), axis=1) grad_forward.append(temp_grad) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) spiral_corrupted.append(x_data) mu_data = np.asarray(mu_data).astype( 'float32').reshape(args.batch_size, INPUT_SIZE) mu_data = mu_data.reshape(args.batch_size, 2) else: x_data_time.append(x_data) x_data, mu_data, _, _ = forward_diffusion( x_data, temperature_forward) plot_images( x_data, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(num_step)) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) spiral_corrupted.append(x_data) mu_data = np.asarray(mu_data).astype( 'float32').reshape(args.batch_size, INPUT_SIZE) mu_data = mu_data.reshape(args.batch_size, 2) x_data_time.append(x_data) temp_grad = np.concatenate( (x_data_time[-2], x_data_time[-1]), axis=1) grad_forward.append(temp_grad) temperature_forward = temperature_forward * args.temperature_factor mean_sampled = x_data.mean() var_sampled = x_data.var() x_temp2 = data[0].reshape(args.batch_size, 2) plot_2D( spiral_corrupted, args.num_steps, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) plot_2D( x_temp2, 1, model_dir + '/' + 'orig_' + 'epoch_' + str(count_sample) + '_batch_index_' + str(batch_index)) plot_grad( grad_forward, model_dir + '/' + 'grad_forward_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) for i in range(args.num_steps + args.extra_steps): x_tilt_time.append(x_data) x_data, sampled_mean = f_sample(x_data, temperature) plot_images( x_data, model_dir + '/' + 'sampled_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(i)) x_tilt_time.append(x_data) temp_grad = np.concatenate( (x_tilt_time[-2], x_tilt_time[-1]), axis=1) grad_back.append(temp_grad) ###print 'Recons, On step number, using temperature', i, temperature x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor plot_grad( grad_back, model_dir + '/' + 'grad_back_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) plot_2D( x_tilt_time, args.num_steps, model_dir + '/' + 'sampled_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) s = np.random.normal(mean_sampled, var_sampled, [args.batch_size, 2]) x_sampled = s temperature = args.temperature * (args.temperature_factor **(args.num_steps - 1)) x_data = np.asarray(x_sampled).astype('float32') for i in range(args.num_steps + args.extra_steps): x_data, sampled_mean = f_sample(x_data, temperature) spiral_sampled.append(x_data) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor plot_2D( spiral_sampled, args.num_steps, model_dir + '/' + 'inference_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) ipdb.set_trace()
def build_model(options,tparams): """Build up the whole computation graph Input is the features extracted from googleNet. """ last_n = options['last_n']; actionNum=options['actions']; decay_c=options['decay_c']; use_dropout=options['use_dropout']; use_wta=options['use_wta']; featureMaps=options['featureMaps']; videosize=options['videosize']; trng = RandomStreams(1234); use_noise = theano.shared(np.float32(0.)); """combine model""" x=T.ftensor3('x'); n_steps=x.shape[0]; n_samples=x.shape[1]; mask = T.fmatrix('mask'); y=T.ftensor3('y'); # one hot vector,n_steps*n_samples*actionNum _x=x.reshape([n_steps*n_samples,3,videosize,videosize]); _x=_x+use_noise*trng.normal(_x.shape,avg=0,std=0.1,dtype=_x.dtype); feature=CNN_block(_x); feature=feature.reshape([n_steps,n_samples,featureMaps]); if use_dropout: feature = dropout_layer(feature, use_noise, trng); f0=ff_build(tparams,feature,prefix="recog",name='fc0',active="tanh"); f1=ff_build(tparams,f0,prefix="recog",name='fc1',active="tanh"); # if use_dropout: f0 = dropout_layer(f0, use_noise, trng); lin=ff_build(tparams,f1,prefix="recog",name='output',active="linear"); # n_steps*n_samples*actionNum probs=T.nnet.softmax(lin.reshape([-1,actionNum])); probs=probs.reshape([n_steps,n_samples,actionNum]); """compute cost""" cost=0; # cross entropy entropy_cost=-y*T.log(probs); entropy_cost = (entropy_cost.sum(2)*mask).mean(0).sum(); cost+=entropy_cost; # weight decay weight_decay = 0.; if decay_c > 0.: decay_c = theano.shared(np.float32(decay_c), name='decay_c'); for vv in itemlist(tparams): # if (vv.ndim==1): weight_decay += (vv ** 2).sum() # else: # weight_decay+=(vv.sum()-vv.shape[0])**2; # weight_decay += (vv ** 2).sum() weight_decay *= decay_c; cost+=weight_decay; """Predictions""" preds = T.sum(probs[-last_n:,:,:],axis=0); preds = T.argmax(preds,axis=1); # n_samples # preds=T.argmax(probs[-last_n:,:,:],axis=2); return cost,preds,[],[x,mask,y],use_noise;
def train(args, model_args, lrate): model_id = '/data/lisatmp4/anirudhg/minst_walk_back/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = 'walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir logger = mimir.Logger(filename=model_dir2 + '/' + model_id2 + 'log.jsonl.gz', formatter=None) # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() #trng = RandomStreams(1234) if args.resume_file is not None: print "Resuming training from " + args.resume_file from blocks.scripts import continue_training continue_training(args.resume_file) ## load the training data if args.dataset == 'MNIST': print 'loading MNIST' from fuel.datasets import MNIST dataset_train = MNIST(['train'], sources=('features', )) dataset_test = MNIST(['test'], sources=('features', )) n_colors = 1 spatial_width = 28 elif args.dataset == 'CIFAR10': from fuel.datasets import CIFAR10 dataset_train = CIFAR10(['train'], sources=('features', )) dataset_test = CIFAR10(['test'], sources=('features', )) n_colors = 3 spatial_width = 32 elif args.dataset == 'Spiral': print 'loading SPIRAL' train_set = Spiral(num_examples=100000, classes=1, cycles=2., noise=0.01, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) else: raise ValueError("Unknown dataset %s." % args.dataset) model_options = locals().copy() train_stream = Flatten( DataStream.default_stream(dataset_train, iteration_scheme=ShuffledScheme( examples=dataset_train.num_examples, batch_size=args.batch_size))) shp = next(train_stream.get_epoch_iterator())[0].shape # make the training data 0 mean and variance 1 # TODO compute mean and variance on full dataset, not minibatch Xbatch = next(train_stream.get_epoch_iterator())[0] scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) # scale is applied before shift #train_stream = ScaleAndShift(train_stream, scl, shft) #test_stream = ScaleAndShift(test_stream, scl, shft) print 'Building model' params = init_params(model_options) if args.reload_ and os.path.exists(args.saveto_filename): print 'Reloading Parameters' print args.saveto_filename params = load_params(args.saveto_filename, params) tparams = init_tparams(params) ''' x = T.matrix('x', dtype='float32') f=transition_operator(tparams, model_options, x, 1) for data in train_stream.get_epoch_iterator(): print data[0] a = f(data[0]) print a ipdb.set_trace() ''' x, cost = build_model(tparams, model_options) inps = [x] x_Data = T.matrix('x_Data', dtype='float32') temperature = T.scalar('temperature', dtype='float32') forward_diffusion = one_step_diffusion(x_Data, model_options, tparams, temperature) print 'Building f_cost...', f_cost = theano.function(inps, cost) print 'Done' print tparams grads = T.grad(cost, wrt=itemlist(tparams)) get_grads = theano.function(inps, grads) for j in range(0, len(grads)): grads[j] = T.switch(T.isnan(grads[j]), T.zeros_like(grads[j]), grads[j]) # compile the optimizer, the actual computational graph is compiled here lr = T.scalar(name='lr') print 'Building optimizers...', optimizer = args.optimizer f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Buiding Sampler....' f_sample = sample(tparams, model_options) print 'Done' uidx = 0 estop = False bad_counter = 0 max_epochs = 4000 batch_index = 0 print 'Number of steps....' print args.num_steps print 'Done' count_sample = 1 for eidx in xrange(max_epochs): n_samples = 0 print 'Starting Next Epoch ', eidx for data in train_stream.get_epoch_iterator(): batch_index += 1 n_samples += len(data[0]) uidx += 1 if data[0] is None: print 'No data ' uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(data[0]) f_update(lrate) ud = time.time() - ud_start if batch_index % 1 == 0: print 'Cost is this', cost count_sample += 1 from impainting import change_image, inpainting train_temp = data[0] print data[0].shape change_image(train_temp.reshape(args.batch_size, 1, 28, 28), 3) train_temp = train_temp.reshape(args.batch_size, 784) output = inpainting(train_temp) change_image(output.reshape(args.batch_size, 1, 28, 28), 1) reverse_time( scl, shft, output, model_dir + '/' + 'impainting_orig_' + 'epoch_' + str(count_sample) + '_batch_index_' + str(batch_index)) x_data = np.asarray(output).astype('float32') temperature = args.temperature * (args.temperature_factor **(args.num_steps - 1)) temperature = args.temperature #* (args.temperature_factor ** (args.num_steps -1 )) orig_impainted_data = np.asarray(data[0]).astype('float32') for i in range(args.num_steps + args.extra_steps + 5): x_data, sampled, sampled_activation, sampled_preactivation = f_sample( x_data, temperature) print 'Impainting using temperature', i, temperature x_data = do_half_image(x_data, orig_impainted_data) reverse_time( scl, shft, x_data, model_dir + '/' + 'impainting_orig_' + 'epoch_' + str(count_sample) + '_batch_index_' + str(batch_index) + 'step_' + str(i)) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature = temperature #temperature /= args.temperature_factor ipdb.set_trace()
def train( dim_word=300, # word vector dimensionality ctx_dim=300, # context vector dimensionality semantic_dim=300, dim=1000, # the number of LSTM units cnn_dim=4096, # CNN feature dimension n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=True, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit cutoff=10, patience=5, max_epochs=30, dispFreq=500, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=1e-4, # used only for SGD selector=False, # selector (see paper) maxlen=30, # maximum length of the description optimizer='rmsprop', pretrained='', batch_size=256, saveto='model', # relative path of saved model file saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates embedding='../Data/GloVe/vocab_glove.pkl', cnn_type='vgg', prefix='../Data', # path to find data dataset='coco', criterion='Bleu_4', switch_test_val=False, use_cnninit=True, use_dropout=True, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if os.path.exists('%s.pkl' % saveto): print "Reloading options" with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(model_options['dataset']) # Load data from data path if 'switch_test_val' in model_options and model_options['switch_test_val']: train, valid, worddict = load_data(path=osp.join( model_options['prefix'], model_options['dataset']), options=model_options, load_train=True, load_test=True) else: train, valid, worddict = load_data(path=osp.join( model_options['prefix'], model_options['dataset']), options=model_options, load_train=True, load_val=True) # Automatically calculate the update frequency validFreq = len(train[0]) / model_options['batch_size'] print "Validation frequency is %d" % validFreq word_idict = {vv: kk for kk, vv in worddict.iteritems()} model_options['n_words'] = len(worddict) # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) # Initialize it with glove if 'VCemb' in params: params['VCemb'] = read_pkl( model_options['embedding']).astype('float32') # If there is a same experiment, don't use pretrained weights if os.path.exists('%s.npz' % saveto): print "Reloading model" params = load_params('%s.npz' % saveto, params) elif pretrained != '': params = load_params(pretrained, params, False) # Only pretrain the Language model # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas,\ cost, \ opt_outs = \ build_model(tparams, model_options) # Load evaluator to calculate bleu score evaluator = cocoEvaluation(model_options['dataset']) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = sum([ alpha_c * ((1. - alpha.sum(0))**2).sum(0).mean() for alpha in alphas ]) cost += alpha_reg # Backprop! grads = tensor.grad(cost, wrt=itemlist(tparams)) # to getthe cost after regularization or the gradients, use this # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) print 'Optimization' train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=model_options['maxlen']) # history_bleu is a bare-bones training log, reload history history_bleu = [] if os.path.exists('%s.npz' % saveto): history_bleu = numpy.load('%s.npz' % saveto)['history_bleu'].tolist() start_epochs = len(history_bleu) best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(start_epochs, model_options['max_epochs']): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx, cnn_feats = prepare_data(caps, train[1], train[2], worddict, model_options) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', model_options[ 'maxlen'] continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx, cnn_feats) print "Epoch %d, Updates: %d, Cost is: %f" % (eidx, uidx, cost) f_update(model_options['lrate']) ud_duration = time.time( ) - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Print a generated sample as a sanity check if numpy.mod(uidx, model_options['sampleFreq']) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score, alphas = gen_sample( f_init, f_next, ctx_s[jj], cnn_feats[jj], model_options, trng=trng, maxlen=model_options['maxlen']) # Decode the sample from encoding back to words print 'Truth ', jj, ': ', print seqs2words(x_s[:, jj], word_idict) for kk, ss in enumerate([sample[0]]): print 'Sample (', kk, ') ', jj, ': ', print seqs2words(ss, word_idict) # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) # Do evaluation on validation set imgid = collapse([elem[-1] for elem in valid[0]]) caps = process_examples([f_init], [f_next], imgid, valid[1], valid[2], word_idict, model_options) folder = osp.join('../output', '%s_%s' % (saveto, 'val')) if not osp.exists(folder): os.mkdir(folder) with open(osp.join(folder, 'captions_val2014_results.json'), 'w') as f: json.dump(caps, f) eva_result = evaluator.evaluate(folder, False) if model_options['criterion'] == 'combine': history_bleu.append(eva_result['Bleu_4'] + eva_result['CIDEr']) else: history_bleu.append(eva_result[model_options['criterion']]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or history_bleu[-1] == max(history_bleu): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto + '_bestll', history_bleu=history_bleu, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if len(history_bleu) > model_options[ 'patience'] and history_bleu[-1] <= max( history_bleu[:-model_options['patience']]): bad_counter += 1 if bad_counter > model_options['patience']: print 'Early Stop!' estop = True break print ' BLEU-4 score ', history_bleu[-1] # Checkpoint if numpy.mod(uidx, model_options['saveFreq']) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_bleu=history_bleu, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' print 'Seen %d samples' % n_samples if estop: break if model_options['save_per_epoch']: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_bleu=history_bleu, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_bleu=history_bleu, **params)
def train(): model = 'theano' if model == 'theano': model_options = locals().copy() params = init_params(model_options) tparams = init_tparams(params) x, u, m, v, q = build_model(tparams, model_options) reward = T.matrix('rewards') gamma_t = T.scalar('reward') cost = T.sqr(q - (reward + gamma_t * v)) inps = [x, u, q, reward, gamma_t, v] inps_cost = [q, reward, gamma_t, v] _mu = theano.function([x], m) mu = lambda x: _mu(x) model = theano.function([x, u], q) params_target = init_params_target(model_options) tparams_target = init_tparams(params_target) x, u, m, v, q = build_model_target(tparams_target, model_options) _V = theano.function([x], v) V = lambda x: _V(x) lr = T.scalar(name='lr') f_cost = theano.function(inps_cost, cost, on_unused_input='warn') grads = T.grad(cost.mean(), wrt=itemlist(tparams)) optimizer = 'adam' f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) for key, key2 in zip(tparams_target.items(), tparams.items()): a, b = key c, d = key2 params_target[a] = params[c] prestates = [] actions = [] rewards = [] poststates = [] terminals = [] total_reward = 0 lrate = optimizer_lr for i_episode in xrange(episodes): observation = env.reset() print "initial state:", observation, max_timesteps episode_reward = 0 for t in xrange(max_timesteps): if display: env.render() x = np.array([observation]) u = mu(x) if noise_decay == 'linear': noise = 1. / (i_episode + 1) elif noise_decay == 'exp': noise = 10**-i_episode elif noise_decay == 'fixed': noise = fixed_noise else: assert False action = u[0] + np.random.randn(num_actuators) * noise prestates.append(observation) actions.append(action) observation, reward, done, info = env.step(action) episode_reward += reward rewards.append(reward) poststates.append(observation) terminals.append(done) if len(prestates) > min_train: loss = 0 for k in xrange(train_repeat): if len(prestates) > batch_size: indexes = np.random.choice(len(prestates), size=batch_size) else: indexes = range(len(prestates)) v = V(np.array(poststates)[indexes]) y = np.array(rewards)[indexes] + gamma * np.squeeze(v) q = model( np.array(prestates)[indexes], np.array(actions)[indexes]) out = f_cost( q, np.array(rewards)[indexes].reshape( (y.shape[0], 1)).astype('float32'), gamma, np.squeeze(v).reshape((y.shape[0], 1))) loss += out.mean() f_grad_shared( np.array(prestates)[indexes], np.array(actions)[indexes], q, np.array(rewards)[indexes].reshape( (y.shape[0], 1)).astype('float32'), gamma, np.squeeze(v).reshape((y.shape[0], 1))) f_update(lrate) for key, key2 in zip(tparams_target.items(), tparams.items()): a, b = key c, d = key2 params_target[a] = params[c] * tau + params_target[ a] * (1. - tau) if done: break episode_reward = episode_reward / float(t + 1) print "Episode {} finished after {} timesteps, average reward {}".format( i_episode + 1, t + 1, episode_reward) total_reward += episode_reward print "Average reward per episode {}".format(total_reward / episodes)
def train(args, model_args): model_id = '/data/lisatmp3/anirudhg/spiral_walk_back/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = 'walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir, model_dir2 logger = mimir.Logger(filename=model_dir2 + '/' + model_id2 + 'log.jsonl.gz', formatter=None) # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() #trng = RandomStreams(1234) if args.resume_file is not None: print "Resuming training from " + args.resume_file from blocks.scripts import continue_training continue_training(args.resume_file) ## load the training data if args.dataset == 'MNIST': print 'loading MNIST' from fuel.datasets import MNIST dataset_train = MNIST(['train'], sources=('features', )) dataset_test = MNIST(['test'], sources=('features', )) n_colors = 1 spatial_width = 28 elif args.dataset == 'Spiral': print 'loading SPIRAL' train_set = Spiral(num_examples=20000, classes=1, cycles=1., noise=0.0, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) iter_per_epoch = train_set.num_examples print iter_per_epoch elif args.dataset == 'Circle': print 'loading SPIRAL' train_set = Circle(num_examples=20000, classes=1, cycles=1., noise=0.0, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) iter_per_epoch = train_set.num_examples print iter_per_epoch else: raise ValueError("Unknown dataset %s." % args.dataset) model_options = locals().copy() train_stream = dataset_train shp = next(train_stream.get_epoch_iterator())[0].shape # make the training data 0 mean and variance 1 # TODO compute mean and variance on full dataset, not minibatch Xbatch = next(train_stream.get_epoch_iterator())[0] scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) print 'Building model' params = init_params(model_options) if args.reload_ and os.path.exists(args.saveto_filename): print 'Reloading Parameters' print args.saveto_filename params = load_params(args.saveto_filename, params) tparams = init_tparams(params) x = T.matrix('x', dtype='float32') x, cost, cost_each_step = build_model(tparams, model_options) inps = [x] x_Data = T.matrix('x_Data', dtype='float32') temperature = T.scalar('temperature', dtype='float32') forward_diffusion, debug_function = one_step_diffusion( x_Data, model_options, tparams, temperature) print 'Building f_cost...', f_cost = theano.function(inps, cost) f_cost_each_step = theano.function(inps, cost_each_step) print 'Done' grads = T.grad(cost, wrt=itemlist(tparams)) get_grads = theano.function(inps, grads) for j in range(0, len(grads)): grads[j] = T.switch(T.isnan(grads[j]), T.zeros_like(grads[j]), grads[j]) # compile the optimizer, the actual computational graph is compiled here lr = T.scalar(name='lr') print 'Building optimizers...', optimizer = args.optimizer f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Buiding Sampler....' f_sample = sample(tparams, model_options) print 'Done' uidx = 0 estop = False bad_counter = 0 max_epochs = 4000 batch_index = 0 print 'Number of steps....', args.num_steps print 'Done' count_sample = 1 for eidx in xrange(max_epochs): if eidx % 20 == 0: params = unzip(tparams) save_params(params, model_dir + '/' + 'params_' + str(eidx) + '.npz') if eidx == 30: ipdb.set_trace() n_samples = 0 batch_index = 0 print 'Starting Next Epoch ', eidx for data in train_stream.get_epoch_iterator(): batch_index += 1 n_samples += len(data[0]) uidx += 1 if data[0] is None: print 'No data ' uidx -= 1 continue plt.plot(*zip(*data[0])) plt.show() ud_start = time.time() cost = f_grad_shared(data[0]) output_cost = f_cost_each_step(data[0]) output_cost = np.asarray(output_cost) #cost_time_step_1, cost_time_step_2 = (output_cost[0,:].sum() *1.0) /args.batch_size, (output_cost[1,:].sum() * 1.0 )/args.batch_size print 'Loss is', cost f_update(lrate) ud = time.time() - ud_start if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1. gradient_updates_ = get_grads(data[0]) logger.log({ 'epoch': eidx, 'batch_index': batch_index, 'uidx': uidx, 'training_error': cost }) empty = [] spiral_x = [empty for i in range(args.num_steps)] spiral_corrupted = [] spiral_sampled = [] grad_forward = [] grad_back = [] x_data_time = [] x_tilt_time = [] if eidx % 10 == 0: count_sample += 1 temperature = args.temperature * (args.temperature_factor **(args.num_steps - 1)) temperature_forward = args.temperature for num_step in range(args.num_steps): if num_step == 0: x_data_time.append(data[0]) plot_images( data[0], model_dir + '/' + 'orig_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) x_data, mu_data = forward_diffusion( data[0], temperature_forward) plot_images( x_data, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(num_step)) x_data_time.append(x_data) temp_grad = np.concatenate( (x_data_time[-2], x_data_time[-1]), axis=1) grad_forward.append(temp_grad) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) spiral_corrupted.append(x_data) mu_data = np.asarray(mu_data).astype( 'float32').reshape(args.batch_size, INPUT_SIZE) mu_data = mu_data.reshape(args.batch_size, 2) else: x_data_time.append(x_data) x_data, mu_data = forward_diffusion( x_data, temperature_forward) plot_images( x_data, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(num_step)) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) spiral_corrupted.append(x_data) mu_data = np.asarray(mu_data).astype( 'float32').reshape(args.batch_size, INPUT_SIZE) mu_data = mu_data.reshape(args.batch_size, 2) x_data_time.append(x_data) temp_grad = np.concatenate( (x_data_time[-2], x_data_time[-1]), axis=1) grad_forward.append(temp_grad) temperature_forward = temperature_forward * args.temperature_factor x_temp2 = data[0].reshape(args.batch_size, 2) plot_2D( spiral_corrupted, args.num_steps, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) plot_2D( x_temp2, 1, model_dir + '/' + 'orig_' + 'epoch_' + str(count_sample) + '_batch_index_' + str(batch_index)) plot_grad( grad_forward, model_dir + '/' + 'grad_forward_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) for i in range(args.num_steps + args.extra_steps): x_tilt_time.append(x_data) x_data, sampled_mean = f_sample(x_data, temperature) plot_images( x_data, model_dir + '/' + 'sampled_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(i)) x_tilt_time.append(x_data) temp_grad = np.concatenate( (x_tilt_time[-2], x_tilt_time[-1]), axis=1) grad_back.append(temp_grad) ###print 'Recons, On step number, using temperature', i, temperature x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor plot_grad( grad_back, model_dir + '/' + 'grad_back_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) np.savez('grad_backward.npz', grad_back) plot_2D( x_tilt_time, args.num_steps, model_dir + '/' + 'sampled_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) s = np.random.normal(0, 1, [args.batch_size, 2]) x_sampled = s temperature = args.temperature * (args.temperature_factor **(args.num_steps - 1)) x_data = np.asarray(x_sampled).astype('float32') for i in range(args.num_steps + args.extra_steps): x_data, sampled_mean = f_sample(x_data, temperature) spiral_sampled.append(x_data) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor plot_2D( spiral_sampled, args.num_steps, model_dir + '/' + 'inference_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) ipdb.set_trace()
def train(args, model_args): #model_id = '/data/lisatmp4/lambalex/lsun_walkback/walkback_' model_id = '/data/lisatmp4/anirudhg/cifar_walk_back/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = 'logs/walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir print model_dir2 + '/' + 'log.jsonl.gz' logger = mimir.Logger(filename=model_dir2 + '/log.jsonl.gz', formatter=None) # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() #trng = RandomStreams(1234) if args.resume_file is not None: print "Resuming training from " + args.resume_file from blocks.scripts import continue_training continue_training(args.resume_file) ## load the training data if args.dataset == 'MNIST': print 'loading MNIST' from fuel.datasets import MNIST dataset_train = MNIST(['train'], sources=('features', )) dataset_test = MNIST(['test'], sources=('features', )) n_colors = 1 spatial_width = 28 elif args.dataset == 'CIFAR10': from fuel.datasets import CIFAR10 dataset_train = CIFAR10(['train'], sources=('features', )) dataset_test = CIFAR10(['test'], sources=('features', )) n_colors = 3 spatial_width = 32 elif args.dataset == "lsun" or args.dataset == "lsunsmall": print "loading lsun class!" from load_lsun import load_lsun print "loading lsun data!" if args.dataset == "lsunsmall": dataset_train, dataset_test = load_lsun(args.batch_size, downsample=True) spatial_width = 32 else: dataset_train, dataset_test = load_lsun(args.batch_size, downsample=False) spatial_width = 64 n_colors = 3 elif args.dataset == "celeba": print "loading celeba data" from fuel.datasets.celeba import CelebA dataset_train = CelebA(which_sets=['train'], which_format="64", sources=('features', ), load_in_memory=False) dataset_test = CelebA(which_sets=['test'], which_format="64", sources=('features', ), load_in_memory=False) spatial_width = 64 n_colors = 3 tr_scheme = SequentialScheme(examples=dataset_train.num_examples, batch_size=args.batch_size) ts_scheme = SequentialScheme(examples=dataset_test.num_examples, batch_size=args.batch_size) train_stream = DataStream.default_stream(dataset_train, iteration_scheme=tr_scheme) test_stream = DataStream.default_stream(dataset_test, iteration_scheme=ts_scheme) dataset_train = train_stream dataset_test = test_stream #epoch_it = train_stream.get_epoch_iterator() elif args.dataset == 'Spiral': print 'loading SPIRAL' train_set = Spiral(num_examples=100000, classes=1, cycles=2., noise=0.01, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) else: raise ValueError("Unknown dataset %s." % args.dataset) model_options = locals().copy() if args.dataset != 'lsun' and args.dataset != 'celeba': train_stream = Flatten( DataStream.default_stream( dataset_train, iteration_scheme=ShuffledScheme( examples=dataset_train.num_examples - (dataset_train.num_examples % args.batch_size), batch_size=args.batch_size))) else: train_stream = dataset_train test_stream = dataset_test print "Width", WIDTH, spatial_width shp = next(train_stream.get_epoch_iterator())[0].shape print "got epoch iterator" Xbatch = next(train_stream.get_epoch_iterator())[0] scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) print 'Building model' params = init_params(model_options) if args.reload_: print "Trying to reload parameters" if os.path.exists(args.saveto_filename): print 'Reloading Parameters' print args.saveto_filename params = load_params(args.saveto_filename, params) tparams = init_tparams(params) print tparams x, cost, start_temperature, step_chain = build_model( tparams, model_options) inps = [x.astype('float32'), start_temperature, step_chain] x_Data = T.matrix('x_Data', dtype='float32') temperature = T.scalar('temperature', dtype='float32') step_chain_part = T.scalar('step_chain_part', dtype='int32') forward_diffusion = one_step_diffusion(x_Data, model_options, tparams, temperature, step_chain_part) print tparams grads = T.grad(cost, wrt=itemlist(tparams)) #get_grads = theano.function(inps, grads) for j in range(0, len(grads)): grads[j] = T.switch(T.isnan(grads[j]), T.zeros_like(grads[j]), grads[j]) # compile the optimizer, the actual computational graph is compiled here lr = T.scalar(name='lr') print 'Building optimizers...', optimizer = args.optimizer f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' #for param in tparams: # print param # print tparams[param].get_value().shape print 'Buiding Sampler....' f_sample = sample(tparams, model_options) print 'Done' uidx = 0 estop = False bad_counter = 0 max_epochs = 4000 batch_index = 1 print 'Number of steps....' print args.num_steps print "Number of metasteps...." print args.meta_steps print 'Done' count_sample = 1 save_data = [] for eidx in xrange(1): print 'Starting Next Epoch ', eidx for data_ in range(500): #train_stream.get_epoch_iterator(): if args.noise == "gaussian": x_sampled = np.random.normal(0.5, 2.0, size=(args.batch_size, INPUT_SIZE)).clip(0.0, 1.0) else: s = np.random.binomial(1, 0.5, INPUT_SIZE) temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) x_data = np.asarray(x_sampled).astype('float32') for i in range(args.num_steps * args.meta_steps + args.extra_steps): x_data, sampled, sampled_activation, sampled_preactivation = f_sample( x_data.astype('float32'), temperature, args.num_steps * args.meta_steps - i - 1) print 'On step number, using temperature', i, temperature #reverse_time(scl, shft, x_data, model_dir + '/batch_index_' + str(batch_index) + '_inference_' + 'epoch_' + str(count_sample) + '_step_' + str(i)) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor count_sample = count_sample + 1 save_data.append(x_data) fname = model_dir + '/batch_index_' + str( batch_index) + '_inference_' + 'epoch_' + str( count_sample) + '_step_' + str(i) np.savez(fname + '.npz', x_data) save2_data = np.asarray(save_data).astype('float32') fname = model_dir + '/generted_images_50000' #+ args.saveto_filename np.savez(fname + '.npz', save_data) ipdb.set_trace()