def explore_MN(burnin_steps=2, test_steps=2): M_arr = [] N_arr = [] N = 100 #N = 50 for M in np.linspace(1, 1e6, 5): #for M in np.linspace(1, 1e3, 4): M_arr.append(int(M)) N_arr.append(int(N)) M = 1e6 #M = 1e3 for N in np.linspace(1, 200, 5): #for N in np.linspace(1,50,4): M_arr.append(int(M)) N_arr.append(int(N)) T_arr = [] for ii in range(len(M_arr)): M = M_arr[ii] N = N_arr[ii] print "case %d of %d, M=%g, N=%g" % (ii + 1, len(M_arr), M, N) # make the model model = models.toy(num_subfunctions=N, num_dims=M) # initialize the optimizer optimizer = SFO(model.f_df, model.theta_init, model.subfunction_references, display=1) # burn in the optimizer, to make sure the subspace has eg. reached its full size optimizer.optimize(num_passes=burnin_steps) # time spent in optimizer during burning t0 = optimizer.time_pass - optimizer.time_func steps0 = np.sum(optimizer.eval_count) optimizer.optimize(num_passes=test_steps) t1 = optimizer.time_pass - optimizer.time_func t_diff = t1 - t0 steps1 = np.sum(optimizer.eval_count) actual_test_steps = float(steps1 - steps0) / float(N) T_arr.append(t_diff / actual_test_steps) print T_arr[-1] return np.array(M_arr), np.array(N_arr), np.array(T_arr)
def explore_MN(burnin_steps=2, test_steps=2): M_arr = [] N_arr = [] N = 100 #N = 50 for M in np.linspace(1, 1e6, 5): #for M in np.linspace(1, 1e3, 4): M_arr.append(int(M)) N_arr.append(int(N)) M = 1e6 #M = 1e3 for N in np.linspace(1,200,5): #for N in np.linspace(1,50,4): M_arr.append(int(M)) N_arr.append(int(N)) T_arr = [] for ii in range(len(M_arr)): M = M_arr[ii] N = N_arr[ii] print "case %d of %d, M=%g, N=%g"%(ii+1, len(M_arr), M, N) # make the model model = models.toy(num_subfunctions=N, num_dims=M) # initialize the optimizer optimizer = SFO(model.f_df, model.theta_init, model.subfunction_references, display=1) # burn in the optimizer, to make sure the subspace has eg. reached its full size optimizer.optimize(num_passes=burnin_steps) # time spent in optimizer during burning t0 = optimizer.time_pass - optimizer.time_func steps0 = np.sum(optimizer.eval_count) optimizer.optimize(num_passes=test_steps) t1 = optimizer.time_pass - optimizer.time_func t_diff = t1 - t0 steps1 = np.sum(optimizer.eval_count) actual_test_steps = float(steps1 - steps0)/float(N) T_arr.append(t_diff/actual_test_steps) print T_arr[-1] return np.array(M_arr), np.array(N_arr), np.array(T_arr)
def main(shape, spacing, origin, nbl, space_order, xs, xr, tn, f0, npasses, batch_size, **kwargs): # Get true model true_model = get_true_model(shape, spacing, origin, nbl, space_order) # Get smooth model smooth_model = get_smooth_model(shape, spacing, origin, nbl, space_order) # Compute initial born perturbation from m - m0 dm = (true_model.vp.data**(-2) - smooth_model.vp.data**(-2)) # Geometry nsrc = xs.shape[0] nrec = xr.shape[0] geometry0 = set_geometry(smooth_model, nsrc, nrec, f0, tn, t0=0) # Compute observed data in parallel (inverse crime). # In real life we would read the SEG-Y data here. futures = [] for i in range(geometry0.nsrc): args = [dm, i, smooth_model, geometry0, space_order] futures.append(forward_modeling.remote(*args)) dobs = np.zeros((geometry0.nt * geometry0.nrec, geometry0.nsrc), dtype=np.float32) for i in range(geometry0.nsrc): dobs[:, i] = ray.get(futures[i]) # List containing an identifying element for each subfunction sub_refs = set_subreferences(dobs, geometry0, batch_size) # Initial guess theta_init = np.zeros(smooth_model.shape, dtype=np.float32) # # initialize the optimizer optimizer = SFO(f_df_multi_shots, theta_init, sub_refs, [geometry0, smooth_model, space_order]) # # run the optimizer for npasses pass through the data theta = optimizer.optimize(num_passes=npasses) # Write inverted reflectivity to disk file = open('output/dvel-final.bin', "wb") scopy = theta.reshape(smooth_model.shape).astype( np.float32).copy(order='C') file.write(scopy) # Create a plot with the minibatch function values plt.plot(np.array(optimizer.hist_f_flat)) plt.xlabel('Iteration') plt.ylabel('Minibatch Function Value') plt.title('Convergence Trace') plt.savefig('output/history_sfo.png')
sample_T=theano.function([muW0T, muW1T, muW2T, mub0T, mub1T, mub2T, covW0T, covW1T, covW2T, covb0T, covb1T, covb2T], samplesT, allow_input_downcast=True) def sample(params): out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5], params[6],params[7],params[8],params[9],params[10],params[11]) return out # Creating the optimizer optimizer = SFO(f_df, init_params, subfuncs) # Running the optimization init_loss = f_df(init_params,subfuncs[0])[0] print init_loss keyin='' while keyin!='y': opt_params = optimizer.optimize(num_passes=24*4) end_loss = f_df(opt_params,subfuncs[0])[0] print 'Current loss: ', end_loss W=opt_params[0] pp.scatter(W[0,:],W[1,:]); pp.show() keyin=raw_input('End optimization? (y)') samples=sample(opt_params) pp.scatter(samples[:,0],samples[:,1]); pp.show()
def optim_vae_sfo(model, x, v_init, w_init, n_batch, n_passes, hook, n_resample=20, resample_keepmem=False, bernoulli_x=False, display=0): # Shuffle columns of dataset x ndict.shuffleCols(x) # create minibatches n_tot = x.itervalues().next().shape[1] minibatches = [] n_minibatches = n_tot / n_batch if (n_tot % n_batch) != 0: raise Exception() # Divide into minibatches def make_minibatch(i): _x = ndict.getCols(x, i * n_batch, (i + 1) * n_batch) _eps = model.gen_eps(n_batch) if bernoulli_x: _x['x'] = np.random.binomial(n=1, p=_x['x']) return [i, _x, _eps] for i in range(n_minibatches): minibatches.append(make_minibatch(i)) L = [0.] n_L = [0] def f_df(w, minibatch): i_minibatch = minibatch[0] x_minibatch = minibatch[1] eps_minibatch = minibatch[2] # Get gradient logpx, logpz, logqz, gv, gw = model.dL_dw(w['v'], w['w'], x_minibatch, eps_minibatch) # Get gradient w.r.t. priors logpv, logpw, gv_prior, gw_prior = model.dlogpw_dw(w['v'], w['w']) gv = {i: gv[i] + float(n_batch) / n_tot * gv_prior[i] for i in gv} gw = {i: gw[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw} f = (logpx.sum() + logpz.sum() - logqz.sum()) L[0] += -f / (1. * n_batch) n_L[0] += 1 f += float(n_batch) / n_tot * logpv f += float(n_batch) / n_tot * logpw for i in gv: gv[i] *= -1. / n_batch for i in gw: gw[i] *= -1. / n_batch f *= -1. / n_batch #print 'norms gv:' #ndict.pNorm(gv) #print 'norms gw' #ndict.pNorm(gw) return f, {'v': gv, 'w': gw} w_init = {'v': v_init, 'w': w_init} from sfo import SFO optimizer = SFO(f_df, w_init, minibatches, display=display) #optimizer.check_grad() # loop for i in range(n_passes): w = optimizer.optimize(num_passes=1) LB = L[0] / (1. * n_L[0]) hook(i, w['v'], w['w'], LB) L[0] = 0 n_L[0] = 0 # Reset noise epsilon of some minibatches for j in range(n_minibatches): if n_resample > 0 and i % n_resample == j % n_resample: minibatches[j] = make_minibatch(j) optimizer.replace_subfunction(j, resample_keepmem, minibatches[j]) print "Finished!"
def train(self, images, batch_size=50, num_epochs=20, method='SGD', train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1., decay1=0.9, decay2=0.999, precondition=True): """ @type images: C{ndarray}/C{list} @param images: an array or a list of images """ print 'Preprocessing...' inputs, outputs = self._preprocess(images) if precondition: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) print 'Creating SLSTMs...' # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params['slstm'][l]) self.mcgsm._set_parameters(params['mcgsm'], {'train_means': train_means}) # select batch and compute hidden activations Y = outputs[idx:idx + batch_size] H = inputs[idx:idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # ignore bottom-right pixel (BSDS300) df_dh[:, -1, -1] = 0. # average negative log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta['slstm'][l] = self.slstm[l].backward( df_dh, force_backward=True) df_dh = df_dtheta['slstm'][l]['inputs'] del df_dtheta['slstm'][l]['inputs'] else: # no need to compute derivatives with respect to input units df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient( H_flat, Y_flat, parameters={'train_means': train_means }) * log(2.) * self.mcgsm.dim_out return f, df_dtheta # collect current parameters params = {} params['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params['slstm'][l] = self.slstm[l].parameters() params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means}) # a start index for each batch start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size) print 'Training...' if method.upper() == 'SFO': try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt['slstm'][l]) self.mcgsm._set_parameters(params_opt['mcgsm'], {'train_means': train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == 'SGD': loss = [] diff = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } for l in train_layers: diff['slstm'][l] = {} for key in params['slstm'][l]: diff['slstm'][l][key] = zeros_like(params['slstm'][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff['slstm'][l][key] = momentum * diff['slstm'][ l][key] - df['slstm'][l][key] params['slstm'][l][key] = params['slstm'][l][ key] + learning_rate * diff['slstm'][l][key] # update MCGSM parameters diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm'] params['mcgsm'] = params[ 'mcgsm'] + learning_rate * diff['mcgsm'] if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss elif method.upper() == 'ADAM': loss = [] diff_mean = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } diff_sqrd = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } for l in train_layers: diff_mean['slstm'][l] = {} diff_sqrd['slstm'][l] = {} for key in params['slstm'][l]: diff_mean['slstm'][l][key] = zeros_like( params['slstm'][l][key]) diff_sqrd['slstm'][l][key] = zeros_like( params['slstm'][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # include bias correction in step width step_width = learning_rate / ( 1. - power(decay1, t)) * sqrt(1. - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \ + (1. - decay1) * df['slstm'][l][key] diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \ + (1. - decay2) * square(df['slstm'][l][key]) params['slstm'][l][key] = params['slstm'][l][key] - \ step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key])) # update MCGSM parameters diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + ( 1. - decay1) * df['mcgsm'] diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + ( 1. - decay2) * square(df['mcgsm']) params['mcgsm'] = params['mcgsm'] - \ step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm'])) if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss else: raise ValueError('Unknown method \'{0}\'.'.format(method))
M = 20 # number visible units J = 10 # number hidden units D = 100000 # full data batch size N = int(np.sqrt(D) / 10.) # number minibatches # generate random training data v = randn(M, D) # create the array of subfunction specific arguments sub_refs = [] for i in range(N): # extract a single minibatch of training data. sub_refs.append(v[:, i::N]) # initialize parameters theta_init = {'W': randn(J, M), 'b_h': randn(J, 1), 'b_v': randn(M, 1)} # initialize the optimizer optimizer = SFO(f_df, theta_init, sub_refs) # # uncomment the following line to test the gradient of f_df # optimizer.check_grad() # run the optimizer for 1 pass through the data theta = optimizer.optimize(num_passes=1) # continue running the optimizer for another 20 passes through the data theta = optimizer.optimize(num_passes=20) # plot the convergence trace plt.plot(np.array(optimizer.hist_f_flat)) plt.xlabel('Iteration') plt.ylabel('Minibatch Function Value') plt.title('Convergence Trace') plt.show()
#pp.hist(np.sqrt(np.sum(samples[-1]**2,axis=1)),50,normed=True,color='r') #pp.figure(8) #pp.suptitle(r'Learned $\beta$ Schedule') #pp.axes(xlabel='t', ylabel=r'$\beta$') #pp.plot(np.arange(nsteps),(1.0/(1.0+np.exp(-opt_params[-1])))*beta_max) pp.show() exit() if automate_training: optimizer = SFO(f_df, init_params, subfuncs) end_loss=99.0 while end_loss>-2.50: linalgerror=False try: opt_params = optimizer.optimize(num_passes=2) end_loss = f_df(opt_params,fdata)[0] except np.linalg.linalg.LinAlgError: linalgerror=True if np.isnan(end_loss) or linalgerror: mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32) mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32) mu_biases=np.zeros(nhid_mu).astype(np.float32) mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32) mu_b=np.zeros((ntgates, nx)).astype(np.float32) cov_centers=(np.random.randn(nx, nhid_cov)*1.0).astype(np.float32) cov_spreads=(np.zeros((nx, nhid_cov))-1.0).astype(np.float32) cov_biases=np.zeros(nhid_cov).astype(np.float32) cov_M=(np.random.randn(nhid_cov, ntgates*nx)*0.01).astype(np.float32) cov_b=np.zeros(ntgates).astype(np.float32)
M = 20 # number visible units J = 10 # number hidden units D = 100000 # full data batch size N = int(np.sqrt(D) / 10.0) # number minibatches # generate random training data v = randn(M, D) # create the array of subfunction specific arguments sub_refs = [] for i in range(N): # extract a single minibatch of training data. sub_refs.append(v[:, i::N]) # initialize parameters theta_init = {"W": randn(J, M), "b_h": randn(J, 1), "b_v": randn(M, 1)} # initialize the optimizer optimizer = SFO(f_df, theta_init, sub_refs) # # uncomment the following line to test the gradient of f_df # optimizer.check_grad() # run the optimizer for 1 pass through the data theta = optimizer.optimize(num_passes=1) # continue running the optimizer for another 20 passes through the data theta = optimizer.optimize(num_passes=20) # plot the convergence trace plt.plot(np.array(optimizer.hist_f_flat)) plt.xlabel("Iteration") plt.ylabel("Minibatch Function Value") plt.title("Convergence Trace") plt.show()
def fit(self, train_X, optimizer, param_init = None, sample_every=None): self.opt = optimizer n_train, n_vis = train_X.shape batch_size = self.batch_size if sample_every == None: sample_every = 10000000 #theano.config.profile = True #theano.config.exception_verbosity='high' assert(n_vis == self.nv) train_X = self.shared_dataset(train_X) n_batches = np.ceil(n_train / float(batch_size)).astype('int') # theano variables for managing data (index minibatches, n examples in batch) index, n_ex = T.iscalars('batch_index', 'n_ex') batch_start = index*batch_size batch_stop = T.minimum(n_ex, (index + 1)*batch_size) effective_batch_size = batch_stop - batch_start # theano variables for learning lr = T.scalar('lr', dtype=theano.config.floatX) mom = T.scalar('mom', dtype=theano.config.floatX) if self.k == 1: # this one is for scaning over a batch and getting connectivity for each example # return grads too because T.grads through scan is awful # takes ~3x longer, but can experiment connectivity #K, grads = self.mpf.rbm_K2G(self.X, effective_batch_size) # this tiles out the minibatch matrix into a 3D tensor to compute connectivity #K, offs, y, y1, z= self.mpf.rbm_K(self.X, effective_batch_size) K = self.mpf.rbm_K(self.X, effective_batch_size) elif self.k == 2: if DEBUG: return_values = self.mpf.debug_rbm_K_2wise(self.X, effective_batch_size) K = return_values[-1] else: K = self.mpf.rbm_K_2wise(self.X, effective_batch_size) else: raise('NotImplemented') reg = self.L1_reg * self.mpf.L1 + self.L2_reg * self.mpf.L2 reg_grad = T.grad(reg, self.mpf.theta) # if not scan (tile out matrix into tensor) cost = K + reg grads = T.grad(cost, self.mpf.theta) # otherwise #grads = grads + reg_grad if param_init == None: self.mpf.theta.set_value(random_theta(D, DH, k=self.k)) else: self.mpf.theta.set_value(np.asarray(np.concatenate(param_init), dtype=theano.config.floatX)) if optimizer == 'sgd': updates = [] theta = self.mpf.theta theta_update = self.mpf.theta_update upd = mom * theta_update - lr * grads updates.append((theta_update, upd)) updates.append((theta, theta + upd)) print 'compiling theano function' if DEBUG: return_values = list(return_values) return_values.append(cost) return_values.append(grads) train_model = theano.function(inputs=[index, n_ex, lr, mom], outputs=return_values, updates=updates, givens={self.X: train_X[batch_start:batch_stop]}) else: train_model = theano.function(inputs=[index, n_ex, lr, mom], outputs=cost, updates=updates, givens={self.X: train_X[batch_start:batch_stop]}) self.current_epoch = 0 start = time.time() learning_rate_init = self.learning_rate while self.current_epoch < self.n_epochs: print 'epoch:', self.current_epoch self.current_epoch += 1 effective_mom = self.final_momentum if self.current_epoch > self.momentum_switchover else self.initial_momentum avg_epoch_cost = 0 last_debug = None for minibatch_idx in xrange(n_batches): avg_cost = train_model(minibatch_idx, n_train, self.learning_rate, effective_mom) #print '\t\t', np.isnan(gr).sum(), np.isnan(yy).sum(), np.isnan(yy1).sum(), np.isnan(zz).sum() if DEBUG: return_values, avg_cost, gradients = avg_cost[:-2], avg_cost[-2], avg_cost[-1] print_debug(return_values, last_debug) last_debug = return_values avg_epoch_cost += avg_cost #print '\t', minibatch_idx, avg_cost print '\t avg epoch cost:', avg_epoch_cost/n_batches self.learning_rate *= self.learning_rate_decay theta_fit = split_theta(self.mpf.theta.get_value(), self.mpf.n_visible, self.mpf.n_hidden, k=self.mpf.k) if (self.current_epoch % sample_every == 0): sample_and_save(theta_fit, self.mpf.n_hidden, self.current_epoch, learning_rate_init, self.mpf.k, self.opt) theta_opt = self.mpf.theta.get_value() end = time.time() elif optimizer == 'cg' or optimizer == 'bfgs': print "compiling theano functions" get_batch_size = theano.function([index, n_ex], effective_batch_size, name='get_batch_size') batch_cost_grads = theano.function([index, n_ex], [cost, grads], givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_cost = theano.function([index, n_ex], cost, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_grads = theano.function([index, n_ex], grads, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') def train_fn_cost_grads(theta_value): print 'nbatches', n_batches self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_losses_grads = [batch_cost_gradst(i, n_train) for i in xrange(n_batches)] train_losses = [i[0] for i in train_losses_grads] train_grads = [i[1] for i in train_losses_grads] train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)] print len(train_losses), len(train_grads) print train_losses[0].shape, train_grads[0].shape returns = np.average(train_losses, weights=train_batch_sizes), np.average(train_grads, weights=train_batch_sizes, axis=0) return returns def train_fn_cost(theta_value): print 'nbatches', n_batches self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_costs = [batch_cost(i, n_train) for i in xrange(n_batches)] train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)] return np.average(train_costs, weights=train_batch_sizes) def train_fn_grads(theta_value): print 'nbatches', n_batches self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_grads = [batch_grads(i, n_train) for i in xrange(n_batches)] train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)] return np.average(train_grads, weights=train_batch_sizes, axis=0) ############### # TRAIN MODEL # ############### def my_callback(): print 'wtf' from scipy.optimize import minimize from scipy.optimize import fmin_bfgs, fmin_l_bfgs_b if optimizer == 'cg': pass elif optimizer == 'bfgs': print 'using bfgs' #theta_opt, f_theta_opt, info = fmin_l_bfgs_b(train_fn, self.mpf.theta.get_value(), iprint=1, maxfun=self.n_epochs) start = time.time() disp = True print 'ready to minimize' #result_obj = minimize(train_fn, self.mpf.theta.get_value(), jac=True, method='BFGS', options={'maxiter':self.n_epochs, 'disp':disp}, callback=my_callback()) #theta_opt = fmin_bfgs(f=train_fn_cost, x0=self.mpf.theta.get_value(), fprime=train_fn_grads, disp=1, maxiter=self.n_epochs) theta_opt, fff, ddd = fmin_l_bfgs_b(func=train_fn_cost, x0=self.mpf.theta.get_value(), fprime=train_fn_grads, disp=1, maxiter=self.n_epochs) print 'done minimize ya right' end = time.time() elif optimizer == 'sof': print "compiling theano functions" batch_cost_grads = theano.function([index, n_ex], [cost, grads], givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_cost = theano.function([index, n_ex], cost, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_grads = theano.function([index, n_ex], grads, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') def train_fn(theta_value, i): self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_losses, train_grads = batch_cost_grads(i, n_train) return train_losses, train_grads ############### # TRAIN MODEL # ############### if param_init == None: theta.set_value(random_theta(D, DH)) else: w0, bh0, bv0 = param_init self.mpf.theta.set_value(np.asarray(np.concatenate((w0, bh0, bv0)), dtype=theano.config.floatX)) print 'using sof' sys.path.append('/export/mlrg/ebuchman/Programming/Sum-of-Functions-Optimizer') from sfo import SFO print 'n batches', n_batches print 'n epochs' , self.n_epochs optimizer = SFO(train_fn, self.mpf.theta.get_value(), np.arange(n_batches)) start = time.time() theta_opt = optimizer.optimize(num_passes = self.n_epochs) end = time.time() self.mpf.theta.set_value(theta_opt.astype(theano.config.floatX), borrow=True) return end-start
samplesT, allow_input_downcast=True) def sample(params): out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5], params[6],params[7],params[8],params[9]) return out if automate_training: optimizer = SFO(f_df, init_params, subfuncs) end_loss=99.0 while end_loss>-2.50: linalgerror=False try: opt_params = optimizer.optimize(num_passes=2) end_loss = f_df(opt_params,fdata)[0] except np.linalg.linalg.LinAlgError: linalgerror=True if np.isnan(end_loss) or linalgerror: mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32) mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32) mu_biases=np.zeros(nhid_mu).astype(np.float32) mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32) mu_b=np.zeros((ntgates, nx)).astype(np.float32) cov_centers=(np.random.randn(nx, nhid_cov)*1.0).astype(np.float32) cov_spreads=(np.zeros((nx, nhid_cov))-1.0).astype(np.float32) cov_biases=np.zeros(nhid_cov).astype(np.float32) cov_M=(np.random.randn(nhid_cov, ntgates*nx)*0.01).astype(np.float32) cov_b=np.zeros(ntgates).astype(np.float32)
def optim_vae_sfo(model, x, v_init, w_init, n_batch, n_passes, hook, n_resample=20, resample_keepmem=False, bernoulli_x=False, display=0): # Shuffle columns of dataset x ndict.shuffleCols(x) # create minibatches n_tot = x.itervalues().next().shape[1] minibatches = [] n_minibatches = n_tot / n_batch if (n_tot%n_batch) != 0: raise Exception() # Divide into minibatches def make_minibatch(i): _x = ndict.getCols(x, i * n_batch, (i+1) * n_batch) _eps = model.gen_eps(n_batch) if bernoulli_x: _x['x'] = np.random.binomial(n=1, p=_x['x']) return [i, _x, _eps] for i in range(n_minibatches): minibatches.append(make_minibatch(i)) L = [0.] n_L = [0] def f_df(w, minibatch): i_minibatch = minibatch[0] x_minibatch = minibatch[1] eps_minibatch = minibatch[2] # Get gradient logpx, logpz, logqz, gv, gw = model.dL_dw(w['v'], w['w'], x_minibatch, eps_minibatch) # Get gradient w.r.t. priors logpv, logpw, gv_prior, gw_prior = model.dlogpw_dw(w['v'], w['w']) gv = {i: gv[i] + float(n_batch)/n_tot * gv_prior[i] for i in gv} gw = {i: gw[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw} f = (logpx.sum() + logpz.sum() - logqz.sum()) L[0] += -f/(1.*n_batch) n_L[0] += 1 f += float(n_batch)/n_tot * logpv f += float(n_batch)/n_tot * logpw for i in gv: gv[i] *= -1./n_batch for i in gw: gw[i] *= -1./n_batch f *= -1./n_batch #print 'norms gv:' #ndict.pNorm(gv) #print 'norms gw' #ndict.pNorm(gw) return f, {'v':gv,'w':gw} w_init = {'v':v_init, 'w':w_init} from sfo import SFO optimizer = SFO(f_df, w_init, minibatches, display=display) #optimizer.check_grad() # loop for i in range(n_passes): w = optimizer.optimize(num_passes=1) LB = L[0]/(1.*n_L[0]) hook(i, w['v'], w['w'], LB) L[0] = 0 n_L[0] = 0 # Reset noise epsilon of some minibatches for j in range(n_minibatches): if n_resample > 0 and i%n_resample == j%n_resample: minibatches[j] = make_minibatch(j) optimizer.replace_subfunction(j, resample_keepmem, minibatches[j]) print "Finished!"
def train(self, images, batch_size=50, num_epochs=20, method='SGD', train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1., decay1=0.9, decay2=0.999, precondition=True): """ @type images: C{ndarray}/C{list} @param images: an array or a list of images """ print 'Preprocessing...' inputs, outputs = self._preprocess(images) if precondition: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) print 'Creating SLSTMs...' # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params['slstm'][l]) self.mcgsm._set_parameters(params['mcgsm'], {'train_means': train_means}) # select batch and compute hidden activations Y = outputs[idx:idx + batch_size] H = inputs[idx:idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # ignore bottom-right pixel (BSDS300) df_dh[:, -1, -1] = 0. # average negative log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh, force_backward=True) df_dh = df_dtheta['slstm'][l]['inputs'] del df_dtheta['slstm'][l]['inputs'] else: # no need to compute derivatives with respect to input units df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient(H_flat, Y_flat, parameters={'train_means': train_means}) * log(2.) * self.mcgsm.dim_out return f, df_dtheta # collect current parameters params = {} params['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params['slstm'][l] = self.slstm[l].parameters() params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means}) # a start index for each batch start_indices = range( 0, inputs.shape[0] - batch_size + 1, batch_size) print 'Training...' if method.upper() == 'SFO': try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt['slstm'][l]) self.mcgsm._set_parameters(params_opt['mcgsm'], {'train_means': train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == 'SGD': loss = [] diff = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm'])} for l in train_layers: diff['slstm'][l] = {} for key in params['slstm'][l]: diff['slstm'][l][key] = zeros_like(params['slstm'][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff['slstm'][l][key] = momentum * diff['slstm'][l][key] - df['slstm'][l][key] params['slstm'][l][key] = params['slstm'][l][key] + learning_rate * diff['slstm'][l][key] # update MCGSM parameters diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm'] params['mcgsm'] = params['mcgsm'] + learning_rate * diff['mcgsm'] if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss elif method.upper() == 'ADAM': loss = [] diff_mean = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm'])} diff_sqrd = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm'])} for l in train_layers: diff_mean['slstm'][l] = {} diff_sqrd['slstm'][l] = {} for key in params['slstm'][l]: diff_mean['slstm'][l][key] = zeros_like(params['slstm'][l][key]) diff_sqrd['slstm'][l][key] = zeros_like(params['slstm'][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # include bias correction in step width step_width = learning_rate / (1. - power(decay1, t)) * sqrt(1. - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \ + (1. - decay1) * df['slstm'][l][key] diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \ + (1. - decay2) * square(df['slstm'][l][key]) params['slstm'][l][key] = params['slstm'][l][key] - \ step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key])) # update MCGSM parameters diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + (1. - decay1) * df['mcgsm'] diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + (1. - decay2) * square(df['mcgsm']) params['mcgsm'] = params['mcgsm'] - \ step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm'])) if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss else: raise ValueError('Unknown method \'{0}\'.'.format(method))
samplesT, allow_input_downcast=True) def sample(params): out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5], params[6],params[7],params[8],params[9],params[10],params[11],params[12],params[13]) return out if automate_training: optimizer = SFO(f_df, init_params, subfuncs) end_loss=99.0 while end_loss>-2.50: linalgerror=False try: opt_params = optimizer.optimize(num_passes=2) end_loss = f_df(opt_params,fdata)[0] except np.linalg.linalg.LinAlgError: linalgerror=True if np.isnan(end_loss) or linalgerror: mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32) mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32) mu_biases=np.zeros(nhid_mu).astype(np.float32) mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32) mu_b=np.zeros((ntgates, nx)).astype(np.float32) cov_centers=(np.random.randn(nx, nhid_cov)*1.0).astype(np.float32) cov_spreads=(np.zeros((nx, nhid_cov))-1.0).astype(np.float32) cov_biases=np.zeros(nhid_cov).astype(np.float32) cov_M=(np.random.randn(nhid_cov, ntgates*nx)*0.01).astype(np.float32) cov_b=np.zeros(ntgates).astype(np.float32)
def train( self, images, batch_size=50, num_epochs=20, method="SGD", train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1.0, decay1=0.9, decay2=0.999, precondition=True, ): """ Train model via stochastic gradient descent (SGD) or sum-of-functions optimizer (SFO). @type images: C{ndarray}/C{list} @param images: an array or a list of training images (e.g., Nx32x32x3) @type batch_size: C{int} @param batch_size: batch size used by SGD @type num_epochs: C{int} @param num_epochs: number of passes through the training set @type method: C{str} @param method: either 'SGD', 'SFO', or 'ADAM' @type train_means: C{bool} @param train_means: whether or not to optimize the mean parameters of the MCGSM @type train_top_layer: C{bool} @param train_top_layer: if true, only the MCGSM and spatial LSTM at the top layer is trained @type momentum: C{float} @param momentum: momentum rate used by SGD @type learning_rate: C{float} @param learning_rate: learning rate used by SGD @type decay1: C{float} @param decay1: hyperparameter used by ADAM @type decay2: C{float} @param decay2: hyperparameter used by ADAM @type precondition: C{bool} @param precondition: whether or not to perform conditional whitening @rtype: C{list} @return: evolution of negative log-likelihood (bits per pixel) over the training """ if images.shape[1] < self.input_mask.shape[0] or images.shape[2] < self.input_mask.shape[1]: raise ValueError("Images too small.") if self.verbosity > 0: print "Preprocessing..." inputs, outputs = self._preprocess(images) if precondition: if self.verbosity > 0: print "Preconditioning..." # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) if self.verbosity > 0: print "Creating SLSTMs..." # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity, ) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params["slstm"][l]) self.mcgsm._set_parameters(params["mcgsm"], {"train_means": train_means}) # select batch and compute hidden activations Y = outputs[idx : idx + batch_size] H = inputs[idx : idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # average log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta["slstm"] = [0.0] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta["slstm"][l] = self.slstm[l].backward(df_dh, force_backward=True) df_dh = df_dtheta["slstm"][l]["inputs"] del df_dtheta["slstm"][l]["inputs"] else: # no need to compute derivatives with respect to input units df_dtheta["slstm"][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta["mcgsm"] = ( self.mcgsm._parameter_gradient(H_flat, Y_flat, parameters={"train_means": train_means}) * log(2.0) * self.mcgsm.dim_out ) return f, df_dtheta # collect current parameters params = {} params["slstm"] = [0.0] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params["slstm"][l] = self.slstm[l].parameters() params["mcgsm"] = self.mcgsm._parameters({"train_means": train_means}) # a start index for each batch start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size) if self.verbosity > 0: print "Training..." if method.upper() == "SFO": try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt["slstm"][l]) self.mcgsm._set_parameters(params_opt["mcgsm"], {"train_means": train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == "SGD": loss = [] diff = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])} for l in train_layers: diff["slstm"][l] = {} for key in params["slstm"][l]: diff["slstm"][l][key] = zeros_like(params["slstm"][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f / log(2.0) / self.num_channels) # update SLSTM parameters for l in train_layers: for key in params["slstm"][l]: diff["slstm"][l][key] = momentum * diff["slstm"][l][key] - df["slstm"][l][key] params["slstm"][l][key] = params["slstm"][l][key] + learning_rate * diff["slstm"][l][key] # update MCGSM parameters diff["mcgsm"] = momentum * diff["mcgsm"] - df["mcgsm"] params["mcgsm"] = params["mcgsm"] + learning_rate * diff["mcgsm"] if self.verbosity > 0: print "{0:>5} {1:>10.4f} {2:>10.4f}".format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]) :]) ) return loss elif method.upper() == "ADAM": loss = [] diff_mean = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])} diff_sqrd = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])} for l in train_layers: diff_mean["slstm"][l] = {} diff_sqrd["slstm"][l] = {} for key in params["slstm"][l]: diff_mean["slstm"][l][key] = zeros_like(params["slstm"][l][key]) diff_sqrd["slstm"][l][key] = zeros_like(params["slstm"][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f / log(2.0) / self.num_channels) # include bias correction in step width step_width = learning_rate / (1.0 - power(decay1, t)) * sqrt(1.0 - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params["slstm"][l]: diff_mean["slstm"][l][key] = ( decay1 * diff_mean["slstm"][l][key] + (1.0 - decay1) * df["slstm"][l][key] ) diff_sqrd["slstm"][l][key] = decay2 * diff_sqrd["slstm"][l][key] + (1.0 - decay2) * square( df["slstm"][l][key] ) params["slstm"][l][key] = params["slstm"][l][key] - step_width * diff_mean["slstm"][l][ key ] / (1e-8 + sqrt(diff_sqrd["slstm"][l][key])) # update MCGSM parameters diff_mean["mcgsm"] = decay1 * diff_mean["mcgsm"] + (1.0 - decay1) * df["mcgsm"] diff_sqrd["mcgsm"] = decay2 * diff_sqrd["mcgsm"] + (1.0 - decay2) * square(df["mcgsm"]) params["mcgsm"] = params["mcgsm"] - step_width * diff_mean["mcgsm"] / ( 1e-8 + sqrt(diff_sqrd["mcgsm"]) ) if self.verbosity > 0: print "{0:>5} {1:>10.4f} {2:>10.4f}".format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]) :]) ) return loss else: raise ValueError("Unknown method '{0}'.".format(method))
sample_T=theano.function([muW0T, muW1T, muW2T, mub0T, mub1T, mub2T, covW0T, covW1T, covW2T, covb0T, covb1T, covb2T], samplesT, allow_input_downcast=True) def sample(params): out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5], params[6],params[7],params[8],params[9],params[10],params[11]) return out # Creating the optimizer optimizer = SFO(f_df, init_params, subfuncs) # Running the optimization init_loss = f_df(init_params,subfuncs[0])[0] print init_loss keyin='' while keyin!='y': opt_params = optimizer.optimize(num_passes=12) end_loss = f_df(opt_params,subfuncs[0])[0] print 'Current loss: ', end_loss W=opt_params[0] pp.scatter(W[0,:],W[1,:]); pp.show() keyin=raw_input('End optimization? (y)') samples=sample(opt_params) pp.scatter(samples[:,0],samples[:,1]); pp.show()