def get_elbo(pred, targ, weights, logdets, weight, dataset_size, prior=log_normal, lbda=0, output_type='categorical'): """ negative elbo, an upper bound on NLL """ logqw = -logdets """ originally... logqw = - (0.5*(ep**2).sum(1)+0.5*T.log(2*np.pi)*num_params+logdets) --> constants are neglected in this wrapperfrom utils import log_laplace """ logpw = prior(weights, 0., -T.log(lbda)).sum(1) """ using normal prior centered at zero, with lbda being the inverse of the variance """ kl = (logqw - logpw).mean() if output_type == 'categorical': logpyx = -cc(pred, targ).mean() elif output_type == 'real': logpyx = -se(pred, targ).mean() # assume output is a vector ! else: assert False loss = -(logpyx - weight * kl / T.cast(dataset_size, floatX)) return loss, [logpyx, logpw, logqw]
def __init__(self, softmax=softmax): self.inpv = T.matrix('inpv') self.outv = T.imatrix('outv') # indices self.ep = T.matrix('ep') self.w = T.scalar('w') self.n = self.inpv.shape[0] self.enc_m = get_encoder() self.enc_s = get_encoder() self.dec = get_decoder() self.mu = get_output(self.enc_m, self.inpv) self.log_s = get_output(self.enc_s, self.inpv) self.log_v = 2 * self.log_s self.sigma = T.exp(self.log_s) self.var = T.exp(self.log_s * 2) self.z = self.mu + self.sigma * self.ep self.rec_linear = get_output(self.dec, self.z) self.rec_reshaped_ln = self.rec_linear.reshape((self.n * d2, 256)) self.rec_reshaped = softmax(self.rec_reshaped_ln) self.out_onehot = T.extra_ops.to_one_hot( self.outv.reshape((self.n * d2, )), 256) # lazy modeling just using squared error ... self.rec_losses_reshaped = cc(self.rec_reshaped, self.out_onehot) self.rec_losses = self.rec_losses_reshaped.reshape((self.n, d2)).sum(1) self.klss = - 0.5 * (1+self.log_v) + \ 0.5 * (self.mu**2 + self.var) self.kls = self.klss.sum(1) self.rec_loss = self.rec_losses.mean() self.kl = self.kls.mean() self.loss = self.rec_loss + self.kl * self.w self.params = get_all_params(self.enc_m) + \ get_all_params(self.enc_s) + \ get_all_params(self.dec) self.updates = lasagne.updates.adam(self.loss, self.params, lr) print '\tgetting train func' self.train_func = theano.function( [self.inpv, self.outv, self.ep, self.w], [self.loss.mean(), self.rec_loss.mean(), self.kl.mean()], updates=self.updates) print '\tgetting other useful funcs' self.recon = theano.function([self.inpv, self.ep], self.rec_reshaped.argmax(1).reshape( (self.n, d2))) self.recon_ = theano.function([self.inpv, self.ep], self.rec_reshaped.reshape( (self.n, d2, 256))) self.project = theano.function([self.inpv, self.ep], self.z) self.get_mu = theano.function([self.inpv], self.mu) self.get_var = theano.function([self.inpv], self.var) self.get_klss = theano.function([self.inpv], self.klss)
def _get_elbo(self): """ negative elbo, an upper bound on NLL """ # TODO: kldiv_bias = tf.reduce_sum(.5 * self.pvar_bias - .5 * self.logvar_bias + ((tf.exp(self.logvar_bias) + tf.square(self.mu_bias)) / (2 * tf.exp(self.pvar_bias))) - .5) # eqn14 kl_q_w_z_p = 0 for mu, sig, z_T_f in zip(self.mus, self.sigs, self.z_T_fs): kl_q_w_z_p += (sig**2).sum() - T.log( sig**2).sum() + mu**2 * z_T_f**2 # leaving off the -1 kl_q_w_z_p *= 0.5 # eqn15 self.log_r_z_T_f_W = 0 print '\n \n eqn15' for mu, sig, z_T_b, c, b_mu, b_logsig in zip( self.mus, self.sigs, self.z_T_bs, self.cs, self.b_mus, self.b_logsigs ): # we'll compute this seperately for every layer's W print 'eqn15' print[tt.shape for tt in [mu, sig, z_T_b, c, b_mu, b_logsig]] # reparametrization trick for eqn 9/10 cTW_mu = T.dot(c, mu) cTW_sig = T.dot(c, sig**2)**.5 the_scalar = T.tanh( cTW_mu + cTW_sig * self.srng.normal(cTW_sig.shape)).sum( ) # TODO: double check (does the sum belong here??) # scaling b by the_scalar mu_tilde = (b_mu * the_scalar).squeeze() log_sig_tilde = (b_logsig * the_scalar).squeeze() self.log_r_z_T_f_W += (-.5 * T.exp(log_sig_tilde) * (z_T_b - mu_tilde)**2 - .5 * T.log(2 * np.pi) + .5 * log_sig_tilde).sum() self.log_r_z_T_f_W += self.logdets_z_T_b # -eqn13 self.kl = ( -self.logdets + kl_q_w_z_p - self.log_r_z_T_f_W).sum() # TODO: why do I need the mean/sum?? if self.output_type == 'categorical': self.logpyx = -cc(self.y, self.target_var).mean() elif self.output_type == 'real': self.logpyx = -se(self.y, self.target_var).mean() else: assert False # FIXME: not a scalar!? self.loss = - (self.logpyx - \ self.weight * self.kl/T.cast(self.dataset_size,floatX)) # DK - extra monitoring params = self.params ds = self.dataset_size self.monitored = []
def _get_elbo(self): """ negative elbo, an upper bound on NLL """ self.logpyx = - cc(self.y,self.target_var).mean() self.loss = - (self.logpyx - \ self.weight * self.kl/T.cast(self.dataset_size,floatX)) # DK - extra monitoring params = self.params ds = self.dataset_size self.logpyx_grad = flatten_list(T.grad(-self.logpyx, params, disconnected_inputs='warn')).norm(2) self.logpw_grad = flatten_list(T.grad(-self.logpw.mean() / ds, params, disconnected_inputs='warn')).norm(2) self.logqw_grad = flatten_list(T.grad(self.logqw.mean() / ds, params, disconnected_inputs='warn')).norm(2) self.monitored = [self.logpyx, self.logpw, self.logqw, self.logpyx_grad, self.logpw_grad, self.logqw_grad] self.logpyx = - cc(self.y,self.target_var).mean() self.loss = - (self.logpyx - \ self.weight * self.kl/T.cast(self.dataset_size,floatX))
def _get_elbo(self): """ negative elbo, an upper bound on NLL """ logdets = self.logdets self.logqw = -logdets """ originally... logqw = - (0.5*(ep**2).sum(1)+0.5*T.log(2*np.pi)*num_params+logdets) --> constants are neglected in this wrapperfrom utils import log_laplace """ self.logpw = self.prior(self.weights, 0., -T.log(self.lbda)).sum(1) """ using normal prior centered at zero, with lbda being the inverse of the variance """ self.kl = (self.logqw - self.logpw).mean() if self.output_type == 'categorical': self.logpyx = -cc(self.y, self.target_var).mean() elif self.output_type == 'real': self.logpyx = -se(self.y, self.target_var).mean() else: assert False self.loss = - (self.logpyx - \ self.weight * self.kl/T.cast(self.dataset_size,floatX)) # DK - extra monitoring params = self.params ds = self.dataset_size self.logpyx_grad = flatten_list( T.grad(-self.logpyx, params, disconnected_inputs='warn')).norm(2) self.logpw_grad = flatten_list( T.grad(-self.logpw.mean() / ds, params, disconnected_inputs='warn')).norm(2) self.logqw_grad = flatten_list( T.grad(self.logqw.mean() / ds, params, disconnected_inputs='warn')).norm(2) self.monitored = [ self.logpyx, self.logpw, self.logqw, self.logpyx_grad, self.logpw_grad, self.logqw_grad ]
def _get_elbo(self): """ negative elbo, an upper bound on NLL """ logdets = self.logdets logqw = -logdets """ originally... logqw = - (0.5*(ep**2).sum(1)+0.5*T.log(2*np.pi)*num_params+logdets) --> constants are neglected in this wrapperfrom utils import log_laplace """ logpw = self.prior(self.weights, 0., -T.log(self.lbda)).sum(1) """ using normal prior centered at zero, with lbda being the inverse of the variance """ kl = (logqw - logpw).mean() logpyx = -cc(self.y, self.target_var).mean() self.loss = -(logpyx - kl / T.cast(self.dataset_size, floatX))
def _get_elbo(self): # NTS: is KL waaay too big?? self.kl = KL(self.prior_mean, self.prior_log_var, self.mean, self.log_var).sum(-1).mean() if self.output_type == 'categorical': self.logpyx = -cc(self.y, self.target_var).mean() elif self.output_type == 'real': self.logpyx = -se(self.y, self.target_var).mean() else: assert False self.loss = - (self.logpyx - \ self.weight * self.kl/T.cast(self.dataset_size,floatX)) # DK - extra monitoring params = self.params ds = self.dataset_size self.logpyx_grad = flatten_list( T.grad(-self.logpyx, params, disconnected_inputs='warn')).norm(2) self.monitored = [self.logpyx, self.logpyx_grad, self.kl] #, self.target_var]
weight = weights[:,t:t+num_param].reshape((wd1,)+ws) inputs[w_layer] = weight layer = stochasticDenseLayer([layer,w_layer],ws[1]) t += num_param layer.nonlinearity = nonlinearities.softmax y = get_output(layer,inputs) #y = T.clip(y, 0.00001, 0.99999) # stability ########################### # loss and grad logdets = sum([get_output(logdet,ep) for logdet in logdets_layers]) logqw = - (0.5*(ep**2).sum(1) + 0.5*T.log(2*np.pi)*num_params + logdets) logpw = log_stdnormal(weights).sum(1) logpyx = - cc(y,target_var).mean() kl = (logqw - logpw).mean() ds = T.cast(dataset_size,floatX) loss = - (logpyx - kl/ds) params = lasagne.layers.get_all_params([h_layer,layer]) grads = T.grad(loss, params) ########################### # extra monitoring nll_grads = flatten_list(T.grad(-logpyx, params, disconnected_inputs='warn')).norm(2) prior_grads = flatten_list(T.grad(-logpw.mean() / ds, params, disconnected_inputs='warn')).norm(2) entropy_grads = flatten_list(T.grad(logqw.mean() / ds, params, disconnected_inputs='warn')).norm(2) outputs = [loss, -logpyx, -logpw / ds, logqw / ds, nll_grads, prior_grads, entropy_grads, logdets] # logdets is "legacy"
def main(): """ MNIST example weight norm reparameterized MLP with prior on rescaling parameters """ import argparse parser = argparse.ArgumentParser() parser.add_argument('--perdatapoint', action='store_true') parser.add_argument('--coupling', action='store_true') parser.add_argument('--size', default=10000, type=int) parser.add_argument('--lrdecay', action='store_true') parser.add_argument('--lr0', default=0.1, type=float) parser.add_argument('--lbda', default=0.01, type=float) parser.add_argument('--bs', default=50, type=int) args = parser.parse_args() print args perdatapoint = args.perdatapoint coupling = 1 #args.coupling lr0 = args.lr0 lrdecay = args.lrdecay lbda = np.cast[floatX](args.lbda) bs = args.bs size = max(10, min(50000, args.size)) clip_grad = 100 max_norm = 100 # load dataset filename = '/data/lisa/data/mnist.pkl.gz' train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename) input_var = T.matrix('input_var') target_var = T.matrix('target_var') dataset_size = T.scalar('dataset_size') lr = T.scalar('lr') # 784 -> 20 -> 10 weight_shapes = [(784, 200), (200, 10)] num_params = sum(ws[1] for ws in weight_shapes) if perdatapoint: wd1 = input_var.shape[0] else: wd1 = 1 # stochastic hypernet ep = srng.normal(std=0.01, size=(wd1, num_params), dtype=floatX) logdets_layers = [] h_layer = lasagne.layers.InputLayer([None, num_params]) layer_temp = LinearFlowLayer(h_layer) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) if coupling: layer_temp = CoupledDenseLayer(h_layer, 200) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) h_layer = PermuteLayer(h_layer, num_params) layer_temp = CoupledDenseLayer(h_layer, 200) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) weights = lasagne.layers.get_output(h_layer, ep) # primary net t = np.cast['int32'](0) layer = lasagne.layers.InputLayer([None, 784]) inputs = {layer: input_var} for ws in weight_shapes: num_param = ws[1] w_layer = lasagne.layers.InputLayer((None, ws[1])) weight = weights[:, t:t + num_param].reshape((wd1, ws[1])) inputs[w_layer] = weight layer = stochasticDenseLayer2([layer, w_layer], ws[1]) print layer.output_shape t += num_param layer.nonlinearity = nonlinearities.softmax y = T.clip(get_output(layer, inputs), 0.001, 0.999) # stability # loss terms logdets = sum([get_output(logdet, ep) for logdet in logdets_layers]) logqw = -(0.5 * (ep**2).sum(1) + 0.5 * T.log(2 * np.pi) * num_params + logdets) #logpw = log_normal(weights,0.,-T.log(lbda)).sum(1) logpw = log_stdnormal(weights).sum(1) kl = (logqw - logpw).mean() logpyx = -cc(y, target_var).mean() loss = -(logpyx - kl / T.cast(dataset_size, floatX)) params = lasagne.layers.get_all_params([h_layer, layer]) grads = T.grad(loss, params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = lasagne.updates.adam(cgrads, params, learning_rate=lr) train = theano.function([input_var, target_var, dataset_size, lr], loss, updates=updates) predict = theano.function([input_var], y.argmax(1)) records = train_model(train, predict, train_x[:size], train_y[:size], valid_x, valid_y, lr0, lrdecay, bs)
def main(): """ MNIST example """ import argparse parser = argparse.ArgumentParser() parser = argparse.ArgumentParser() parser.add_argument('--perdatapoint', action='store_true') parser.add_argument('--coupling', action='store_true') parser.add_argument('--size', default=10000, type=int) parser.add_argument('--lrdecay', action='store_true') parser.add_argument('--lr0', default=0.1, type=float) parser.add_argument('--lbda', default=10, type=float) parser.add_argument('--bs', default=50, type=int) args = parser.parse_args() print args perdatapoint = args.perdatapoint coupling = args.coupling size = max(10, min(50000, args.size)) clip_grad = 10 max_norm = 1000 # load dataset filename = '/data/lisa/data/mnist.pkl.gz' train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename) input_var = T.matrix('input_var') target_var = T.matrix('target_var') dataset_size = T.scalar('dataset_size') lr = T.scalar('lr') # 784 -> 20 -> 10 weight_shapes = [(784, 20), (20, 20), (20, 10)] num_params = sum(np.prod(ws) for ws in weight_shapes) if perdatapoint: wd1 = input_var.shape[0] else: wd1 = 1 # stochastic hypernet ep = srng.normal(size=(wd1, num_params), dtype=floatX) logdets_layers = [] h_layer = lasagne.layers.InputLayer([None, num_params]) layer_temp = LinearFlowLayer(h_layer) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) if coupling: layer_temp = CoupledConv1DLayer(h_layer, 16, 5) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) h_layer = PermuteLayer(h_layer, num_params) layer_temp = CoupledConv1DLayer(h_layer, 16, 5) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) weights = lasagne.layers.get_output(h_layer, ep) # primary net t = np.cast['int32'](0) layer = lasagne.layers.InputLayer([None, 784]) inputs = {layer: input_var} for ws in weight_shapes: num_param = np.prod(ws) print t, t + num_param w_layer = lasagne.layers.InputLayer((None, ) + ws) weight = weights[:, t:t + num_param].reshape((wd1, ) + ws) inputs[w_layer] = weight layer = stochasticDenseLayer([layer, w_layer], ws[1]) t += num_param layer.nonlinearity = nonlinearities.softmax y = T.clip(get_output(layer, inputs), 0.001, 0.999) # stability # loss terms logdets = sum([get_output(logdet, ep) for logdet in logdets_layers]) logqw = -(0.5 * (ep**2).sum(1) + 0.5 * T.log(2 * np.pi) * num_params + logdets) logpw = log_stdnormal(weights).sum(1) kl = (logqw - logpw).mean() logpyx = -cc(y, target_var).mean() loss = -(logpyx - kl / T.cast(dataset_size, floatX)) params = lasagne.layers.get_all_params([h_layer, layer]) grads = T.grad(loss, params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = lasagne.updates.nesterov_momentum(cgrads, params, learning_rate=lr) train = theano.function([input_var, target_var, dataset_size, lr], loss, updates=updates) predict = theano.function([input_var], y.argmax(1)) records = train_model(train, predict, train_x[:size], train_y[:size], valid_x, valid_y) output_probs = theano.function([input_var], y) MCt = np.zeros((100, 1000, 10)) MCv = np.zeros((100, 1000, 10)) for i in range(100): MCt[i] = output_probs(train_x[:1000]) MCv[i] = output_probs(valid_x[:1000]) tr = np.equal(MCt.mean(0).argmax(-1), train_y[:1000].argmax(-1)).mean() va = np.equal(MCv.mean(0).argmax(-1), valid_y[:1000].argmax(-1)).mean() print "train perf=", tr print "valid perf=", va for ii in range(15): print np.round(MCt[ii][0] * 1000)
def main(): """ MNIST example weight norm reparameterized MLP with prior on rescaling parameters """ import argparse parser = argparse.ArgumentParser() parser.add_argument('--coupling', action='store_true') parser.add_argument('--size', default=10000, type=int) parser.add_argument('--lrdecay', action='store_true') parser.add_argument('--lr0', default=0.1, type=float) parser.add_argument('--lbda', default=0.01, type=float) parser.add_argument('--bs', default=50, type=int) args = parser.parse_args() print args coupling = args.coupling lr0 = args.lr0 lrdecay = args.lrdecay lbda = np.cast[floatX](args.lbda) bs = args.bs size = max(10, min(50000, args.size)) clip_grad = 5 max_norm = 10 # load dataset filename = '/data/lisa/data/mnist.pkl.gz' train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename) train_x = train_x.reshape(50000, 1, 28, 28) valid_x = valid_x.reshape(10000, 1, 28, 28) test_x = test_x.reshape(10000, 1, 28, 28) input_var = T.tensor4('input_var') target_var = T.matrix('target_var') dataset_size = T.scalar('dataset_size') lr = T.scalar('lr') # 784 -> 20 -> 10 weight_shapes = [ (16, 1, 5, 5), # -> (None, 16, 14, 14) (16, 16, 5, 5), # -> (None, 16, 7, 7) (16, 16, 5, 5) ] # -> (None, 16, 4, 4) num_params = sum(np.prod(ws) for ws in weight_shapes) + 10 wd1 = 1 # stochastic hypernet ep = srng.normal(std=0.01, size=(wd1, num_params), dtype=floatX) logdets_layers = [] h_layer = lasagne.layers.InputLayer([None, num_params]) layer_temp = LinearFlowLayer(h_layer) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) if coupling: layer_temp = CoupledDenseLayer(h_layer, 200) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) h_layer = PermuteLayer(h_layer, num_params) layer_temp = CoupledDenseLayer(h_layer, 200) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) weights = lasagne.layers.get_output(h_layer, ep) # primary net t = np.cast['int32'](0) layer = lasagne.layers.InputLayer([None, 1, 28, 28]) inputs = {layer: input_var} for ws in weight_shapes: num_param = np.prod(ws) weight = weights[:, t:t + num_param].reshape(ws) num_filters = ws[0] filter_size = ws[2] stride = 2 pad = 'same' layer = stochasticConv2DLayer([layer, weight], num_filters, filter_size, stride, pad) print layer.output_shape t += num_param w_layer = lasagne.layers.InputLayer((None, 10)) weight = weights[:, t:t + 10].reshape((wd1, 10)) inputs[w_layer] = weight layer = stochasticDenseLayer2([layer, w_layer], 10, nonlinearity=nonlinearities.softmax) y = T.clip(get_output(layer, inputs), 0.001, 0.999) # loss terms logdets = sum([get_output(logdet, ep) for logdet in logdets_layers]) logqw = -(0.5 * (ep**2).sum(1) + 0.5 * T.log(2 * np.pi) * num_params + logdets) logpw = log_normal(weights, 0., -T.log(lbda)).sum(1) #logpw = log_stdnormal(weights).sum(1) kl = (logqw - logpw).mean() logpyx = -cc(y, target_var).mean() loss = -(logpyx - kl / T.cast(dataset_size, floatX)) params = lasagne.layers.get_all_params([layer])[1:] # excluding rand state grads = T.grad(loss, params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = lasagne.updates.adam(cgrads, params, learning_rate=lr) train = theano.function([input_var, target_var, dataset_size, lr], loss, updates=updates) predict = theano.function([input_var], y.argmax(1)) records = train_model(train, predict, train_x[:size], train_y[:size], valid_x, valid_y, lr0, lrdecay, bs) output_probs = theano.function([input_var], y) MCt = np.zeros((100, 1000, 10)) MCv = np.zeros((100, 1000, 10)) for i in range(100): MCt[i] = output_probs(train_x[:1000]) MCv[i] = output_probs(valid_x[:1000]) tr = np.equal(MCt.mean(0).argmax(-1), train_y[:1000].argmax(-1)).mean() va = np.equal(MCv.mean(0).argmax(-1), valid_y[:1000].argmax(-1)).mean() print "train perf=", tr print "valid perf=", va for ii in range(15): print np.round(MCt[ii][0] * 1000)
def __init__( self, srng=RandomStreams(seed=427), prior_mean=0, prior_log_var=0, n_hiddens=2, n_units=800, n_inputs=784, n_classes=10, output_type='categorical', random_biases=1, #dataset_size=None, opt='adam', #weight=1.,# the weight of the KL term **kargs): self.__dict__.update(locals()) # TODO self.dataset_size = T.scalar('dataset_size') self.weight = T.scalar('weight') self.learning_rate = T.scalar('learning_rate') self.weight_shapes = [] self.weight_shapes = [] if n_hiddens > 0: self.weight_shapes.append((n_inputs, n_units)) #self.params.append((theano.shared())) for i in range(1, n_hiddens): self.weight_shapes.append((n_units, n_units)) self.weight_shapes.append((n_units, n_classes)) else: self.weight_shapes = [(n_inputs, n_classes)] if self.random_biases: self.num_params = sum( (ws[0] + 1) * ws[1] for ws in self.weight_shapes) else: self.num_params = sum((ws[0]) * ws[1] for ws in self.weight_shapes) self.wd1 = 1 self.X = T.matrix() self.y = T.matrix() self.mean = ts(self.num_params) self.log_var = ts(self.num_params, scale=1e-6, bias=-1e8) self.params = [self.mean, self.log_var] self.ep = self.srng.normal(size=(self.num_params, ), dtype=floatX) self.weights = self.mean + (T.exp(self.log_var) + np.float32(.000001)) * self.ep t = 0 acts = self.X for nn, ws in enumerate(self.weight_shapes): if self.random_biases: num_param = (ws[0] + 1) * ws[1] weight_and_bias = self.weights[t:t + num_param] weight = weight_and_bias[:ws[0] * ws[1]].reshape( (ws[0], ws[1])) bias = weight_and_bias[ws[0] * ws[1]:].reshape((ws[1], )) acts = T.dot(acts, weight) + bias else: assert False # TODO if nn < len(self.weight_shapes) - 1: acts = (acts > 0.) * (acts) else: acts = T.nnet.softmax(acts) t += num_param y_hat = acts #y_hat = T.clip(y_hat, 0.001, 0.999) # stability self.y_hat = y_hat self.kl = KL(self.prior_mean, self.prior_log_var, self.mean, self.log_var).sum(-1).mean() self.logpyx = -cc(self.y_hat, self.y).mean() self.logpyx = -se(self.y_hat, self.y).mean() self.loss = -(self.logpyx - self.weight * self.kl / T.cast(self.dataset_size, floatX)) self.loss = se(self.y_hat, self.y).mean() self.logpyx_grad = flatten_list( T.grad(-self.logpyx, self.params, disconnected_inputs='warn')).norm(2) self.monitored = [self.logpyx, self.logpyx_grad, self.kl] #def _get_useful_funcs(self): self.predict_proba = theano.function([self.X], self.y_hat) self.predict = theano.function([self.X], self.y_hat.argmax(1)) self.predict_fixed_mask = theano.function([self.X, self.weights], self.y_hat) self.sample_weights = theano.function([], self.weights) self.monitor_fn = theano.function( [self.X, self.y], self.monitored) #, (self.predict(x) == y).sum() #def _get_grads(self): grads = T.grad(self.loss, self.params) #mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=self.max_norm) #cgrads = [T.clip(g, -self.clip_grad, self.clip_grad) for g in mgrads] cgrads = grads if self.opt == 'adam': self.updates = lasagne.updates.adam( cgrads, self.params, learning_rate=self.learning_rate) elif self.opt == 'momentum': self.updates = lasagne.updates.nesterov_momentum( cgrads, self.params, learning_rate=self.learning_rate) elif self.opt == 'sgd': self.updates = lasagne.updates.sgd( cgrads, self.params, learning_rate=self.learning_rate) #def _get_train_func(self): inputs = [ self.X, self.y, self.dataset_size, self.learning_rate, self.weight ] train = theano.function(inputs, self.loss, updates=self.updates, on_unused_input='warn') self.train_func_ = train # DK - putting this here, because is doesn't get overwritten by subclasses self.monitor_func = theano.function( [self.X, self.y, self.dataset_size, self.learning_rate], self.monitored, on_unused_input='warn')
def __init__(self, arch=None, lbda=1, perdatapoint=False, srng=RandomStreams(seed=427), prior=log_normal, opt='adam', coupling=4, coupling_dim=200, pad='same', stride=2, pool=None, uncoupled_init=0, convex_combination=0): if arch == 'Riashat': kernel_width = 3 self.kernel_width = kernel_width stride = 1 self.stride = stride pad = 'valid' self.pad = pad self.weight_shapes = [ (32, 1, kernel_width, kernel_width), # -> (None, 16, 14, 14) (32, 32, kernel_width, kernel_width) ] # -> (None, 16, 7, 7) self.args = [[32, kernel_width, stride, pad, rectify, 'none'], [32, kernel_width, stride, pad, rectify, 'max']] self.pool_size = 5 else: self.pool_size = 2 self.n_kernels = np.array(self.weight_shapes)[:, 1].sum() self.kernel_shape = self.weight_shapes[0][:1] + self.weight_shapes[0][ 2:] print "kernel_shape", self.kernel_shape self.kernel_size = np.prod(self.weight_shapes[0]) self.num_classes = 10 if arch == 'Riashat': self.num_hids = 256 else: self.num_hids = 128 self.num_mlp_layers = 1 self.num_mlp_params = self.num_classes + \ self.num_hids * self.num_mlp_layers self.num_cnn_params = np.sum(np.array(self.weight_shapes)[:, 0]) self.num_params = self.num_mlp_params + self.num_cnn_params self.coupling = coupling self.extra_l2 = 0 self.convex_combination = convex_combination #def __init__(self, self.lbda = lbda self.perdatapoint = perdatapoint self.srng = srng self.prior = prior self.__dict__.update(locals()) if perdatapoint: self.wd1 = self.input_var.shape[0] else: self.wd1 = 1 #def _get_theano_variables(self): self.input_var = T.matrix('input_var') self.input_var = T.tensor4('input_var') # <-- for CNN self.target_var = T.matrix('target_var') self.dataset_size = T.scalar('dataset_size') self.learning_rate = T.scalar('learning_rate') #def _get_hyper_net(self): # inition random noise print self.num_params ep = self.srng.normal(size=(self.wd1, self.num_params), dtype=floatX) logdets_layers = [] h_net = lasagne.layers.InputLayer([None, self.num_params]) # mean and variation of the initial noise layer_temp = LinearFlowLayer(h_net) h_net = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) if self.coupling: layer_temp = CoupledWNDenseLayer(h_net, coupling_dim, uncoupled_init=uncoupled_init) h_net = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) for c in range(self.coupling - 1): h_net = PermuteLayer(h_net, self.num_params) layer_temp = CoupledWNDenseLayer(h_net, coupling_dim, uncoupled_init=uncoupled_init) h_net = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) if self.convex_combination: layer_temp = ConvexBiasLayer( h_net, upweight_primary=self.convex_combination) h_net = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) self.h_net = h_net self.weights = lasagne.layers.get_output(h_net, ep) self.logdets = sum([get_output(ld, ep) for ld in logdets_layers]) #def _get_primary_net(self): t = np.cast['int32'](0) if 1: #self.dataset == 'mnist': p_net = lasagne.layers.InputLayer([None, 1, 28, 28]) print p_net.output_shape inputs = {p_net: self.input_var} #logpw = np.float32(0.) for ws, args in zip(self.weight_shapes, self.args): num_filters = ws[0] # TO-DO: generalize to have multiple samples? weight = self.weights[0, t:t + num_filters].dimshuffle( 0, 'x', 'x', 'x') num_filters = args[0] filter_size = args[1] stride = args[2] pad = args[3] nonl = args[4] p_net = lasagne.layers.Conv2DLayer(p_net, num_filters, filter_size, stride, pad, nonlinearity=nonl) p_net = stochastic_weight_norm(p_net, weight) if args[5] == 'max': p_net = lasagne.layers.MaxPool2DLayer(p_net, self.pool_size) #print p_net.output_shape t += num_filters for layer in range(self.num_mlp_layers): weight = self.weights[:, t:t + self.num_hids].reshape( (self.wd1, self.num_hids)) p_net = lasagne.layers.DenseLayer(p_net, self.num_hids, nonlinearity=rectify) p_net = stochastic_weight_norm(p_net, weight) if self.extra_l2: self.l2_penalty = lasagne.regularization.regularize_layer_params_weighted( {p_net: 3.5 / 128}, lasagne.regularization.l2) t += self.num_hids weight = self.weights[:, t:t + self.num_classes].reshape( (self.wd1, self.num_classes)) p_net = lasagne.layers.DenseLayer(p_net, self.num_classes, nonlinearity=nonlinearities.softmax) p_net = stochastic_weight_norm(p_net, weight) y = T.clip(get_output(p_net, inputs), 0.001, 0.999) # stability self.p_net = p_net self.y = y #def _get_params(self): params = lasagne.layers.get_all_params([self.h_net, self.p_net]) self.params = list() for param in params: if type(param) is not RSSV: self.params.append(param) params0 = lasagne.layers.get_all_param_values([self.h_net, self.p_net]) params = lasagne.layers.get_all_params([self.h_net, self.p_net]) updates = {p: p0 for p, p0 in zip(params, params0)} self.reset = theano.function([], None, updates=updates) self.add_reset('init') #def _get_elbo(self): logdets = self.logdets self.logqw = -logdets self.logpw = self.prior(self.weights, 0., -T.log(self.lbda)).sum(1) self.kl = (self.logqw - self.logpw).mean() self.kl_term = self.kl / T.cast(self.dataset_size, floatX) self.logpyx = -cc(self.y, self.target_var).mean() self.loss = -self.logpyx + self.kl_term # DK - extra monitoring (TODO) params = self.params ds = self.dataset_size self.logpyx_grad = flatten_list( T.grad(-self.logpyx, params, disconnected_inputs='warn')).norm(2) self.logpw_grad = flatten_list( T.grad(-self.logpw.mean() / ds, params, disconnected_inputs='warn')).norm(2) self.logqw_grad = flatten_list( T.grad(self.logqw.mean() / ds, params, disconnected_inputs='warn')).norm(2) self.monitored = [ self.logpyx, self.logpw, self.logqw, self.logpyx_grad, self.logpw_grad, self.logqw_grad ] #def _get_grads(self): grads = T.grad(self.loss, self.params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=self.max_norm) cgrads = [T.clip(g, -self.clip_grad, self.clip_grad) for g in mgrads] if self.opt == 'adam': self.updates = lasagne.updates.adam( cgrads, self.params, learning_rate=self.learning_rate) elif self.opt == 'momentum': self.updates = lasagne.updates.nesterov_momentum( cgrads, self.params, learning_rate=self.learning_rate) elif self.opt == 'sgd': self.updates = lasagne.updates.sgd( cgrads, self.params, learning_rate=self.learning_rate) #def _get_train_func(self): train = theano.function([ self.input_var, self.target_var, self.dataset_size, self.learning_rate ], self.loss, updates=self.updates) self.train_func = train # DK - putting this here, because is doesn't get overwritten by subclasses self.monitor_func = theano.function([ self.input_var, self.target_var, self.dataset_size, self.learning_rate ], self.monitored, on_unused_input='warn') #def _get_useful_funcs(self): self.predict_proba = theano.function([self.input_var], self.y) self.predict = theano.function([self.input_var], self.y.argmax(1))
def __init__(self, n_hiddens, n_units, n_inputs=784, dropout=False, flow='IAF', norm_type='WN', coupling=0, n_units_h=200, static_bias=True, prior=log_normal, lbda=1, srng=RandomStreams(seed=427), max_norm=10, clip_grad=5): """ flow: if None, then just regular MLE estimate of parameters flow can be `IAF` or `NVP` to approximate the rescaling parameters (and shift) of Weightnorm or Batchnorm coupling: number of transformation layers using `IAF` or `RealNVP` if flow is not None dropout: dropout layer after activation static_bias: if one wants the hyper net to output the shifting parameters of WN/BN of flow is not None """ layer = lasagne.layers.InputLayer([None, n_inputs]) self.n_hiddens = n_hiddens self.n_units = n_units self.weight_shapes = list() self.weight_shapes.append((n_inputs, n_units)) for i in range(1, n_hiddens): self.weight_shapes.append((n_units, n_units)) self.weight_shapes.append((n_units, 10)) self.num_params = sum(ws[1] for ws in self.weight_shapes) self.flow = flow self.norm_type = norm_type self.coupling = coupling self.dropout = dropout self.static_bias = static_bias self.prior = prior self.lbda = lbda self.max_norm = max_norm self.clip_grad = clip_grad for j, ws in enumerate(self.weight_shapes): layer = lasagne.layers.DenseLayer( layer, ws[1], nonlinearity=lasagne.nonlinearities.rectify) if dropout: if j != len(self.weight_shapes) - 1: layer = lasagne.layers.dropout(layer) layer.nonlinearity = lasagne.nonlinearities.softmax self.input_var = T.matrix('input_var') self.target_var = T.matrix('target_var') self.learning_rate = T.scalar('leanring_rate') self.inputs = [self.input_var, self.target_var, self.learning_rate] self.layer = layer if flow is None: self.output_var = get_output(layer, self.input_var) self.output_var_det = get_output(layer, self.input_var, deterministic=True) losses = cc(self.y, self.target_var) self.loss = losses.mean() self.prints = [] elif flow == 'IAF' or flow == 'RealNVP': self.dataset_size = T.scalar('dataset_size') self.beta = T.scalar('beta') # anealing weight self.inputs = [ self.input_var, self.target_var, self.dataset_size, self.learning_rate, self.beta ] copies = 1 if self.static_bias else 2 hnet, ld, num_params = hypernet(layer, n_units_h, coupling, flow, copies=copies) static_bias = theano.shared(np.zeros( (num_params)).astype(floatX)) if self.static_bias else None ep = srng.normal(size=(1, num_params), dtype=floatX) output_var = N_get_output(layer, self.input_var, hnet, ep, norm_type=norm_type, static_bias=static_bias) weights = get_output(hnet, ep) logdets = get_output(ld, ep) self.num_params = num_params self.N_bias = static_bias self.hnet = hnet self.ep = ep self.output_var_ = output_var if norm_type == 'BN' and flow is not None: print 'BN test time uses running avg' #self.output_var = N_get_output(layer, # self.input_var,hnet,ep, # norm_type=norm_type, # static_bias=static_bias, # test_time=True) self.output_var = self.output_var else: self.output_var = self.output_var self.weights = weights self.logdets = logdets loss, prints = get_elbo( T.clip(output_var, 0.001, 0.999), # stability self.target_var, self.weights, self.logdets, self.beta, self.dataset_size, prior=self.prior, lbda=self.lbda, output_type='categorical') self.loss = loss self.prints = prints self.params = lasagne.layers.get_all_params(self.layer) + \ lasagne.layers.get_all_params(self.hnet) if hasattr(self, 'N_bias'): if self.N_bias is not None: self.params.append(self.N_bias) self.grads = stable_grad(self.loss, self.params, self.clip_grad, self.max_norm) self.updates = lasagne.updates.adam(self.grads, self.params, self.learning_rate) print '\tgetting train_func' if len(self.inputs) == 3: self.train_func_ = theano.function(self.inputs, [ self.loss, ] + self.prints, updates=self.updates) self.tran_func = lambda x, y, n, lr, w: self.train_func_(x, y, lr) elif len(self.inputs) == 5: self.train_func = theano.function(self.inputs, [ self.loss, ] + self.prints, updates=self.updates) print '\tgetting useful_funcs' self.predict_proba = theano.function([self.input_var], self.output_var)