def rmsprop(cost, incomings, params, options): ''' incomings should be a list ''' lr = options.get('lr', 1e-2) clip_norm = options.get('clip_norm', None) dr = options.get('dr', 0.95) #decay rate epsilon = options.get('epsilon', 1e-8) grads = T.grad(cost, params.values()) if clip_norm != None: l2_norm = [T.sqrt(T.sum(g**2)) for g in grads] for i in range(len(grads)): grads[i] = T.switch(l2_norm[i] > clip_norm, grads[i] * clip_norm / l2_norm[i], grads[i]) cache = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()] updates = [] def c_update(c, g): return dr * c + (1 - dr) * T.sqr(g) for p, g, c in zip(params.values(), grads, cache): updates.append([p, p - lr * g / (np.sqrt(c_update(c, g)) + epsilon)]) updates.append([c, c_update(c, g)]) if isinstance(lr, T.TensorVariable): return theano.function(incomings + [lr], cost, updates=updates, allow_input_downcast=True) else: return theano.function(incomings, cost, updates=updates, allow_input_downcast=True)
def add_param(shape, params, name=None, val=None, initializer=init.HeUniform()): if name not in params: if name is None: name = name_suf(name, '_%d' % len(params)) if isinstance(val, theano.tensor.sharedvar.TensorSharedVariable): assert(shape == val.get_value().shape) assert(val.dtype == theano.config.floatX) ''' if val.dtype != theano.config.floatX: val = val.astype(theano.config.floatX) ''' params[name] = val return name if val is None: val = cast_floatX(initializer(shape)) else: val = cast_floatX(val) assert(val.shape == shape) params[name] = theano.shared(val) return name
def adam(cost, incomings, params, options): ''' incomings should be a list ''' lr = options.get('lr', 1e-2) beta1 = options.get('beta1', 0.9) beta2 = options.get('beta2', 0.999) epsilon = options.get('epsilon', 1e-8) grads = T.grad(cost, params.values()) velocity = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()] momentum = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()] beta1_run = theano.shared(cast_floatX(beta1)) beta2_run = theano.shared(cast_floatX(beta2)) # cannot use beta2_run = theano.shared(beta2).astype(theano.config.floatX) # since the target of updates in theano.function has to be an shared, not an Elemwise{Cast{float32}}.0 updates = [] def m_update(m, g): return beta1 * m + (1 - beta1) * g def mb(m): return m / (1 - beta1_run) #return m def v_update(v, g): return beta2 * v + (1 - beta2) * T.sqr(g) def vb(v): return v / (1 - beta2_run) #return v for p, g, v, m in zip(params.values(), grads, velocity, momentum): # since theano will update only based on p'value before p is updated, # we need to explicitly express the update of p updates.append([p, p - lr * mb(m_update(m, g)) / (T.sqrt(vb(v_update(v, g))) + epsilon)]) updates.append([v, v_update(v, g)]) updates.append([m, m_update(m, g)]) updates.append([beta1_run, beta1_run * beta1]) updates.append([beta2_run, beta2_run * beta2]) if isinstance(lr, T.TensorVariable): return theano.function(incomings + [lr], cost, updates=updates, allow_input_downcast=True) else: return theano.function(incomings, cost, updates=updates, allow_input_downcast=True)
def add_param(shape, params, name=None, val=None, initializer=init.HeUniform()): if name not in params: if name is None: name = name_suf(name, '_%d' % len(params)) if isinstance(val, theano.tensor.sharedvar.TensorSharedVariable): assert (shape == val.get_value().shape) assert (val.dtype == theano.config.floatX) ''' if val.dtype != theano.config.floatX: val = val.astype(theano.config.floatX) ''' params[name] = val return name if val is None: val = cast_floatX(initializer(shape)) else: val = cast_floatX(val) assert (val.shape == shape) params[name] = theano.shared(val) return name
def DropoutLayer(incoming, use_noise, p): """ tensor switch is like an if statement that checks the value of the theano shared variable (use_noise) (we can also use 0/1), before either dropping out the incoming tensor or computing the appropriate activation. During training/testing use_noise is toggled on and off. """ trng = RandomStreams(1234) incoming, input_shape = incoming output_shape = input_shape proj = T.switch(use_noise, incoming * trng.binomial(incoming.shape, p=p, n=1, dtype=incoming.dtype), #trng.binomial(incoming.shape, p=p, n=1, dtype=theano.config.floatX), incoming * cast_floatX(p)) #return (proj.astype(theano.config.floatX), output_shape) return (proj, output_shape)
def DropoutLayer(incoming, use_noise, p): """ tensor switch is like an if statement that checks the value of the theano shared variable (use_noise) (we can also use 0/1), before either dropping out the incoming tensor or computing the appropriate activation. During training/testing use_noise is toggled on and off. """ trng = RandomStreams(1234) incoming, input_shape = incoming output_shape = input_shape proj = T.switch( use_noise, incoming * trng.binomial(incoming.shape, p=p, n=1, dtype=incoming.dtype), #trng.binomial(incoming.shape, p=p, n=1, dtype=theano.config.floatX), incoming * cast_floatX(p)) #return (proj.astype(theano.config.floatX), output_shape) return (proj, output_shape)
def adagrad(cost, incomings, params, options): ''' incomings should be a list ''' lr = options.get('lr', 1e-2) epsilon = options.get('epsilon', 1e-8) grads = T.grad(cost, params.values()) cache = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()] updates = [] def c_update(c, g): return c + T.sqr(g) for p, g, c in zip(params.values(), grads, cache): updates.append([p, p - lr * g / (np.sqrt(c_update(c, g)) + epsilon)]) updates.append([c, c_update(c, g)]) if isinstance(lr, T.TensorVariable): return theano.function(incomings + [lr], cost, updates=updates, allow_input_downcast=True) else: return theano.function(incomings, cost, updates=updates, allow_input_downcast=True)
def nesterov_momentum(cost, incomings, params, options): ''' incomings should be a list ''' lr = options.get('lr', 1e-2) mu = options.get('mu', 0.9) grads = T.grad(cost, params.values()) velocity = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()] updates = [] def v_update(v, g): return mu * v - lr * g for p, g, v in zip(params.values(), grads, velocity): updates.append([p, p - mu * v + (1 + mu) * v_update(v, g)]) updates.append([v, v_update(v, g)]) if isinstance(lr, T.TensorVariable): return theano.function(incomings + [lr], cost, updates=updates, allow_input_downcast=True) else: return theano.function(incomings, cost, updates=updates, allow_input_downcast=True)
def momentum(cost, incomings, params, options): ''' incomings should be a list ''' lr = options.get('lr', 1e-2) mu = options.get('mu', 0.9) grads = T.grad(cost, params.values()) velocity = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()] updates = [] def v_update(v, g): return mu * v - lr * g for p, g, v in zip(params.values(), grads, velocity): # since theano will update only based on p'value before p is updated, # we need to explicitly express the update of p updates.append([p, p + v_update(v, g)]) updates.append([v, v_update(v, g)]) if isinstance(lr, T.TensorVariable): return theano.function(incomings + [lr], cost, updates=updates, allow_input_downcast=True) else: return theano.function(incomings, cost, updates=updates, allow_input_downcast=True)
# 6 GB for 2 pool_sizes = [3,4] n = sum([l_from_network(x,pool=p) for p in pool_sizes]) / len(pool_sizes) train_loss = (n+l) #.norm(2)/np.sqrt(48).astype("float32") #train_loss = -0.001*( network_output[0,interesting_features] ** cfg.network_power).norm(2) + ( T.nnet.categorical_crossentropy(network_output, interesting_features_one_hot) ** cfg.network_power).mean() + l #train_loss = -( 2*network_output[0,628] ) \ # + rect(network_output[0,:628]).sum()/628. \ # + rect(network_output[0,628:]).sum()/372. \ # + np.float32(cfg.prior_strength)*l.mean() #+ 100*(network_output) learning_rate = theano.shared(utils.cast_floatX(cfg.learning_rate)) if hasattr(cfg, 'build_updates'): updates, resets = cfg.build_updates(train_loss, all_params, learning_rate) else: updates = nn.updates.sgd( train_loss, all_params, learning_rate, ) resets = [] givens = { # target_var: T.sqr(y), model.input.input_var: x-cfg.mean_img } print "Compiling" idx = T.lscalar('idx')
def sample(self, shape): return cast_floatX(np.random.rand(*shape) * (self.b - self.a) + self.a)
def sample(self, shape): return cast_floatX(self.sigma * (np.random.randn(*shape) + self.mu))
def sample(self, shape): return cast_floatX(np.ones(shape) * self.val)
mu = theano.shared(np.load("prior_mean.npy").astype("float32")) l = T.sqr(z - mu.dimshuffle("x", 0, "x", "x")) l = T.sqr((z - mu.dimshuffle("x", 0, "x", "x"))[:, :-1]) return l.mean() pool_sizes = [1, 4, 8, 16, 32] l = np.float32(cfg.prior_strength) * sum( [l_with_meanpool_student(x, pool=p) for p in pool_sizes]) / len(pool_sizes) pool_sizes = [1] n = sum([l_from_network(x, pool=p) for p in pool_sizes]) / len(pool_sizes) train_loss = (n + l) learning_rate = theano.shared(utils.cast_floatX(cfg.learning_rate)) if hasattr(cfg, 'build_updates'): updates, resets = cfg.build_updates(train_loss, all_params, learning_rate) else: updates = nn.updates.sgd( train_loss, all_params, learning_rate, ) resets = [] givens = { # target_var: T.sqr(y), model.input.input_var: x - cfg.mean_img }
import theano from theano import tensor as T from theano.tensor import nnet import numpy as np from utils import cast_floatX w_initializer = lambda shape: cast_floatX(0.01*np.random.randn(*shape)) b_initializer = lambda shape: cast_floatX(np.zeros(shape)) class Initializer(object): def __call__(self, shape): return self.sample(shape) def sample(self, shape): raise NotImplementedError() class Const(Initializer): def __init__(self, val=0.): self.val = val def sample(self, shape): return cast_floatX(np.ones(shape) * self.val) class Gaussian(Initializer): def __init__(self, mu=0., sigma=1.): self.mu = mu self.sigma = sigma def sample(self, shape):
#train_loss = nn.objectives.categorical_crossentropy(T.clip( nn.layers.get_output(model.out), 1e-15, 1 - 1e-15), # targets_batch) train_loss = log_loss(nn.layers.get_output(model.out), targets_batch) train_loss = train_loss.mean(); givens = { # obj.target_var: targets_batch, model.input.input_var: x[idx*cfg.batch_size:(idx+1)*cfg.batch_size] } all_params = nn.layers.get_all_params(model.out, trainable=True) learning_rate = theano.shared(utils.cast_floatX(cfg.learning_rate)) using_micro = False if hasattr(cfg, 'build_updates_with_micro'): using_micro = True updates, micro_updates = cfg.build_updates_with_micro(train_loss, all_params, learning_rate) else: if hasattr(cfg, 'build_updates'): updates = cfg.build_updates(train_loss, all_params, learning_rate) else: updates = nn.updates.adam( train_loss, all_params, learning_rate) mask = nn.utils.shared_empty(dim=2) mask_batch = T.matrix("mask_batch") # l_mask = nn.layers.InputLayer(shape=(cfg.batch_size, cfg.seq_length), input_var=mask_batch)
config.weight_decay = 1e-7 config.max_grad_norm = 10 config.num_steps = 35 config.max_epoch = 20 # number of epochs after which learning decay starts config.drop_x = 0.25 # variational dropout rate over input word embeddings config.drop_i = 0.75 # variational dropout rate over inputs of RHN layers(s), applied seperately in each RHN layer config.drop_s = 0.25 # variational dropout rate over recurrent state config.drop_o = 0.75 # variational dropout rate over outputs of RHN layer(s), applied before classification layer config.vocab_size = 10000 print("Data loading") train_data, valid_data, test_data, _ = ptb_raw_data(config.data_path) print('Compiling model') _is_training = T.iscalar('is_training') _lr = theano.shared(cast_floatX(config.learning_rate), 'lr') _input_data = T.imatrix('input_data') # (batch_size, num_steps) _noise_x = T.matrix('noise_x') # (batch_size, num_steps) # model _theano_rng = RandomStreams(config.seed // 2 + 321) # generates random numbers directly on GPU flat_probs, params, rhn_updates, hidden_states = stacked.model( _input_data, _noise_x, _lr, _is_training, config, _theano_rng) # loss _targets = T.imatrix('targets') # (batch_size, num_steps) flat_targets = _targets.T.flatten() xentropies = T.nnet.categorical_crossentropy( flat_probs, flat_targets) # (batch_size * num_steps,) pred_loss = xentropies.sum() / config.batch_size
# build loss if hasattr(config, 'build_objective'): obj = config.build_objective(model) obj_valid = config.build_objective(model, deterministic=True) else: raise NotImplementedError # build updates learning_rate = theano.shared(np.float32(0.0)) if hasattr(config, 'build_updates'): updates = config.build_updates(obj.loss, all_params, learning_rate) else: updates = lasagne.updates.adam(obj.loss, all_params, learning_rate) # load data to GPU xtrain_shared = theano.shared(utils.cast_floatX(config.x_train)) xvalid_shared = theano.shared(utils.cast_floatX(config.x_valid)) idxs = T.ivector('idx') givens = {model.l_in.input_var: xtrain_shared[idxs]} train = theano.function([idxs], obj.loss, givens=givens, updates=updates, allow_input_downcast=True) eval_valid = theano.function([], obj_valid, givens={model.l_in.input_var: xtrain_shared}) eval_train = theano.function([], obj_valid, givens={model.l_in.input_var: xvalid_shared}) train_data_iter = DataIterator(config.ntrain, config.batch_size) print 'Train model' print train_batches_per_epoch = config.ntrain / config.batch_size max_niter = config.max_epoch * train_batches_per_epoch
import theano from theano import tensor as T from theano.tensor import nnet import numpy as np from utils import cast_floatX w_initializer = lambda shape: cast_floatX(0.01 * np.random.randn(*shape)) b_initializer = lambda shape: cast_floatX(np.zeros(shape)) class Initializer(object): def __call__(self, shape): return self.sample(shape) def sample(self, shape): raise NotImplementedError() class Const(Initializer): def __init__(self, val=0.): self.val = val def sample(self, shape): return cast_floatX(np.ones(shape) * self.val) class Gaussian(Initializer): def __init__(self, mu=0., sigma=1.): self.mu = mu self.sigma = sigma