Esempio n. 1
0
def rmsprop(cost, incomings, params, options):
  '''
  incomings should be a list
  '''
  lr = options.get('lr', 1e-2)
  clip_norm = options.get('clip_norm', None)
  dr = options.get('dr', 0.95) #decay rate
  epsilon = options.get('epsilon', 1e-8)

  grads = T.grad(cost, params.values())
  if clip_norm != None:
    l2_norm = [T.sqrt(T.sum(g**2)) for g in grads]
    for i in range(len(grads)):
      grads[i] = T.switch(l2_norm[i] > clip_norm,
          grads[i] * clip_norm / l2_norm[i],
          grads[i])

  cache = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()]
  updates = []
  def c_update(c, g):
    return dr * c + (1 - dr) * T.sqr(g)
  for p, g, c in zip(params.values(), grads, cache):
    updates.append([p, p - lr * g / (np.sqrt(c_update(c, g)) + epsilon)])
    updates.append([c, c_update(c, g)])
  if isinstance(lr, T.TensorVariable):
    return theano.function(incomings + [lr], cost, updates=updates, allow_input_downcast=True)
  else:
    return theano.function(incomings, cost, updates=updates, allow_input_downcast=True)
Esempio n. 2
0
def add_param(shape, params, name=None, val=None, initializer=init.HeUniform()):
  if name not in params:
    if name is None:
      name = name_suf(name, '_%d' % len(params))
    if isinstance(val, theano.tensor.sharedvar.TensorSharedVariable):
      assert(shape == val.get_value().shape)
      assert(val.dtype == theano.config.floatX)
      '''
      if val.dtype != theano.config.floatX:
        val = val.astype(theano.config.floatX)
      '''
      params[name] = val
      return name
    if val is None:
      val = cast_floatX(initializer(shape))
    else:
      val = cast_floatX(val)
    assert(val.shape == shape)
    params[name] = theano.shared(val)
  return name
Esempio n. 3
0
def adam(cost, incomings, params, options):
  '''
  incomings should be a list
  '''
  lr = options.get('lr', 1e-2)
  beta1 = options.get('beta1', 0.9)
  beta2 = options.get('beta2', 0.999)
  epsilon = options.get('epsilon', 1e-8)

  grads = T.grad(cost, params.values())
  velocity = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()]
  momentum = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()]
  beta1_run = theano.shared(cast_floatX(beta1))
  beta2_run = theano.shared(cast_floatX(beta2))
  # cannot use beta2_run = theano.shared(beta2).astype(theano.config.floatX)
  # since the target of updates in theano.function has to be an shared, not an Elemwise{Cast{float32}}.0
  updates = []
  def m_update(m, g):
    return beta1 * m + (1 - beta1) * g
  def mb(m):
    return m / (1 - beta1_run)
    #return m
  def v_update(v, g):
    return beta2 * v + (1 - beta2) * T.sqr(g)
  def vb(v):
    return v / (1 - beta2_run)
    #return v
  for p, g, v, m in zip(params.values(), grads, velocity, momentum):
    # since theano will update only based on p'value before p is updated, 
    # we need to explicitly express the update of p
    updates.append([p, p - lr * mb(m_update(m, g)) / (T.sqrt(vb(v_update(v, g))) + epsilon)])
    updates.append([v, v_update(v, g)])
    updates.append([m, m_update(m, g)])
  updates.append([beta1_run, beta1_run * beta1])
  updates.append([beta2_run, beta2_run * beta2])
  if isinstance(lr, T.TensorVariable):
    return theano.function(incomings + [lr], cost, updates=updates, allow_input_downcast=True)
  else:
    return theano.function(incomings, cost, updates=updates, allow_input_downcast=True)
Esempio n. 4
0
def add_param(shape,
              params,
              name=None,
              val=None,
              initializer=init.HeUniform()):
    if name not in params:
        if name is None:
            name = name_suf(name, '_%d' % len(params))
        if isinstance(val, theano.tensor.sharedvar.TensorSharedVariable):
            assert (shape == val.get_value().shape)
            assert (val.dtype == theano.config.floatX)
            '''
      if val.dtype != theano.config.floatX:
        val = val.astype(theano.config.floatX)
      '''
            params[name] = val
            return name
        if val is None:
            val = cast_floatX(initializer(shape))
        else:
            val = cast_floatX(val)
        assert (val.shape == shape)
        params[name] = theano.shared(val)
    return name
Esempio n. 5
0
def DropoutLayer(incoming, use_noise, p):
  """
  tensor switch is like an if statement that checks the
  value of the theano shared variable (use_noise) (we can also use 0/1),
  before either dropping out the incoming tensor or
  computing the appropriate activation. During training/testing
  use_noise is toggled on and off.
  """
  trng = RandomStreams(1234)
  incoming, input_shape = incoming
  output_shape = input_shape
  proj = T.switch(use_noise,
                  incoming *
                  trng.binomial(incoming.shape, p=p, n=1, dtype=incoming.dtype),
                  #trng.binomial(incoming.shape, p=p, n=1, dtype=theano.config.floatX),
                  incoming * cast_floatX(p))
  #return (proj.astype(theano.config.floatX), output_shape)
  return (proj, output_shape)
Esempio n. 6
0
def DropoutLayer(incoming, use_noise, p):
    """
  tensor switch is like an if statement that checks the
  value of the theano shared variable (use_noise) (we can also use 0/1),
  before either dropping out the incoming tensor or
  computing the appropriate activation. During training/testing
  use_noise is toggled on and off.
  """
    trng = RandomStreams(1234)
    incoming, input_shape = incoming
    output_shape = input_shape
    proj = T.switch(
        use_noise,
        incoming *
        trng.binomial(incoming.shape, p=p, n=1, dtype=incoming.dtype),
        #trng.binomial(incoming.shape, p=p, n=1, dtype=theano.config.floatX),
        incoming * cast_floatX(p))
    #return (proj.astype(theano.config.floatX), output_shape)
    return (proj, output_shape)
Esempio n. 7
0
def adagrad(cost, incomings, params, options):
  '''
  incomings should be a list
  '''
  lr = options.get('lr', 1e-2)
  epsilon = options.get('epsilon', 1e-8)

  grads = T.grad(cost, params.values())
  cache = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()]
  updates = []
  def c_update(c, g):
    return c + T.sqr(g)
  for p, g, c in zip(params.values(), grads, cache):
    updates.append([p, p - lr * g / (np.sqrt(c_update(c, g)) + epsilon)])
    updates.append([c, c_update(c, g)])
  if isinstance(lr, T.TensorVariable):
    return theano.function(incomings + [lr], cost, updates=updates, allow_input_downcast=True)
  else:
    return theano.function(incomings, cost, updates=updates, allow_input_downcast=True)
Esempio n. 8
0
def nesterov_momentum(cost, incomings, params, options):
  '''
  incomings should be a list
  '''
  lr = options.get('lr', 1e-2)
  mu = options.get('mu', 0.9)

  grads = T.grad(cost, params.values())
  velocity = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()]
  updates = []
  def v_update(v, g):
    return mu * v - lr * g
  for p, g, v in zip(params.values(), grads, velocity):
    updates.append([p, p - mu * v + (1 + mu) * v_update(v, g)])
    updates.append([v, v_update(v, g)])
  if isinstance(lr, T.TensorVariable):
    return theano.function(incomings + [lr], cost, updates=updates, allow_input_downcast=True)
  else:
    return theano.function(incomings, cost, updates=updates, allow_input_downcast=True)
Esempio n. 9
0
def momentum(cost, incomings, params, options):
  '''
  incomings should be a list
  '''
  lr = options.get('lr', 1e-2)
  mu = options.get('mu', 0.9)

  grads = T.grad(cost, params.values())
  velocity = [theano.shared(cast_floatX(np.zeros_like(p.get_value()))) for p in params.values()]
  updates = []
  def v_update(v, g):
    return mu * v - lr * g
  for p, g, v in zip(params.values(), grads, velocity):
    # since theano will update only based on p'value before p is updated, 
    # we need to explicitly express the update of p
    updates.append([p, p + v_update(v, g)])
    updates.append([v, v_update(v, g)])
  if isinstance(lr, T.TensorVariable):
    return theano.function(incomings + [lr], cost, updates=updates, allow_input_downcast=True)
  else:
    return theano.function(incomings, cost, updates=updates, allow_input_downcast=True)
# 6 GB for 2
pool_sizes = [3,4]
n = sum([l_from_network(x,pool=p) for p in pool_sizes]) / len(pool_sizes)

train_loss = (n+l)
    #.norm(2)/np.sqrt(48).astype("float32")

#train_loss = -0.001*( network_output[0,interesting_features] ** cfg.network_power).norm(2) + ( T.nnet.categorical_crossentropy(network_output, interesting_features_one_hot) ** cfg.network_power).mean() + l

#train_loss = -( 2*network_output[0,628] ) \
#             + rect(network_output[0,:628]).sum()/628. \
#             + rect(network_output[0,628:]).sum()/372. \
#             + np.float32(cfg.prior_strength)*l.mean() #+ 100*(network_output)

learning_rate = theano.shared(utils.cast_floatX(cfg.learning_rate))

if hasattr(cfg, 'build_updates'):
    updates, resets = cfg.build_updates(train_loss, all_params, learning_rate)
else:
    updates = nn.updates.sgd(    train_loss, all_params,
                                 learning_rate, )
    resets = []

givens = {
    # target_var: T.sqr(y),
    model.input.input_var: x-cfg.mean_img
}

print "Compiling"
idx = T.lscalar('idx')
Esempio n. 11
0
 def sample(self, shape):
     return cast_floatX(np.random.rand(*shape) * (self.b - self.a) + self.a)
Esempio n. 12
0
 def sample(self, shape):
     return cast_floatX(self.sigma * (np.random.randn(*shape) + self.mu))
Esempio n. 13
0
 def sample(self, shape):
     return cast_floatX(np.ones(shape) * self.val)
Esempio n. 14
0
 def sample(self, shape):
   return cast_floatX(np.random.rand(*shape) * (self.b - self.a) + self.a)
Esempio n. 15
0
 def sample(self, shape):
   return cast_floatX(np.ones(shape) * self.val)
Esempio n. 16
0
    mu = theano.shared(np.load("prior_mean.npy").astype("float32"))
    l = T.sqr(z - mu.dimshuffle("x", 0, "x", "x"))
    l = T.sqr((z - mu.dimshuffle("x", 0, "x", "x"))[:, :-1])
    return l.mean()


pool_sizes = [1, 4, 8, 16, 32]
l = np.float32(cfg.prior_strength) * sum(
    [l_with_meanpool_student(x, pool=p) for p in pool_sizes]) / len(pool_sizes)

pool_sizes = [1]
n = sum([l_from_network(x, pool=p) for p in pool_sizes]) / len(pool_sizes)

train_loss = (n + l)

learning_rate = theano.shared(utils.cast_floatX(cfg.learning_rate))

if hasattr(cfg, 'build_updates'):
    updates, resets = cfg.build_updates(train_loss, all_params, learning_rate)
else:
    updates = nn.updates.sgd(
        train_loss,
        all_params,
        learning_rate,
    )
    resets = []

givens = {
    # target_var: T.sqr(y),
    model.input.input_var: x - cfg.mean_img
}
Esempio n. 17
0
import theano
from theano import tensor as T
from theano.tensor import nnet
import numpy as np
from utils import cast_floatX

w_initializer = lambda shape: cast_floatX(0.01*np.random.randn(*shape))
b_initializer = lambda shape: cast_floatX(np.zeros(shape))

class Initializer(object):
  def __call__(self, shape):
    return self.sample(shape)

  def sample(self, shape):
    raise NotImplementedError()


class Const(Initializer): 
  def __init__(self, val=0.):
    self.val = val

  def sample(self, shape):
    return cast_floatX(np.ones(shape) * self.val)


class Gaussian(Initializer):
  def __init__(self, mu=0., sigma=1.):
    self.mu = mu
    self.sigma = sigma

  def sample(self, shape):
Esempio n. 18
0
#train_loss = nn.objectives.categorical_crossentropy(T.clip( nn.layers.get_output(model.out), 1e-15, 1 - 1e-15),
#                                                   targets_batch)

train_loss = log_loss(nn.layers.get_output(model.out), targets_batch)
train_loss = train_loss.mean();

givens = {
    # obj.target_var: targets_batch,
    model.input.input_var: x[idx*cfg.batch_size:(idx+1)*cfg.batch_size]
}

all_params = nn.layers.get_all_params(model.out, trainable=True)



learning_rate = theano.shared(utils.cast_floatX(cfg.learning_rate))

using_micro = False
if hasattr(cfg, 'build_updates_with_micro'):
    using_micro = True
    updates, micro_updates = cfg.build_updates_with_micro(train_loss, all_params, learning_rate)
else:
    if hasattr(cfg, 'build_updates'):
        updates = cfg.build_updates(train_loss, all_params, learning_rate)
    else:
        updates = nn.updates.adam( train_loss, all_params, learning_rate)


mask = nn.utils.shared_empty(dim=2)
mask_batch = T.matrix("mask_batch")
# l_mask = nn.layers.InputLayer(shape=(cfg.batch_size, cfg.seq_length), input_var=mask_batch)
Esempio n. 19
0
 def sample(self, shape):
   return cast_floatX(self.sigma * (np.random.randn(*shape) + self.mu))
config.weight_decay = 1e-7
config.max_grad_norm = 10
config.num_steps = 35
config.max_epoch = 20  # number of epochs after which learning decay starts
config.drop_x = 0.25  # variational dropout rate over input word embeddings
config.drop_i = 0.75  # variational dropout rate over inputs of RHN layers(s), applied seperately in each RHN layer
config.drop_s = 0.25  # variational dropout rate over recurrent state
config.drop_o = 0.75  # variational dropout rate over outputs of RHN layer(s), applied before classification layer
config.vocab_size = 10000

print("Data loading")
train_data, valid_data, test_data, _ = ptb_raw_data(config.data_path)

print('Compiling model')
_is_training = T.iscalar('is_training')
_lr = theano.shared(cast_floatX(config.learning_rate), 'lr')
_input_data = T.imatrix('input_data')  # (batch_size, num_steps)
_noise_x = T.matrix('noise_x')  # (batch_size, num_steps)

# model
_theano_rng = RandomStreams(config.seed // 2 +
                            321)  # generates random numbers directly on GPU
flat_probs, params, rhn_updates, hidden_states = stacked.model(
    _input_data, _noise_x, _lr, _is_training, config, _theano_rng)

# loss
_targets = T.imatrix('targets')  # (batch_size, num_steps)
flat_targets = _targets.T.flatten()
xentropies = T.nnet.categorical_crossentropy(
    flat_probs, flat_targets)  # (batch_size * num_steps,)
pred_loss = xentropies.sum() / config.batch_size
Esempio n. 21
0
# build loss
if hasattr(config, 'build_objective'):
    obj = config.build_objective(model)
    obj_valid = config.build_objective(model, deterministic=True)
else:
    raise NotImplementedError

# build updates
learning_rate = theano.shared(np.float32(0.0))
if hasattr(config, 'build_updates'):
    updates = config.build_updates(obj.loss, all_params, learning_rate)
else:
    updates = lasagne.updates.adam(obj.loss, all_params, learning_rate)

# load data to GPU
xtrain_shared = theano.shared(utils.cast_floatX(config.x_train))
xvalid_shared = theano.shared(utils.cast_floatX(config.x_valid))

idxs = T.ivector('idx')
givens = {model.l_in.input_var: xtrain_shared[idxs]}

train = theano.function([idxs], obj.loss, givens=givens, updates=updates, allow_input_downcast=True)
eval_valid = theano.function([], obj_valid, givens={model.l_in.input_var: xtrain_shared})
eval_train = theano.function([], obj_valid, givens={model.l_in.input_var: xvalid_shared})

train_data_iter = DataIterator(config.ntrain, config.batch_size)

print 'Train model'
print
train_batches_per_epoch = config.ntrain / config.batch_size
max_niter = config.max_epoch * train_batches_per_epoch
Esempio n. 22
0
import theano
from theano import tensor as T
from theano.tensor import nnet
import numpy as np
from utils import cast_floatX

w_initializer = lambda shape: cast_floatX(0.01 * np.random.randn(*shape))
b_initializer = lambda shape: cast_floatX(np.zeros(shape))


class Initializer(object):
    def __call__(self, shape):
        return self.sample(shape)

    def sample(self, shape):
        raise NotImplementedError()


class Const(Initializer):
    def __init__(self, val=0.):
        self.val = val

    def sample(self, shape):
        return cast_floatX(np.ones(shape) * self.val)


class Gaussian(Initializer):
    def __init__(self, mu=0., sigma=1.):
        self.mu = mu
        self.sigma = sigma