def recurrence(self, state_below, mask=None): nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_): preact = tensor.dot(h_, self.U) preact += x_ i = tensor.nnet.sigmoid( _slice(preact, 0, self.option[Option.OUTPUT_DIM])) f = tensor.nnet.sigmoid( _slice(preact, 1, self.option[Option.OUTPUT_DIM])) o = tensor.nnet.sigmoid( _slice(preact, 2, self.option[Option.OUTPUT_DIM])) c = tensor.tanh(_slice(preact, 3, self.option[Option.OUTPUT_DIM])) c = f * c_ + i * c if m_ is not None: c1 = m_[:, None] * c c2 = (1. - m_)[:, None] * c_ c = c1 + c2 h = o * tensor.tanh(c) if m_ is not None: h = m_[:, None] * h + (1. - m_)[:, None] * h_ return tensor.cast(h, th.config.floatX), tensor.cast( c, th.config.floatX) state_below = (tensor.dot(state_below, self.W) + self.b) dim_proj = self.option[Option.OUTPUT_DIM] rval, updates = th.scan(_step, sequences=[mask, state_below], outputs_info=[ tensor.alloc(numpy_floatX(0.), n_samples, dim_proj), tensor.alloc(numpy_floatX(0.), n_samples, dim_proj) ], name=_p(self.id, '_layers'), n_steps=nsteps) return rval[0] # return hidden values
def compile(self): ##### if self.mask is not None: self.core_layer.mask = self.mask self.core_layer.id = self.id self.core_layer.init_params() ##### if isinstance(self.input, list): time_steps = len(self.input) else: time_steps = self.input.shape[0] def _step(_m, prev): self.core_layer.input = _m self.core_layer.compile(init_params=False) return self.core_layer.output result, updates = th.scan( fn=_step, outputs_info=tensor.alloc( numpy_floatX(0.), self.input.shape[1], self.core_layer.option[Option.OUTPUT_DIM]), sequences=[self.input]) #self.output = result.swapaxes(0,1) self.output = result self.params = self.core_layer.params
def add_layer(self, l): if isinstance(l, list): for ll in l: self.add_layer(ll) else: self.layers.append(l) if isinstance(l, Dropout): self.use_noise = th.shared(numpy_floatX(0.))
def __init__(self, idx="0", input_value_type=config.floatX, prediction_type='vector', prediction_value_type=config.floatX, use_mask=False, use_noise=False): ''' :param idx: id of the :param input_value_type: :param prediction_type: :param prediction_value_type: :param use_mask: :return: ''' self.input = tensor.matrix('x', dtype=input_value_type) self.output = None self.prediction_type = prediction_type if prediction_type == 'vector': self.prediction = tensor.ivector('y') self.gold = tensor.ivector('g') if prediction_type == 'matrix': self.prediction = tensor.matrix('y', dtype=prediction_value_type) self.gold = tensor.matrix('g', dtype=prediction_value_type) self.params = OrderedDict() self.layers = [] self.optimizer = None self.cost = None self.id = idx self.option = Option() # options for the model if use_noise: self.use_noise = th.shared(numpy_floatX(0.)) else: self.use_noise = None self.use_mask = use_mask # can be set to True when working with sequences of different sizes self.input_mask = None self.output_mask = None self.f_cost = None self.f_grad = None self.lr = tensor.scalar(name='lr') self.f_grad_shared = None self.f_update = None
def adadelta(lr, tparams, grads, x, y, cost, input_mask=None, output_mask=None): """ An adaptive learning rate optimizer Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs mask: Theano variable Sequence mask y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [ADADELTA]_. .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning Rate Method*, arXiv:1212.5701. """ zipped_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items() ] running_up2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.items() ] running_grads2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] if input_mask is not None and output_mask is not None: print("not none") else: print("none") if input_mask is not None and output_mask is not None: f_grad_shared = theano.function([x, y, input_mask, output_mask], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') else: f_grad_shared = theano.function([x, y], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [ -tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2) ] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, x, y, cost, input_mask=None, output_mask=None): """ A variant of SGD that scales the step size by running average of the recent step norms. Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs mask: Theano variable Sequence mask y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [Hint2014]_. .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, lecture 6a, http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf """ zipped_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items() ] running_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.items() ] running_grads2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] if input_mask is not None and output_mask is not None: f_grad_shared = theano.function([x, y, input_mask, output_mask], cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared') else: f_grad_shared = theano.function([x, y], cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared') updir = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.items() ] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg**2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') return f_grad_shared, f_update