def get_gradients(self, model, data, ** kwargs): indiv_results = [] composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for cost, cost_data in safe_zip(self.costs, nested_data): result = cost.get_gradients(model, cost_data, ** kwargs) indiv_results.append(result) grads = OrderedDict() updates = OrderedDict() params = model.get_params() for coeff, packed in zip(self.coeffs, indiv_results): g, u = packed for param in g: if param not in params: raise ValueError("A shared variable (" + str(param) + ") that is not a parameter appeared " "a cost gradient dictionary.") for param in g: assert param.ndim == g[param].ndim v = coeff * g[param] if param not in grads: grads[param] = v else: grads[param] = grads[param] + v assert grads[param].ndim == param.ndim assert not any([state in updates for state in u]) assert not any([state in params for state in u]) updates.update(u) return grads, updates
def get_monitoring_channels(self, model, data, ** kwargs): self.get_data_specs(model)[0].validate(data) rval = OrderedDict() composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for i, cost in enumerate(self.costs): cost_data = nested_data[i] try: channels = cost.get_monitoring_channels(model, cost_data, **kwargs) rval.update(channels) except TypeError: reraise_as(Exception('SumOfCosts.get_monitoring_channels ' 'encountered TypeError while calling {0}' '.get_monitoring_channels'.format( type(cost)))) value = cost.expr(model, cost_data, ** kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_' + str(i) + name] = value return rval
def get_monitoring_channels(self, model, data, **kwargs): self.get_data_specs(model)[0].validate(data) rval = OrderedDict() composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for i, cost in enumerate(self.costs): cost_data = nested_data[i] try: channels = cost.get_monitoring_channels( model, cost_data, **kwargs) rval.update(channels) except TypeError: reraise_as( Exception('SumOfCosts.get_monitoring_channels ' 'encountered TypeError while calling {0}' '.get_monitoring_channels'.format(type(cost)))) value = cost.expr(model, cost_data, **kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_' + str(i) + name] = value return rval
def get_gradients(self, model, data, **kwargs): indiv_results = [] composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for cost, cost_data in safe_zip(self.costs, nested_data): result = cost.get_gradients(model, cost_data, **kwargs) indiv_results.append(result) grads = OrderedDict() updates = OrderedDict() params = model.get_params() for coeff, packed in zip(self.coeffs, indiv_results): g, u = packed for param in g: if param not in params: raise ValueError("A shared variable (" + str(param) + ") that is not a parameter appeared " "a cost gradient dictionary.") for param in g: assert param.ndim == g[param].ndim v = coeff * g[param] if param not in grads: grads[param] = v else: grads[param] = grads[param] + v assert grads[param].ndim == param.ndim assert not any([state in updates for state in u]) assert not any([state in params for state in u]) updates.update(u) return grads, updates
def get_gradients(self, model, data, **kwargs): cost = self._cost(model, data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore', consider_constant=[self.sampler.particles]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() sampler_updates = self.sampler.updates() updates.update(sampler_updates) return gradients, updates
def get_lr_scalers(self): """ .. todo:: WRITEME """ rval = OrderedDict() params = self.get_params() for layer in self.hidden_layers + [self.visible_layer]: contrib = layer.get_lr_scalers() # No two layers can contend to scale a parameter assert not any([key in rval for key in contrib]) # Don't try to scale anything that's not a parameter assert all([key in params for key in contrib]) rval.update(contrib) assert all([isinstance(val, float) for val in rval.values()]) return rval
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.info('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.info('done. Took {0}'.format(t2 - t1)) if self.verbose: logger.info('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.info('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt( sum([ T.sqr(elem).sum() for elem in self.param_to_grad_shared.values() ])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = \ self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = \ function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = \ sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = \ function([norm], updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) # beta_pr is the Polak-Ribiere formula for beta. # According to wikipedia, the beta to use for NCG is "a matter of # heuristics or taste" but max(0, beta_pr) is "a popular choice... # which provides direction reset automatically." (ie, it is meant # to revert to steepest descent when you have traveled far enough # that the objective function is behaving non-quadratically enough # that the conjugate gradient formulas aren't working anymore) # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = \ [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line( 'BatchGradientDescent._make_conjugate var ' + var_descriptor(v) + '\n') mode.record.handle_line( 'BatchGradientDescent._make_conjugate update ' + var_descriptor(u) + '\n') self._make_conjugate = \ function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line( 'BatchGradientDescent._make_conjugate output ' + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
class Recurrent(Layer): """ A recurrent neural network layer using the hyperbolic tangent activation function, passing on all hidden states or a selection of them to the next layer. The hidden state is initialized to zeros. Parameters ---------- dim : int The number of elements in the hidden layer layer_name : str The name of the layer. All layers in an MLP must have a unique name. irange : float Initializes each weight randomly in U(-irange, irange) irange : float The input-to-hidden weight matrix is initialized with weights in the uniform interval (-irange, irange). The hidden-to-hidden matrix weights are sampled in the same manner, unless the argument svd is set to True (see below). indices : slice, list of integers or integer, optional If specified this layer will return only the given hidden states. If an integer is given, it will not return a SequenceSpace. Otherwise, it will return a SequenceSpace of fixed length. Note that a SequenceSpace of fixed length can be flattened by using the FlattenerLayer. Note: For now only [-1] is supported. init_bias : float, optional Set an initial bias to be added at each time step. Defaults to 0. nonlinearity : theano.function, optional weight_noise : bool, optional Additive Gaussian noise applied to parameters """ def __init__(self, dim, layer_name, irange, indices=None, init_bias=0., nonlinearity=tensor.tanh, weight_noise=False, **kwargs): self._std_dev = kwargs.pop('noise_std_dev', .075) self.rnn_friendly = True self._scan_updates = OrderedDict() self.__dict__.update(locals()) del self.self super(Recurrent, self).__init__() if not self.weight_noise: self._std_dev = None @wraps(Layer.set_input_space) def set_input_space(self, space): if ((not isinstance(space, SequenceSpace) and not isinstance(space, SequenceDataSpace)) or not isinstance(space.space, VectorSpace)): raise ValueError("Recurrent layer needs a SequenceSpace(" "VectorSpace) or SequenceDataSpace(VectorSpace)\ as input but received %s instead" % (space)) self.input_space = space if self.indices is not None: if len(self.indices) > 1: raise ValueError("Only indices = [-1] is supported right now") self.output_space = CompositeSpace( [VectorSpace(dim=self.dim) for _ in range(len(self.indices))] ) else: assert self.indices == [-1], "Only indices = [-1] works now" self.output_space = VectorSpace(dim=self.dim) else: if isinstance(self.input_space, SequenceSpace): self.output_space = SequenceSpace(VectorSpace(dim=self.dim)) elif isinstance(self.input_space, SequenceDataSpace): self.output_space =\ SequenceDataSpace(VectorSpace(dim=self.dim)) # Initialize the parameters rng = self.mlp.rng if self.irange is None: raise ValueError("Recurrent layer requires an irange value in " "order to initialize its weight matrices") input_dim = self.input_space.dim # W is the input-to-hidden matrix W = rng.uniform(-self.irange, self.irange, (input_dim, self.dim)) # U is the hidden-to-hidden transition matrix U = rng.randn(self.dim, self.dim) U, _ = scipy.linalg.qr(U) # b is the bias b = np.zeros((self.dim,)) self._params = [ sharedX(W, name=(self.layer_name + '_W')), sharedX(U, name=(self.layer_name + '_U')), sharedX(b + self.init_bias, name=(self.layer_name + '_b')) ] @wraps(Layer.get_layer_monitoring_channels) def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): W, U, b = self._params sq_W = tensor.sqr(W) sq_U = tensor.sqr(U) row_norms = tensor.sqrt(sq_W.sum(axis=1)) col_norms = tensor.sqrt(sq_W.sum(axis=0)) u_row_norms = tensor.sqrt(sq_U.sum(axis=1)) u_col_norms = tensor.sqrt(sq_U.sum(axis=0)) rval = OrderedDict([('W_row_norms_min', row_norms.min()), ('W_row_norms_mean', row_norms.mean()), ('W_row_norms_max', row_norms.max()), ('W_col_norms_min', col_norms.min()), ('W_col_norms_mean', col_norms.mean()), ('W_col_norms_max', col_norms.max()), ('U_row_norms_min', u_row_norms.min()), ('U_row_norms_mean', u_row_norms.mean()), ('U_row_norms_max', u_row_norms.max()), ('U_col_norms_min', u_col_norms.min()), ('U_col_norms_mean', u_col_norms.mean()), ('U_col_norms_max', u_col_norms.max())]) if (state is not None) or (state_below is not None): if state is None: state = self.fprop(state_below) if isinstance(self.input_space, SequenceSpace): state, _ = state state_below, _ = state_below mx = state.max(axis=0) mean = state.mean(axis=0) mn = state.min(axis=0) rg = mx - mn rval['range_x_max_u'] = rg.max() rval['range_x_mean_u'] = rg.mean() rval['range_x_min_u'] = rg.min() rval['max_x_max_u'] = mx.max() rval['max_x_mean_u'] = mx.mean() rval['max_x_min_u'] = mx.min() rval['mean_x_max_u'] = mean.max() rval['mean_x_mean_u'] = mean.mean() rval['mean_x_min_u'] = mean.min() rval['min_x_max_u'] = mn.max() rval['min_x_mean_u'] = mn.mean() rval['min_x_min_u'] = mn.min() return rval @wraps(Layer._modify_updates) def _modify_updates(self, updates): # When random variables are used in the scan function the updates # dictionary returned by scan might not be empty, and needs to be # added to the updates dictionary before compiling the training # function if any(key in updates for key in self._scan_updates): # Don't think this is possible, but let's check anyway raise ValueError("A single shared variable is being updated by " "multiple scan functions") updates.update(self._scan_updates) def add_noise(self, param): """ A function that adds additive Gaussian noise Parameters ---------- param : sharedX model parameter to be regularized Returns ------- param : sharedX model parameter with additive noise """ param += self.mlp.theano_rng.normal(size=param.shape, avg=0., std=self._std_dev, dtype=param.dtype) return param @wraps(Layer.fprop) def fprop(self, state_below, return_all=False): if isinstance(state_below, tuple): state_below, mask = state_below else: mask = None # z0 is the initial hidden state which is (batch size, output dim) z0 = tensor.alloc(np.cast[config.floatX](0), state_below.shape[1], self.dim) if self.dim == 1: # This should fix the bug described in Theano issue #1772 z0 = tensor.unbroadcast(z0, 1) # Later we will add a noise function W, U, b = self._params if self.weight_noise: W = self.add_noise(W) U = self.add_noise(U) # It is faster to do the input-to-hidden matrix multiplications # outside of scan state_below = tensor.dot(state_below, W) + b if mask is not None: z, updates = scan(fn=self.fprop_step_mask, sequences=[state_below, mask], outputs_info=[z0], non_sequences=[U]) else: z, updates = scan(fn=self.fprop_step, sequences=[state_below], outputs_info=[z0], non_sequences=[U]) self._scan_updates.update(updates) if self.indices is not None: if len(self.indices) > 1: return [z[i] for i in self.indices] else: return z[self.indices[0]] else: return (z, mask) def fprop_step_mask(self, state_below, mask, state_before, U): """ Scan function for case using masks Parameters ---------- : todo state_below : TheanoTensor """ z = self.nonlinearity(state_below + tensor.dot(state_before, U)) # Only update the state for non-masked data, otherwise # just carry on the previous state until the end z = mask[:, None] * z + (1 - mask[:, None]) * state_before return z def fprop_step(self, state_below, state_before, U): """ Scan function for case without masks Parameters ---------- : todo state_below : TheanoTensor """ z = self.nonlinearity(state_below + tensor.dot(state_before, U)) return z
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.info('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.info('done. Took {0}'.format(t2-t1)) if self.verbose: logger.info('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.info('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function( [alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = \ self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = \ function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = \ sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = \ function([norm], updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) # beta_pr is the Polak-Ribiere formula for beta. # According to wikipedia, the beta to use for NCG is "a matter of # heuristics or taste" but max(0, beta_pr) is "a popular choice... # which provides direction reset automatically." (ie, it is meant # to revert to steepest descent when you have traveled far enough # that the objective function is behaving non-quadratically enough # that the conjugate gradient formulas aren't working anymore) # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = \ [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line( 'BatchGradientDescent._make_conjugate var ' + var_descriptor(v) + '\n') mode.record.handle_line( 'BatchGradientDescent._make_conjugate update ' + var_descriptor(u) + '\n') self._make_conjugate = \ function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line( 'BatchGradientDescent._make_conjugate output ' + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
class Recurrent(Layer): """ A recurrent neural network layer using the hyperbolic tangent activation function, passing on all hidden states or a selection of them to the next layer. The hidden state is initialized to zeros. Parameters ---------- dim : int The number of elements in the hidden layer layer_name : str The name of the layer. All layers in an MLP must have a unique name. irange : float Initializes each weight randomly in U(-irange, irange) irange : float The input-to-hidden weight matrix is initialized with weights in the uniform interval (-irange, irange). The hidden-to-hidden matrix weights are sampled in the same manner, unless the argument svd is set to True (see below). indices : slice, list of integers or integer, optional If specified this layer will return only the given hidden states. If an integer is given, it will not return a SequenceSpace. Otherwise, it will return a SequenceSpace of fixed length. Note that a SequenceSpace of fixed length can be flattened by using the FlattenerLayer. Note: For now only [-1] is supported. init_bias : float, optional Set an initial bias to be added at each time step. Defaults to 0. nonlinearity : theano.function, optional weight_noise : bool, optional Additive Gaussian noise applied to parameters """ def __init__(self, dim, layer_name, irange, indices=None, init_bias=0., nonlinearity=tensor.tanh, weight_noise=False, **kwargs): self._std_dev = kwargs.pop('noise_std_dev', .075) self.rnn_friendly = True self._scan_updates = OrderedDict() self.__dict__.update(locals()) del self.self super(Recurrent, self).__init__() if not self.weight_noise: self._std_dev = None @wraps(Layer.set_input_space) def set_input_space(self, space): if ((not isinstance(space, SequenceSpace) and not isinstance(space, SequenceDataSpace)) or not isinstance(space.space, VectorSpace)): raise ValueError("Recurrent layer needs a SequenceSpace(" "VectorSpace) or SequenceDataSpace(VectorSpace)\ as input but received %s instead" % (space)) self.input_space = space if self.indices is not None: if len(self.indices) > 1: raise ValueError("Only indices = [-1] is supported right now") self.output_space = CompositeSpace([ VectorSpace(dim=self.dim) for _ in range(len(self.indices)) ]) else: assert self.indices == [-1], "Only indices = [-1] works now" self.output_space = VectorSpace(dim=self.dim) else: if isinstance(self.input_space, SequenceSpace): self.output_space = SequenceSpace(VectorSpace(dim=self.dim)) elif isinstance(self.input_space, SequenceDataSpace): self.output_space =\ SequenceDataSpace(VectorSpace(dim=self.dim)) # Initialize the parameters rng = self.mlp.rng if self.irange is None: raise ValueError("Recurrent layer requires an irange value in " "order to initialize its weight matrices") input_dim = self.input_space.dim # W is the input-to-hidden matrix W = rng.uniform(-self.irange, self.irange, (input_dim, self.dim)) # U is the hidden-to-hidden transition matrix U = rng.randn(self.dim, self.dim) U, _ = scipy.linalg.qr(U) # b is the bias b = np.zeros((self.dim, )) self._params = [ sharedX(W, name=(self.layer_name + '_W')), sharedX(U, name=(self.layer_name + '_U')), sharedX(b + self.init_bias, name=(self.layer_name + '_b')) ] @wraps(Layer.get_layer_monitoring_channels) def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): W, U, b = self._params sq_W = tensor.sqr(W) sq_U = tensor.sqr(U) row_norms = tensor.sqrt(sq_W.sum(axis=1)) col_norms = tensor.sqrt(sq_W.sum(axis=0)) u_row_norms = tensor.sqrt(sq_U.sum(axis=1)) u_col_norms = tensor.sqrt(sq_U.sum(axis=0)) rval = OrderedDict([('W_row_norms_min', row_norms.min()), ('W_row_norms_mean', row_norms.mean()), ('W_row_norms_max', row_norms.max()), ('W_col_norms_min', col_norms.min()), ('W_col_norms_mean', col_norms.mean()), ('W_col_norms_max', col_norms.max()), ('U_row_norms_min', u_row_norms.min()), ('U_row_norms_mean', u_row_norms.mean()), ('U_row_norms_max', u_row_norms.max()), ('U_col_norms_min', u_col_norms.min()), ('U_col_norms_mean', u_col_norms.mean()), ('U_col_norms_max', u_col_norms.max())]) if (state is not None) or (state_below is not None): if state is None: state = self.fprop(state_below) state, _ = state state_below, _ = state_below mx = state.max(axis=0) mean = state.mean(axis=0) mn = state.min(axis=0) rg = mx - mn rval['range_x_max_u'] = rg.max() rval['range_x_mean_u'] = rg.mean() rval['range_x_min_u'] = rg.min() rval['max_x_max_u'] = mx.max() rval['max_x_mean_u'] = mx.mean() rval['max_x_min_u'] = mx.min() rval['mean_x_max_u'] = mean.max() rval['mean_x_mean_u'] = mean.mean() rval['mean_x_min_u'] = mean.min() rval['min_x_max_u'] = mn.max() rval['min_x_mean_u'] = mn.mean() rval['min_x_min_u'] = mn.min() return rval @wraps(Layer._modify_updates) def _modify_updates(self, updates): # When random variables are used in the scan function the updates # dictionary returned by scan might not be empty, and needs to be # added to the updates dictionary before compiling the training # function if any(key in updates for key in self._scan_updates): # Don't think this is possible, but let's check anyway raise ValueError("A single shared variable is being updated by " "multiple scan functions") updates.update(self._scan_updates) def add_noise(self, param): """ A function that adds additive Gaussian noise Parameters ---------- param : sharedX model parameter to be regularized Returns ------- param : sharedX model parameter with additive noise """ param += self.theano_rng.normal(size=param.shape, avg=0., std=self._std_dev, dtype=param.dtype) return param @wraps(Layer.fprop) def fprop(self, state_below, return_all=False): if isinstance(state_below, tuple): state_below, mask = state_below else: mask = None # z0 is the initial hidden state which is (batch size, output dim) z0 = tensor.alloc(np.cast[config.floatX](0), state_below.shape[1], self.dim) if self.dim == 1: # This should fix the bug described in Theano issue #1772 z0 = tensor.unbroadcast(z0, 1) # Later we will add a noise function W, U, b = self._params if self.weight_noise: W = self.add_noise(W) U = self.add_noise(U) # It is faster to do the input-to-hidden matrix multiplications # outside of scan state_below = tensor.dot(state_below, W) + b if mask is not None: def fprop_step(state_below, mask, state_before, U): z = self.nonlinearity(state_below + tensor.dot(state_before, U)) # Only update the state for non-masked data, otherwise # just carry on the previous state until the end z = mask[:, None] * z + (1 - mask[:, None]) * state_before return z z, updates = scan(fn=fprop_step, sequences=[state_below, mask], outputs_info=[z0], non_sequences=[U]) else: def fprop_step(state_below, state_before, U): z = self.nonlinearity(state_below + tensor.dot(state_before, U)) return z z, updates = scan(fn=fprop_step, sequences=[state_below], outputs_info=[z0], non_sequences=[U]) self._scan_updates.update(updates) if self.indices is not None: if len(self.indices) > 1: return [z[i] for i in self.indices] else: return z[self.indices[0]] else: return (z, mask)
def get_lr_scalers(self, model_idx=-1): scaler = OrderedDict() for model in self.models: scaler.update(model.get_lr_scalers()) return scaler