def get_gradients(self, model, data, **kwargs): """ Provides the gradients of the cost function with respect to the model parameters. These are not necessarily those obtained by theano.tensor.grad --you may wish to use approximate or even intentionally incorrect gradients in some cases. Parameters ---------- model : a pylearn2 Model instance data : a batch in cost.get_data_specs() form kwargs : dict Optional extra arguments, not used by the base class. Returns ------- gradients : OrderedDict a dictionary mapping from the model's parameters to their gradients The default implementation is to compute the gradients using T.grad applied to the value returned by expr. However, subclasses may return other values for the gradient. For example, an intractable cost may return a sampling-based approximation to its gradient. updates : OrderedDict a dictionary mapping shared variables to updates that must be applied to them each time these gradients are computed. This is to facilitate computation of sampling-based approximate gradients. The parameters should never appear in the updates dictionary. This would imply that computing their gradient changes their value, thus making the gradient value outdated. """ try: cost = self.expr(model=model, data=data, **kwargs) except TypeError: # If anybody knows how to add type(self) to the exception message # but still preserve the stack trace, please do so # The current code does neither message = "Error while calling " + str(type(self)) + ".expr" reraise_as(TypeError(message)) if cost is None: raise NotImplementedError( str(type(self)) + " represents an intractable cost and " "does not provide a gradient " "approximation scheme.") params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates
def get_gradients(self, model, data, ** kwargs): indiv_results = [] composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for cost, cost_data in safe_zip(self.costs, nested_data): result = cost.get_gradients(model, cost_data, ** kwargs) indiv_results.append(result) grads = OrderedDict() updates = OrderedDict() params = model.get_params() for coeff, packed in zip(self.coeffs, indiv_results): g, u = packed for param in g: if param not in params: raise ValueError("A shared variable (" + str(param) + ") that is not a parameter appeared " "a cost gradient dictionary.") for param in g: assert param.ndim == g[param].ndim v = coeff * g[param] if param not in grads: grads[param] = v else: grads[param] = grads[param] + v assert grads[param].ndim == param.ndim assert not any([state in updates for state in u]) assert not any([state in params for state in u]) updates.update(u) return grads, updates
def get_monitoring_channels(self, model, data, **kwargs): self.get_data_specs(model)[0].validate(data) rval = OrderedDict() composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for i, cost in enumerate(self.costs): cost_data = nested_data[i] try: channels = cost.get_monitoring_channels( model, cost_data, **kwargs) rval.update(channels) except TypeError: reraise_as( Exception('SumOfCosts.get_monitoring_channels ' 'encountered TypeError while calling {0}' '.get_monitoring_channels'.format(type(cost)))) value = cost.expr(model, cost_data, **kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_' + str(i) + name] = value return rval
def __init__(self, base_learning_rule, decay=0.9): self.base = base_learning_rule # hack to allow MomentumAdjustor to access momentum value if hasattr(self.base, 'momentum'): self.momentum = self.base.momentum self.decay = decay self.mean_updates = OrderedDict()
def get_monitoring_channels(self, model, data, ** kwargs): self.get_data_specs(model)[0].validate(data) rval = OrderedDict() composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for i, cost in enumerate(self.costs): cost_data = nested_data[i] try: channels = cost.get_monitoring_channels(model, cost_data, **kwargs) rval.update(channels) except TypeError: reraise_as(Exception('SumOfCosts.get_monitoring_channels ' 'encountered TypeError while calling {0}' '.get_monitoring_channels'.format( type(cost)))) value = cost.expr(model, cost_data, ** kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_' + str(i) + name] = value return rval
def test_spatiotemporal_cubes(): def check_patch_coverage(files): rng = numpy.random.RandomState(1) inputs = [(name, array.shape) for name, array in six.iteritems(files)] shape = (5, 7, 7) for fname, index in spatiotemporal_cubes(inputs, shape, 50000, rng): cube = files[fname][index] if len(files[fname].shape) == 3: assert cube.shape == shape else: assert cube.shape[:3] == shape[:3] cube[...] = True for fname, array in six.iteritems(files): assert array.all() files = OrderedDict( file1=numpy.zeros((10, 30, 21), dtype=bool), file2=numpy.zeros((15, 25, 28), dtype=bool), file3=numpy.zeros((7, 18, 22), dtype=bool), ) check_patch_coverage(files) # Check that stuff still works with an extra color channel dimension. files = OrderedDict( file1=numpy.zeros((10, 30, 21, 3), dtype=bool), file2=numpy.zeros((15, 25, 28, 3), dtype=bool), file3=numpy.zeros((7, 18, 22, 3), dtype=bool), ) check_patch_coverage(files)
def get_gradients(self, model, data, **kwargs): indiv_results = [] composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for cost, cost_data in safe_zip(self.costs, nested_data): result = cost.get_gradients(model, cost_data, **kwargs) indiv_results.append(result) grads = OrderedDict() updates = OrderedDict() params = model.get_params() for coeff, packed in zip(self.coeffs, indiv_results): g, u = packed for param in g: if param not in params: raise ValueError("A shared variable (" + str(param) + ") that is not a parameter appeared " "a cost gradient dictionary.") for param in g: assert param.ndim == g[param].ndim v = coeff * g[param] if param not in grads: grads[param] = v else: grads[param] = grads[param] + v assert grads[param].ndim == param.ndim assert not any([state in updates for state in u]) assert not any([state in params for state in u]) updates.update(u) return grads, updates
def __init__(self, dim, layer_name, irange, indices=None, init_bias=0., nonlinearity=tensor.tanh, weight_noise=False, **kwargs): self._std_dev = kwargs.pop('noise_std_dev', .075) self.rnn_friendly = True self._scan_updates = OrderedDict() self.__dict__.update(locals()) del self.self super(Recurrent, self).__init__() if not self.weight_noise: self._std_dev = None
def __init__(self, model): avg_updates = OrderedDict() t = sharedX(1.) self.param_to_mean = OrderedDict() for param in model.get_params(): mean = sharedX(param.get_value()) assert type(mean) == type(param) self.param_to_mean[param] = mean avg_updates[mean] = mean - (mean - param) / t avg_updates[t] = t + 1. self.avg = function([], updates=avg_updates)
def get_gradients(self, model, data, **kwargs): cost, neg_v = self._cost(model, data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore', consider_constant=[neg_v]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates
def get_gradients(self, model, data, **kwargs): cost = self._cost(model, data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore', consider_constant=[self.sampler.particles]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() sampler_updates = self.sampler.updates() updates.update(sampler_updates) return gradients, updates
def __init__(self, inputs, outputs=None, updates=None): batch_size = T.cast(inputs[0].shape[0], 'float32') total_examples = T.scalar() transformed_updates = OrderedDict() self.has_updates = updates is not None if self.has_updates: self._clear = function([], updates=[(var, 0. * var) for var in updates]) for var in updates: update = updates[var] transformed_updates[var] = var + \ (batch_size / total_examples) * update self._shared_mask = [hasattr(elem, 'get_value') for elem in inputs] true_inputs = self._true_inputs(inputs) self._shared = self._shared_inputs(inputs) if outputs is not None: if not isinstance(outputs, list): outputs = [outputs] outputs = [ output * (batch_size / total_examples) for output in outputs ] self._func = function(true_inputs + [total_examples], outputs=outputs, updates=transformed_updates)
def get_monitoring_channels(self, data): """ Get monitoring channels for this model. Parameters ---------- data : tensor_like, or (possibly nested) tuple of tensor_likes, This is data on which the monitoring quantities will be calculated (e.g., a validation set). See `self.get_monitoring_data_specs()`. Returns ------- channels : OrderedDict A dictionary with strings as keys, mapping channel names to symbolic values that depend on the variables in `data`. Notes ----- You can make any channel names you want, just try to make sure they won't collide with names made by the training Cost, etc. Anything you think is worth monitoring during training can be added here. You probably want to control which channels get added with some config option for your model. """ space, source = self.get_monitoring_data_specs() space.validate(data) return OrderedDict()
def get_monitoring_channels(self, data): rval = OrderedDict() for i in xrange(len(self.models)): if self.monitor_targets: X = data[i] Y = data[-1] else: X = data[i] Y = None model_data = (X, Y) ch = self.models[i].get_monitoring_channels(model_data) for key in ch: value = ch[key] rval["cascade_" + str(i) + '_' + key] = value if Y is not None: state = self.fprop(data[0:-1]) # Threshold Y_hat at 0.5. prediction = T.gt(state, 0.5) # If even one feature is wrong for a given training example, # it's considered incorrect, so we max over columns. incorrect = T.neq(Y, prediction).max(axis=1) rval['misclass'] = T.cast(incorrect, config.floatX).mean() return rval
def on_monitor(self, model, dataset, algorithm): """ Make sure Polyak-averaged model gets monitored. Save the model if necessary. Parameters ---------- model : a Model instance dataset : Dataset algorithm : WRITEME """ if self._count == self.start: self._worker = _PolyakWorker(model) algorithm.update_callbacks.append(self._worker) # HACK try: model.add_polyak_channels(self._worker.param_to_mean, algorithm.monitoring_dataset) except AttributeError: pass elif self.save_path is not None and self._count > self.start and \ self._count % self.save_freq == 0: saved_params = OrderedDict() for param in model.get_params(): saved_params[param] = param.get_value() param.set_value(self._worker.param_to_mean[param].get_value()) serial.save(self.save_path, model) for param in model.get_params(): param.set_value(saved_params[param]) self._count += 1
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): T, = self.transformer.get_params() assert T.ndim == 3 # sq_T = theano.tensor.sqr(T) # Prepare an orderedDict with values to monitor. return OrderedDict()
def __call__(self, inputs): """ .. todo:: WRITEME """ space = self.dbm.get_input_space() num_examples = space.batch_size(inputs) last_layer = self.dbm.get_all_layers()[-1] layer_to_chains = self.dbm.make_layer_to_symbolic_state( num_examples, self.theano_rng) # The examples are used to initialize the visible layer's chains layer_to_chains[self.dbm.visible_layer] = inputs layer_to_clamp = OrderedDict([(self.dbm.visible_layer, True)]) layer_to_chains = self.dbm.sampling_procedure.sample( layer_to_state=layer_to_chains, theano_rng=self.theano_rng, layer_to_clamp=layer_to_clamp, num_steps=1) rval = layer_to_chains[last_layer] rval = last_layer.upward_state(rval) return rval
def __init__(self, dim, layer_name, irange, indices=None, init_bias=0., svd=True, nonlinearity=tensor.tanh): self.rnn_friendly = True self._scan_updates = OrderedDict() self.__dict__.update(locals()) del self.self super(Recurrent, self).__init__()
def get_learn_func(self): """ Returns a theano function that takes an action and a reward, and updates the agent based on this experience. """ a = T.iscalar() r = T.scalar() old_estimated_reward = self.estimated_rewards[a] old_observation_count = self.observation_counts[a] observation_count = old_observation_count + 1. delta = r - old_estimated_reward new_estimated_reward = old_estimated_reward + delta / observation_count new_estimated_rewards = T.set_subtensor(self.estimated_rewards[a], new_estimated_reward) new_observation_counts = T.set_subtensor(self.observation_counts[a], observation_count) updates = OrderedDict([(self.estimated_rewards, new_estimated_rewards), (self.observation_counts, new_observation_counts)]) rval = function([a, r], updates=updates) return rval
def get_monitoring_channels(self, model, data, **kwargs): """ .. todo:: WRITEME .. todo:: how do you do prereqs in this setup? (I think PL changed it, not sure if there still is a way in this context) Returns a dictionary mapping channel names to expressions for channel values. Parameters ---------- model : Model the model to use to compute the monitoring channels data : batch (a member of self.get_data_specs()[0]) symbolic expressions for the monitoring data kwargs : dict used so that custom algorithms can use extra variables for monitoring. Returns ------- rval : dict Maps channels names to expressions for channel values. """ self.get_data_specs(model)[0].validate(data) return OrderedDict()
def monitoring_channels_from_prior_params(self): """ Get monitoring channels from the parameters of the prior distribution. By default, no monitoring channel is computed. """ return OrderedDict()
def get_monitoring_channels(self, data): """ Notes ----- Monitors quantities related to the approximate posterior parameters phi and the conditional and prior parameters theta. """ space, source = self.get_monitoring_data_specs() space.validate(data) rval = OrderedDict() X = data epsilon_shape = (X.shape[0], self.nhid) epsilon = self.sample_from_epsilon(shape=epsilon_shape) phi = self.encode_phi(X) z = self.sample_from_q_z_given_x(epsilon=epsilon, phi=phi) theta = self.decode_theta(z) X_r = self.means_from_theta(theta) rval["reconstruction_mse"] = T.sqr(X - X_r).mean() posterior_channels = \ self.posterior.monitoring_channels_from_conditional_params(phi) safe_update(rval, posterior_channels) conditional_channels = \ self.conditional.monitoring_channels_from_conditional_params(theta) safe_update(rval, conditional_channels) prior_channels = self.prior.monitoring_channels_from_prior_params() safe_update(rval, prior_channels) return rval
def get_monitoring_channels(self, model, data, **kwargs): """ .. todo:: WRITEME properly Provides monitoring of the individual costs that are being added together. This is a very useful method to subclass if you need to monitor more things about the model. """ self.get_data_specs(model)[0].validate(data) rval = OrderedDict() # if there's only 1 cost, then no need to split up the costs if len(self.costs) > 1: output = self._get_samples_from_model(model, data) rval['reconstruction_cost'] =\ self._get_total_for_cost(0, self.costs[0][2], data, output) rval['classification_cost'] =\ self._get_total_for_cost(1, self.costs[1][2], data, output) return rval
def spatiotemporal_cubes(file_tuples, shape, n_patches=numpy.inf, rng=None): """ Generator function that yields a stream of (filename, slicetuple) representing a spatiotemporal patch of that file. Parameters ---------- file_tuples : list of tuples Each element should be a 2-tuple consisting of a filename (or arbitrary identifier) and a (length, height, width) shape tuple of the dimensions (number of frames in the video, height and width of each frame). shape : tuple A shape tuple consisting of the desired (length, height, width) of each spatiotemporal patch. n_patches : int, optional The number of patches to generate. By default, generates patches infinitely. rng : RandomState object or seed, optional The random number generator (or seed) to use. Defaults to None, meaning it will be seeded from /dev/urandom or the clock. Returns ------- generator : generator object A generator that yields a stream of (filename, slicetuple) tuples. The slice tuple is such that it indexes into a 3D array containing the entire clip with frames indexed along the first axis, rows along the second and columns along the third. """ frame_lookup = FrameLookup([(a, b[0]) for a, b in file_tuples]) file_lookup = OrderedDict(file_tuples) patch_length, patch_height, patch_width = shape done = 0 rng = make_np_rng(rng, which_method="random_integers") while done < n_patches: frame = rng.random_integers(0, len(frame_lookup) - 1) filename, file_length, frame_no = frame_lookup[frame] # Check that there is a contiguous block of frames starting at # frame_no that is at least as long as our desired cube length. if file_length - frame_no < patch_length: continue _, video_height, video_width = file_lookup[filename][:3] # The last row and column in which a patch could "start" to still # fall within frame. last_row = video_height - patch_height last_col = video_width - patch_width row = numpy.random.random_integers(0, last_row) col = numpy.random.random_integers(0, last_col) patch_slice = (slice(frame_no, frame_no + patch_length), slice(row, row + patch_height), slice(col, col + patch_width)) done += 1 yield filename, patch_slice
def enforce_constraints(self): """ Enforces all constraints encoded by self.modify_updates. """ params = self.get_params() updates = OrderedDict(izip_no_length_check(params, params)) self.modify_updates(updates) f = function([], updates=updates) f()
def get_monitoring_channels(self, data): """ .. todo:: WRITEME """ space, source = self.get_monitoring_data_specs() space.validate(data) X = data history = self.mf(X, return_history=True) q = history[-1] rval = OrderedDict() ch = self.visible_layer.get_monitoring_channels() for key in ch: rval['vis_' + key] = ch[key] for state, layer in safe_zip(q, self.hidden_layers): ch = layer.get_monitoring_channels() for key in ch: rval[layer.layer_name + '_' + key] = ch[key] ch = layer.get_monitoring_channels_from_state(state) for key in ch: rval['mf_' + layer.layer_name + '_' + key] = ch[key] if len(history) > 1: prev_q = history[-2] flat_q = flatten(q) flat_prev_q = flatten(prev_q) mx = None for new, old in safe_zip(flat_q, flat_prev_q): cur_mx = abs(new - old).max() if new is old: logger.error('{0} is {1}'.format(new, old)) assert False if mx is None: mx = cur_mx else: mx = T.maximum(mx, cur_mx) rval['max_var_param_diff'] = mx for layer, new, old in safe_zip(self.hidden_layers, q, prev_q): sum_diff = 0. for sub_new, sub_old in safe_zip(flatten(new), flatten(old)): sum_diff += abs(sub_new - sub_old).sum() denom = self.batch_size * \ layer.get_total_state_space().get_total_dimension() denom = np.cast[config.floatX](denom) rval['mean_'+layer.layer_name+'_var_param_diff'] = \ sum_diff / denom return rval
def monitoring_channels_from_conditional_params(self, conditional_params): rval = OrderedDict() mu, log_sigma = conditional_params rval[self.name + '_sigma_min'] = T.exp(log_sigma).min() rval[self.name + '_sigma_max'] = T.exp(log_sigma).max() rval[self.name + '_sigma_mean'] = T.exp(log_sigma).mean() rval[self.name + '_sigma_std'] = T.exp(log_sigma).std() return rval
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): """ Block monitoring channels if not necessary Parameters --------- : todo """ rval = OrderedDict() if self.use_monitoring_channels: state = state_below x = state state_conc = None for layer in self.layers: # We don't go through all the inner layers recursively state_below = state if ((self.x_shortcut and layer is not self.layers[0] and layer is not self.layers[-1])): state = self.create_shortcut_batch(state, x, 2, 1) if self.y_shortcut and layer is self.layers[-1]: state = layer.fprop(state_conc) else: state = layer.fprop(state) if self.y_shortcut and layer is not self.layers[-1]: if layer is self.layers[0]: state_conc = state else: state_conc = self.create_shortcut_batch( state_conc, state, 2) args = [state_below, state] if layer is self.layers[-1] and targets is not None: args.append(targets) ch = layer.get_layer_monitoring_channels(*args) if not isinstance(ch, OrderedDict): raise TypeError(str((type(ch), layer.layer_name))) for key in ch: value = ch[key] doc = get_monitor_doc(value) if doc is None: doc = str(type(layer)) + \ ".get_monitoring_channels_from_state did" + \ " not provide any further documentation for" + \ " this channel." doc = 'This channel came from a layer called "' + \ layer.layer_name + '" of an MLP.\n' + doc value.__doc__ = doc rval[layer.layer_name + '_' + key] = value return rval
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): W, U, b = self._params sq_W = tensor.sqr(W) sq_U = tensor.sqr(U) row_norms = tensor.sqrt(sq_W.sum(axis=1)) col_norms = tensor.sqrt(sq_W.sum(axis=0)) u_row_norms = tensor.sqrt(sq_U.sum(axis=1)) u_col_norms = tensor.sqrt(sq_U.sum(axis=0)) rval = OrderedDict([('W_row_norms_min', row_norms.min()), ('W_row_norms_mean', row_norms.mean()), ('W_row_norms_max', row_norms.max()), ('W_col_norms_min', col_norms.min()), ('W_col_norms_mean', col_norms.mean()), ('W_col_norms_max', col_norms.max()), ('U_row_norms_min', u_row_norms.min()), ('U_row_norms_mean', u_row_norms.mean()), ('U_row_norms_max', u_row_norms.max()), ('U_col_norms_min', u_col_norms.min()), ('U_col_norms_mean', u_col_norms.mean()), ('U_col_norms_max', u_col_norms.max())]) if (state is not None) or (state_below is not None): if state is None: state = self.fprop(state_below) if isinstance(self.input_space, SequenceSpace): state, _ = state state_below, _ = state_below mx = state.max(axis=0) mean = state.mean(axis=0) mn = state.min(axis=0) rg = mx - mn rval['range_x_max_u'] = rg.max() rval['range_x_mean_u'] = rg.mean() rval['range_x_min_u'] = rg.min() rval['max_x_max_u'] = mx.max() rval['max_x_mean_u'] = mx.mean() rval['max_x_min_u'] = mx.min() rval['mean_x_max_u'] = mean.max() rval['mean_x_mean_u'] = mean.mean() rval['mean_x_min_u'] = mean.min() rval['min_x_max_u'] = mn.max() rval['min_x_mean_u'] = mn.mean() rval['min_x_min_u'] = mn.min() return rval
def get_monitoring_channels(self, model, data, **kwargs): space, sources = self.get_data_specs(model) space.validate(data) rval = model.log_likelihood_lower_bound(data, self.num_samples, return_individual_terms=True) kl_divergence_term = rval[0].mean() expectation_term = -rval[1].mean() return OrderedDict([('kl_divergence_term', kl_divergence_term), ('expectation_term', expectation_term)])
def __init__(self, model): self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() self.theano_function_mode = None self.on_channel_conflict = 'error' # Initialize self._nested_data_specs, self._data_specs_mapping, # and self._flat_data_specs self._build_data_specs()
def get_lr_scalers(self): """ Specify how to rescale the learning rate on each parameter. Returns ------- lr_scalers : OrderedDict A dictionary mapping the parameters of the model to floats. The learning rate will be multiplied by the float for each parameter. If a parameter does not appear in the dictionary, it will use the global learning rate with no scaling. """ return OrderedDict()
def get_lr_scalers(self): """ .. todo:: WRITEME """ rval = OrderedDict() params = self.get_params() for layer in self.hidden_layers + [self.visible_layer]: contrib = layer.get_lr_scalers() # No two layers can contend to scale a parameter assert not any([key in rval for key in contrib]) # Don't try to scale anything that's not a parameter assert all([key in params for key in contrib]) rval.update(contrib) assert all([isinstance(val, float) for val in rval.values()]) return rval
def __init__( self, decrease_rate=0.5, increase_rate=1.2, min_rate=1e-6, max_rate=50 ): assert increase_rate > 1. assert decrease_rate < 1. self.decrease_rate = sharedX(decrease_rate, 'decrease_rate') self.increase_rate = sharedX(increase_rate, 'increase_rate') self.min_rate = min_rate self.max_rate = max_rate self.zeros = OrderedDict()
def __init__( self, decrease_rate=0.5, increase_rate=1.2, min_rate=1e-6, max_rate=50, switching_threshold=1e-6 ): assert increase_rate > 1. assert decrease_rate < 1. self.decrease_rate = sharedX(decrease_rate, 'decrease_rate') self.increase_rate = sharedX(increase_rate, 'increase_rate') self.min_rate = min_rate self.max_rate = max_rate self.switching_threshold = switching_threshold self.epsilons = OrderedDict() self.gt_epsilons = OrderedDict() self.lt_epsilons = OrderedDict() self.eq_epsilons = OrderedDict()
def __init__(self, model): self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() self.theano_function_mode = None # Initialize self._nested_data_specs, self._data_specs_mapping, # and self._flat_data_specs self._build_data_specs()
class RMSProp(LearningRule): """ Implements the RMSProp learning rule. The RMSProp learning rule is described by Hinton in `lecture 6 <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>` of the Coursera Neural Networks for Machine Learning course. In short, Hinton suggests "[the] magnitude of the gradient can be very different for different weights and can change during learning. This makes it hard to choose a global learning rate." RMSProp solves this problem by "[dividing] the learning rate for a weight by a running average of the magnitudes of recent gradients for that weight." Parameters ---------- decay : float, optional Decay constant similar to that used in AdaDelta and Momentum methods. max_scaling: float, optional Restrict the RMSProp gradient scaling coefficient to values below `max_scaling`. Notes ----- An instance of this LearningRule should only be used with one TrainingAlgorithm, and its get_updates method should be called only once. This is required in order to make the monitoring channels correctly report the moving averages. """ def __init__(self, decay=0.9, max_scaling=1e5): assert 0. <= decay < 1. assert max_scaling > 0 self.decay = sharedX(decay, 'decay') self.epsilon = 1. / max_scaling self.mean_square_grads = OrderedDict() @wraps(LearningRule.add_channels_to_monitor) def add_channels_to_monitor(self, monitor, monitoring_dataset): """ The channels added are the min, mean, and max of the mean_square_grad of each parameter. """ channel_mapping = { '_min': T.min, '_max': T.max, '_mean': T.mean } for mean_square_grad in self.mean_square_grads.values(): for suffix, op in channel_mapping.items(): monitor.add_channel( name=(mean_square_grad.name + suffix), ipt=None, val=op(mean_square_grad), data_specs=(NullSpace(), ''), dataset=monitoring_dataset) return def get_updates(self, learning_rate, grads, lr_scalers=None): """ Provides the symbolic (theano) description of the updates needed to perform this learning rule. See Notes for side-effects. Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. Notes ----- This method has the side effect of storing the moving average of the square gradient in `self.mean_square_grads`. This is necessary in order for the monitoring channels to be able to track the value of these moving averages. Therefore, this method should only get called once for each instance of RMSProp. """ updates = OrderedDict() for param in grads: # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) if param.name is None: raise ValueError("Model parameters must be named.") mean_square_grad.name = 'mean_square_grad_' + param.name if param.name in self.mean_square_grads: warnings.warn("Calling get_updates more than once on the " "gradients of `%s` may make monitored values " "incorrect." % param.name) # Store variable in self.mean_square_grads for monitoring. self.mean_square_grads[param.name] = mean_square_grad # Accumulate gradient new_mean_squared_grad = (self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param])) # Compute update scaled_lr = lr_scalers.get(param, 1.) * learning_rate rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, self.epsilon) delta_x_t = - scaled_lr * grads[param] / rms_grad_t # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[param] = param + delta_x_t return updates
class UpdateNormMonitorLearningRule(LearningRule): """ Wraps an existing pylearn2 learning rule and adds monitor channels for the norms of the gradient based updates calculated during learning. """ def __init__(self, base_learning_rule, decay=0.9): self.base = base_learning_rule # hack to allow MomentumAdjustor to access momentum value if hasattr(self.base, 'momentum'): self.momentum = self.base.momentum self.decay = decay self.mean_updates = OrderedDict() def add_channels_to_monitor(self, monitor, monitoring_dataset): channel_mapping = { '_min': T.min, '_max': T.max, '_mean': T.mean } for mean_update in self.mean_updates.values(): if mean_update.ndim == 4: # rank-4 tensor (assuming stack of rank-3 convolutional kernels) knl_norm_vals = T.sqrt(T.sum(T.sqr(mean_update), axis=(1,2,3))) for suffix, op in channel_mapping.items(): monitor.add_channel( name=(mean_update.name + "_kernel_norm" + suffix), ipt=None, val=op(knl_norm_vals), data_specs=(NullSpace(), ''), dataset=monitoring_dataset) elif mean_update.ndim == 3: # rank-3 tensor (assuming stack of rank-2 conv layer biases) knl_norm_vals = T.sqrt(T.sum(T.sqr(mean_update), axis=(1,2))) for suffix, op in channel_mapping.items(): monitor.add_channel( name=(mean_update.name + "_norm" + suffix), ipt=None, val=op(knl_norm_vals), data_specs=(NullSpace(), ''), dataset=monitoring_dataset) elif mean_update.ndim == 2: # rank-2 tensor (matrix) col_norm_vals = T.sqrt(T.sum(T.sqr(mean_update), axis=0)) row_norm_vals = T.sqrt(T.sum(T.sqr(mean_update), axis=1)) mtx_norm_val = T.sqrt(T.sum(T.sqr(mean_update))) for suffix, op in channel_mapping.items(): monitor.add_channel( name=(mean_update.name + "_col_norm" + suffix), ipt=None, val=op(col_norm_vals), data_specs=(NullSpace(), ''), dataset=monitoring_dataset) monitor.add_channel( name=(mean_update.name + "_row_norm" + suffix), ipt=None, val=op(row_norm_vals), data_specs=(NullSpace(), ''), dataset=monitoring_dataset) monitor.add_channel( name=(mean_update.name + "_norm"), ipt=None, val=mtx_norm_val, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) elif mean_update.ndim == 1: # rank-1 tensor (vector) norm_val = T.sqrt(T.sum(T.sqr(mean_update), axis=0)) monitor.add_channel( name=(mean_update.name + "_norm"), ipt=None, val=norm_val, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) elif mean_update.ndim == 0: # rank-0 tensor (scalar) monitor.add_channel( name=(mean_update.name + "_norm"), ipt=None, val=mean_update, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) else: # not sure which axes to sum over in this case raise ValueError( 'Mean update {0} has unexpected number of dimensions {1} ({2})' .format(mean_update, mean_update.ndim, mean_update.shape)) self.base.add_channels_to_monitor(monitor, monitoring_dataset) return def get_updates(self, learning_rate, grads, lr_scalers=None): updates = self.base.get_updates(learning_rate, grads, lr_scalers) for (param, grad) in six.iteritems(grads): mean_update = sharedX(param.get_value() * 0.) if param.name is None: raise ValueError("Model parameters must be named.") mean_update.name = 'mean_update_' + param.name if param.name in self.mean_updates: warnings.warn("Calling get_updates more than once on the " "gradients of `%s` may make monitored values " "incorrect." % param.name) # Store variable in self.mean_updates for monitoring. self.mean_updates[param.name] = mean_update # Accumulate updates d_param = updates[param] - param new_mean_update = (self.decay * mean_update + (1 - self.decay) * d_param) # Apply update updates[mean_update] = new_mean_update return updates
def get_lr_scalers(self, model_idx=-1): scaler = OrderedDict() for model in self.models: scaler.update(model.get_lr_scalers()) return scaler
class DROP_RPROP(LearningRule): def __init__( self, decrease_rate=0.5, increase_rate=1.2, min_rate=1e-6, max_rate=50 ): assert increase_rate > 1. assert decrease_rate < 1. self.decrease_rate = sharedX(decrease_rate, 'decrease_rate') self.increase_rate = sharedX(increase_rate, 'increase_rate') self.min_rate = min_rate self.max_rate = max_rate self.zeros = OrderedDict() def add_channels_to_monitor(self, monitor, monitoring_dataset): monitor.add_channel( 'rprop_decrease_rate', ipt=None, val=self.decrease_rate, dataset=monitoring_dataset, data_specs=(NullSpace(), '') ) monitor.add_channel( 'rprop_increase_rate', ipt=None, val=self.increase_rate, dataset=monitoring_dataset, data_specs=(NullSpace(), '') ) for zero in self.zeros.values(): monitor.add_channel( zero.name, ipt=None, val=T.sum(zero), dataset=monitoring_dataset, data_specs=(NullSpace(), '') ) def get_updates(self, learning_rate, grads, lr_scalers=None, global_error=None,masks=None): updates = OrderedDict() for param, grad in grads.iteritems(): # Create required shared variables lr = lr_scalers.get(param, learning_rate.get_value()) delta = sharedX( np.zeros_like(param.get_value()) + lr, borrow=True ) previous_grad = sharedX( np.zeros_like(param.get_value()), borrow=True ) zeros = sharedX( np.zeros_like(param.get_value()), borrow=True ) layer_name = re.sub('_W$','',param.name) if re.match(r'.*_W$',param.name) and layer_name in masks: mask = masks[layer_name] masked_grad = T.gt(T.dot(mask.T,T.dot(mask,grad)),0.) else: masked_grad = 1. #T.ones_like(grad) # Name variables according to the parameter name if param.name is not None: delta.name = 'delta_'+param.name zeros.name = 'zeros_' + param.name previous_grad.name = 'previous_grad_' + param.name self.zeros[param] = zeros temp = grad * previous_grad delta_inc = T.switch( T.neq(grad,0.), T.clip( T.switch( T.eq(temp, 0.), delta, T.switch( T.lt(temp, 0.), delta*self.decrease_rate, delta*self.increase_rate ) ), self.min_rate, self.max_rate ), delta ) previous_grad_inc = T.switch( T.gt(masked_grad,0.), T.switch( T.gt(temp,0.), grad, 0. ), previous_grad ) # Calculate updates of parameters updated_inc = T.switch( T.neq(grad,0.), - delta_inc * T.sgn(grad), 0. ) new_zeros = zeros + T.switch(T.neq(grad,0.),0,1) # Compile the updates updates[param] = param + updated_inc updates[delta] = delta_inc updates[previous_grad] = previous_grad_inc updates[zeros] = new_zeros return updates
class Monitor(object): """ A class for monitoring Models while they are being trained. A monitor object records the number of minibatches and number of examples the model has trained, as well as any number of "channels" that track quantities of interest (examples: the objective function, measures of hidden unit activity, reconstruction error, sum of squared second derivatives, average norm of the weight vectors, etc.) Parameters ---------- model : `pylearn2.models.model.Model` Attributes ---------- on_channel_conflict : string `error` : this is a behavior when there is conlfict on creating a channel twice `copy_history` : this is a behavior when creating a new channel and transfering history of old_monitor `overwrite` : this is a behavior when creating a new channel without taking an account of old_monitor """ def __init__(self, model): self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() self.theano_function_mode = None self.on_channel_conflict = 'error' # Initialize self._nested_data_specs, self._data_specs_mapping, # and self._flat_data_specs self._build_data_specs() def _build_data_specs(self): """ Computes a nested data_specs for input and all channels Also computes the mapping to flatten it. This function is called from redo_theano. """ # Ask the model what it needs m_space, m_source = self.model.get_monitoring_data_specs() input_spaces = [m_space] input_sources = [m_source] for channel in self.channels.values(): space = channel.data_specs[0] assert isinstance(space, Space) input_spaces.append(space) input_sources.append(channel.data_specs[1]) nested_space = CompositeSpace(input_spaces) nested_source = tuple(input_sources) self._nested_data_specs = (nested_space, nested_source) self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs) flat_space = self._data_specs_mapping.flatten(nested_space, return_tuple=True) flat_source = self._data_specs_mapping.flatten(nested_source, return_tuple=True) self._flat_data_specs = (CompositeSpace(flat_space), flat_source) def set_theano_function_mode(self, mode): """ .. todo:: WRITEME Parameters ---------- mode : theano.compile.Mode Theano functions for the monitoring channels will be compiled and run using this mode. """ if self.theano_function_mode != mode: self._dirty = True self.theano_function_mode = mode def add_dataset(self, dataset, mode='sequential', batch_size=None, num_batches=None, seed=None): """ Determines the data used to calculate the values of each channel. Parameters ---------- dataset : object A `pylearn2.datasets.Dataset` object. mode : str or object, optional Iteration mode; see the docstring of the `iterator` method on `pylearn2.datasets.Dataset` for details. batch_size : int, optional The size of an individual batch. Optional if `mode` is 'sequential' and `num_batches` is specified (batch size will be calculated based on full dataset size). num_batches : int, optional The total number of batches. Unnecessary if `mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). seed : int, optional Optional. The seed to be used for random iteration modes. """ # The user can ommit using lists if only one dataset is set if not isinstance(dataset, list): dataset = [dataset] if not isinstance(mode, list): mode = [mode] if not isinstance(batch_size, list): batch_size = [batch_size] if not isinstance(num_batches, list): num_batches = [num_batches] if seed is None: seed = [None] * len(dataset) if not isinstance(seed, list): seed = [seed] if len(mode) != len(dataset): raise ValueError("Received " + str(len(dataset)) + " dataset but " + str(len(mode)) + " modes.") if any([len(l) != len(dataset) for l in [batch_size, seed]]): raise ValueError("make sure each dataset has its iteration " + "batch size and number of batches.") for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size, num_batches, seed): try: it = d.iterator(mode=m, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) except ValueError as exc: reraise_as(ValueError("invalid iteration parameters in " + "Monitor.add_dataset: " + str(exc))) if it.stochastic: # Must be a seed, not a random number generator. If it were a # random number generator, different iterators using it would # update its state, so we would not get the same iterator # each time. Also, must not be None, because this makes the # iterator pick a seed based on the clock if sd is None: raise TypeError("Monitor requires a seed when using " + "stochastic iteration modes.") if not isinstance(sd, (list, tuple, int)): raise TypeError("Monitor requires a seed (not a random " + "number generator) when using " + "stochastic iteration modes.") else: # The iterator should catch this, but let's double-check assert sd is None if d not in self._datasets: self._datasets.append(d) self._iteration_mode.append(m) self._batch_size.append(b) self._num_batches.append(n) self._rng_seed.append(sd) def __call__(self): """ Runs the model on the monitoring dataset in order to add one data point to each of the channels. """ # If the channels have changed at all, we need to recompile the theano # functions used to compute them if self._dirty: self.redo_theano() datasets = self._datasets # Set all channels' val_shared to 0 self.begin_record_entry() for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode, self._batch_size, self._num_batches, self.accum, self._rng_seed, self.num_examples): if isinstance(d, six.string_types): d = yaml_parse.load(d) raise NotImplementedError() # need to put d back into self._datasets myiterator = d.iterator(mode=i, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) # If self._flat_data_specs is empty, no channel needs data, # so we do not need to call the iterator in order to average # the monitored values across different batches, we only # have to call them once. if len(self._flat_data_specs[1]) == 0: X = () self.run_prereqs(X, d) a(*X) else: actual_ne = 0 for X in myiterator: # X is a flat (not nested) tuple self.run_prereqs(X, d) a(*X) actual_ne += self._flat_data_specs[0].np_batch_size(X) # end for X if actual_ne != ne: raise RuntimeError("At compile time, your iterator said " "it had %d examples total, but at " "runtime it gave us %d." % (ne, actual_ne)) # end for d log.info("Monitoring step:") log.info("\tEpochs seen: %d" % self._epochs_seen) log.info("\tBatches seen: %d" % self._num_batches_seen) log.info("\tExamples seen: %d" % self._examples_seen) t = time.time() - self.t0 for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key): channel = self.channels[channel_name] channel.time_record.append(t) channel.batch_record.append(self._num_batches_seen) channel.example_record.append(self._examples_seen) channel.epoch_record.append(self._epochs_seen) val = channel.val_shared.get_value() channel.val_record.append(val) # TODO: use logging infrastructure so that user can configure # formatting if abs(val) < 1e4: val_str = str(val) else: val_str = '%.3e' % val log.info("\t%s: %s" % (channel_name, val_str)) def run_prereqs(self, data, dataset): """ Runs all "prerequistie functions" on a batch of data. Always called right before computing the monitoring channels on that batch. Parameters ---------- data : tuple or Variable a member of the Space used as input to the monitoring functions dataset : Dataset the Dataset the data was drawn from """ if dataset not in self.prereqs: return for prereq in self.prereqs[dataset]: prereq(*data) def get_batches_seen(self): """ Returns the number of batches the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly). """ return self._num_batches_seen def get_epochs_seen(self): """ .. todo:: WRITEME Returns ------- epochs_seen : int The number of epochs the model has been trained on. One "epoch" is one pass through Dataset.iterator. """ return self._epochs_seen def get_examples_seen(self): """ .. todo:: WRITEME Returns ------- examples_seen : int The number of examples the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._examples_seen def report_batch(self, num_examples): """ Call this whenever the model has learned on another batch of examples. Report how many examples were learned on. Parameters ---------- num_examples : int The number of examples learned on in this minibatch. """ self._examples_seen += num_examples self._num_batches_seen += 1 def report_epoch(self): """ Call this whenever the model has completed another "epoch" of learning. We regard one pass through Dataset.iterator as one epoch. """ self._epochs_seen += 1 def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry' ) updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args,) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [] for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size): it.append(d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True)) self.num_examples = [i.num_examples for i in it] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = T.cast(channel.val * T.cast(batch_size, 'float64') / cur_num_examples, config.floatX) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append(function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names]) def register_names_to_del(self, names): """ Register names of fields that should be deleted before pickling. Parameters ---------- names : list A list of attribute names as strings. """ for name in names: if name not in self.names_to_del: self.names_to_del.append(name) def __getstate__(self): """ In order to avoid pickling a copy of the dataset whenever a monitor is saved, the __getstate__ method replaces the dataset field with the dataset's yaml source. This is not a perfect solution because it won't work with job resuming, which would require saving the state of the dataset's random number generator. Like in the Model class, we also need to avoid saving any Theano functions, so we delete everything that can be regenerated with `redo_theano` by deleting the fields in `self.names_to_del` """ # Patch old pickled monitors if not hasattr(self, '_datasets'): self._datasets = [self._dataset] del self._dataset temp = self._datasets if self._datasets: self._datasets = [] for dataset in temp: if isinstance(dataset, six.string_types): self._datasets.append(dataset) else: try: self._datasets.append(dataset.yaml_src) except AttributeError: warnings.warn('Trained model saved without ' + 'indicating yaml_src') d = copy.copy(self.__dict__) self._datasets = temp for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): """ Sets the object to have the state described by `d`. Parameters ---------- d : dict A dictionary mapping string names of fields to values for these fields. """ # patch old pkl files if '_dataset' in d: d['_datasets'] = [d['_dataset']] del d['_dataset'] self.__dict__.update(d) def add_channel(self, name, ipt, val, dataset=None, prereqs=None, data_specs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name : str The display name in the monitor. ipt : tensor_like The symbolic tensor which should be clamped to the data. (or a list/tuple containing symbolic tensors, following the data_specs) val : tensor_like The value (function of `ipt`) to be tracked. dataset : pylearn2.datasets.Dataset Which dataset to compute this channel on prereqs : list of callables that take a list of numpy tensors Each prereq must be called exactly once per each new batch of data drawn *from dataset* before the channel value is computed if two channels provide a prereq with exactly the same id, that prereq will only be called once data_specs : (space, source) pair Identifies the order, format and semantics of ipt """ if six.PY3: numeric = (float, int) else: numeric = (float, int, long) # noqa if isinstance(val, numeric): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if data_specs is None: warnings.warn("parameter 'data_specs' should be provided when " + "calling add_channel. We will build a default one.", stacklevel=2) if isinstance(ipt, list): ipt = tuple(ipt) if ipt is not None and not isinstance(ipt, tuple): ipt = (ipt,) if ipt is None: data_specs = (NullSpace(), '') elif len(ipt) == 0: data_specs = (CompositeSpace([]), ()) elif hasattr(dataset, 'get_data_specs'): dataset_space, dataset_source = dataset.get_data_specs() if (len(ipt) == 1 and dataset_source is not None and (not isinstance(dataset_source, tuple) or len(dataset_source) == 1) and 'features' in dataset_source): data_specs = (dataset_space, dataset_source) elif (len(ipt) == 2 and dataset_source == ('features', 'targets')): data_specs = (dataset_space, dataset_source) else: raise ValueError("Cannot infer default data_specs for " + "the following input points and " + "dataset: ipt = %s, dataset = %s" % (ipt, dataset)) data_specs[0].validate(ipt) mapping = DataSpecsMapping(data_specs) flat_ipt = mapping.flatten(ipt) if not isinstance(flat_ipt, tuple): flat_ipt = (flat_ipt,) inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and \ not isinstance(elem, theano.gof.graph.Constant): if elem not in flat_ipt: raise ValueError("Unspecified input: " + str(elem) + ". This may be due to an incorrect " + "implementation of a cost's " + "get_data_specs() method, or of a " + "model's get_monitoring_data_specs() " + "method.") mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel '+name+'\n') assert isinstance(flat_ipt, tuple) if len(flat_ipt) != 1: for elem in flat_ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(flat_ipt[0]) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: reraise_as(ValueError("The dataset specified is not one of the " + "monitor's datasets")) if ((self.on_channel_conflict not in ('error', 'copy_history', 'overwrite'))): raise ValueError("on_channel_conflict should be either 'error'" + "'copy_history', or 'overwrite'") if name in self.channels and self.on_channel_conflict == 'error': raise ValueError("Tried to create the same channel twice (%s)" % name) elif ((name in self.channels and self.on_channel_conflict == 'copy_history')): self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs, self.channels[name]) elif ((name not in self.channels or self.on_channel_conflict == 'overwrite')): self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs) self._dirty = True def _sanity_check(self): """ Sometimes we serialize models and then load them somewhere else but still try to use their Monitor, and the Monitor is in a mangled state. I've added some calls to _sanity_check to try to catch when that happens. Not sure what to do for a long term fix. I think it requires making theano graphs serializable first. """ for name in self.channels: channel = self.channels[name] assert hasattr(channel, 'prereqs') @classmethod def get_monitor(cls, model): """ Returns a model's monitor. If the model doesn't have a monitor yet, installs one and returns that. Parameters ---------- model : object An object that implements the `Model` interface specified in `pylearn2.models`. """ if hasattr(model, 'monitor'): rval = model.monitor rval._sanity_check() else: rval = Monitor(model) model.monitor = rval return rval # TODO: find out if this method is used anywhere, remove if not. @property def batch_size(self): """ .. todo:: WRITEME Returns ------- batch_size : int The size of the batches used for monitoring """ return self._batch_size # TODO: find out if this method is used anywhere, remove if not. @property def num_batches(self): """ .. todo:: WRITEME Returns ------- num_batches : int The number of batches used for monitoring """ return self._num_batches def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential', obj_prereqs=None, cost_monitoring_args=None): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. Parameters ---------- dataset : pylearn2.datasets.Dataset Dataset or dictionary mapping string names to Datasets. If string names are used, then for every dataset, each channel defined by the model or cost will be replicated with that dataset's name followed by an underscore as the prefix. For example, if your cost defines a channel called 'misclass', and datasets is {'train' : train_dataset, 'valid' : valid_dataset}, you will get channels called 'train_misclass' and 'valid_misclass'. cost : pylearn2.costs.Cost The cost being optimized by training. The value of the cost will appear as the `objective` channel. Its `get_monitoring_channels` method will also be used to supply other channels. extra_costs : OrderedDict, optional A dictionary mapping channel names to Cost objects. Their value will appear as the specified channel name. They will also provide more monitoring channels via their `get_monitoring_channels` method. obj_prereqs : None, or list of functions Functions to pass as prerequisites to the `objective` channel. cost_monitoring_args : dict Dictionary of kwargs that will be passed to `cost.get_monitoring_channels()` (but not for the extra_costs). """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: assert isinstance(extra_costs, (OrderedDict, dict)) costs = extra_costs assert '' not in costs costs[''] = cost if cost_monitoring_args is None: cost_monitoring_args = {} model = self.model # Build a composite data_specs containing the specs for all costs, # then the specs of the model cost_names = sorted(costs.keys()) spaces = [] sources = [] for c in cost_names: c_space, c_source = costs[c].get_data_specs(model) spaces.append(c_space) sources.append(c_source) # Ask the model for the data_specs needed m_space, m_source = model.get_monitoring_data_specs() spaces.append(m_space) sources.append(m_source) nested_space = CompositeSpace(spaces) nested_sources = tuple(sources) # Flatten this data_specs, so we build only one symbolic Theano # variable for each of the unique (space, source) pairs. mapping = DataSpecsMapping((nested_space, nested_sources)) space_tuple = mapping.flatten(nested_space, return_tuple=True) source_tuple = mapping.flatten(nested_sources, return_tuple=True) ipt = tuple(space.make_theano_batch(name='monitor_%s' % source, batch_size=None) for (space, source) in safe_zip(space_tuple, source_tuple)) # Build a nested tuple from ipt, to dispatch the appropriate parts # of the ipt batch to each cost nested_ipt = mapping.nest(ipt) custom_channels = {} for i, cost_name in enumerate(cost_names): if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] cost_ipt = nested_ipt[i] raw_channels = cost.get_monitoring_channels(model, cost_ipt) channels = {} for name in raw_channels: # We need three things: the value itself (raw_channels[name]), # the input variables (cost_ipt), and the data_specs for # these input variables ((spaces[i], sources[i])) channels[prefix + name] = (raw_channels[name], cost_ipt, (spaces[i], sources[i])) custom_channels.update(channels) # Use the last inputs from nested_ipt for the model model_channels = model.get_monitoring_channels(nested_ipt[-1]) channels = {} for name in model_channels: # Note: some code used to consider that model_channels[name] # could be a a (channel, prereqs) pair, this is not supported. channels[name] = (model_channels[name], nested_ipt[-1], (spaces[-1], sources[-1])) custom_channels.update(channels) if is_stochastic(mode): seed = [[2013, 2, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks # that respond to the values in the monitor use the name to find # it. for i, cost_name in enumerate(cost_names): cost = costs[cost_name] cost_ipt = nested_ipt[i] cost_value = cost.expr(model, cost_ipt) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' prereqs = obj_prereqs else: name = dprefix + cost_name prereqs = None cost.get_data_specs(model)[0].validate(cost_ipt) self.add_channel(name=name, ipt=cost_ipt, val=cost_value, data_specs=cost.get_data_specs(model), dataset=cur_dataset, prereqs=prereqs) for key in custom_channels: val, ipt, data_specs = custom_channels[key] data_specs[0].validate(ipt) self.add_channel(name=dprefix + key, ipt=ipt, val=val, data_specs=data_specs, dataset=cur_dataset)
def __init__(self, decay=0.9, max_scaling=1e5): assert 0. <= decay < 1. assert max_scaling > 0 self.decay = sharedX(decay, 'decay') self.epsilon = 1. / max_scaling self.mean_square_grads = OrderedDict()
class DRPROP(LearningRule): def __init__( self, decrease_rate=0.5, increase_rate=1.2, min_rate=1e-6, max_rate=50, switching_threshold=1e-6 ): assert increase_rate > 1. assert decrease_rate < 1. self.decrease_rate = sharedX(decrease_rate, 'decrease_rate') self.increase_rate = sharedX(increase_rate, 'increase_rate') self.min_rate = min_rate self.max_rate = max_rate self.switching_threshold = switching_threshold self.epsilons = OrderedDict() self.gt_epsilons = OrderedDict() self.lt_epsilons = OrderedDict() self.eq_epsilons = OrderedDict() def add_channels_to_monitor(self, monitor, monitoring_dataset): monitor.add_channel( 'rprop_decrease_rate', ipt=None, val=self.decrease_rate, dataset=monitoring_dataset, data_specs=(NullSpace(), '') ) monitor.add_channel( 'rprop_increase_rate', ipt=None, val=self.increase_rate, dataset=monitoring_dataset, data_specs=(NullSpace(), '') ) #for gt_epsilon in self.gt_epsilons.values(): # monitor.add_channel( # gt_epsilon.name, # ipt=None, # val=T.sum(gt_epsilon), # dataset=monitoring_dataset, # data_specs=(NullSpace(), '') # ) #for lt_epsilon in self.lt_epsilons.values(): # monitor.add_channel( # lt_epsilon.name, # ipt=None, # val=T.sum(lt_epsilon), # dataset=monitoring_dataset, # data_specs=(NullSpace(), '') # ) #for eq_epsilon in self.eq_epsilons.values(): # monitor.add_channel( # eq_epsilon.name, # ipt=None, # val=T.sum(eq_epsilon), # dataset=monitoring_dataset, # data_specs=(NullSpace(), '') # ) for epsilon in self.epsilons.values(): monitor.add_channel( epsilon.name + '_sum', ipt=None, val=T.sum(epsilon), dataset=monitoring_dataset, data_specs=(NullSpace(), '') ) monitor.add_channel( epsilon.name + '_min', ipt=None, val=T.min(epsilon), dataset=monitoring_dataset, data_specs=(NullSpace(), '') ) monitor.add_channel( epsilon.name + '_max', ipt=None, val=T.max(epsilon), dataset=monitoring_dataset, data_specs=(NullSpace(), '') ) def get_updates(self, learning_rate, grads, lr_scalers=None, global_error=None,dropout_mask=None): updates = OrderedDict() for param, grad in grads.iteritems(): # Created required shared variables lr = lr_scalers.get(param, learning_rate.get_value()) delta = sharedX( np.zeros_like(param.get_value()) + lr, borrow=True ) previous_grad = sharedX( np.zeros_like(param.get_value()), borrow=True ) epsilons = sharedX( np.zeros_like(param.get_value()), borrow=True ) #gt_epsilons = sharedX( # np.zeros_like(param.get_value()), # borrow=True #) #lt_epsilons = sharedX( # np.zeros_like(param.get_value()), # borrow=True #) #eq_epsilons = sharedX( # np.zeros_like(param.get_value()), # borrow=True #) # Name variables according to the parameter name if param.name is not None: delta.name = 'delta_'+param.name epsilons.name = 'epsilons_' + param.name #gt_epsilons.name = 'gt_epsilons_' + param.name #lt_epsilons.name = 'lt_epsilons_' + param.name #eq_epsilons.name = 'eq_epsilons_' + param.name previous_grad.name = 'previous_grad_' + param.name self.epsilons[param] = epsilons #self.gt_epsilons[param] = gt_epsilons #self.lt_epsilons[param] = lt_epsilons #self.eq_epsilons[param] = eq_epsilons temp = grad*previous_grad new_epsilons = T.clip( T.switch( T.lt(T.abs_(grad),self.switching_threshold), epsilons + 1., 0. ), 0., 10 ) delta_inc = T.switch(T.neq(grad,0.), T.clip( T.switch( T.eq(temp, 0.), delta, T.switch( T.lt(temp, 0.), delta*self.decrease_rate, delta*self.increase_rate ) ), self.min_rate, self.max_rate ), delta ) previous_grad_inc = T.switch( T.neq(grad,0.), T.switch( T.gt(temp, 0.), grad, T.zeros_like(grad) ), previous_grad ) scaled_lr = lr_scalers.get(param, 1.) * learning_rate unscaled_update = - delta_inc * T.sgn(grad) # Calculate updates of parameters updated_inc = T.switch( T.lt(new_epsilons,0.1), unscaled_update, T.switch( T.gt(T.abs_(grad),T.abs_(previous_grad)), - unscaled_update / (2 ** (new_epsilons + 1.)), unscaled_update / (2 ** (new_epsilons + 1.)) ) ) #new_gt_epsilons = T.switch( # T.eq(grad,0.), # 0., # T.switch( # T.gt(T.abs_(grad),self.switching_threshold), # 0., # T.switch( # T.gt(temp,0.), # 1., # 0. # ) # ) #) #new_lt_epsilons = T.switch( # T.eq(grad,0.), # 0., # T.switch( # T.gt(T.abs_(grad),self.switching_threshold), # 0., # T.switch( # T.lt(temp,0.), # 1., # 0. # ) # ) #) #new_eq_epsilons = T.switch( # T.eq(grad,0.), # 0., # T.switch( # T.gt(T.abs_(grad),self.switching_threshold), # 0., # T.switch( # T.eq(temp,0.), # 1., # 0. # ) # ) #) # Compile the updates updates[param] = param + updated_inc updates[delta] = delta_inc updates[previous_grad] = previous_grad_inc updates[epsilons] = new_epsilons #updates[gt_epsilons] = new_gt_epsilons #updates[lt_epsilons] = new_lt_epsilons #updates[eq_epsilons] = new_eq_epsilons return updates
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.info('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.info('done. Took {0}'.format(t2-t1)) if self.verbose: logger.info('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.info('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function( [alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = \ self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = \ function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = \ sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = \ function([norm], updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) # beta_pr is the Polak-Ribiere formula for beta. # According to wikipedia, the beta to use for NCG is "a matter of # heuristics or taste" but max(0, beta_pr) is "a popular choice... # which provides direction reset automatically." (ie, it is meant # to revert to steepest descent when you have traveled far enough # that the objective function is behaving non-quadratically enough # that the conjugate gradient formulas aren't working anymore) # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = \ [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line( 'BatchGradientDescent._make_conjugate var ' + var_descriptor(v) + '\n') mode.record.handle_line( 'BatchGradientDescent._make_conjugate update ' + var_descriptor(u) + '\n') self._make_conjugate = \ function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line( 'BatchGradientDescent._make_conjugate output ' + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
class Recurrent(Layer): """ A recurrent neural network layer using the hyperbolic tangent activation function, passing on all hidden states or a selection of them to the next layer. The hidden state is initialized to zeros. Parameters ---------- dim : int The number of elements in the hidden layer layer_name : str The name of the layer. All layers in an MLP must have a unique name. irange : float Initializes each weight randomly in U(-irange, irange) irange : float The input-to-hidden weight matrix is initialized with weights in the uniform interval (-irange, irange). The hidden-to-hidden matrix weights are sampled in the same manner, unless the argument svd is set to True (see below). indices : slice, list of integers or integer, optional If specified this layer will return only the given hidden states. If an integer is given, it will not return a SequenceSpace. Otherwise, it will return a SequenceSpace of fixed length. Note that a SequenceSpace of fixed length can be flattened by using the FlattenerLayer. Note: For now only [-1] is supported. init_bias : float, optional Set an initial bias to be added at each time step. Defaults to 0. nonlinearity : theano.function, optional weight_noise : bool, optional Additive Gaussian noise applied to parameters """ def __init__(self, dim, layer_name, irange, indices=None, init_bias=0., nonlinearity=tensor.tanh, weight_noise=False, **kwargs): self._std_dev = kwargs.pop('noise_std_dev', .075) self.rnn_friendly = True self._scan_updates = OrderedDict() self.__dict__.update(locals()) del self.self super(Recurrent, self).__init__() if not self.weight_noise: self._std_dev = None @wraps(Layer.set_input_space) def set_input_space(self, space): if ((not isinstance(space, SequenceSpace) and not isinstance(space, SequenceDataSpace)) or not isinstance(space.space, VectorSpace)): raise ValueError("Recurrent layer needs a SequenceSpace(" "VectorSpace) or SequenceDataSpace(VectorSpace)\ as input but received %s instead" % (space)) self.input_space = space if self.indices is not None: if len(self.indices) > 1: raise ValueError("Only indices = [-1] is supported right now") self.output_space = CompositeSpace( [VectorSpace(dim=self.dim) for _ in range(len(self.indices))] ) else: assert self.indices == [-1], "Only indices = [-1] works now" self.output_space = VectorSpace(dim=self.dim) else: if isinstance(self.input_space, SequenceSpace): self.output_space = SequenceSpace(VectorSpace(dim=self.dim)) elif isinstance(self.input_space, SequenceDataSpace): self.output_space =\ SequenceDataSpace(VectorSpace(dim=self.dim)) # Initialize the parameters rng = self.mlp.rng if self.irange is None: raise ValueError("Recurrent layer requires an irange value in " "order to initialize its weight matrices") input_dim = self.input_space.dim # W is the input-to-hidden matrix W = rng.uniform(-self.irange, self.irange, (input_dim, self.dim)) # U is the hidden-to-hidden transition matrix U = rng.randn(self.dim, self.dim) U, _ = scipy.linalg.qr(U) # b is the bias b = np.zeros((self.dim,)) self._params = [ sharedX(W, name=(self.layer_name + '_W')), sharedX(U, name=(self.layer_name + '_U')), sharedX(b + self.init_bias, name=(self.layer_name + '_b')) ] @wraps(Layer.get_layer_monitoring_channels) def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): W, U, b = self._params sq_W = tensor.sqr(W) sq_U = tensor.sqr(U) row_norms = tensor.sqrt(sq_W.sum(axis=1)) col_norms = tensor.sqrt(sq_W.sum(axis=0)) u_row_norms = tensor.sqrt(sq_U.sum(axis=1)) u_col_norms = tensor.sqrt(sq_U.sum(axis=0)) rval = OrderedDict([('W_row_norms_min', row_norms.min()), ('W_row_norms_mean', row_norms.mean()), ('W_row_norms_max', row_norms.max()), ('W_col_norms_min', col_norms.min()), ('W_col_norms_mean', col_norms.mean()), ('W_col_norms_max', col_norms.max()), ('U_row_norms_min', u_row_norms.min()), ('U_row_norms_mean', u_row_norms.mean()), ('U_row_norms_max', u_row_norms.max()), ('U_col_norms_min', u_col_norms.min()), ('U_col_norms_mean', u_col_norms.mean()), ('U_col_norms_max', u_col_norms.max())]) if (state is not None) or (state_below is not None): if state is None: state = self.fprop(state_below) if isinstance(self.input_space, SequenceSpace): state, _ = state state_below, _ = state_below mx = state.max(axis=0) mean = state.mean(axis=0) mn = state.min(axis=0) rg = mx - mn rval['range_x_max_u'] = rg.max() rval['range_x_mean_u'] = rg.mean() rval['range_x_min_u'] = rg.min() rval['max_x_max_u'] = mx.max() rval['max_x_mean_u'] = mx.mean() rval['max_x_min_u'] = mx.min() rval['mean_x_max_u'] = mean.max() rval['mean_x_mean_u'] = mean.mean() rval['mean_x_min_u'] = mean.min() rval['min_x_max_u'] = mn.max() rval['min_x_mean_u'] = mn.mean() rval['min_x_min_u'] = mn.min() return rval @wraps(Layer._modify_updates) def _modify_updates(self, updates): # When random variables are used in the scan function the updates # dictionary returned by scan might not be empty, and needs to be # added to the updates dictionary before compiling the training # function if any(key in updates for key in self._scan_updates): # Don't think this is possible, but let's check anyway raise ValueError("A single shared variable is being updated by " "multiple scan functions") updates.update(self._scan_updates) def add_noise(self, param): """ A function that adds additive Gaussian noise Parameters ---------- param : sharedX model parameter to be regularized Returns ------- param : sharedX model parameter with additive noise """ param += self.mlp.theano_rng.normal(size=param.shape, avg=0., std=self._std_dev, dtype=param.dtype) return param @wraps(Layer.fprop) def fprop(self, state_below, return_all=False): if isinstance(state_below, tuple): state_below, mask = state_below else: mask = None # z0 is the initial hidden state which is (batch size, output dim) z0 = tensor.alloc(np.cast[config.floatX](0), state_below.shape[1], self.dim) if self.dim == 1: # This should fix the bug described in Theano issue #1772 z0 = tensor.unbroadcast(z0, 1) # Later we will add a noise function W, U, b = self._params if self.weight_noise: W = self.add_noise(W) U = self.add_noise(U) # It is faster to do the input-to-hidden matrix multiplications # outside of scan state_below = tensor.dot(state_below, W) + b if mask is not None: z, updates = scan(fn=self.fprop_step_mask, sequences=[state_below, mask], outputs_info=[z0], non_sequences=[U]) else: z, updates = scan(fn=self.fprop_step, sequences=[state_below], outputs_info=[z0], non_sequences=[U]) self._scan_updates.update(updates) if self.indices is not None: if len(self.indices) > 1: return [z[i] for i in self.indices] else: return z[self.indices[0]] else: return (z, mask) def fprop_step_mask(self, state_below, mask, state_before, U): """ Scan function for case using masks Parameters ---------- : todo state_below : TheanoTensor """ z = self.nonlinearity(state_below + tensor.dot(state_before, U)) # Only update the state for non-masked data, otherwise # just carry on the previous state until the end z = mask[:, None] * z + (1 - mask[:, None]) * state_before return z def fprop_step(self, state_below, state_before, U): """ Scan function for case without masks Parameters ---------- : todo state_below : TheanoTensor """ z = self.nonlinearity(state_below + tensor.dot(state_before, U)) return z