def get_monitoring_channels(self, model, data, **kwargs): """ .. todo:: WRITEME properly Provides monitoring of the individual costs that are being added together. This is a very useful method to subclass if you need to monitor more things about the model. """ self.get_data_specs(model)[0].validate(data) rval = OrderedDict() # if there's only 1 cost, then no need to split up the costs if len(self.costs) > 1: output = self._get_samples_from_model(model, data) rval['reconstruction_cost'] =\ self._get_total_for_cost(0, self.costs[0][2], data, output) rval['classification_cost'] =\ self._get_total_for_cost(1, self.costs[1][2], data, output) return rval
def get_learn_func(self): """ Returns a theano function that takes an action and a reward, and updates the agent based on this experience. """ a = T.iscalar() r = T.scalar() old_estimated_reward = self.estimated_rewards[a] old_observation_count = self.observation_counts[a] observation_count = old_observation_count + 1. delta = r - old_estimated_reward new_estimated_reward = old_estimated_reward + delta / observation_count new_estimated_rewards = T.set_subtensor(self.estimated_rewards[a], new_estimated_reward) new_observation_counts = T.set_subtensor(self.observation_counts[a], observation_count) updates = OrderedDict([(self.estimated_rewards, new_estimated_rewards), (self.observation_counts, new_observation_counts)]) rval = function([a, r], updates=updates) return rval
def initialize(self): params = OrderedDict() N = self.nout Nm = len(self.recurrent) for parname, parout in self.parent.items(): W_shape = (parout, 4 * N + Nm) W_name = 'W_' + parname + '__' + self.name params[W_name] = self.init_W.get(W_shape) for recname, recout in self.recurrent.items(): M = recout U = self.init_U.ortho((M, N)) for j in xrange(3): U = np.concatenate([U, self.init_U.ortho((M, N))], axis=-1) U = np.concatenate([U, self.init_U.rand((M, Nm))], axis=-1) U_name = 'U_' + recname + '__' + self.name params[U_name] = U params['b_' + self.name] = self.init_b.get(4 * N + Nm) return params
def get_monitoring_channels(self, model, data, **kwargs): """ .. todo:: WRITEME Returns a dictionary mapping channel names to expressions for channel values. TODO: how do you do prereqs in this setup? (I think PL changed it, not sure if there still is a way in this context) Parameters ---------- model : Model the model to use to compute the monitoring channels data : batch (a member of self.get_data_specs()[0]) symbolic expressions for the monitoring data kwargs : dict used so that custom algorithms can use extra variables for monitoring. Returns ------- rval : dict Maps channels names to expressions for channel values. """ self.get_data_specs(model)[0].validate(data) return OrderedDict()
def __call__(self, inputs): """ .. todo:: WRITEME """ space = self.dbm.get_input_space() num_examples = space.batch_size(inputs) last_layer = self.dbm.get_all_layers()[-1] layer_to_chains = self.dbm.make_layer_to_symbolic_state( num_examples, self.theano_rng) # The examples are used to initialize the visible layer's chains layer_to_chains[self.dbm.visible_layer] = inputs layer_to_clamp = OrderedDict([(self.dbm.visible_layer, True)]) layer_to_chains = self.dbm.mcmc_steps(layer_to_chains, self.theano_rng, layer_to_clamp=layer_to_clamp, num_steps=1) rval = layer_to_chains[last_layer] rval = last_layer.upward_state(rval) return rval
def _get_positive_phase(self, model, X, Y=None): """ .. todo:: WRITEME """ return self._get_sampling_pos(model, X, Y), OrderedDict()
def get_monitoring_channels(self, data): """ Get monitoring channels for this model. Parameters ---------- data: tensor_like, or (possibly nested) tuple of tensor_likes, This is data on which the monitoring quantities will be \ calculated (e.g., a validation set). See \ `self.get_monitoring_data_specs()`. Returns ------- channels : OrderedDict A dictionary with strings as keys, mapping channel names to \ symbolic values that depend on the variables in `data`. Notes ----- You can make any channel names you want, just try to make sure they won't collide with names made by the training Cost, etc. Anything you think is worth monitoring during training can be added here. You probably want to control which channels get added with some config option for your model. """ space, source = self.get_monitoring_data_specs() space.validate(data) return OrderedDict()
def initialize(self): params = OrderedDict() for parname, parout in self.parent.items(): W_shape = (parout, self.nout) W_name = 'W_' + parname + '__' + self.name + '_h1' params[W_name] = self.init_W.get(W_shape) if self.use_bias: b_name = 'b_' + self.name + '_h1' params[b_name] = self.init_b.get(self.nout) for l in xrange(1, self.num_layers): W_shape = (self.nout, self.nout) W_name = self.name + '_W_h%d__h%d' % (l, l+1) C_name = self.name + '_C_h%d__h%d' % (l, l+1) b_name = self.name + '_b_h%d__h%d' % (l, l+1) b_C_name = self.name + '_b_C_h%d__h%d' % (l, l+1) params[W_name] = self.init_W.get(W_shape) params[C_name] = self.init_W.get(W_shape) params[b_name] = self.init_b.get(self.nout) params[b_C_name] = self.init_b_C.get(self.nout) return params
def unzip(zipped): new_params = OrderedDict() for kk, vv in zipped.iteritems(): new_params[kk] = vv.get_value() return new_params
def __init__(self, valid=None, invalid=None, valid_equivalent=None): ''' Check if variables can be expressed without using variables in invalid. init_valid_equivalent provides a dictionary mapping some invalid variables to valid ones that can be used instead. ''' if valid is None: valid = [] if invalid is None: invalid = [] if valid_equivalent is None: valid_equivalent = OrderedDict() # Nodes that are valid to have in the graph computing outputs self.valid = set(valid) # Nodes that are NOT valid to have in the graph computing outputs self.invalid = set(invalid) # Mapping from invalid variables to equivalent valid ones. self.valid_equivalent = valid_equivalent.copy() self.valid.update(valid_equivalent.values()) self.invalid.update(valid_equivalent.keys())
def get_monitoring_channels(self, model, X, Y=None, **kwargs): if Y is None and self.supervised: raise ValueError("no targets provided while some of the " + "costs in the sum are supervised costs") rval = OrderedDict() for i, cost in enumerate(self.costs): try: rval.update(cost.get_monitoring_channels( model, X, Y, **kwargs)) except TypeError: print 'SumOfCosts.get_monitoring_channels encountered TypeError while calling ' \ + str(type(cost))+'.get_monitoring_channels' raise Y_to_pass = Y if not cost.supervised: Y_to_pass = None value = cost(model, X, Y_to_pass, **kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_' + str(i) + name] = value return rval
def __init__(self, model): """ .. todo:: WRITEME """ avg_updates = OrderedDict() t = sharedX(1.) self.param_to_mean = OrderedDict() for param in model.get_params(): mean = sharedX(param.get_value()) assert type(mean) == type(param) self.param_to_mean[param] = mean avg_updates[mean] = mean - (mean - param) / t avg_updates[t] = t + 1. self.avg = function([], updates=avg_updates)
def two_step_backprop(mlp): """ mlp: A SimpleMLP instance Returns: f1: a theano function Takes two arguments: a minibatch of examples and a minibatch of targets. Returns two values: 1) The gradient of the loss on mlp.w_out 2) An auxiliary value of your choosing f2: Takes two arguments: a minibatch of examples, and the auxiliary value returned by f1. Returns the gradient of the loss on mlp.W_hid Should not make use of mlp.w_out at all! """ # Run fprop X = T.matrix() y = T.vector() H, y_hat = mlp.fprop(X) l = loss(y_hat, y) g_w, g_H = T.grad(l, [mlp.w_out, H]) f1 = function([X, y], [g_w, g_H]) known_grads = OrderedDict() known_grads[H] = g_H g_W = T.grad(None, mlp.W_hid, known_grads=known_grads) f2 = function([X, g_H], g_W) return f1, f2
def get_monitoring_channels(self, data): """ Notes ----- Monitors quantities related to the approximate posterior parameters phi and the conditional and prior parameters theta. """ space, source = self.get_monitoring_data_specs() space.validate(data) rval = OrderedDict() X = data epsilon_shape = (1, X.shape[0], self.nhid) epsilon = self.sample_from_epsilon(shape=epsilon_shape) phi = self.encode_phi(X) z = self.sample_from_q_z_given_x(epsilon=epsilon, phi=phi) z = z.reshape((epsilon.shape[0] * epsilon.shape[1], epsilon.shape[2])) theta = self.decode_theta(z) posterior_channels = \ self.posterior.monitoring_channels_from_conditional_params(phi) safe_update(rval, posterior_channels) conditional_channels = \ self.conditional.monitoring_channels_from_conditional_params(theta) safe_update(rval, conditional_channels) prior_channels = self.prior.monitoring_channels_from_prior_params() safe_update(rval, prior_channels) return rval
def get_fixed_var_descr(self, model, X, Y): """ .. todo:: WRITEME """ assert Y is not None batch_size = model.batch_size drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size)) drop_mask_X.name = 'drop_mask' X_space = model.get_input_space() updates = OrderedDict() rval = FixedVarDescr() inputs=[X, Y] if not self.supervised: update_X = self.mask_gen(X, X_space = X_space) else: drop_mask_Y = sharedX(np.ones(batch_size,)) drop_mask_Y.name = 'drop_mask_Y' update_X, update_Y = self.mask_gen(X, Y, X_space) updates[drop_mask_Y] = update_Y rval.fixed_vars['drop_mask_Y'] = drop_mask_Y if self.mask_gen.sync_channels: n = update_X.ndim assert n == drop_mask_X.ndim - 1 update_X.name = 'raw_update_X' zeros_like_X = T.zeros_like(X) zeros_like_X.name = 'zeros_like_X' update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x') update_X.name = 'update_X' updates[drop_mask_X] = update_X rval.fixed_vars['drop_mask'] = drop_mask_X if hasattr(model.inference_procedure, 'V_dropout'): include_prob = model.inference_procedure.include_prob include_prob_V = model.inference_procedure.include_prob_V include_prob_Y = model.inference_procedure.include_prob_Y theano_rng = MRG_RandomStreams(2012+11+20) for elem in flatten([model.inference_procedure.V_dropout]): updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V if "Softmax" in str(type(model.hidden_layers[-1])): hid = model.inference_procedure.H_dropout[:-1] y = model.inference_procedure.H_dropout[-1] updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y else: hid = model.inference_procedure.H_dropout for elem in flatten(hid): updates[elem] = theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob rval.on_load_batch = [utils.function(inputs, updates=updates)] return rval
def init_tparams(params): tparams = OrderedDict() for kk, pp in params.iteritems(): tparams[kk] = theano.shared(castX(params[kk]), name=kk) return tparams
def _get_standard_neg(self, model, layer_to_chains): """ .. todo:: WRITEME """ params = list(model.get_params()) warnings.warn("""TODO: reduce variance of negative phase by integrating out the even-numbered layers. The Rao-Blackwellize method can do this for you when expected gradient = gradient of expectation, but doing this in general is trickier.""") #layer_to_chains = model.rao_blackwellize(layer_to_chains) expected_energy_p = model.energy( layer_to_chains[model.visible_layer], [layer_to_chains[layer] for layer in model.hidden_layers] ).mean() samples = flatten(layer_to_chains.values()) for i, sample in enumerate(samples): if sample.name is None: sample.name = 'sample_'+str(i) neg_phase_grads = OrderedDict( safe_zip(params, T.grad(-expected_energy_p, params, consider_constant=samples, disconnected_inputs='ignore')) ) return neg_phase_grads
def get_lr_scalers(self): """ .. todo:: WRITEME """ return OrderedDict()
def _get_positive_phase(self, model, X, Y=None): """ .. todo:: WRITEME """ return self._get_variational_pos(model, X, Y), OrderedDict()
def on_monitor(self, model, dataset, algorithm): """ Make sure Polyak-averaged model gets monitored. Save the model if necessary. Parameters ---------- model : a Model instance dataset : Dataset algorithm : WRITEME """ if self._count == self.start: self._worker = _PolyakWorker(model) algorithm.update_callbacks.append(self._worker) #HACK try: model.add_polyak_channels(self._worker.param_to_mean, algorithm.monitoring_dataset) except AttributeError: pass elif self.save_path is not None and self._count > self.start and \ self._count % self.save_freq == 0: saved_params = OrderedDict() for param in model.get_params(): saved_params[param] = param.get_value() param.set_value(self._worker.param_to_mean[param].get_value()) serial.save(self.save_path, model) for param in model.get_params(): param.set_value(saved_params[param]) self._count += 1
def test_pickle_unpickle_without_reoptimization(): mode = theano.config.mode if mode in ["DEBUG_MODE", "DebugMode"]: mode = "FAST_RUN" x1 = T.fmatrix('x1') x2 = T.fmatrix('x2') x3 = theano.shared(numpy.ones((10, 10), dtype=floatX)) x4 = theano.shared(numpy.ones((10, 10), dtype=floatX)) y = T.sum(T.sum(T.sum(x1**2 + x2) + x3) + x4) updates = OrderedDict() updates[x3] = x3 + 1 updates[x4] = x4 + 1 f = theano.function([x1, x2], y, updates=updates, mode=mode) # now pickle the compiled theano fn string_pkl = cPickle.dumps(f, -1) # compute f value in1 = numpy.ones((10, 10), dtype=floatX) in2 = numpy.ones((10, 10), dtype=floatX) # test unpickle without optimization default = theano.config.reoptimize_unpickled_function try: # the default is True theano.config.reoptimize_unpickled_function = False f_ = cPickle.loads(string_pkl) assert f(in1, in2) == f_(in1, in2) finally: theano.config.reoptimize_unpickled_function = default
def get_monitoring_channels(self, data): """ data is a flat tuple, and can contain features, targets, or both """ X, Y = data state = X rval = OrderedDict() #import pdb #pdb.set_trace() for layer in self.layers: ch = layer.get_monitoring_channels() for key in ch: rval[layer.layer_name + '_' + key] = ch[key] state = layer.test_fprop(state) args = [state] if layer is self.layers[-1]: args.append(Y) ch = layer.get_monitoring_channels_from_state(*args) if not isinstance(ch, OrderedDict): raise TypeError(str((type(ch), layer.layer_name))) for key in ch: rval[layer.layer_name + '_' + key] = ch[key] return rval
def get_monitoring_channels(self, model, data, **kwargs): self.get_data_specs(model)[0].validate(data) rval = OrderedDict() composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for i, cost in enumerate(self.costs): cost_data = nested_data[i] try: channels = cost.get_monitoring_channels( model, cost_data, **kwargs) rval.update(channels) except TypeError: logger.error('SumOfCosts.get_monitoring_channels encountered ' 'TypeError while calling {0}' '.get_monitoring_channels'.format(type(cost))) raise value = cost.expr(model, cost_data, **kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_' + str(i) + name] = value return rval
def get_params(self): """ This returns the list of theano shared variables that will be trained by the :class:`Optimizer`. These parameters are used in the gradient. This includes all of the parameters in every model in the Prototype, without duplication. Returns ------- dict(str: SharedVariable) Dictionary of {string_name: theano shared variables} to be trained with an :class:`Optimizer`. These are the parameters to be trained. """ params = OrderedDict() model_index = 0 for model in self.models: if isinstance(model, Model): model_params = model.get_params() # append the parameters only if they aren't already in the list! for name, param in model_params.items(): if param not in list(params.values()): name = model._classname + '_%d_' % model_index + name params[name] = param model_index += 1 return params
def orderings(self): """ Return dict d s.t. d[node] is a list of nodes that must be evaluated before node itself can be evaluated. This is used primarily by the destroy_handler feature to ensure that all clients of any destroyed inputs have already computed their outputs. :note: This only calls the orderings() fct on all features. It does not take care of computing dependencies by itself. """ ords = OrderedDict() assert isinstance(self._features, list) for feature in self._features: if hasattr(feature, 'orderings'): orderings = feature.orderings(self) if not isinstance(orderings, OrderedDict): raise TypeError("Non-deterministic return value from " + str(feature.orderings) + ". Nondeterministic object is " + str(orderings)) for node, prereqs in orderings.items(): if not isinstance(prereqs, (list, OrderedSet)): raise TypeError( "prereqs must be a type with a " "deterministic iteration order, or toposort " " will be non-deterministic.") ords.setdefault(node, []).extend(prereqs) # eliminate duplicate prereqs for (node, prereqs) in ords.items(): ords[node] = list(OrderedSet(prereqs)) return ords
def rms_prop(param_grad_dict, learning_rate, momentum=.9, averaging_coeff=.95, stabilizer=.0001): updates = OrderedDict() for param in param_grad_dict.keys(): inc = sharedX(param.get_value() * 0.) avg_grad = sharedX(np.zeros_like(param.get_value())) avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) new_avg_grad = averaging_coeff * avg_grad \ + (1 - averaging_coeff) * param_grad_dict[param] new_avg_grad_sqr = averaging_coeff * avg_grad_sqr \ + (1 - averaging_coeff) * param_grad_dict[param]**2 normalized_grad = param_grad_dict[param] / \ T.sqrt(new_avg_grad_sqr - new_avg_grad**2 + stabilizer) updated_inc = momentum * inc - learning_rate * normalized_grad updates[avg_grad] = new_avg_grad updates[avg_grad_sqr] = new_avg_grad_sqr updates[inc] = updated_inc updates[param] = param + updated_inc return updates
def get_updates(self, grads): """ .. todo:: WRITEME """ updates = OrderedDict() i = sharedX(0., 'counter') i_t = i + 1. b1_t = self.b1**i_t b2_t = self.b2**i_t lr_t = self.lr * T.sqrt(1. - b2_t) / (1 - b1_t) #b1 = 1 - self.b1 * self.lambd**i for p, g in grads.items(): lr_scaler = self.lr_scalers.get(str(p), 1.) m = sharedX(p.get_value() * 0.) v = sharedX(p.get_value() * 0.) #m_t = b1 * m + (1 - b1) * g m_t = self.b1 * m + (1 - self.b1) * g v_t = self.b2 * v + (1 - self.b2) * g**2 g_t = m_t / (T.sqrt(v_t) + self.eps) p_t = p - lr_scaler * lr_t * g_t updates[m] = m_t updates[v] = v_t updates[p] = p_t updates[i] = i_t return updates
def get_updates(self, gradients): """ Provides the symbolic (theano) description of the updates needed to perform this learning rule. See Notes for side-effects. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. Notes ----- This method has the side effect of storing the moving average of the square gradient in `self.mean_square_grads`. This is necessary in order for the monitoring channels to be able to track the value of these moving averages. Therefore, this method should only get called once for each instance of RMSProp. """ log.debug('Setting up RMSProp for optimizer...') updates = OrderedDict() for param in gradients: # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) if param.name is None: raise ValueError("Model parameters must be named.") mean_square_grad.name = 'mean_square_grad_' + param.name if param.name in self.mean_square_grads: log.warning("Calling get_updates more than once on the " "gradients of `%s` may make monitored values " "incorrect." % param.name) # Store variable in self.mean_square_grads for monitoring. self.mean_square_grads[param.name] = mean_square_grad # Accumulate gradient new_mean_squared_grad = ( self.decay * mean_square_grad + (1 - self.decay) * T.sqr(gradients[param])) # Compute update scaled_lr = self.lr_scalers.get(param, 1.) * self.learning_rate rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, self.epsilon) delta_x_t = -scaled_lr * gradients[param] / rms_grad_t # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[param] = param + delta_x_t return updates
def __init__(self, model): """ Makes a monitor for `model`. Assumes the model has not been trained at all yet. Parameters ---------- model : pylearn2.models.model.Model instance """ self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() self.theano_function_mode = None # Initialize self._nested_data_specs, self._data_specs_mapping, # and self._flat_data_specs self._build_data_specs()
def get_layer_monitoring_channels(self, state_below=None, state=None, target=None): b = self.b rval = OrderedDict([('bias_min', b.min()), ('bias_mean', b.mean()), ('bias_max', b.max()),]) return rval