def refresh_droot_impact(self): """ Makes sure self.droot, self.impact, and self.root_destroyer are up to date, and returns them. (see docstrings for these properties above) """ if self.stale_droot: droot = OrderedDict() # destroyed view + nonview variables -> foundation impact = OrderedDict() # destroyed nonview variable -> it + all views of it root_destroyer = OrderedDict() # root -> destroyer apply for app in self.destroyers: for output_idx, input_idx_list in app.op.destroy_map.items(): if len(input_idx_list) != 1: raise NotImplementedError() input_idx = input_idx_list[0] input = app.inputs[input_idx] input_root = getroot(input, self.view_i) if input_root in droot: raise InconsistencyError( "Multiple destroyers of %s" % input_root) droot[input_root] = input_root root_destroyer[input_root] = app input_impact = get_impact(input_root, self.view_o) for v in input_impact: assert v not in droot droot[v] = input_root impact[input_root] = input_impact impact[input_root].add(input_root) self.droot, self.impact, self.root_destroyer = droot, impact, root_destroyer self.stale_droot = False return self.droot, self.impact, self.root_destroyer
def get_monitoring_channels(self, data): rval = OrderedDict() g_ch = self.generator.get_monitoring_channels(data) d_ch = self.discriminator.get_monitoring_channels((data, None)) samples, _, conditional_data, _ = self.generator.sample_and_noise(100) d_samp_ch = self.discriminator.get_monitoring_channels( ((samples, conditional_data), None)) i_ch = OrderedDict() if self.inferer is not None: batch_size = self.inference_monitoring_batch_size sample, noise, conditional_data, _ = self.generator.sample_and_noise( batch_size) i_ch.update( self.inferer.get_monitoring_channels( ((sample, conditional_data), noise))) if self.monitor_generator: for key in g_ch: rval['gen_' + key] = g_ch[key] if self.monitor_discriminator: for key in d_ch: rval['dis_on_data_' + key] = d_samp_ch[key] for key in d_ch: rval['dis_on_samp_' + key] = d_ch[key] if self.monitor_inference: for key in i_ch: rval['inf_' + key] = i_ch[key] return rval
def on_prune(self, fgraph, app, reason): """Remove Apply instance from set which must be computed""" if app not in self.debug_all_apps: raise ProtocolError("prune without import") self.debug_all_apps.remove(app) # UPDATE self.clients for i, input in enumerate(OrderedSet(app.inputs)): del self.clients[input][app] if getattr(app.op, 'destroy_map', OrderedDict()): self.destroyers.remove(app) # Note: leaving empty client dictionaries in the struct. # Why? It's a pain to remove them. I think they aren't doing any harm, they will be # deleted on_detach(). # UPDATE self.view_i, self.view_o for o_idx, i_idx_list in iteritems( getattr(app.op, 'view_map', OrderedDict())): if len(i_idx_list) > 1: # destroying this output invalidates multiple inputs raise NotImplementedError() o = app.outputs[o_idx] i = app.inputs[i_idx_list[0]] del self.view_i[o] self.view_o[i].remove(o) if not self.view_o[i]: del self.view_o[i] self.stale_droot = True
def __init__(self, do_imports_on_attach=True): self.fgraph = None self.do_imports_on_attach = do_imports_on_attach """ Maps every variable in the graph to its "foundation" (deepest ancestor in view chain). TODO: change name to var_to_vroot. """ self.droot = OrderedDict() """ Maps a variable to all variables that are indirect or direct views of it (including itself) essentially the inverse of droot. TODO: do all variables appear in this dict, or only those that are foundations? TODO: do only destroyed variables go in here? one old docstring said so. TODO: rename to x_to_views after reverse engineering what x is """ self.impact = OrderedDict() """ If a var is destroyed, then this dict will map droot[var] to the apply node that destroyed var TODO: rename to vroot_to_destroyer """ self.root_destroyer = OrderedDict()
def on_import(self, fgraph, app, reason): """ Add Apply instance to set which must be computed. """ if app in self.debug_all_apps: raise ProtocolError("double import") self.debug_all_apps.add(app) # print 'DH IMPORT', app, id(app), id(self), len(self.debug_all_apps) # If it's a destructive op, add it to our watch list if getattr(app.op, 'destroy_map', {}): self.destroyers.add(app) # add this symbol to the forward and backward maps for o_idx, i_idx_list in iteritems(getattr(app.op, 'view_map', {})): if len(i_idx_list) > 1: raise NotImplementedError( 'destroying this output invalidates multiple inputs', (app. op)) o = app.outputs[o_idx] i = app.inputs[i_idx_list[0]] self.view_i[o] = i self.view_o.setdefault(i, OrderedSet()).add(o) # update self.clients for i, input in enumerate(app.inputs): self.clients.setdefault(input, OrderedDict()).setdefault(app, 0) self.clients[input][app] += 1 for i, output in enumerate(app.outputs): self.clients.setdefault(output, OrderedDict()) self.stale_droot = True
def on_attach(self, fgraph): """ When attaching to a new fgraph, check that 1) This DestroyHandler wasn't already attached to some fgraph (its data structures are only set up to serve one). 2) The FunctionGraph doesn't already have a DestroyHandler. This would result in it validating everything twice, causing compilation to be slower. Give the FunctionGraph instance: 1) A new method "destroyers(var)" TODO: what does this do exactly? 2) A new attribute, "destroy_handler" TODO: WRITEME: what does this do besides the checks? """ # Do the checking # already_there = False if self.fgraph is fgraph: already_there = True if self.fgraph is not None: raise Exception("A DestroyHandler instance can only serve one" " FunctionGraph. (Matthew 6:24)") for attr in ('destroyers', 'destroy_handler'): if hasattr(fgraph, attr): already_there = True if already_there: # FunctionGraph.attach_feature catches AlreadyThere and cancels the attachment raise toolbox.AlreadyThere( "DestroyHandler feature is already present" " or in conflict with another plugin.") # Annotate the FunctionGraph # self.unpickle(fgraph) fgraph.destroy_handler = self self.fgraph = fgraph self.destroyers = OrderedSet( ) # set of Apply instances with non-null destroy_map self.view_i = OrderedDict() # variable -> variable used in calculation self.view_o = OrderedDict( ) # variable -> set of variables that use this one as a direct input # clients: how many times does an apply use a given variable self.clients = OrderedDict() # variable -> apply -> ninputs self.stale_droot = True self.debug_all_apps = OrderedSet() if self.do_imports_on_attach: toolbox.Bookkeeper.on_attach(self, fgraph)
def test_constrain_filter_max_norm(): """ Test that ConstrainFilterNorm matches a manual implementation. """ limit = 1. ext = ConstrainFilterMaxNorm(limit) W = np.zeros((2, 4)) # Column 0 tests the case where an element has zero norm # Column 1 tests the case where an element is smaller than the limit W[0, 1] = .5 # Column 2 tests the case where an element is on the limit W[0, 2] = 1. # Column 3 tests the case where an element is too big W[0, 3] = 2. W = sharedX(W / 2.) model = ModelWithW(W) model.extensions.append(ext) updates = OrderedDict() updates[W] = W * 2. model.modify_updates(updates) f = function([], updates=updates) f() W = W.get_value() assert W.shape == (2, 4) assert np.abs(W[1, :]).max() == 0 assert W[0, 0] == 0. assert W[0, 1] == 0.5 assert W[0, 2] == 1. assert W[0, 3] == 1., W[0, 3]
def test_pickle_unpickle_without_reoptimization(): mode = theano.config.mode if mode in ["DEBUG_MODE", "DebugMode"]: mode = "FAST_RUN" x1 = T.fmatrix('x1') x2 = T.fmatrix('x2') x3 = theano.shared(numpy.ones((10, 10), dtype=floatX)) x4 = theano.shared(numpy.ones((10, 10), dtype=floatX)) y = T.sum(T.sum(T.sum(x1**2 + x2) + x3) + x4) updates = OrderedDict() updates[x3] = x3 + 1 updates[x4] = x4 + 1 f = theano.function([x1, x2], y, updates=updates, mode=mode) # now pickle the compiled theano fn string_pkl = cPickle.dumps(f, -1) # compute f value in1 = numpy.ones((10, 10), dtype=floatX) in2 = numpy.ones((10, 10), dtype=floatX) # test unpickle without optimization default = theano.config.reoptimize_unpickled_function try: # the default is True theano.config.reoptimize_unpickled_function = False f_ = cPickle.loads(string_pkl) assert f(in1, in2) == f_(in1, in2) finally: theano.config.reoptimize_unpickled_function = default
def make_layer_to_symbolic_state(self, num_examples, rng=None): """ .. todo:: Explain the difference with `make_layer_to_state` Makes and returns a dictionary mapping layers to states. By states, we mean here a real assignment, not a mean field state. For example, for a layer containing binary random variables, the state will be a shared variable containing values in {0,1}, not [0,1]. The visible layer will be included. Uses a dictionary so it is easy to unambiguously index a layer without needing to remember rules like vis layer = 0, hiddens start at 1, etc. Parameters ---------- num_examples : int WRITEME rng : WRITEME """ # Make a list of all layers layers = [self.visible_layer] + self.hidden_layers assert rng is not None states = [layer.make_symbolic_state(num_examples, rng) for layer in layers] zipped = safe_zip(layers, states) rval = OrderedDict(zipped) return rval
def __init__(self, valid=None, invalid=None, valid_equivalent=None): ''' Check if variables can be expressed without using variables in invalid. init_valid_equivalent provides a dictionary mapping some invalid variables to valid ones that can be used instead. ''' if valid is None: valid = [] if invalid is None: invalid = [] if valid_equivalent is None: valid_equivalent = OrderedDict() # Nodes that are valid to have in the graph computing outputs self.valid = set(valid) # Nodes that are NOT valid to have in the graph computing outputs self.invalid = set(invalid) # Mapping from invalid variables to equivalent valid ones. self.valid_equivalent = valid_equivalent.copy() self.valid.update(valid_equivalent.values()) self.invalid.update(valid_equivalent.keys())
def get_updates(self, params, loss): grads = self.get_gradients(params, loss) self.updates = OrderedDict() if isinstance(self.learning_rate, LearningRateDecay): lr = self.learning_rate.learning_rate self.updates.update(self.learning_rate.get_updates()) else: lr = self.learning_rate beta1 = self.beta1 beta2 = self.beta2 eps = self.eps one = T.constant(utils.floatX(1.)) beta1_t = theano.shared(utils.floatX(beta1), name='beta1_t') beta2_t = theano.shared(utils.floatX(beta2), name='beta2_t') for param, grad in zip(params, grads): momentum = theano.shared(param.get_value() * 0., broadcastable=param.broadcastable, name='momentum') velocity = theano.shared(param.get_value() * 0., broadcastable=param.broadcastable, name='velocity') m_t = beta1 * momentum + (one - beta1) * grad v_t = beta2 * velocity + (one - beta2) * grad**2 m_hat = m_t / (1. - beta1_t) v_hat = v_t / (1. - beta2_t) step = lr * m_hat / (T.sqrt(v_hat) + eps) self.updates[momentum] = m_t self.updates[velocity] = v_t self.updates[param] = param - step self.updates[beta1_t] = beta1_t * beta1 self.updates[beta2_t] = beta2_t * beta2 return self.updates
def get_monitoring_channels(self, model, data, **kwargs): rval = OrderedDict() m = data.shape[0] g = model.generator d = model.discriminator y_hat = d.fprop(data) rval['false_negatives'] = T.cast((y_hat < 0.5).mean(), 'float32') samples = g.sample(m) y_hat = d.fprop(samples) rval['false_positives'] = T.cast((y_hat > 0.5).mean(), 'float32') # y = T.alloc(0., m, 1) cost = d.cost_from_X((samples, y_hat)) sample_grad = T.grad(-cost, samples) rval['sample_grad_norm'] = T.sqrt(T.sqr(sample_grad).sum()) _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) if model.monitor_inference and i_obj != 0: rval['objective_i'] = i_obj if model.monitor_discriminator: rval['objective_d'] = d_obj if model.monitor_generator: rval['objective_g'] = g_obj rval['now_train_generator'] = self.now_train_generator return rval
def orderings(self): """ Return dict d s.t. d[node] is a list of nodes that must be evaluated before node itself can be evaluated. This is used primarily by the destroy_handler feature to ensure that all clients of any destroyed inputs have already computed their outputs. Notes ----- This only calls the orderings() fct on all features. It does not take care of computing dependencies by itself. """ ords = OrderedDict() assert isinstance(self._features, list) for feature in self._features: if hasattr(feature, 'orderings'): orderings = feature.orderings(self) if not isinstance(orderings, OrderedDict): raise TypeError("Non-deterministic return value from " + str(feature.orderings) + ". Nondeterministic object is " + str(orderings)) for node, prereqs in iteritems(orderings): if not isinstance(prereqs, (list, OrderedSet)): raise TypeError( "prereqs must be a type with a " "deterministic iteration order, or toposort " " will be non-deterministic.") ords.setdefault(node, []).extend(prereqs) # eliminate duplicate prereqs for (node, prereqs) in iteritems(ords): ords[node] = list(OrderedSet(prereqs)) return ords
def forced_replace(out, x, y): """ :param out: Theano Variable :param x: Theano Variable :param y: Theano Variable This function checks all internal values of the graph that computes the variable ``out`` for occurances of values identical with ``x``. If such occurances are encountered then they are replaced with variable ``y``. For example: out := sigmoid(wu)*(1-sigmoid(wu)) x := sigmoid(wu) forced_replace(out, x, y) := y*(1-y) """ if out is None: return None # ``visited`` is a set of nodes that are already known and don't need to be # checked again, speeding up the traversal of multiply-connected graphs. visited = set() def local_traverse(graph, x): if graph in visited: return [] visited.add(graph) if equal_computations([graph], [x]): return [graph] elif not graph.owner: return [] else: rval = [] for inp in graph.owner.inputs: rval += local_traverse(inp, x) return rval to_replace = local_traverse(out, x) return clone(out, replace=OrderedDict((v, y) for v in to_replace))
def test_hash_from_dict(): dicts = [{}, { 0: 0 }, { 0: 1 }, { 1: 0 }, { 1: 1 }, { 0: (0, ) }, { 0: [1] }, { 0: (0, 1) }, { 0: [1, 0] }] for elem in dicts[:]: dicts.append(OrderedDict(elem)) hashs = [] for idx, d in enumerate(dicts): h = hash_from_dict(d) assert h not in hashs hashs.append(h) # List are not hashable. So they are transformed into tuple. assert hash_from_dict({0: (0, )}) == hash_from_dict({0: [0]})
def get_monitoring_channels(self, data): if data is None: m = 100 conditional_data = self.condition_distribution.sample(m) else: _, conditional_data = data m = conditional_data.shape[0] noise = self.get_noise((m, self.noise_dim)) rval = OrderedDict() sampled_data = (noise, conditional_data) try: rval.update(self.mlp.get_monitoring_channels((sampled_data, None))) except Exception: warnings.warn( "something went wrong with generator.mlp's monitoring channels" ) if self.monitor_ll: rval['ll'] = T.cast( self.ll(data, self.ll_n_samples, self.ll_sigma), theano.config.floatX).mean() rval['nll'] = -rval['ll'] return rval
def get_monitoring_channels(self, model, data, **kwargs): rval = OrderedDict() space, sources = self.get_data_specs(model) X_data, X_condition = data m = X_data.shape[space.get_batch_axis()] G, D = model.generator, model.discriminator # Compute false negatives w/ empirical samples y_hat = D.fprop((X_data, X_condition)) rval['false_negatives'] = T.cast((y_hat < 0.5).mean(), 'float32') # Compute false positives w/ generated sample G_conditional_data = self.condition_distribution.sample(m) samples = G.sample(G_conditional_data) y_hat = D.fprop((samples, G_conditional_data)) rval['false_positives'] = T.cast((y_hat > 0.5).mean(), 'float32') # y = T.alloc(0., m, 1) cost = D.cost_from_X(((samples, G_conditional_data), y_hat)) sample_grad = T.grad(-cost, samples) rval['sample_grad_norm'] = T.sqrt(T.sqr(sample_grad).sum()) _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) if model.monitor_inference and i_obj != 0: rval['objective_i'] = i_obj if model.monitor_discriminator: rval['objective_d'] = d_obj if model.monitor_generator: rval['objective_g'] = g_obj rval['now_train_generator'] = self.now_train_generator return rval
def get_updates(self): self.updates = OrderedDict() epoch = theano.shared(1, name='epoch_t') self.updates[epoch] = epoch + 1 self.updates[self.learning_rate] = T.cast( self.initial_rate * self.decay_rate**(epoch / self.step), theano.config.floatX) return self.updates
def get_layer_monitoring_channels(self): rval = OrderedDict() for param in self.get_params(): rval[param.name + "_min"] = param.min() rval[param.name + "_max"] = param.max() rval[param.name + "_mean"] = param.mean() return rval
def get_monitoring_channels(self, data): rval = OrderedDict() try: rval.update(self.mlp.get_monitoring_channels(data)) except Exception: warnings.warn( "something went wrong with compressor.mlp's monitoring channels" ) return rval
def get_monitoring_channels(self, data): """ .. todo:: WRITEME """ space, source = self.get_monitoring_data_specs() space.validate(data) X = data history = self.mf(X, return_history=True) q = history[-1] rval = OrderedDict() ch = self.visible_layer.get_monitoring_channels() for key in ch: rval['vis_' + key] = ch[key] for state, layer in safe_zip(q, self.hidden_layers): ch = layer.get_monitoring_channels() for key in ch: rval[layer.layer_name + '_' + key] = ch[key] ch = layer.get_monitoring_channels_from_state(state) for key in ch: rval['mf_' + layer.layer_name + '_' + key] = ch[key] if len(history) > 1: prev_q = history[-2] flat_q = flatten(q) flat_prev_q = flatten(prev_q) mx = None for new, old in safe_zip(flat_q, flat_prev_q): cur_mx = abs(new - old).max() if new is old: print new, 'is', old assert False if mx is None: mx = cur_mx else: mx = T.maximum(mx, cur_mx) rval['max_var_param_diff'] = mx for layer, new, old in safe_zip(self.hidden_layers, q, prev_q): sum_diff = 0. for sub_new, sub_old in safe_zip(flatten(new), flatten(old)): sum_diff += abs(sub_new - sub_old).sum() denom = self.batch_size * layer.get_total_state_space( ).get_total_dimension() denom = np.cast[config.floatX](denom) rval['mean_' + layer.layer_name + '_var_param_diff'] = sum_diff / denom return rval
def orderings(self, function_graph): """ Called by toposort. It should return a dictionary of {node: predecessors} where predecessors is a list of nodes that should be computed before the key node. If you raise an exception in this function, the state of the graph might be broken for all intents and purposes. """ return OrderedDict()
def on_change_input(self, fgraph, app, i, old_r, new_r, reason): """ app.inputs[i] changed from old_r to new_r. """ if app == 'output': # app == 'output' is special key that means FunctionGraph is redefining which nodes are being # considered 'outputs' of the graph. pass else: if app not in self.debug_all_apps: raise ProtocolError("change without import") # UPDATE self.clients self.clients[old_r][app] -= 1 if self.clients[old_r][app] == 0: del self.clients[old_r][app] self.clients.setdefault(new_r, OrderedDict()).setdefault(app, 0) self.clients[new_r][app] += 1 # UPDATE self.view_i, self.view_o for o_idx, i_idx_list in iteritems(getattr(app.op, 'view_map', OrderedDict())): if len(i_idx_list) > 1: # destroying this output invalidates multiple inputs raise NotImplementedError() i_idx = i_idx_list[0] output = app.outputs[o_idx] if i_idx == i: if app.inputs[i_idx] is not new_r: raise ProtocolError("wrong new_r on change") self.view_i[output] = new_r self.view_o[old_r].remove(output) if not self.view_o[old_r]: del self.view_o[old_r] self.view_o.setdefault(new_r, OrderedSet()).add(output) self.stale_droot = True
def test_known_grads(): # Tests that the grad method with no known_grads # matches what happens if you put its own known_grads # in for each variable full_range = theano.tensor.arange(10) x = theano.tensor.scalar('x') t = theano.tensor.iscalar('t') ft = full_range[t] ft.name = 'ft' coeffs = theano.tensor.vector('c') ct = coeffs[t] ct.name = 'ct' p = x**ft p.name = 'p' y = ct * p y.name = 'y' cost = theano.tensor.sqr(y) cost.name = 'cost' layers = [[cost], [y], [ct, p], [ct, x, ft], [coeffs, t, full_range, x]] inputs = [coeffs, t, x] rng = np.random.RandomState([2012, 11, 15]) values = [rng.randn(10), rng.randint(10), rng.randn()] values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)] true_grads = theano.tensor.grad(cost, inputs, disconnected_inputs='ignore') true_grads = theano.function(inputs, true_grads) true_grads = true_grads(*values) for layer in layers: first = theano.tensor.grad(cost, layer, disconnected_inputs='ignore') known = OrderedDict(izip(layer, first)) full = theano.tensor.grad(cost=None, known_grads=known, wrt=inputs, disconnected_inputs='ignore') full = theano.function(inputs, full) full = full(*values) assert len(true_grads) == len(full) for a, b, var in zip(true_grads, full, inputs): if not np.allclose(a, b): print('Failure') print(a) print(b) print(var) print(layer) for v in known: print(v, ':', theano.function(inputs, known[v])(*values)) assert False
def __init__(self, *axis): # Sort them to make sure we merge all possible case. items = sorted(axis) self.axis = OrderedDict(items) for axis, broad in iteritems(self.axis): if not isinstance(axis, (numpy.integer, integer_types)): raise TypeError("Rebroadcast needs integer axes. " "Got {}".format(axis)) if not isinstance(broad, (numpy.bool_, bool)): raise TypeError("Rebroadcast needs bool for new broadcast " "pattern. Got {}".format(broad))
def __init__(self, *axis): # Sort them to make sure we merge all possible case. items = sorted(axis) self.axis = OrderedDict(items) for axis, broad in iteritems(self.axis): assert isinstance( axis, (numpy.integer, int)), ("Rebroadcast needs integer axes. Got ", axis) assert isinstance(broad, bool), ( "Rebroadcast needs bool for new broadcast pattern. Got ", broad)
def run(replay, log=None): if not replay: log = StringIO() else: log = StringIO(log) record = Record(replay=replay, file_object=log) disturb_mem.disturb_mem() mode = RecordMode(record=record) b = sharedX(np.zeros((2, )), name='b') channels = OrderedDict() disturb_mem.disturb_mem() v_max = b.max(axis=0) v_min = b.min(axis=0) v_range = v_max - v_min updates = [] for i, val in enumerate([ v_max.max(), v_max.min(), v_range.max(), ]): disturb_mem.disturb_mem() s = sharedX(0., name='s_' + str(i)) updates.append((s, val)) for var in theano.gof.graph.ancestors(update for _, update in updates): if var.name is not None and var.name is not 'b': if var.name[0] != 's' or len(var.name) != 2: var.name = None for key in channels: updates.append((s, channels[key])) f = theano.function([], mode=mode, updates=updates, on_unused_input='ignore', name='f') for output in f.maker.fgraph.outputs: mode.record.handle_line(var_descriptor(output) + '\n') disturb_mem.disturb_mem() f() mode.record.f.flush() if not replay: return log.getvalue()
def make_layer_to_state(self, num_examples, rng=None): """ Makes and returns a dictionary mapping layers to states. By states, we mean here a real assignment, not a mean field state. For example, for a layer containing binary random variables, the state will be a shared variable containing values in {0,1}, not [0,1]. The visible layer will be included. Uses a dictionary so it is easy to unambiguously index a layer without needing to remember rules like vis layer = 0, hiddens start at 1, etc. Parameters ---------- num_examples : int WRITEME rng : WRITEME """ # Make a list of all layers layers = [self.visible_layer] + self.hidden_layers if rng is None: rng = self.rng states = [layer.make_state(num_examples, rng) for layer in layers] zipped = safe_zip(layers, states) def recurse_check(layer, state): if isinstance(state, (list, tuple)): for elem in state: recurse_check(layer, elem) else: val = state.get_value() m = val.shape[0] if m != num_examples: raise ValueError(layer.layer_name + " gave state with " + str(m) + " examples in some component." "We requested " + str(num_examples)) for layer, state in zipped: recurse_check(layer, state) rval = OrderedDict(zipped) return rval
def gradient_descent(self, loss): """Momentum GD with gradient clipping.""" grad = T.grad(loss, self.params) self.momentum_velocity_ = [0.] * len(grad) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad))) updates = OrderedDict() not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) scaling_den = T.maximum(5.0, grad_norm) for n, (param, grad) in enumerate(zip(self.params, grad)): grad = T.switch(not_finite, 0.1 * param, grad * (5.0 / scaling_den)) velocity = self.momentum_velocity_[n] update_step = self.momentum * velocity - self.learning_rate * grad self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): rval = OrderedDict() if state is None: state = self.fprop(state_below) vars_and_prefixes = [(state, '')] for var, prefix in vars_and_prefixes: # print "average output: ", var.ndim, type(var) # if not hasattr(var, 'ndim') or var.ndim != 4: # print "expected 4D tensor, got " # print var # print type(var) # if isinstance(var, tuple): # print "tuple length: ", len(var) # assert False v_max = var.max(axis=1) v_min = var.min(axis=1) v_mean = var.mean(axis=1) v_range = v_max - v_min # max_x.mean_u is "the mean over *u*nits of the max over # e*x*amples" The x and u are included in the name because # otherwise its hard to remember which axis is which when reading # the monitor I use inner.outer rather than outer_of_inner or # something like that because I want mean_x.* to appear next to # each other in the alphabetical list, as these are commonly # plotted together for key, val in [('max_x.max_u', v_max.max()), ('max_x.mean_u', v_max.mean()), ('max_x.min_u', v_max.min()), ('min_x.max_u', v_min.max()), ('min_x.mean_u', v_min.mean()), ('min_x.min_u', v_min.min()), ('range_x.max_u', v_range.max()), ('range_x.mean_u', v_range.mean()), ('range_x.min_u', v_range.min()), ('mean_x.max_u', v_mean.max()), ('mean_x.mean_u', v_mean.mean()), ('mean_x.min_u', v_mean.min())]: rval[prefix + key] = val return rval