def __init__(self, inputs, outputs = None, updates = None): """ Standin for a theano function with the given inputs, outputs, updates. Here in the __init__ method you give the same expression as usual. However, instead of passing __call__ the input variables directly, you pass it batches, where each batch is a list containing the inputs for that batch. It returns the average value of the function, averaged across batches, taking batch size into account. The average of all updates is also applied. One extra change: if any of the inputs is a shared variable, then this can assign to that variable, while theano.function would refuse to. Those shared variables will be left with the value of the last batch when __call__ returns. """ batch_size = T.cast(inputs[0].shape[0], 'float32') total_examples = T.scalar() transformed_updates = OrderedDict() self.has_updates = updates is not None if self.has_updates: self._clear = function([], updates = [ (var, 0. * var) for var in updates]) for var in updates: update = updates[var] transformed_updates[var] = var + (batch_size / total_examples) * update self._shared_mask = [ hasattr(elem, 'get_value') for elem in inputs] true_inputs = self._true_inputs(inputs) self._shared = self._shared_inputs(inputs) if outputs is not None: if not isinstance(outputs, list): outputs = [ outputs ] outputs = [ output * (batch_size / total_examples) for output in outputs] self._func = function(true_inputs + [total_examples], outputs=outputs, updates=transformed_updates)
def __init__(self, inputs, outputs=None, updates=None): batch_size = T.cast(inputs[0].shape[0], 'float32') total_examples = T.scalar() transformed_updates = OrderedDict() self.has_updates = updates is not None if self.has_updates: self._clear = function([], updates=[(var, 0. * var) for var in updates]) for var in updates: update = updates[var] transformed_updates[var] = var + \ (batch_size / total_examples) * update self._shared_mask = [hasattr(elem, 'get_value') for elem in inputs] true_inputs = self._true_inputs(inputs) self._shared = self._shared_inputs(inputs) if outputs is not None: if not isinstance(outputs, list): outputs = [outputs] outputs = [ output * (batch_size / total_examples) for output in outputs ] self._func = function(true_inputs + [total_examples], outputs=outputs, updates=transformed_updates)
def get_fixed_var_descr(self, model, X, Y=None): rval = FixedVarDescr() rval.fixed_vars = {'sup_aux_var': sup_counter} rval.on_load_batch = [ function([X, Y], updates=[(sup_counter, sup_counter + 1)]) ] return rval
def test_vector_to_conv_c01b_invertible(): """ Tests that the format_as methods between Conv2DSpace and VectorSpace are invertible for the ('c', 0, 1, 'b') axis format. """ rng = np.random.RandomState([2013, 5, 1]) batch_size = 3 rows = 4 cols = 5 channels = 2 conv = Conv2DSpace([rows, cols], channels=channels, axes=('c', 0, 1, 'b')) vec = VectorSpace(conv.get_total_dimension()) X = conv.make_batch_theano() Y = conv.format_as(X, vec) Z = vec.format_as(Y, conv) A = vec.make_batch_theano() B = vec.format_as(A, conv) C = conv.format_as(B, vec) f = function([X, A], [Z, C]) X = rng.randn(*(conv.get_origin_batch(batch_size).shape)).astype(X.dtype) A = rng.randn(*(vec.get_origin_batch(batch_size).shape)).astype(A.dtype) Z, C = f(X, A) np.testing.assert_allclose(Z, X) np.testing.assert_allclose(C, A)
def test_vector_to_conv_c01b_invertible(): """ Tests that the format_as methods between Conv2DSpace and VectorSpace are invertible for the ('c', 0, 1, 'b') axis format. """ rng = np.random.RandomState([2013, 5, 1]) batch_size = 3 rows = 4 cols = 5 channels = 2 conv = Conv2DSpace([rows, cols], channels = channels, axes = ('c', 0, 1, 'b')) vec = VectorSpace(conv.get_total_dimension()) X = conv.make_batch_theano() Y = conv.format_as(X, vec) Z = vec.format_as(Y, conv) A = vec.make_batch_theano() B = vec.format_as(A, conv) C = conv.format_as(B, vec) f = function([X, A], [Z, C]) X = rng.randn(*(conv.get_origin_batch(batch_size).shape)).astype(X.dtype) A = rng.randn(*(vec.get_origin_batch(batch_size).shape)).astype(A.dtype) Z, C = f(X,A) np.testing.assert_allclose(Z, X) np.testing.assert_allclose(C, A)
def get_fixed_var_descr(self, model, X, Y): """ .. todo:: WRITEME """ assert Y is not None batch_size = model.batch_size drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size)) drop_mask_X.name = 'drop_mask' X_space = model.get_input_space() updates = OrderedDict() rval = FixedVarDescr() inputs=[X, Y] if not self.supervised: update_X = self.mask_gen(X, X_space = X_space) else: drop_mask_Y = sharedX(np.ones(batch_size,)) drop_mask_Y.name = 'drop_mask_Y' update_X, update_Y = self.mask_gen(X, Y, X_space) updates[drop_mask_Y] = update_Y rval.fixed_vars['drop_mask_Y'] = drop_mask_Y if self.mask_gen.sync_channels: n = update_X.ndim assert n == drop_mask_X.ndim - 1 update_X.name = 'raw_update_X' zeros_like_X = T.zeros_like(X) zeros_like_X.name = 'zeros_like_X' update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x') update_X.name = 'update_X' updates[drop_mask_X] = update_X rval.fixed_vars['drop_mask'] = drop_mask_X if hasattr(model.inference_procedure, 'V_dropout'): include_prob = model.inference_procedure.include_prob include_prob_V = model.inference_procedure.include_prob_V include_prob_Y = model.inference_procedure.include_prob_Y theano_rng = MRG_RandomStreams(2012+11+20) for elem in flatten([model.inference_procedure.V_dropout]): updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V if "Softmax" in str(type(model.hidden_layers[-1])): hid = model.inference_procedure.H_dropout[:-1] y = model.inference_procedure.H_dropout[-1] updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y else: hid = model.inference_procedure.H_dropout for elem in flatten(hid): updates[elem] = theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob rval.on_load_batch = [utils.function(inputs, updates=updates)] return rval
def __init__(self, dataset, batch_size, num_batches, topo, targets, rng): if rng is None: rng = np.random.RandomState([2013, 4, 22]) if isinstance(rng, list): rng = np.random.RandomState(rng) self.__dict__.update(locals()) del self.self theano_rng = MRG_RandomStreams(rng.randint(2 ** 16)) if batch_size is None: raise ValueError("must specify batch size, there is infinite data.") samples = dataset.s3c.random_design_matrix(batch_size, theano_rng = theano_rng, return_all = targets) assert samples is not None if targets: assert len(samples) == 3 assert not any(sample is None for sample in samples) else: assert isinstance(samples, Variable) warnings.warn("This is recompiled every time we make a new iterator, just compile it once per iteration mode. Keep in mind the rng is part of the mode though-- the monitor wants to see the same stuff every time.") self.f = function([], samples) if num_batches is None: raise ValueError("must specify a number of batches, there is infinite 'data'") self.num_examples = num_batches * batch_size
def get_fixed_var_descr(self, model, X, Y, **kwargs): rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} Y=T.matrix() theano_func = function([X, Y], updates=[(unsup_counter, unsup_counter + 1)]) rval.on_load_batch = [theano_func] return rval
def __init__(self, inputs, outputs=None, updates=None): """ Standin for a theano function with the given inputs, outputs, updates. Here in the __init__ method you give the same expression as usual. However, instead of passing __call__ the input variables directly, you pass it batches, where each batch is a list containing the inputs for that batch. It returns the average value of the function, averaged across batches, taking batch size into account. The average of all updates is also applied. One extra change: if any of the inputs is a shared variable, then this can assign to that variable, while theano.function would refuse to. Those shared variables will be left with the value of the last batch when __call__ returns. Parameters ---------- inputs : WRITEME outputs : WRITEME updates : WRITEME """ batch_size = T.cast(inputs[0].shape[0], 'float32') total_examples = T.scalar() transformed_updates = OrderedDict() self.has_updates = updates is not None if self.has_updates: self._clear = function([], updates=[(var, 0. * var) for var in updates]) for var in updates: update = updates[var] transformed_updates[var] = var + (batch_size / total_examples) * update self._shared_mask = [hasattr(elem, 'get_value') for elem in inputs] true_inputs = self._true_inputs(inputs) self._shared = self._shared_inputs(inputs) if outputs is not None: if not isinstance(outputs, list): outputs = [outputs] outputs = [ output * (batch_size / total_examples) for output in outputs ] self._func = function(true_inputs + [total_examples], outputs=outputs, updates=transformed_updates)
def __init__(self, inputs, outputs = None, updates = None): batch_size = T.cast(inputs[0].shape[0], 'float32') total_examples = T.scalar() transformed_updates = OrderedDict() self.has_updates = updates is not None if self.has_updates: self._clear = function([], updates = [ (var, 0. * var) for var in updates]) for var in updates: update = updates[var] transformed_updates[var] = var + (batch_size / total_examples) * update self._shared_mask = [ hasattr(elem, 'get_value') for elem in inputs] true_inputs = self._true_inputs(inputs) self._shared = self._shared_inputs(inputs) if outputs is not None: if not isinstance(outputs, list): outputs = [ outputs ] outputs = [ output * (batch_size / total_examples) for output in outputs] self._func = function(true_inputs + [total_examples], outputs=outputs, updates=transformed_updates)
def enforce_constraints(self): """ Enforces all constraints encoded by self.modify_updates. """ params = self.get_params() updates = OrderedDict(izip_no_length_check(params, params)) self.modify_updates(updates) f = function([], updates=updates) f()
def enforce_constraints(self): """ Enforces all constraints encoded by self.censor_updates. """ params = self.get_params() updates = OrderedDict(izip_no_length_check(params, params)) self.censor_updates(updates) f = function([], updates=updates) f()
def get_fixed_var_descr(self, model, X, Y, **kwargs): rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} Y = T.matrix() theano_func = function([X, Y], updates=[(unsup_counter, unsup_counter + 1) ]) rval.on_load_batch = [theano_func] return rval
def get_fixed_var_descr(self, model, data): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'sup_aux_var': sup_counter} theano_func = function([], updates=[(sup_counter, sup_counter + 1)]) def on_load(data): theano_func() rval.on_load_batch = [on_load] return rval
def get_obj_func(model): X = model.get_input_space().make_batch_theano() Y = model.get_output_space().make_batch_theano() y = T.argmax(Y, axis=1) drop_mask = mask_gen(X, X_space=model.get_input_space()) if isinstance(model, MLP_Wrapper): Q = model.mf_missing(X, drop_mask) else: Q = model.inference_procedure.do_inpainting(X, Y = T.zeros_like(Y), drop_mask = drop_mask, drop_mask_Y = T.ones_like(T.cast(y, 'float32'))) Y_hat = Q[-1] y_hat = T.argmax(Y_hat, axis=1) obj = T.neq(y, y_hat).mean() return function([X,Y], obj)
def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W ,= self.transformer.get_params() W = W.T W = W.reshape((self.dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)()
def get_fixed_var_descr(self, model, data, **kwargs): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} # The input to function should be a flat, non-redundent tuple mapping = DataSpecsMapping(data_specs) data_tuple = mapping.flatten(data, return_tuple=True) theano_func = function([], updates=[(unsup_counter, unsup_counter + 1)]) def on_load(batch, mapping=mapping, theano_func=theano_func): return theano_func() rval.on_load_batch = [on_load] return rval
def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W, = self.transformer.get_params() W = W.T W = W.reshape( (self.dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.num_channels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)()
def get_obj_func(model): X = model.get_input_space().make_batch_theano() Y = model.get_output_space().make_batch_theano() y = T.argmax(Y, axis=1) drop_mask = mask_gen(X, X_space=model.get_input_space()) if isinstance(model, MLP_Wrapper): Q = model.mf_missing(X, drop_mask) else: Q = model.inference_procedure.do_inpainting(X, Y=T.zeros_like(Y), drop_mask=drop_mask, drop_mask_Y=T.ones_like( T.cast(y, 'float32'))) Y_hat = Q[-1] y_hat = T.argmax(Y_hat, axis=1) obj = T.neq(y, y_hat).mean() return function([X, Y], obj)
def get_fixed_var_descr(self, model, data, **kwargs): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} # The input to function should be a flat, non-redundent tuple mapping = DataSpecsMapping(data_specs) data_tuple = mapping.flatten(data, return_tuple=True) theano_func = function([], updates=[(unsup_counter, unsup_counter + 1) ]) def on_load(batch, mapping=mapping, theano_func=theano_func): return theano_func() rval.on_load_batch = [on_load] return rval
def get_fixed_var_descr(self, model, data): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'sup_aux_var': sup_counter} rval.data_specs = data_specs # data has to be flattened into a tuple before being passed # to `function`. mapping = DataSpecsMapping(data_specs) flat_data = mapping.flatten(data, return_tuple=True) theano_func = function(flat_data, updates=[(sup_counter, sup_counter + 1)]) # the on_load_batch function will take numerical data formatted # as rval.data_specs, so we have to flatten it inside the # returned function too. # Using default argument binds the variables used in the lambda # function to the value they have when the lambda is defined. on_load = (lambda batch, mapping=mapping, theano_func=theano_func: theano_func(*mapping.flatten(batch, return_tuple=True))) rval.on_load_batch = [on_load] return rval
def get_fixed_var_descr(self, model, data, **kwargs): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} rval.data_specs = data_specs # The input to function should be a flat, non-redundent tuple mapping = DataSpecsMapping(data_specs) data_tuple = mapping.flatten(data, return_tuple=True) theano_func = function(data_tuple, updates=[(unsup_counter, unsup_counter + 1)]) # the on_load_batch function will take numerical data formatted # as rval.data_specs, so we have to flatten it inside the # returned function too. # Using default argument binds the variables used in the lambda # function to the value they have when the lambda is defined. on_load = (lambda batch, mapping=mapping, theano_func=theano_func: theano_func(*mapping.flatten(batch, return_tuple=True))) rval.on_load_batch = [on_load] return rval
def setup(self, model, dataset, algorithm): self.origin = model.get_param_vector() cost = algorithm.cost # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now # ======================================= data_specs = cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) # End cargo culting # ====================== print "Compiling cost function..." cost_fn = function(theano_args, cost_value) self.cost_fn = cost_fn
def get_fixed_var_descr(self, model, data, **kwargs): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} rval.data_specs = data_specs # The input to function should be a flat, non-redundent tuple mapping = DataSpecsMapping(data_specs) data_tuple = mapping.flatten(data, return_tuple=True) theano_func = function(data_tuple, updates=[(unsup_counter, unsup_counter + 1) ]) # the on_load_batch function will take numerical data formatted # as rval.data_specs, so we have to flatten it inside the # returned function too. # Using default argument binds the variables used in the lambda # function to the value they have when the lambda is defined. on_load = (lambda batch, mapping=mapping, theano_func=theano_func: theano_func(*mapping.flatten(batch, return_tuple=True))) rval.on_load_batch = [on_load] return rval
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is needed so that if new channels are added, Theano's optimizations make sure (to the extent that they can) that the new channels and old channels don't have any redundant calculations. It is also needed to regenerate Theano functions after pickling and unpickling, since Theano functions should not be pickled. """ self._dirty = False init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function(inputs=[], updates=updates, mode=self.theano_function_mode, name = 'Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() #Get the appropriate kind of theano variable to represent the data the model #acts on X = self.model.get_input_space().make_theano_batch(name = "monitoring_X") if config.compute_test_value != 'off': m = self.model.get_test_batch_size() test_value = self.model.get_input_space().get_origin_batch(m) X.tag.test_value = np.cast[X.type.dtype](test_value) if self.require_label: Y = self.model.get_output_space().make_theano_batch(name = "monitoring_Y") log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including channel '+key+'\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \ for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for channel in self.channels.values(): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] n = num_examples[index] u = updates[index] if isinstance(channel.graph_input, (list, tuple)): g[channel.graph_input[0]] = X g[channel.graph_input[1]] = Y else: g[channel.graph_input] = X if n == 0: raise ValueError("Iterating over 0 examples results in divide by 0") if self.topo: batch_index = d.get_topo_batch_axis() else: batch_index = 0 val = channel.val * T.cast(X.shape[batch_index], config.floatX) / n u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' \ + key.name + ' has dtype ' + key.dtype + \ ' but is driven by an expression with type ' + \ up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key '+var_descriptor(elem)+'\n') mode.record.handle_line('g val '+var_descriptor(g[elem])+'\n') for elem in u: mode.record.handle_line('u key '+var_descriptor(elem)+'\n') mode.record.handle_line('u val '+var_descriptor(u[elem])+'\n') function_name = 'Monitor.accum[%d]' % idx if self.require_label: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just monitor the model # parameters, or some shared variable updated by the training algorithm, so we # need to ignore the unused input error self.accum.append(function([X, Y], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) else: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling unsupervised accum\n') self.accum.append(function([X], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output '+var_descriptor(elem)+'\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names])
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args, ) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [ d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size) ] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) / cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append( function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names])
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is needed so that if new channels are added, Theano's optimizations make sure (to the extent that they can) that the new channels and old channels don't have any redundant calculations. It is also needed to regenerate Theano functions after pickling and unpickling, since Theano functions should not be pickled. """ self._dirty = False init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data the model # acts on X = self.model.get_input_space().make_theano_batch(name="monitoring_X") if config.compute_test_value != 'off': m = self.model.get_test_batch_size() test_value = self.model.get_input_space().get_origin_batch(m) X.tag.test_value = np.cast[X.type.dtype](test_value) if self.require_label: Y = self.model.get_output_space().make_theano_batch( name="monitoring_Y") log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line( 'compiling monitor including channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \ for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for channel in self.channels.values(): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] if isinstance(channel.graph_input, (list, tuple)): channel_X, channel_Y = channel.graph_input assert channel_X not in g or g[channel_X] is X assert channel_Y not in g or g[channel_Y] is Y g[channel_X] = X g[channel_Y] = Y else: channel_X = channel.graph_input assert channel_X not in g or g[channel_X] is X g[channel_X] = X if n == 0: raise ValueError( "Iterating over 0 examples results in divide by 0") if self.topo: batch_index = d.get_topo_batch_axis() else: batch_index = 0 val = channel.val * T.cast(X.shape[batch_index], config.floatX) / cur_num_examples u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' \ + key.name + ' has dtype ' + key.dtype + \ ' but is driven by an expression with type ' + \ up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if self.require_label: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just monitor the model # parameters, or some shared variable updated by the training algorithm, so we # need to ignore the unused input error self.accum.append( function([X, Y], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) else: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line( 'compiling unsupervised accum\n') self.accum.append( function([X], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names])
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): """ objective: a theano expression to be minimized should be a function of params and, if provided, inputs params: A list of theano shared variables. These are the optimization variables inputs: (Optional) A list of theano variables to serve as inputs to the graph. param_constrainers: (Optional) A list of callables to be called on all updates dictionaries to be applied to params. This is how you implement constrained optimization. reset_alpha: If True, reverts to using init_alpha after each call. If False, the final set of alphas is used at the start of the next call to minimize. conjugate: If True, tries to pick conjugate gradient directions. For the directions to be truly conjugate, you must use line_search_mode = 'exhaustive' and the objective function must be quadratic. Using line_search_mode = 'exhaustive' on a non-quadratic objective function implements nonlinear conjugate gradient descent. reset_conjugate: has no effect unless conjugate == True if reset_conjugate == True, reverts to direction of steepest descent for the first step in each call to minimize. otherwise, tries to make the new search direction conjugate to the last one (even though the objective function might be totally different on each call to minimize) gradients: if None, compute the gradients of obj using T.grad otherwise, a dictionary mapping from params to expressions for their gradients (this allows you to use approximate gradients computed with something other than T.grad) gradient_updates: a dictionary of shared variable updates to run each time the gradient is computed Calling the ``minimize'' method with values for for ``inputs'' will update ``params'' to minimize ``objective''. """ self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: print 'batch gradient class compiling gradient function' t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() print 'done. Took ', t2 - t1 if self.verbose: print 'batch gradient class compiling objective function' if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: print 'done' self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt( sum([ T.sqr(elem).sum() for elem in self.param_to_grad_shared.values() ])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + ( 1. - self.new_weight) * self.ave_grad_size self._normalize_grad = function( [], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_' + elem.name) self._store_old_grad = function( [norm], updates=OrderedDict([(grad_to_old_grad[g], g * norm) for g in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g] for g in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) """ beta_pr is the Polak-Ribiere formula for beta. According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." (ie, it is meant to revert to steepest descent when you have traveled far enough that the objective function is behaving non-quadratically enough that the conjugate gradient formulas aren't working anymore) http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method """ assert grad not in grad_to_old_grad make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function( [], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def __init__(self, objective, params, inputs = None, param_constrainers = None, max_iter = -1, lr_scalers = None, verbose = 0, tol = None, init_alpha = None, min_init_alpha = 1e-3, reset_alpha = True, conjugate = False, reset_conjugate = True, gradients = None, gradient_updates = None, line_search_mode = None, accumulate = False, theano_function_mode=None): """ objective: a theano expression to be minimized should be a function of params and, if provided, inputs params: A list of theano shared variables. These are the optimization variables inputs: (Optional) A list of theano variables to serve as inputs to the graph. param_constrainers: (Optional) A list of callables to be called on all updates dictionaries to be applied to params. This is how you implement constrained optimization. reset_alpha: If True, reverts to using init_alpha after each call. If False, the final set of alphas is used at the start of the next call to minimize. conjugate: If True, tries to pick conjugate gradient directions. For the directions to be truly conjugate, you must use line_search_mode = 'exhaustive' and the objective function must be quadratic. Using line_search_mode = 'exhaustive' on a non-quadratic objective function implements nonlinear conjugate gradient descent. reset_conjugate: has no effect unless conjugate == True if reset_conjugate == True, reverts to direction of steepest descent for the first step in each call to minimize. otherwise, tries to make the new search direction conjugate to the last one (even though the objective function might be totally different on each call to minimize) gradients: if None, compute the gradients of obj using T.grad otherwise, a dictionary mapping from params to expressions for their gradients (this allows you to use approximate gradients computed with something other than T.grad) gradient_updates: a dictionary of shared variable updates to run each time the gradient is computed Calling the ``minimize'' method with values for for ``inputs'' will update ``params'' to minimize ``objective''. """ self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [ param for param in params ] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX( param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: print 'batch gradient class compiling gradient function' t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates = updates) else: self._compute_grad = function(inputs, updates = updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() print 'done. Took ',t2-t1 if self.verbose: print 'batch gradient class compiling objective function' if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: print 'done' self.param_to_cache = OrderedDict() alpha = T.scalar(name = 'alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g], g * norm) for g in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [ grad_to_old_grad[g] for g in grad_ordered] def dot_product(x, y): return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) """ beta_pr is the Polak-Ribiere formula for beta. According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." (ie, it is meant to revert to steepest descent when you have traveled far enough that the objective function is behaving non-quadratically enough that the conjugate gradient formulas aren't working anymore) http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method """ assert grad not in grad_to_old_grad make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def setup_impl(self, model, dataset, algorithm): cost = algorithm.cost root = model.get_param_vector() dim = root.size rng = self.rng points = rng.randn(self.num_points, self.num_basis_vectors) points = points.astype(root.dtype) points *= self.scale if self.include_root: points[0, :] = 0. if not hasattr(self, 'cost_fn'): # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now # ======================================= data_specs = cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) # End cargo culting # ====================== print "Compiling cost function..." cost_fn = function(theano_args, cost_value) self.cost_fn = cost_fn else: cost_fn = self.cost_fn cost_values = np.zeros(self.num_points) data = list(dataset.get_batch_design(self.batch_size, include_labels=True)) from pylearn2.utils.one_hot import one_hot data[1] = one_hot(data[1]) if self.method == 'gaussian': basis = rng.normal(dim, self.num_basis_vectors).astype(root.dtype) elif self.method == 'element': basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype) for i in xrange(self.num_basis_vectors): basis[rng.randint(dim), i] = 1. elif self.method == 'gradient': if not hasattr(self, 'grad_fn'): self.grad_fn = function(theano_args, grad(cost_value, model.get_params())) grad_fn = self.grad_fn basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype) for i in xrange(self.num_basis_vectors): ipt = list(dataset.get_batch_design(1, include_labels=True)) label = ipt[1] assert label.size == 1 label = label[0] one_hot = np.zeros((1, 10,),dtype='float32') one_hot[0, label] = 1 ipt[1] = one_hot g = grad_fn(*ipt) basis[:,i] = np.concatenate([e.reshape(e.size) for e in g], axis=0) else: assert False basis /= np.sqrt(np.square(basis).sum(axis=0)) # Orthogonalize basis for i in xrange(self.num_basis_vectors): v = basis[:,i ].copy() for j in xrange(i - 1): u = basis[:, j].copy() v -= np.dot(u, v) * u norm = np.sqrt(np.square(v).sum()) assert norm > 1e-4 v /= norm basis[:,i] = v for i in xrange(self.num_points): print "Evaluating cost at point ", i point = points[i, :] full_point = root + np.dot(basis, point) model.set_param_vector(full_point) cost_values[i] = cost_fn(*data) print cost_values[i] from pylearn2.utils import sharedX import theano.tensor as T print "!!!!!!!! FITTING THE QUADRATIC FUNCTION !!!!!!!!!!!!!!!!!!!" if not hasattr(self, 'fit_quad'): points = sharedX(points) #from theano import config #config.compute_test_value = 'raise' cost_values = sharedX(cost_values) A = sharedX(np.zeros((self.num_basis_vectors, self.num_basis_vectors))) if self.psd: mat = T.dot(A.T, A) else: mat = A b = sharedX(np.zeros(self.num_basis_vectors)) c = sharedX(0.) half_quad = T.dot(points, mat) quad = (points * half_quad).sum(axis=1) lin = T.dot(points, b) pred = quad + lin + c from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent mse = T.square(pred - cost_values).mean() mae = abs(pred - cost_values).mean() obj = locals()[self.fitting_cost] fit_quad = BatchGradientDescent(obj, params = [A, b, c], max_iter = self.num_basis_vectors ** 2, verbose = 3, tol = None, init_alpha = None, min_init_alpha = 1e-7, reset_alpha = False, conjugate = True, reset_conjugate = False, line_search_mode = 'exhaustive') self.fit_quad = fit_quad self.A = A self.b = b self.c = c self.points = points self.cost_values = cost_values else: self.A.set_value(.001 * np.identity(self.A.get_value().shape[0], dtype=self.A.dtype)) self.b.set_value(self.b.get_value() * 0.) self.c.set_value(self.c.get_value() * 0.) self.points.set_value(points) self.cost_values.set_value(cost_values.astype(self.cost_values.dtype)) self.fit_quad.minimize() print "!!!!!!!!!!!!! FINDING ITS MINIMUM !!!!!!!!!!!!!!!!!!!!!!!!!!!" if self.use_solver: if self.psd: Av = self.A.get_value() mat_v = np.dot(Av.T, Av) else: mat_v = self.A.get_value() bv = self.b.get_value() # minimize for x^T A x + b^T x + c # -> solve 2 A x + b = 0 # Ax = - b / 2 print "********** mat_v", mat_v.min(), mat_v.max() x, ignored_residuals, ignored_rank, ignored_singular_values = np.linalg.lstsq(mat_v, - 0.5 * bv) print "********** soln: ", x.min(), x.mean(), x.max() print "********** SVs: ", ignored_singular_values.min(), ignored_singular_values.max() assert x.ndim == 1, x.shape prod = np.dot(basis, x) norm = np.sqrt(np.square(prod).sum()) print "*************** Moving params by ",norm vector = root + prod model.set_param_vector(vector) else: # use minimizer if not hasattr(self, 'fit_params'): self.vector = sharedX(points.get_value().mean(axis=0)) vector = self.vector obj = T.dot(T.dot(mat, vector), vector) + T.dot(b, vector) def constrain(d): assert vector in d n = d[vector] norm = T.sqrt(T.square(n).sum()) desired_norm = T.clip(norm, 0., self.max_jump_norm) d[vector] = n * desired_norm / norm self.fit_params = BatchGradientDescent(obj, params=[vector], max_iter = self.num_basis_vectors, verbose = 3, tol=None, param_constrainers = [constrain], init_alpha = None, min_init_alpha = 1e-3, reset_alpha=False, conjugate=True, reset_conjugate=False, line_search_mode='exhaustive') else: self.vector.set_value(points.mean(axis=0).astype(self.vector.dtype)) self.fit_params.minimize() model.set_param_vector(root + np.dot(basis , self.vector.get_value()))
def get_fixed_var_descr(self, model, X, Y=None): rval = FixedVarDescr() rval.fixed_vars = {'sup_aux_var': sup_counter} rval.on_load_batch = [ function([X, Y], updates=[(sup_counter, sup_counter+1)])] return rval
from galatea.maxout import GCN_C01B2 layer = GCN_C01B2(layer_name='unused') from pylearn2.space import Conv2DSpace space = Conv2DSpace(shape=[32, 32], num_channels=3, axes = ('c', 0, 1, 'b')) layer.set_input_space(space) from pylearn2.utils import function X = space.make_batch_theano() gcn = function([X], layer.fprop(X)) from pylearn2.space import VectorSpace vector_space = VectorSpace(32*32*3) flatten = function([X], space.format_as(X, vector_space)) mean = np.zeros((32*32*3,), dtype='float32') cov = np.zeros((32*32*3, 32*32*3), dtype='float32') dataset.X = dataset.X.astype('float32') r_ofs = 8 c_ofs = 8
def get_obj_func(model): X = model.get_input_space().make_batch_theano() Y = model.get_output_space().make_batch_theano() obj = cost(model, X, Y) return function([X,Y], obj)
def get_fixed_var_descr(self, model, data): """ .. todo:: WRITEME """ X, Y = data assert Y is not None batch_size = model.batch_size drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size)) drop_mask_X.name = "drop_mask" X_space = model.get_input_space() updates = OrderedDict() rval = FixedVarDescr() inputs = [X, Y] if not self.supervised: update_X = self.mask_gen(X, X_space=X_space) else: drop_mask_Y = sharedX(np.ones(batch_size)) drop_mask_Y.name = "drop_mask_Y" update_X, update_Y = self.mask_gen(X, Y, X_space) updates[drop_mask_Y] = update_Y rval.fixed_vars["drop_mask_Y"] = drop_mask_Y if self.mask_gen.sync_channels: n = update_X.ndim assert n == drop_mask_X.ndim - 1 update_X.name = "raw_update_X" zeros_like_X = T.zeros_like(X) zeros_like_X.name = "zeros_like_X" update_X = zeros_like_X + update_X.dimshuffle(0, 1, 2, "x") update_X.name = "update_X" updates[drop_mask_X] = update_X rval.fixed_vars["drop_mask"] = drop_mask_X if hasattr(model.inference_procedure, "V_dropout"): include_prob = model.inference_procedure.include_prob include_prob_V = model.inference_procedure.include_prob_V include_prob_Y = model.inference_procedure.include_prob_Y theano_rng = make_theano_rng(None, 2012 + 10 + 20, which_method="binomial") for elem in flatten([model.inference_procedure.V_dropout]): updates[elem] = ( theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V ) if "Softmax" in str(type(model.hidden_layers[-1])): hid = model.inference_procedure.H_dropout[:-1] y = model.inference_procedure.H_dropout[-1] updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y else: hid = model.inference_procedure.H_dropout for elem in flatten(hid): updates[elem] = ( theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob ) rval.on_load_batch = [utils.function(inputs, updates=updates)] return rval
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.info('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.info('done. Took {0}'.format(t2-t1)) if self.verbose: logger.info('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.info('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function( [alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = \ self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = \ function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = \ sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = \ function([norm], updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) # beta_pr is the Polak-Ribiere formula for beta. # According to wikipedia, the beta to use for NCG is "a matter of # heuristics or taste" but max(0, beta_pr) is "a popular choice... # which provides direction reset automatically." (ie, it is meant # to revert to steepest descent when you have traveled far enough # that the objective function is behaving non-quadratically enough # that the conjugate gradient formulas aren't working anymore) # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = \ [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line( 'BatchGradientDescent._make_conjugate var ' + var_descriptor(v) + '\n') mode.record.handle_line( 'BatchGradientDescent._make_conjugate update ' + var_descriptor(u) + '\n') self._make_conjugate = \ function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line( 'BatchGradientDescent._make_conjugate output ' + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def run(self): mm = self.monitor updates = OrderedDict() for channel in mm.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) mm.begin_record_entry = function(inputs=[], updates=updates, mode=mm.theano_function_mode, name = 'Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() theano_args = mm._flat_data_specs[0].make_theano_batch( ['monitoring_%s' % s for s in mm._flat_data_specs[1]]) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = mm._flat_data_specs[0].batch_size(theano_args) nested_theano_args = mm._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args,) assert len(nested_theano_args) == (len(mm.channels) + 1) for key in sorted(mm.channels.keys()): mode = mm.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including channel '+key+'\n') #log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=mm._flat_data_specs, return_tuple=True) \ for d, i, n, b in safe_izip(mm._datasets, mm._iteration_mode, mm._num_batches, mm._batch_size)] mm.num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it] givens = [OrderedDict() for d in mm._datasets] updates = [OrderedDict() for d in mm._datasets] #for i, channel in enumerate(mm.channels.values()): for i, dw_name in enumerate(mm.channels.keys()): if dw_name in self.p_channel: channel = mm.channels[dw_name] index = mm._datasets.index(channel.dataset) d = mm._datasets[index] g = givens[index] cur_num_examples = mm.num_examples[index] u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X #print channel_X.type , X.type assert channel_X.type == X.type g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(mm._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) / cur_num_examples) u[channel.val_shared] = channel.val_shared + val mm.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = mm.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key '+var_descriptor(elem)+'\n') mode.record.handle_line('g val '+var_descriptor(g[elem])+'\n') for elem in u: mode.record.handle_line('u key '+var_descriptor(elem)+'\n') mode.record.handle_line('u val '+var_descriptor(u[elem])+'\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just monitor the model # parameters, or some shared variable updated by the training algorithm, so we # need to ignore the unused input error mm.accum.append(function(theano_args, givens=g, updates=u, mode=mm.theano_function_mode, name=function_name)) for a in mm.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output '+var_descriptor(elem)+'\n') #log.info("graph size: %d" % len(a.maker.fgraph.toposort())) datasets = mm._datasets # Set all channels' val_shared to 0 mm.begin_record_entry() for d, i, b, n, a, sd, ne in safe_izip(datasets, mm._iteration_mode, mm._batch_size, mm._num_batches, mm.accum, mm._rng_seed, mm.num_examples): myiterator = d.iterator(mode=i, batch_size=b, num_batches=n, data_specs=mm._flat_data_specs, return_tuple=True, rng=sd) # If mm._flat_data_specs is empty, no channel needs data, # so we do not need to call the iterator in order to average # the monitored values across different batches, we only # have to call them once. if len(mm._flat_data_specs[1]) == 0: X = () mm.run_prereqs(X, d) a(*X) else: actual_ne = 0 for X in myiterator: # X is a flat (not nested) tuple mm.run_prereqs(X, d) a(*X) actual_ne += mm._flat_data_specs[0].np_batch_size(X) # end for X if actual_ne != ne: raise RuntimeError("At compile time, your iterator said " "it had " + str(ne) + " examples total, but at " "runtime it gave us " + str(actual_ne) + ".") # end for d t = time.time() - mm.t0 if self.p_save != None: b= open(self.p_save,'a') b.write("\tEpochs seen: %d\n" % mm._epochs_seen) b.write("\tTime Elapse: %s\n" % str(datetime.timedelta(seconds=t))) print("Monitoring step:") print("\tEpochs seen: %d" % mm._epochs_seen) print("\tBatches seen: %d" % mm._num_batches_seen) print("\tTime Elapse: %s" % str(datetime.timedelta(seconds=t))) #print("\tExamples seen: %d" % mm._examples_seen) #print mm.channels for channel_name in self.p_channel: if channel_name in mm.channels: channel = mm.channels[channel_name] channel.time_record.append(t) channel.batch_record.append(mm._num_batches_seen) channel.example_record.append(mm._examples_seen) channel.epoch_record.append(mm._epochs_seen) val = channel.val_shared.get_value() # naive hack: ... #channel.val_shared.set_value(0) channel.val_record.append(val) if abs(val) < 1e4: val_str = str(val) else: val_str = '%.3e' % val print "\t%s: %s" % (channel_name, val_str) if self.p_save!=None: b.write("\t%s: %s\n" % (channel_name, val_str)) # clean up if self.p_save!=None: b.close() mm._epochs_seen += 1
def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry' ) updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args,) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it] self.num_examples = [float(i.num_examples) for i in it] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] inv_cur_num_examples = as_floatX(1./self.num_examples[index]) u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) * inv_cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append(function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names])
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.info('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.info('done. Took {0}'.format(t2 - t1)) if self.verbose: logger.info('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.info('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt( sum([ T.sqr(elem).sum() for elem in self.param_to_grad_shared.values() ])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = \ self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = \ function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = \ sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = \ function([norm], updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) # beta_pr is the Polak-Ribiere formula for beta. # According to wikipedia, the beta to use for NCG is "a matter of # heuristics or taste" but max(0, beta_pr) is "a popular choice... # which provides direction reset automatically." (ie, it is meant # to revert to steepest descent when you have traveled far enough # that the objective function is behaving non-quadratically enough # that the conjugate gradient formulas aren't working anymore) # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = \ [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line( 'BatchGradientDescent._make_conjugate var ' + var_descriptor(v) + '\n') mode.record.handle_line( 'BatchGradientDescent._make_conjugate update ' + var_descriptor(u) + '\n') self._make_conjugate = \ function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line( 'BatchGradientDescent._make_conjugate output ' + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def get_obj_func(model): X = model.get_input_space().make_batch_theano() Y = model.get_output_space().make_batch_theano() obj = cost(model, X, Y) return function([X, Y], obj)