def get_fixed_var_descr(self, model, X, Y): """ .. todo:: WRITEME """ assert Y is not None batch_size = model.batch_size drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size)) drop_mask_X.name = 'drop_mask' X_space = model.get_input_space() updates = OrderedDict() rval = FixedVarDescr() inputs=[X, Y] if not self.supervised: update_X = self.mask_gen(X, X_space = X_space) else: drop_mask_Y = sharedX(np.ones(batch_size,)) drop_mask_Y.name = 'drop_mask_Y' update_X, update_Y = self.mask_gen(X, Y, X_space) updates[drop_mask_Y] = update_Y rval.fixed_vars['drop_mask_Y'] = drop_mask_Y if self.mask_gen.sync_channels: n = update_X.ndim assert n == drop_mask_X.ndim - 1 update_X.name = 'raw_update_X' zeros_like_X = T.zeros_like(X) zeros_like_X.name = 'zeros_like_X' update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x') update_X.name = 'update_X' updates[drop_mask_X] = update_X rval.fixed_vars['drop_mask'] = drop_mask_X if hasattr(model.inference_procedure, 'V_dropout'): include_prob = model.inference_procedure.include_prob include_prob_V = model.inference_procedure.include_prob_V include_prob_Y = model.inference_procedure.include_prob_Y theano_rng = MRG_RandomStreams(2012+11+20) for elem in flatten([model.inference_procedure.V_dropout]): updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V if "Softmax" in str(type(model.hidden_layers[-1])): hid = model.inference_procedure.H_dropout[:-1] y = model.inference_procedure.H_dropout[-1] updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y else: hid = model.inference_procedure.H_dropout for elem in flatten(hid): updates[elem] = theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob rval.on_load_batch = [utils.function(inputs, updates=updates)] return rval
def get_gradients(self, model, data, **kwargs): self.get_data_specs(model)[0].validate(data) obj, scratch = self.base_cost(model, data, return_locals=True, **kwargs) if self.supervised: assert isinstance(data, (list, tuple)) assert len(data) == 2 (X, Y) = data else: X, = data interm_grads = OrderedDict() H_hat = scratch['H_hat'] terms = scratch['terms'] hidden_layers = scratch['hidden_layers'] grads = OrderedDict() assert len(H_hat) == len(terms) assert len(terms) == len(hidden_layers) num_layers = len(hidden_layers) for i in xrange(num_layers): state = H_hat[i] layer = model.hidden_layers[i] term = terms[i] if term == 0.: continue else: print 'term is ',term if i == 0: state_below = X layer_below = model.visible_layer else: layer_below = model.hidden_layers[i-1] state_below = H_hat[i-1] state_below = layer_below.upward_state(state_below) components = flatten(state) real_grads = T.grad(term, components) fake_state = layer.linear_feed_forward_approximation(state_below) fake_components = flatten(fake_state) real_grads = OrderedDict(safe_zip(fake_components, real_grads)) params = list(layer.get_params()) fake_grads = T.grad(cost=None, consider_constant=flatten(state_below), wrt=params, known_grads = real_grads) for param, grad in safe_zip(params, fake_grads): if param in grads: grads[param] = grads[param] + grad else: grads[param] = grad return grads, OrderedDict()
def get_gradients(self, model, X, Y=None, **kwargs): obj, scratch = self.base_cost(model, X, Y, return_locals=True, **kwargs) interm_grads = OrderedDict() H_hat = scratch['H_hat'] terms = scratch['terms'] hidden_layers = scratch['hidden_layers'] grads = OrderedDict() assert len(H_hat) == len(terms) assert len(terms) == len(hidden_layers) num_layers = len(hidden_layers) for i in xrange(num_layers): state = H_hat[i] layer = model.hidden_layers[i] term = terms[i] if term == 0.: continue else: print 'term is ', term if i == 0: state_below = X layer_below = model.visible_layer else: layer_below = model.hidden_layers[i - 1] state_below = H_hat[i - 1] state_below = layer_below.upward_state(state_below) components = flatten(state) real_grads = T.grad(term, components) fake_state = layer.linear_feed_forward_approximation(state_below) fake_components = flatten(fake_state) real_grads = OrderedDict(safe_zip(fake_components, real_grads)) params = list(layer.get_params()) fake_grads = T.grad(cost=None, consider_constant=flatten(state_below), wrt=params, known_grads=real_grads) for param, grad in safe_zip(params, fake_grads): if param in grads: grads[param] = grads[param] + grad else: grads[param] = grad return grads, OrderedDict()
def nan_check(i, node, fn): inputs = fn.inputs # TODO: figure out why individual inputs are themselves lists sometimes for x in flatten(inputs): do_check_on(x, node, fn, True) fn() outputs = fn.outputs for j, x in enumerate(flatten(outputs)): do_check_on(x, node, fn, False)
def get_monitoring_channels(self, data): """ .. todo:: WRITEME """ space, source = self.get_monitoring_data_specs() space.validate(data) X = data history = self.mf(X, return_history=True) q = history[-1] rval = OrderedDict() ch = self.visible_layer.get_monitoring_channels() for key in ch: rval['vis_' + key] = ch[key] for state, layer in safe_zip(q, self.hidden_layers): ch = layer.get_monitoring_channels() for key in ch: rval[layer.layer_name + '_' + key] = ch[key] ch = layer.get_monitoring_channels_from_state(state) for key in ch: rval['mf_' + layer.layer_name + '_' + key] = ch[key] if len(history) > 1: prev_q = history[-2] flat_q = flatten(q) flat_prev_q = flatten(prev_q) mx = None for new, old in safe_zip(flat_q, flat_prev_q): cur_mx = abs(new - old).max() if new is old: logger.error('{0} is {1}'.format(new, old)) assert False if mx is None: mx = cur_mx else: mx = T.maximum(mx, cur_mx) rval['max_var_param_diff'] = mx for layer, new, old in safe_zip(self.hidden_layers, q, prev_q): sum_diff = 0. for sub_new, sub_old in safe_zip(flatten(new), flatten(old)): sum_diff += abs(sub_new - sub_old).sum() denom = self.batch_size * \ layer.get_total_state_space().get_total_dimension() denom = np.cast[config.floatX](denom) rval['mean_'+layer.layer_name+'_var_param_diff'] = \ sum_diff / denom return rval
def _get_standard_neg(self, model, layer_to_chains): params = list(model.get_params()) warnings.warn("""TODO: reduce variance of negative phase by integrating out the even-numbered layers. The Rao-Blackwellize method can do this for you when expected gradient = gradient of expectation, but doing this in general is trickier.""") #layer_to_chains = model.rao_blackwellize(layer_to_chains) expected_energy_p = model.energy( layer_to_chains[model.visible_layer], [layer_to_chains[layer] for layer in model.hidden_layers] ).mean() samples = flatten(layer_to_chains.values()) for i, sample in enumerate(samples): if sample.name is None: sample.name = 'sample_'+str(i) neg_phase_grads = OrderedDict( safe_zip(params, T.grad(-expected_energy_p, params, consider_constant=samples, disconnected_inputs='ignore')) ) return neg_phase_grads
def _get_standard_neg(self, model, layer_to_chains): """ .. todo:: WRITEME """ params = list(model.get_params()) warnings.warn("""TODO: reduce variance of negative phase by integrating out the even-numbered layers. The Rao-Blackwellize method can do this for you when expected gradient = gradient of expectation, but doing this in general is trickier.""") #layer_to_chains = model.rao_blackwellize(layer_to_chains) expected_energy_p = model.energy( layer_to_chains[model.visible_layer], [layer_to_chains[layer] for layer in model.hidden_layers]).mean() samples = flatten(layer_to_chains.values()) for i, sample in enumerate(samples): if sample.name is None: sample.name = 'sample_' + str(i) neg_phase_grads = OrderedDict( safe_zip( params, T.grad(-expected_energy_p, params, consider_constant=samples, disconnected_inputs='ignore'))) return neg_phase_grads
def nan_check(i, node, fn): """ Runs `fn` while checking its inputs and outputs for NaNs / Infs Parameters ---------- i : currently ignored (TODO: determine why it is here or remove) node : theano.gof.Apply The Apply node currently being executed fn : callable The thunk to execute for this Apply node """ inputs = fn.inputs # TODO: figure out why individual inputs are themselves lists sometimes for x in flatten(inputs): do_check_on(x, node, fn, True) fn() outputs = fn.outputs for j, x in enumerate(flatten(outputs)): do_check_on(x, node, fn, False)
def _get_sampling_pos(self, model, X, Y): """ .. todo:: WRITEME """ layer_to_clamp = OrderedDict([(model.visible_layer, True)]) layer_to_pos_samples = OrderedDict([(model.visible_layer, X)]) if self.supervised: # note: if the Y layer changes to something without linear energy, # we'll need to make the expected energy clamp Y in the # positive phase assert isinstance(model.hidden_layers[-1], dbm.Softmax) layer_to_clamp[model.hidden_layers[-1]] = True layer_to_pos_samples[model.hidden_layers[-1]] = Y hid = model.hidden_layers[:-1] else: assert Y is None hid = model.hidden_layers for layer in hid: mf_state = layer.init_mf_state() def recurse_zeros(x): if isinstance(x, tuple): return tuple([recurse_zeros(e) for e in x]) return x.zeros_like() layer_to_pos_samples[layer] = recurse_zeros(mf_state) layer_to_pos_samples = model.mcmc_steps( layer_to_state=layer_to_pos_samples, layer_to_clamp=layer_to_clamp, num_steps=self.num_gibbs_steps, theano_rng=self.theano_rng) q = [layer_to_pos_samples[layer] for layer in model.hidden_layers] pos_samples = flatten(q) # The gradients of the expected energy under q are easy, we can just # do that in theano expected_energy_q = model.energy(X, q).mean() params = list(model.get_params()) gradients = OrderedDict( safe_zip( params, T.grad(expected_energy_q, params, consider_constant=pos_samples, disconnected_inputs='ignore'))) return gradients
def _get_sampling_pos(self, model, X, Y): """ .. todo:: WRITEME """ layer_to_clamp = OrderedDict([(model.visible_layer, True)]) layer_to_pos_samples = OrderedDict([(model.visible_layer, X)]) if self.supervised: # note: if the Y layer changes to something without linear energy, # we'll need to make the expected energy clamp Y in the # positive phase assert isinstance(model.hidden_layers[-1], Softmax) layer_to_clamp[model.hidden_layers[-1]] = True layer_to_pos_samples[model.hidden_layers[-1]] = Y hid = model.hidden_layers[:-1] else: assert Y is None hid = model.hidden_layers for layer in hid: mf_state = layer.init_mf_state() def recurse_zeros(x): if isinstance(x, tuple): return tuple([recurse_zeros(e) for e in x]) return x.zeros_like() layer_to_pos_samples[layer] = recurse_zeros(mf_state) layer_to_pos_samples = model.mcmc_steps( layer_to_state=layer_to_pos_samples, layer_to_clamp=layer_to_clamp, num_steps=self.num_gibbs_steps, theano_rng=self.theano_rng, ) q = [layer_to_pos_samples[layer] for layer in model.hidden_layers] pos_samples = flatten(q) # The gradients of the expected energy under q are easy, we can just # do that in theano expected_energy_q = model.energy(X, q).mean() params = list(model.get_params()) gradients = OrderedDict( safe_zip( params, T.grad(expected_energy_q, params, consider_constant=pos_samples, disconnected_inputs="ignore") ) ) return gradients
def _get_variational_pos(self, model, X, Y): """ .. todo:: WRITEME """ if self.supervised: assert Y is not None # note: if the Y layer changes to something without linear energy, # we'll need to make the expected energy clamp Y in the positive # phase assert isinstance(model.hidden_layers[-1], dbm.Softmax) q = model.mf(X, Y) """ Use the non-negativity of the KL divergence to construct a lower bound on the log likelihood. We can drop all terms that are constant with repsect to the model parameters: log P(v) = L(v, q) + KL(q || P(h|v)) L(v, q) = log P(v) - KL(q || P(h|v)) L(v, q) = log P(v) - sum_h q(h) log q(h) + q(h) log P(h | v) L(v, q) = log P(v) + sum_h q(h) log P(h | v) + const L(v, q) = log P(v) + sum_h q(h) log P(h, v) - sum_h q(h) log P(v) + const L(v, q) = sum_h q(h) log P(h, v) + const L(v, q) = sum_h q(h) -E(h, v) - log Z + const so the cost we want to minimize is expected_energy + log Z + const Note: for the RBM, this bound is exact, since the KL divergence goes to 0. """ variational_params = flatten(q) # The gradients of the expected energy under q are easy, we can just # do that in theano expected_energy_q = model.expected_energy(X, q).mean() params = list(model.get_params()) gradients = OrderedDict( safe_zip( params, T.grad(expected_energy_q, params, consider_constant=variational_params, disconnected_inputs='ignore'))) return gradients
def _get_variational_pos(self, model, X, Y): """ .. todo:: WRITEME """ if self.supervised: assert Y is not None # note: if the Y layer changes to something without linear energy, # we'll need to make the expected energy clamp Y in the positive # phase assert isinstance(model.hidden_layers[-1], Softmax) q = model.mf(X, Y) """ Use the non-negativity of the KL divergence to construct a lower bound on the log likelihood. We can drop all terms that are constant with repsect to the model parameters: log P(v) = L(v, q) + KL(q || P(h|v)) L(v, q) = log P(v) - KL(q || P(h|v)) L(v, q) = log P(v) - sum_h q(h) log q(h) + q(h) log P(h | v) L(v, q) = log P(v) + sum_h q(h) log P(h | v) + const L(v, q) = log P(v) + sum_h q(h) log P(h, v) - sum_h q(h) log P(v) + const L(v, q) = sum_h q(h) log P(h, v) + const L(v, q) = sum_h q(h) -E(h, v) - log Z + const so the cost we want to minimize is expected_energy + log Z + const Note: for the RBM, this bound is exact, since the KL divergence goes to 0. """ variational_params = flatten(q) # The gradients of the expected energy under q are easy, we can just # do that in theano expected_energy_q = model.expected_energy(X, q).mean() params = list(model.get_params()) gradients = OrderedDict( safe_zip(params, T.grad(expected_energy_q, params, consider_constant=variational_params, disconnected_inputs='ignore')) ) return gradients
def _get_toronto_neg(self, model, layer_to_chains): """ .. todo:: WRITEME """ # Ruslan Salakhutdinov's undocumented negative phase from # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m # IG copied it here without fully understanding it, so it # only applies to exactly the same model structure as # in that code. assert isinstance(model.visible_layer, dbm.BinaryVector) assert isinstance(model.hidden_layers[0], dbm.BinaryVectorMaxPool) assert model.hidden_layers[0].pool_size == 1 assert isinstance(model.hidden_layers[1], dbm.BinaryVectorMaxPool) assert model.hidden_layers[1].pool_size == 1 assert isinstance(model.hidden_layers[2], dbm.Softmax) assert len(model.hidden_layers) == 3 params = list(model.get_params()) V_samples = layer_to_chains[model.visible_layer] H1_samples, H2_samples, Y_samples = [ layer_to_chains[layer] for layer in model.hidden_layers ] H1_mf = model.hidden_layers[0].mf_update( state_below=model.visible_layer.upward_state(V_samples), state_above=model.hidden_layers[1].downward_state(H2_samples), layer_above=model.hidden_layers[1]) Y_mf = model.hidden_layers[2].mf_update( state_below=model.hidden_layers[1].upward_state(H2_samples)) H2_mf = model.hidden_layers[1].mf_update( state_below=model.hidden_layers[0].upward_state(H1_mf), state_above=model.hidden_layers[2].downward_state(Y_mf), layer_above=model.hidden_layers[2]) expected_energy_p = model.energy(V_samples, [H1_mf, H2_mf, Y_samples]).mean() constants = flatten([V_samples, H1_mf, H2_mf, Y_samples]) neg_phase_grads = OrderedDict( safe_zip( params, T.grad(-expected_energy_p, params, consider_constant=constants))) return neg_phase_grads
def _get_toronto_neg(self, model, layer_to_chains): """ .. todo:: WRITEME """ # Ruslan Salakhutdinov's undocumented negative phase from # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m # IG copied it here without fully understanding it, so it # only applies to exactly the same model structure as # in that code. assert isinstance(model.visible_layer, BinaryVector) assert isinstance(model.hidden_layers[0], BinaryVectorMaxPool) assert model.hidden_layers[0].pool_size == 1 assert isinstance(model.hidden_layers[1], BinaryVectorMaxPool) assert model.hidden_layers[1].pool_size == 1 assert isinstance(model.hidden_layers[2], Softmax) assert len(model.hidden_layers) == 3 params = list(model.get_params()) V_samples = layer_to_chains[model.visible_layer] H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for layer in model.hidden_layers] H1_mf = model.hidden_layers[0].mf_update( state_below=model.visible_layer.upward_state(V_samples), state_above=model.hidden_layers[1].downward_state(H2_samples), layer_above=model.hidden_layers[1]) Y_mf = model.hidden_layers[2].mf_update( state_below=model.hidden_layers[1].upward_state(H2_samples)) H2_mf = model.hidden_layers[1].mf_update( state_below=model.hidden_layers[0].upward_state(H1_mf), state_above=model.hidden_layers[2].downward_state(Y_mf), layer_above=model.hidden_layers[2]) expected_energy_p = model.energy( V_samples, [H1_mf, H2_mf, Y_samples] ).mean() constants = flatten([V_samples, H1_mf, H2_mf, Y_samples]) neg_phase_grads = OrderedDict( safe_zip(params, T.grad(-expected_energy_p, params, consider_constant=constants))) return neg_phase_grads
keep = keep_func(filter_me.X, filter_me.y) keep = keep.astype('bool') filter_me.X = filter_me.X[keep, :] filter_me.y = filter_me.y[keep, :] dropout = hasattr(model.inference_procedure, 'V_dropout') if dropout: include_prob = model.inference_procedure.include_prob theano_rng = MRG_RandomStreams(2012+11+20) updates = {} for elem in flatten([model.inference_procedure.V_dropout, model.inference_procedure.H_dropout]): updates[elem] = theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob do_dropout = function([], updates=updates) while True: if dropout: do_dropout() if cost.supervised: X, Y = dataset.get_batch_design(m, include_labels = True) else: X = dataset.get_batch_design(m) if topo: X = dataset.get_topological_view(X)
def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model if self.set_batch_size: model.set_batch_size(self.batch_size) if self.batch_size is None: self.batch_size = model.force_batch_size model.cost = self.cost model.mask_gen = self.mask_gen self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) prereq = self.get_setup_batch_object() #We want to use big batches. We need to make several theano calls on each #batch. To avoid paying the GPU latency every time, we use a shared variable #but the shared variable needs to stay allocated during the time that the #monitor is working, and we don't want the monitor to increase the memory #overhead. So we make the monitor work off of the same shared variable space = model.get_input_space() X = sharedX(space.get_origin_batch(model.batch_size), 'BGD_X') self.space = space rng = np.random.RandomState([2012, 7, 20]) test_mask = space.get_origin_batch(model.batch_size) test_mask = rng.randint(0, 2, test_mask.shape) if hasattr(self.mask_gen, 'sync_channels') and self.mask_gen.sync_channels: if test_mask.ndim != 4: raise NotImplementedError() test_mask = test_mask[:, :, :, 0] assert test_mask.ndim == 3 drop_mask = sharedX(np.cast[X.dtype](test_mask), name='drop_mask') self.drop_mask = drop_mask assert drop_mask.ndim == test_mask.ndim Y = None drop_mask_Y = None if self.cost.supervised: Y = sharedX( model.get_output_space().get_origin_batch(model.batch_size), 'BGD_Y') self.Y = Y test_mask_Y = rng.randint(0, 2, (model.batch_size, )) drop_mask_Y = sharedX(np.cast[Y.dtype](test_mask_Y), name='drop_mask_Y') self.drop_mask_Y = drop_mask_Y dmx, dmy = self.mask_gen(X, Y) updates = OrderedDict([ (drop_mask, dmx),\ (drop_mask_Y, dmy)] ) else: updates = OrderedDict([(drop_mask, self.mask_gen(X))]) obj = self.cost(model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y) gradients, gradient_updates = self.cost.get_gradients( model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y) if hasattr(model.inference_procedure, 'V_dropout'): include_prob = model.inference_procedure.include_prob theano_rng = MRG_RandomStreams(2012 + 11 + 20) for elem in flatten([ model.inference_procedure.V_dropout, model.inference_procedure.H_dropout ]): updates[elem] = theano_rng.binomial( p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob self.update_mask = function([], updates=updates) if self.monitoring_dataset is not None: if not any([ dataset.has_targets() for dataset in self.monitoring_dataset.values() ]): Y = None assert X.name is not None channels = model.get_monitoring_channels(X, Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) assert X.name is not None wtf = self.cost.get_monitoring_channels(model, X=X, Y=Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y) for key in wtf: channels[key] = wtf[key] for dataset_name in self.monitoring_dataset: if dataset_name == '': prefix = '' else: prefix = dataset_name + '_' monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset(dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) #we only need to put the prereq in once to make sure it gets run #adding it more times shouldn't hurt, but be careful #each time you say "self.setup_batch" you get a new object with a #different id, and if you install n of those the prereq will run n #times. It won't cause any wrong results, just a big slowdown warnings.warn( "This is weird-- ipt=(X,Y)=tell the monitor to replace X, Y with the givens dict, " " but you don't actually want them to be replaced.") ipt = X if Y is not None: ipt = [X, Y] self.monitor.add_channel(prefix + 'objective', ipt=ipt, val=obj, dataset=monitoring_dataset, prereqs=[prereq]) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = [] prereqs = list(prereqs) prereqs.append(prereq) if Y is not None: ipt = (X, Y) else: ipt = X self.monitor.add_channel(name=prefix + name, ipt=ipt, val=J, dataset=monitoring_dataset, prereqs=prereqs) self.accumulate = self.combine_batches > 1 if self.accumulate: self.inputs = [ elem for elem in [X, Y, drop_mask, drop_mask_Y] if elem is not None ] else: self.inputs = None self.optimizer = BatchGradientDescent( objective=obj, inputs=self.inputs, verbose=1, gradients=gradients, gradient_updates=gradient_updates, params=model.get_params(), lr_scalers=model.get_lr_scalers(), param_constrainers=[model.censor_updates], max_iter=self.max_iter, tol=3e-7, init_alpha=self.init_alpha, reset_alpha=self.reset_alpha, conjugate=self.conjugate, reset_conjugate=self.reset_conjugate, min_init_alpha=self.min_init_alpha, line_search_mode=self.line_search_mode, accumulate=self.accumulate, theano_function_mode=self.theano_function_mode) self.X = X if self.monitoring_dataset is not None: self.monitor.add_channel( name='ave_step_size', ipt=ipt, val=self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_size', ipt=ipt, val=self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_mult', ipt=ipt, val=self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True
keep_func = function([gX, gY], keep) keep = keep_func(filter_me.X, filter_me.y) keep = keep.astype('bool') filter_me.X = filter_me.X[keep, :] filter_me.y = filter_me.y[keep, :] dropout = hasattr(model.inference_procedure, 'V_dropout') if dropout: include_prob = model.inference_procedure.include_prob theano_rng = MRG_RandomStreams(2012 + 11 + 20) updates = {} for elem in flatten([ model.inference_procedure.V_dropout, model.inference_procedure.H_dropout ]): updates[elem] = theano_rng.binomial( p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob do_dropout = function([], updates=updates) while True: if dropout: do_dropout() if cost.supervised: X, Y = dataset.get_batch_design(m, include_labels=True) else: X = dataset.get_batch_design(m) if topo:
def get_gradients(self, model, X, Y=None): """ PCD approximation to the gradient. Keep in mind this is a cost, so we use the negative log likelihood. """ layer_to_clamp = OrderedDict([(model.visible_layer, True )]) layer_to_pos_samples = OrderedDict([(model.visible_layer, X)]) if self.supervised: assert Y is not None # note: if the Y layer changes to something without linear energy, # we'll need to make the expected energy clamp Y in the positive phase assert isinstance(model.hidden_layers[-1], dbm.Softmax) layer_to_clamp[model.hidden_layers[-1]] = True layer_to_pos_samples[model.hidden_layers[-1]] = Y hid = model.hidden_layers[:-1] else: assert Y is None hid = model.hidden_layers for layer in hid: mf_state = layer.init_mf_state() def recurse_zeros(x): if isinstance(x, tuple): return tuple([recurse_zeros(e) for e in x]) return x.zeros_like() layer_to_pos_samples[layer] = recurse_zeros(mf_state) layer_to_pos_samples = model.mcmc_steps( layer_to_state=layer_to_pos_samples, layer_to_clamp=layer_to_clamp, num_steps=self.num_gibbs_steps, theano_rng = self.theano_rng) q = [layer_to_pos_samples[layer] for layer in model.hidden_layers] pos_samples = flatten(q) # The gradients of the expected energy under q are easy, we can just do that in theano expected_energy_q = model.energy(X, q).mean() params = list(model.get_params()) gradients = OrderedDict(safe_zip(params, T.grad(expected_energy_q, params, consider_constant = pos_samples, disconnected_inputs = 'ignore'))) """ d/d theta log Z = (d/d theta Z) / Z = (d/d theta sum_h sum_v exp(-E(v,h)) ) / Z = (sum_h sum_v - exp(-E(v,h)) d/d theta E(v,h) ) / Z = - sum_h sum_v P(v,h) d/d theta E(v,h) """ layer_to_chains = model.make_layer_to_state(self.num_chains) def recurse_check(l): if isinstance(l, (list, tuple)): for elem in l: recurse_check(elem) else: assert l.get_value().shape[0] == self.num_chains recurse_check(layer_to_chains.values()) model.layer_to_chains = layer_to_chains # Note that we replace layer_to_chains with a dict mapping to the new # state of the chains updates, layer_to_chains = model.get_sampling_updates(layer_to_chains, self.theano_rng, num_steps=self.num_gibbs_steps, return_layer_to_updated = True) if self.toronto_neg: # Ruslan Salakhutdinov's undocumented negative phase from # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m # IG copied it here without fully understanding it, so it # only applies to exactly the same model structure as # in that code. assert isinstance(model.visible_layer, dbm.BinaryVector) assert isinstance(model.hidden_layers[0], dbm.BinaryVectorMaxPool) assert model.hidden_layers[0].pool_size == 1 assert isinstance(model.hidden_layers[1], dbm.BinaryVectorMaxPool) assert model.hidden_layers[1].pool_size == 1 assert isinstance(model.hidden_layers[2], dbm.Softmax) assert len(model.hidden_layers) == 3 V_samples = layer_to_chains[model.visible_layer] H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for layer in model.hidden_layers] H1_mf = model.hidden_layers[0].mf_update(state_below=model.visible_layer.upward_state(V_samples), state_above=model.hidden_layers[1].downward_state(H2_samples), layer_above=model.hidden_layers[1]) Y_mf = model.hidden_layers[2].mf_update(state_below=model.hidden_layers[1].upward_state(H2_samples)) H2_mf = model.hidden_layers[1].mf_update(state_below=model.hidden_layers[0].upward_state(H1_mf), state_above=model.hidden_layers[2].downward_state(Y_mf), layer_above=model.hidden_layers[2]) expected_energy_p = model.energy(V_samples, [H1_mf, H2_mf, Y_samples]).mean() constants = flatten([V_samples, H1_mf, H2_mf, Y_samples]) neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant = constants))) else: warnings.warn("""TODO: reduce variance of negative phase by integrating out the even-numbered layers. The Rao-Blackwellize method can do this for you when expected gradient = gradient of expectation, but doing this in general is trickier.""") #layer_to_chains = model.rao_blackwellize(layer_to_chains) expected_energy_p = model.energy(layer_to_chains[model.visible_layer], [layer_to_chains[layer] for layer in model.hidden_layers]).mean() samples = flatten(layer_to_chains.values()) for i, sample in enumerate(samples): if sample.name is None: sample.name = 'sample_'+str(i) neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant = samples, disconnected_inputs='ignore'))) for param in list(gradients.keys()): #print param.name,': ' #print theano.printing.min_informative_str(neg_phase_grads[param]) gradients[param] = neg_phase_grads[param] + gradients[param] return gradients, updates
def get_gradients(self, model, X, Y=None): """ PCD approximation to the gradient of the bound. Keep in mind this is a cost, so we are upper bounding the negative log likelihood. """ if self.supervised: assert Y is not None # note: if the Y layer changes to something without linear energy, # we'll need to make the expected energy clamp Y in the positive phase assert isinstance(model.hidden_layers[-1], dbm.Softmax) q = model.mf(X, Y) """ Use the non-negativity of the KL divergence to construct a lower bound on the log likelihood. We can drop all terms that are constant with repsect to the model parameters: log P(v) = L(v, q) + KL(q || P(h|v)) L(v, q) = log P(v) - KL(q || P(h|v)) L(v, q) = log P(v) - sum_h q(h) log q(h) + q(h) log P(h | v) L(v, q) = log P(v) + sum_h q(h) log P(h | v) + const L(v, q) = log P(v) + sum_h q(h) log P(h, v) - sum_h q(h) log P(v) + const L(v, q) = sum_h q(h) log P(h, v) + const L(v, q) = sum_h q(h) -E(h, v) - log Z + const so the cost we want to minimize is expected_energy + log Z + const Note: for the RBM, this bound is exact, since the KL divergence goes to 0. """ variational_params = flatten(q) # The gradients of the expected energy under q are easy, we can just do that in theano expected_energy_q = model.expected_energy(X, q).mean() params = list(model.get_params()) gradients = OrderedDict(safe_zip(params, T.grad(expected_energy_q, params, consider_constant = variational_params, disconnected_inputs = 'ignore'))) """ d/d theta log Z = (d/d theta Z) / Z = (d/d theta sum_h sum_v exp(-E(v,h)) ) / Z = (sum_h sum_v - exp(-E(v,h)) d/d theta E(v,h) ) / Z = - sum_h sum_v P(v,h) d/d theta E(v,h) """ layer_to_chains = model.make_layer_to_state(self.num_chains) def recurse_check(l): if isinstance(l, (list, tuple)): for elem in l: recurse_check(elem) else: assert l.get_value().shape[0] == self.num_chains recurse_check(layer_to_chains.values()) model.layer_to_chains = layer_to_chains # Note that we replace layer_to_chains with a dict mapping to the new # state of the chains updates, layer_to_chains = model.get_sampling_updates(layer_to_chains, self.theano_rng, num_steps=self.num_gibbs_steps, return_layer_to_updated = True) if self.toronto_neg: # Ruslan Salakhutdinov's undocumented negative phase from # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m # IG copied it here without fully understanding it, so it # only applies to exactly the same model structure as # in that code. assert isinstance(model.visible_layer, dbm.BinaryVector) assert isinstance(model.hidden_layers[0], dbm.BinaryVectorMaxPool) assert model.hidden_layers[0].pool_size == 1 assert isinstance(model.hidden_layers[1], dbm.BinaryVectorMaxPool) assert model.hidden_layers[1].pool_size == 1 assert isinstance(model.hidden_layers[2], dbm.Softmax) assert len(model.hidden_layers) == 3 V_samples = layer_to_chains[model.visible_layer] H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for layer in model.hidden_layers] H1_mf = model.hidden_layers[0].mf_update(state_below=model.visible_layer.upward_state(V_samples), state_above=model.hidden_layers[1].downward_state(H2_samples), layer_above=model.hidden_layers[1]) Y_mf = model.hidden_layers[2].mf_update(state_below=model.hidden_layers[1].upward_state(H2_samples)) H2_mf = model.hidden_layers[1].mf_update(state_below=model.hidden_layers[0].upward_state(H1_mf), state_above=model.hidden_layers[2].downward_state(Y_mf), layer_above=model.hidden_layers[2]) expected_energy_p = model.energy(V_samples, [H1_mf, H2_mf, Y_samples]).mean() constants = flatten([V_samples, H1_mf, H2_mf, Y_samples]) neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant = constants))) else: warnings.warn("""TODO: reduce variance of negative phase by integrating out the even-numbered layers. The Rao-Blackwellize method can do this for you when expected gradient = gradient of expectation, but doing this in general is trickier.""") #layer_to_chains = model.rao_blackwellize(layer_to_chains) expected_energy_p = model.energy(layer_to_chains[model.visible_layer], [layer_to_chains[layer] for layer in model.hidden_layers]).mean() samples = flatten(layer_to_chains.values()) for i, sample in enumerate(samples): if sample.name is None: sample.name = 'sample_'+str(i) neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant = samples, disconnected_inputs='ignore'))) for param in list(gradients.keys()): gradients[param] = neg_phase_grads[param] + gradients[param] return gradients, updates
def expr(self, model, data, drop_mask = None, drop_mask_Y = None, return_locals = False, include_toronto = True, ** kwargs): """ .. todo:: WRITEME """ if self.supervised: X, Y = data else: X = data Y = None if not self.supervised: assert drop_mask_Y is None # ignore Y if some other cost is supervised and has made it get # passed in (can this still happen after the (space, source) # interface change?) Y = None if self.supervised: assert Y is not None if drop_mask is not None: assert drop_mask_Y is not None if not hasattr(model,'cost'): model.cost = self if not hasattr(model,'mask_gen'): model.mask_gen = self.mask_gen dbm = model X_space = model.get_input_space() if drop_mask is None: if self.supervised: drop_mask, drop_mask_Y = self.mask_gen(X, Y, X_space=X_space) else: drop_mask = self.mask_gen(X, X_space=X_space) if drop_mask_Y is not None: assert drop_mask_Y.ndim == 1 if drop_mask.ndim < X.ndim: if self.mask_gen is not None: assert self.mask_gen.sync_channels if X.ndim != 4: raise NotImplementedError() drop_mask = drop_mask.dimshuffle(0,1,2,'x') if not hasattr(self,'noise'): self.noise = False history = dbm.do_inpainting(X, Y = Y, drop_mask = drop_mask, drop_mask_Y = drop_mask_Y, return_history = True, noise = self.noise, niter = self.niter, block_grad = self.block_grad) final_state = history[-1] new_drop_mask = None new_drop_mask_Y = None new_history = [ None for state in history ] if not hasattr(self, 'both_directions'): self.both_directions = False if self.both_directions: new_drop_mask = 1. - drop_mask if self.supervised: new_drop_mask_Y = 1. - drop_mask_Y new_history = dbm.do_inpainting(X, Y = Y, drop_mask=new_drop_mask, drop_mask_Y=new_drop_mask_Y, return_history=True, noise = self.noise, niter = self.niter, block_grad = self.block_grad) new_final_state = new_history[-1] total_cost, sublocals = self.cost_from_states(final_state, new_final_state, dbm, X, Y, drop_mask, drop_mask_Y, new_drop_mask, new_drop_mask_Y, return_locals=True) l1_act_cost = sublocals['l1_act_cost'] inpaint_cost = sublocals['inpaint_cost'] reweighted_act_cost = sublocals['reweighted_act_cost'] if not hasattr(self, 'robustness'): self.robustness = None if self.robustness is not None: inpainting_H_hat = history[-1]['H_hat'] mf_H_hat = dbm.mf(X, Y=Y) if self.supervised: inpainting_H_hat = inpainting_H_hat[:-1] mf_H_hat = mf_H_hat[:-1] for ihh, mhh in safe_izip(flatten(inpainting_H_hat), flatten(mf_H_hat)): total_cost += self.robustness * T.sqr(mhh-ihh).sum() if not hasattr(self, 'toronto_act_targets'): self.toronto_act_targets = None toronto_act_cost = None if self.toronto_act_targets is not None and include_toronto: toronto_act_cost = 0. H_hat = history[-1]['H_hat'] for s, c, t in zip(H_hat, self.toronto_act_coeffs, self.toronto_act_targets): if c == 0.: continue s, _ = s m = s.mean(axis=0) toronto_act_cost += c * T.sqr(m-t).mean() total_cost += toronto_act_cost if return_locals: return locals() total_cost.name = 'total_inpaint_cost' return total_cost
def get_gradients(self, model, data, **kwargs): """ .. todo:: WRITEME """ self.get_data_specs(model)[0].validate(data) obj, scratch = self.base_cost.expr(model, data, return_locals=True, **kwargs) if self.supervised: assert isinstance(data, (list, tuple)) assert len(data) == 2 (X, Y) = data else: X = data H_hat = scratch['H_hat'] terms = scratch['terms'] hidden_layers = scratch['hidden_layers'] grads = OrderedDict() assert len(H_hat) == len(terms) assert len(terms) == len(hidden_layers) num_layers = len(hidden_layers) for i in xrange(num_layers): state = H_hat[i] layer = model.hidden_layers[i] term = terms[i] if term == 0.: continue else: print 'term is ', term if i == 0: state_below = X layer_below = model.visible_layer else: layer_below = model.hidden_layers[i - 1] state_below = H_hat[i - 1] state_below = layer_below.upward_state(state_below) components = flatten(state) real_grads = T.grad(term, components) fake_state = layer.linear_feed_forward_approximation(state_below) fake_components = flatten(fake_state) real_grads = OrderedDict(safe_zip(fake_components, real_grads)) params = list(layer.get_params()) fake_grads = pylearn2.utils.grad( cost=None, consider_constant=flatten(state_below), wrt=params, known_grads=real_grads) for param, grad in safe_zip(params, fake_grads): if param in grads: grads[param] = grads[param] + grad else: grads[param] = grad return grads, OrderedDict()
def __call__(self, model, X, Y=None, drop_mask=None, drop_mask_Y=None, return_locals=False, include_toronto=True, **kwargs): """ .. todo:: WRITEME """ if not self.supervised: assert drop_mask_Y is None Y = None # ignore Y if some other cost is supervised and has made it get passed in if self.supervised: assert Y is not None if drop_mask is not None: assert drop_mask_Y is not None if not hasattr(model, 'cost'): model.cost = self if not hasattr(model, 'mask_gen'): model.mask_gen = self.mask_gen dbm = model X_space = model.get_input_space() if drop_mask is None: if self.supervised: drop_mask, drop_mask_Y = self.mask_gen(X, Y, X_space=X_space) else: drop_mask = self.mask_gen(X, X_space=X_space) if drop_mask_Y is not None: assert drop_mask_Y.ndim == 1 if drop_mask.ndim < X.ndim: if self.mask_gen is not None: assert self.mask_gen.sync_channels if X.ndim != 4: raise NotImplementedError() drop_mask = drop_mask.dimshuffle(0, 1, 2, 'x') if not hasattr(self, 'noise'): self.noise = False history = dbm.do_inpainting(X, Y=Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y, return_history=True, noise=self.noise, niter=self.niter, block_grad=self.block_grad) final_state = history[-1] new_drop_mask = None new_drop_mask_Y = None new_history = [None for state in history] if not hasattr(self, 'both_directions'): self.both_directions = False if self.both_directions: new_drop_mask = 1. - drop_mask if self.supervised: new_drop_mask_Y = 1. - drop_mask_Y new_history = dbm.do_inpainting(X, Y=Y, drop_mask=new_drop_mask, drop_mask_Y=new_drop_mask_Y, return_history=True, noise=self.noise, niter=self.niter, block_grad=self.block_grad) new_final_state = new_history[-1] total_cost, sublocals = self.cost_from_states(final_state, new_final_state, dbm, X, Y, drop_mask, drop_mask_Y, new_drop_mask, new_drop_mask_Y, return_locals=True) l1_act_cost = sublocals['l1_act_cost'] inpaint_cost = sublocals['inpaint_cost'] reweighted_act_cost = sublocals['reweighted_act_cost'] if not hasattr(self, 'robustness'): self.robustness = None if self.robustness is not None: inpainting_H_hat = history[-1]['H_hat'] mf_H_hat = dbm.mf(X, Y=Y) if self.supervised: inpainting_H_hat = inpainting_H_hat[:-1] mf_H_hat = mf_H_hat[:-1] for ihh, mhh in safe_izip(flatten(inpainting_H_hat), flatten(mf_H_hat)): total_cost += self.robustness * T.sqr(mhh - ihh).sum() if not hasattr(self, 'toronto_act_targets'): self.toronto_act_targets = None toronto_act_cost = None if self.toronto_act_targets is not None and include_toronto: toronto_act_cost = 0. H_hat = history[-1]['H_hat'] for s, c, t in zip(H_hat, self.toronto_act_coeffs, self.toronto_act_targets): if c == 0.: continue s, _ = s m = s.mean(axis=0) toronto_act_cost += c * T.sqr(m - t).mean() total_cost += toronto_act_cost if return_locals: return locals() total_cost.name = 'total_inpaint_cost' return total_cost
def get_fixed_var_descr(self, model, data): """ .. todo:: WRITEME """ X, Y = data assert Y is not None batch_size = model.batch_size drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size)) drop_mask_X.name = "drop_mask" X_space = model.get_input_space() updates = OrderedDict() rval = FixedVarDescr() inputs = [X, Y] if not self.supervised: update_X = self.mask_gen(X, X_space=X_space) else: drop_mask_Y = sharedX(np.ones(batch_size)) drop_mask_Y.name = "drop_mask_Y" update_X, update_Y = self.mask_gen(X, Y, X_space) updates[drop_mask_Y] = update_Y rval.fixed_vars["drop_mask_Y"] = drop_mask_Y if self.mask_gen.sync_channels: n = update_X.ndim assert n == drop_mask_X.ndim - 1 update_X.name = "raw_update_X" zeros_like_X = T.zeros_like(X) zeros_like_X.name = "zeros_like_X" update_X = zeros_like_X + update_X.dimshuffle(0, 1, 2, "x") update_X.name = "update_X" updates[drop_mask_X] = update_X rval.fixed_vars["drop_mask"] = drop_mask_X if hasattr(model.inference_procedure, "V_dropout"): include_prob = model.inference_procedure.include_prob include_prob_V = model.inference_procedure.include_prob_V include_prob_Y = model.inference_procedure.include_prob_Y theano_rng = make_theano_rng(None, 2012 + 10 + 20, which_method="binomial") for elem in flatten([model.inference_procedure.V_dropout]): updates[elem] = ( theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V ) if "Softmax" in str(type(model.hidden_layers[-1])): hid = model.inference_procedure.H_dropout[:-1] y = model.inference_procedure.H_dropout[-1] updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y else: hid = model.inference_procedure.H_dropout for elem in flatten(hid): updates[elem] = ( theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob ) rval.on_load_batch = [utils.function(inputs, updates=updates)] return rval