def get_updates(self, learning_rate, grads, lr_scalers=None): updates = OrderedDict() for param in grads.keys(): avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) momentum = sharedX(np.zeros_like(param.get_value())) if param.name is not None: avg_grad_sqr.name = 'avg_grad_sqr_' + param.name new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr \ + (1 - self.averaging_coeff) \ * T.sqr(grads[param]) rms_grad_t = T.sqrt(new_avg_grad_sqr) rms_grad_t = T.maximum(rms_grad_t, self.stabilizer) normalized_grad = grads[param] / (rms_grad_t) new_momentum = self.momentum * momentum \ - learning_rate * normalized_grad updates[avg_grad_sqr] = new_avg_grad_sqr updates[momentum] = new_momentum updates[param] = param + new_momentum return updates
def __init__(self, nvis, nclasses): """Initialize the parameters of the logistic regression instance. Parameters ---------- nvis : int number of input units, the dimension of the space in which the datapoints lie. nclasses : int number of output units, the dimension of the space in which the labels lie. """ super(LogisticRegressionLayer, self).__init__() assert nvis >= 0, "Number of visible units must be non-negative" assert nclasses >= 0, "Number of classes must be non-negative" self.nvis = nvis self.nclasses = nclasses # initialize with 0 the weights W as a matrix of shape (nvis, nclasses) self.W = sharedX(numpy.zeros((nvis, nclasses)), name='W', borrow=True) # initialize the biases b as a vector of nclasses 0s self.b = sharedX(numpy.zeros((nclasses,)), name='b', borrow=True) # parameters of the model self._params = [self.W, self.b]
def bench(f, m, n): #print f rng = np.random.RandomState([2012,9,11]) X = sharedX(rng.randn(m,n)) Y = sharedX(X.get_value()) func = theano.function([], updates = { Y : f(X) }) nodes = func.maker.fgraph.toposort() # Make sure the optimizations haven't made us benchmark something different from what we intend if f is my_softmax: assert True not in [ isinstance(node.op, theano.tensor.nnet.Softmax) for node in nodes ] if f is softmax_op: assert True in [ isinstance(node.op, theano.tensor.nnet.Softmax) for node in nodes ] if f is softmax_with_bias: assert True in [ isinstance(node.op, theano.tensor.nnet.SoftmaxWithBias) for node in nodes ] # warm up for i in xrange(5): func() # actual time times = [] for i in xrange(5): t1 = time.time() func() t2 = time.time() times.append(t2-t1) rval = np.asarray(times).mean() #print rval return rval
def __init__(self, dict_size, dim, context_length, k, irange = 0.1, seed = 22): super(vLBLSoft, self).__init__() rng = np.random.RandomState(seed) self.rng = rng self.context_length = context_length self.dim = dim self.dict_size = dict_size C_values = np.asarray(rng.normal(0, math.sqrt(irange), size=(dim,context_length)), dtype=theano.config.floatX) self.C = theano.shared(value=C_values, name='C', borrow=True) W_context = rng.uniform(-irange, irange, (dict_size, dim)) W_context = sharedX(W_context,name='W_context') W_target = rng.uniform(-irange, irange, (dict_size, dim)) W_target = sharedX(W_target,name='W_target') self.projector_context = MatrixMul(W_context) self.projector_target = MatrixMul(W_target) self.W_context = W_context self.W_target = W_target self.W_target = W_context b_values = np.asarray(rng.normal(0, math.sqrt(irange), size=(dict_size,)), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, name='b', borrow=True) self.input_space = IndexSpace(dim = context_length, max_labels = dict_size) self.output_space = IndexSpace(dim = 1, max_labels = dict_size) self.allY = T.as_tensor_variable(np.arange(dict_size,dtype=np.int64).reshape(dict_size,1))
def createGradientFunctions(self): #create X = T.dmatrices("X") mu, logSigma, u, v, f, R = T.dcols("mu", "logSigma", "u", "v", "f", "R") mu = sharedX( np.random.normal(10, 10, (self.dimTheta, 1)), name='mu') logSigma = sharedX(np.random.uniform(0, 4, (self.dimTheta, 1)), name='logSigma') logLambd = sharedX(np.matrix(np.random.uniform(0, 10)),name='logLambd') logLambd = T.patternbroadcast(T.dmatrix("logLambd"),[1,1]) negKL = 0.5 * T.sum(1 + 2*logSigma - mu ** 2 - T.exp(logSigma) ** 2) theta = mu+T.exp(logSigma)*v W=theta y=X[:,0] X_sim=X[:,1:] f = (T.dot(X_sim,W)+u).flatten() gradvariables = [mu, logSigma, logLambd] logLike = T.sum(-(0.5 * np.log(2 * np.pi) + logLambd) - 0.5 * ((y-f)/(T.exp(logLambd)))**2) logp = (negKL + logLike)/self.m optimizer = -logp self.negKL = th.function([mu, logSigma], negKL, on_unused_input='ignore') self.f = th.function(gradvariables + [X,u,v], f, on_unused_input='ignore') self.logLike = th.function(gradvariables + [X, u, v], logLike,on_unused_input='ignore') derivatives = T.grad(logp,gradvariables) derivatives.append(logp) self.gradientfunction = th.function(gradvariables + [X, u, v], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [X, u, v], logp, on_unused_input='ignore') self.optimizer = BatchGradientDescent(objective=optimizer, params=gradvariables,inputs = [X,u,v],conjugate=True,max_iter=1)
def redo_everything(self): """ compiles learn_func if necessary makes new negative chains does not reset weights or biases TODO: figure out how to make the semantics of this cleaner / more in line with other models """ #compile learn_func if necessary if self.autonomous: self.redo_theano() #make the negative chains if not self.use_cd: self.V_chains = self.make_chains(self.bias_vis) self.V_chains.name = 'dbm_V_chains' self.H_chains = [ self.make_chains(bias_hid) for bias_hid in self.bias_hid ] for i, H_chain in enumerate(self.H_chains): H_chain.name = 'dbm_H[%d]_chain' % i if self.num_classes > 0: P = np.zeros((self.negative_chains, self.num_classes)) \ + T.nnet.softmax( self.bias_class ) temp_theano_rng = RandomStreams(87) sample_from = Sampler(temp_theano_rng, 'multinomial') values = function([],sample_from(P))() self.Y_chains = sharedX(values, 'Y_chains') else: self.Y_chains = None if hasattr(self, 'init_beta') and self.init_beta is not None: self.beta = sharedX( np.zeros( self.bias_vis.get_value().shape) + self.init_beta, name = 'beta')
def __init__(self, dataset, model, algorithm=None, save_path=None, save_freq=0, extensions=None, allow_overwrite=True): """ Construct a Train instance. Parameters ---------- dataset : `pylearn2.datasets.dataset.Dataset` model : `pylearn2.models.model.Model` algorithm : <Optional> `pylearn2.training_algorithms.training_algorithm.TrainingAlgorithm` save_path : <Optional> str Path to save (with pickle / joblib) the model. save_freq : <Optional> int Frequency of saves, in epochs. A frequency of zero disables automatic saving altogether. A frequency of 1 saves every epoch. A frequency of 2 saves every other epoch, etc. (default=0, i.e. never save). Note: when automatic saving is enabled (eg save_freq > 0), the model is always saved after learning, even when the final epoch is not a multiple of `save_freq`. extensions : <Optional> iterable A collection of `TrainExtension` objects whose callbacks are triggered at various points in learning. allow_overwrite : <Optional> bool If `True`, will save the model to save_path even if there is already something there. Otherwise, will raise an error if the `save_path` is already occupied. """ self.allow_overwrite = allow_overwrite self.first_save = True self.dataset = dataset self.model = model self.algorithm = algorithm if save_path is not None: if save_freq == 0: warnings.warn('save_path specified but save_freq is 0 ' '(never save). Is this intentional?') self.save_path = save_path else: if save_freq > 0: phase_variable = 'PYLEARN2_TRAIN_PHASE' if phase_variable in os.environ: phase = 'phase%d' % os.environ[phase_variable] tokens = [os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], phase, 'pkl'] else: tokens = os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], 'pkl' self.save_path = '.'.join(tokens) self.save_freq = save_freq if hasattr(self.dataset, 'yaml_src'): self.model.dataset_yaml_src = self.dataset.yaml_src else: warnings.warn("dataset has no yaml src, model won't know what " + "data it was trained on") self.extensions = extensions if extensions is not None else [] self.training_seconds = sharedX(value=0, name='training_seconds_this_epoch') self.total_seconds = sharedX(value=0, name='total_seconds_last_epoch')
def set_input_space(self, space): """ Note: this function will reset the parameters! """ self.input_space = space if not isinstance(space, Conv2DSpace): raise BadInputSpaceError(self.__class__.__name__ + ".set_input_space " "expected a Conv2DSpace, got " + str(space) + " of type " + str(type(space))) rng = self.get_mlp().rng if self.pad != (0,0): output_shape = \ [int(np.ceil((i_sh + 2. * k_pad - k_sh) / float(k_st))) + 1 for i_sh, k_sh, k_st, k_pad in izip(self.input_space.shape, self.kernel_shape, self.kernel_stride, self.pad)] elif self.border_mode == 'valid': output_shape = [(self.input_space.shape[0] - self.kernel_shape[0]) / self.kernel_stride[0] + 1, (self.input_space.shape[1] - self.kernel_shape[1]) / self.kernel_stride[1] + 1] elif self.border_mode == 'full': output_shape = [(self.input_space.shape[0] + self.kernel_shape[0]) / self.kernel_stride[0] - 1, (self.input_space.shape[1] + self.kernel_shape[1]) / self.kernel_stride[1] - 1] print "In:", self.layer_name, self.input_space.shape, self.kernel_shape, self.kernel_stride, self.pad print "Out:", self.layer_name, output_shape self.detector_space = Conv2DSpace(shape=output_shape, num_channels=self.output_channels, axes=('b', 'c', 0, 1)) self.initialize_transformer(rng) W, = self.transformer.get_params() W.name = self.layer_name + '_W' assert self.tied_b if self.tied_b: self.b = sharedX(np.zeros((self.detector_space.num_channels)) + self.init_bias) else: self.b = sharedX(self.detector_space.get_origin() + self.init_bias) self.b.name = self.layer_name + '_b' logger.info('Input shape: {0}'.format(self.input_space.shape)) logger.info('Detector space: {0}'.format(self.detector_space.shape)) self.initialize_output_space()
def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng self._params = [] V = np.zeros((self.n_classes, self.input_dim),dtype=np.float32) self.V = sharedX(V, self.layer_name + "_V" ) U = np.identity( self.input_dim) self.U = sharedX(U, self.layer_name + "_U") Q = np.zeros((self.input_dim, self.input_dim),dtype=np.float32) self.Q = sharedX(Q, self.layer_name + "_Q") Ui = np.identity(self.input_dim,dtype=np.float32) self.Ui = sharedX(Ui, self.layer_name + "_Ui") self._params = [ self.U, self.Ui, self.V, self.Q]
def get_updates(self, learning_rate, grads, lr_scalers=None): """ .. todo:: WRITEME """ updates = OrderedDict() for param in grads.keys(): inc = sharedX(param.get_value() * 0.) avg_grad = sharedX(np.zeros_like(param.get_value())) avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) if param.name is not None: avg_grad.name = 'avg_grad_' + param.name avg_grad_sqr.name = 'avg_grad_sqr_' + param.name new_avg_grad = self.averaging_coeff * avg_grad \ + (1 - self.averaging_coeff) * grads[param] new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr \ + (1 - self.averaging_coeff) * grads[param]**2 normalized_grad = grads[param] / T.sqrt(new_avg_grad_sqr - new_avg_grad**2 + self.stabilizer) updated_inc = self.momentum * inc - learning_rate * normalized_grad updates[avg_grad] = new_avg_grad updates[avg_grad_sqr] = new_avg_grad_sqr updates[inc] = updated_inc updates[param] = param + updated_inc return updates
def get_updates(self, learning_rate, grads, lr_scalers=None): """ .. todo:: WRITEME """ updates = OrderedDict() for param in grads.keys(): #avg_grad = sharedX(np.zeros_like(param.get_value())) avg_grad_sqr = sharedX(np.zeros_like(param.get_value())) momentum = sharedX(np.zeros_like(param.get_value())) if param.name is not None: #avg_grad.name = 'avg_grad_' + param.name avg_grad_sqr.name = 'avg_grad_sqr_' + param.name #new_avg_grad = self.averaging_coeff * avg_grad \ # + (1- self.averaging_coeff) * grads[param] new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr \ + (1 - self.averaging_coeff) * grads[param]**2 #normalized_grad = grads[param] / T.sqrt(new_avg_grad_sqr \ # - new_avg_grad**2 + self.stabilizer) normalized_grad = grads[param] / T.sqrt(new_avg_grad_sqr + self.stabilizer) new_momentum = self.momentum - learning_rate * normalized_grad #updates[avg_grad] = new_avg_grad updates[avg_grad_sqr] = new_avg_grad_sqr updates[momentum] = new_momentum updates[param] = param + new_momentum return updates
def set_input_space(self, space): self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) if self.fprop_code==True: self.output_space = VectorSpace(self.dim) else: self.output_space = VectorSpace(self.input_dim) rng = self.mlp.rng W = rng.randn(self.input_dim, self.dim) self.W = sharedX(W.T, self.layer_name + '_W') self.transformer = MatrixMul(self.W) self.W, = self.transformer.get_params() b = np.zeros((self.input_dim,)) self.b = sharedX(b, self.layer_name + '_b') # We need both to pass input_dim valid X = .001 * rng.randn(self.batch_size, self.dim) self.X = sharedX(X, self.layer_name + '_X') self._params = [self.W, self.b, self.X] self.state_below = T.zeros((self.batch_size, self.input_dim))
def __init__(self, dim, dim_hid, dim_cond, clamp_sigmoid=False, unroll_scan=1): """ Parameters ---------- dim : int Number of observed binary variables dim_hid : int Number of latent binary variables dim_cond : int Number of conditioning variables clamp_sigmoid : bool, optional WRITEME. Defaults to `False`. unroll_scan : int, optional WRITEME. Defaults to 1. """ super(CNADE, self).__init__(dim=dim, dim_hid=dim_hid, clamp_sigmoid=clamp_sigmoid, unroll_scan=unroll_scan) self.dim_cond = dim_cond # Conditioning weights matrix for visible biases U_b_value = self._initialize_weights(self.dim_cond, self.dim) self.U_b = sharedX(U_b_value, 'U_b') # Conditioning weights matrix for hidden biases U_c_value = self._initialize_weights(self.dim_cond, self.dim_hid) self.U_c = sharedX(U_c_value, 'U_c')
def __init__(self, dim, dim_hid, clamp_sigmoid=False, unroll_scan=1): """ Parameters ---------- dim : int Number of observed binary variables dim_hid : int Number of latent binary variables clamp_sigmoid : bool, optional WRITEME. Defaults to `False`. unroll_scan : int, optional WRITEME. Defaults to 1. """ super(NADEBase, self).__init__() self.dim = dim self.dim_hid = dim_hid self.clamp_sigmoid = clamp_sigmoid self.unroll_scan = unroll_scan self.input_space = VectorSpace(dim=self.dim) # Visible biases b_value = numpy.zeros(self.dim) self.b = sharedX(b_value, 'b') # Hidden biases c_value = numpy.zeros(self.dim_hid) self.c = sharedX(c_value, 'c') # Encoder weights W_value = self._initialize_weights(self.dim, self.dim_hid) self.W = sharedX(W_value, 'W') # Decoder weights V_value = self._initialize_weights(self.dim_hid, self.dim) self.V = sharedX(V_value, 'V')
def __init__(self, n_vis_units, n_hidden_units): Model.__init__(self) self._W = sharedX(np.random.uniform(size=(n_vis_units, n_hidden_units)), 'W') self._b = sharedX(np.zeros(n_hidden_units), 'b') self._b_reconstruction = sharedX(np.zeros(n_vis_units), 'b_reconstruction') self.input_space = VectorSpace(dim=n_vis_units)
def get_fixed_var_descr(self, model, X, Y): """ .. todo:: WRITEME """ assert Y is not None batch_size = model.batch_size drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size)) drop_mask_X.name = 'drop_mask' X_space = model.get_input_space() updates = OrderedDict() rval = FixedVarDescr() inputs=[X, Y] if not self.supervised: update_X = self.mask_gen(X, X_space = X_space) else: drop_mask_Y = sharedX(np.ones(batch_size,)) drop_mask_Y.name = 'drop_mask_Y' update_X, update_Y = self.mask_gen(X, Y, X_space) updates[drop_mask_Y] = update_Y rval.fixed_vars['drop_mask_Y'] = drop_mask_Y if self.mask_gen.sync_channels: n = update_X.ndim assert n == drop_mask_X.ndim - 1 update_X.name = 'raw_update_X' zeros_like_X = T.zeros_like(X) zeros_like_X.name = 'zeros_like_X' update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x') update_X.name = 'update_X' updates[drop_mask_X] = update_X rval.fixed_vars['drop_mask'] = drop_mask_X if hasattr(model.inference_procedure, 'V_dropout'): include_prob = model.inference_procedure.include_prob include_prob_V = model.inference_procedure.include_prob_V include_prob_Y = model.inference_procedure.include_prob_Y theano_rng = MRG_RandomStreams(2012+11+20) for elem in flatten([model.inference_procedure.V_dropout]): updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V if "Softmax" in str(type(model.hidden_layers[-1])): hid = model.inference_procedure.H_dropout[:-1] y = model.inference_procedure.H_dropout[-1] updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y else: hid = model.inference_procedure.H_dropout for elem in flatten(hid): updates[elem] = theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob rval.on_load_batch = [utils.function(inputs, updates=updates)] return rval
def __init__(self, scale_grads=1, target_scale=.1, discriminator_default_input_include_prob = 1., discriminator_input_include_probs=None, discriminator_default_input_scale=1., discriminator_input_scales=None, generator_default_input_include_prob = 1., generator_default_input_scale=1., inference_default_input_include_prob=None, inference_input_include_probs=None, inference_default_input_scale=1., inference_input_scales=None, init_now_train_generator=True, ever_train_discriminator=True, ever_train_generator=True, ever_train_inference=True, no_drop_in_d_for_g=False, alternate_g = False, infer_layer=None, noise_both = 0., g_eps = 0., d_eps =0.): self.__dict__.update(locals()) del self.self # These allow you to dynamically switch off training parts. # If the corresponding ever_train_* is False, these have # no effect. self.now_train_generator = sharedX(init_now_train_generator) self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32')) self.now_train_inference = sharedX(numpy.array(1., dtype='float32'))
def __init__(self, nvis, nhid, num_S=0, init_W=None): super(CMModel, self).__init__() self.nvis = nvis self.nhid = nhid self.num_S = num_S assert num_S in {0, 1}, "Currently only num_S == 0 or num_S == 1 is supported!" if init_W: model = pickle.load(open(init_W, "rb")) W = model.W.get_value() self.W = sharedX(W) else: self.W = sharedX(np.random.uniform(-1e-3, 1e-3, (nhid, nvis))) self.S = sharedX(np.random.uniform(-1e-3, 1e-3, (nhid, nhid))) self.theta = sharedX(np.zeros(nhid)) if self.num_S > 0: self._params = [self.W, self.S, self.theta] else: self._params = [self.W, self.theta] self.input_space = VectorSpace(dim=nvis) self.output_space = VectorSpace(dim=nhid)
def __init__(self, dataset, model, algorithm=None, save_path=None, save_freq=0, extensions=None, allow_overwrite=True): self.allow_overwrite = allow_overwrite self.first_save = True self.dataset = dataset self.model = model self.algorithm = algorithm if save_path is not None: if save_freq == 0: warnings.warn('save_path specified but save_freq is 0 ' '(never save). Is this intentional?') self.save_path = preprocess(save_path) else: if save_freq > 0: phase_variable = 'PYLEARN2_TRAIN_PHASE' if phase_variable in os.environ: phase = 'phase%d' % os.environ[phase_variable] tokens = [os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], phase, 'pkl'] else: tokens = os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], 'pkl' self.save_path = '.'.join(tokens) self.save_freq = save_freq if hasattr(self.dataset, 'yaml_src'): self.model.dataset_yaml_src = self.dataset.yaml_src else: warnings.warn("dataset has no yaml src, model won't know what " + "data it was trained on") self.extensions = extensions if extensions is not None else [] self.training_seconds = sharedX(value=0, name='training_seconds_this_epoch') self.total_seconds = sharedX(value=0, name='total_seconds_last_epoch')
def run(): disturb_mem.disturb_mem() b = sharedX(np.zeros((2,))) channels = OrderedDict() disturb_mem.disturb_mem() v_max = b.max(axis=0) v_min = b.min(axis=0) v_range = v_max - v_min updates = [] for i, val in enumerate([ v_max.max(), v_max.min(), v_range.max(), ]): disturb_mem.disturb_mem() s = sharedX(0., name='s_'+str(i)) updates.append((s, val)) for var in theano.gof.graph.ancestors(update for var, update in updates): if var.name is not None: if var.name[0] != 's' or len(var.name) != 2: var.name = None for key in channels: updates.append((s, channels[key])) file_path='nondeterminism_6.txt' mode = RecordMode(file_path=file_path, replay=0) f = theano.function([], mode=mode, updates=updates, on_unused_input='ignore', name='f') """ print 'type(f): ',type(f) print 'elements of f:' for elem in dir(f): print '\t',elem print 'type(f.fn): ',type(f.fn) print 'elements of f.fn:' for elem in dir(f.fn): print '\t',elem """ trials = 1 for i in xrange(trials): disturb_mem.disturb_mem() f() mode.record.f.flush() mode.record.f.close() mode.set_record(Record(file_path=file_path, replay=1)) for i in xrange(trials): disturb_mem.disturb_mem() f()
def __init__(self, super_dbm, feature_niter, post_scale = 0.5, input_include_prob = .5, remove_y = False): self.__dict__.update(locals()) del self.self self.input_space = super_dbm.get_input_space() self.output_space = super_dbm.get_output_space() self.theano_rng = MRG_RandomStreams(2013+1+27) h, g, y = super_dbm.hidden_layers vishid = h.get_weights() biashid = h.get_biases() hidpen = g.get_weights() penhid = g.get_weights().T biaspen = g.get_biases() penlab = y.get_weights() labpen = y.get_weights().T biaslab = y.get_biases() param_names = ['vishid', 'biashid', 'hidpen', 'penhid', 'biaspen', 'penlab', 'labpen', 'biaslab'] self._params = [] for name in param_names: val = locals()[name] scaled_val = val if val.ndim == 2: scaled_val = val / post_scale param = sharedX(scaled_val) setattr(self, name, param) self._params.append(param) fixed = sharedX(val) setattr(self, 'feature_'+name, fixed) self.hidden_layers = super_dbm.hidden_layers
def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05, non_linearity='sigmoid', use_ground_truth=True): allowed_non_linearities = {'sigmoid': T.nnet.sigmoid, 'tanh': T.tanh} self.nvis = nvis self.nhid = nhid self.hidden_transition_model = hidden_transition_model self.use_ground_truth = use_ground_truth self.alpha = sharedX(1) self.alpha_decrease_rate = 0.999 assert non_linearity in allowed_non_linearities self.non_linearity = allowed_non_linearities[non_linearity] # Space initialization self.input_space = VectorSpace(dim=self.nvis) self.hidden_space = VectorSpace(dim=self.nhid) self.output_space = VectorSpace(dim=1) self.input_source = 'features' self.target_source = 'targets' # Features-to-hidden matrix W_value = numpy.random.uniform(low=-irange, high=irange, size=(self.nvis, self.nhid)) self.W = sharedX(W_value, name='W') # Hidden biases b_value = numpy.zeros(self.nhid) self.b = sharedX(b_value, name='b') # Hidden-to-out matrix U_value = numpy.random.uniform(low=-irange, high=irange, size=(self.nhid, 1)) self.U = sharedX(U_value, name='U') # Output bias c_value = numpy.zeros(1) self.c = sharedX(c_value, name='c')
def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) self.output_space = VectorSpace(self.dim + self.copy_input \ * self.input_dim) rng = self.mlp.rng shape = (self.input_dim, self.dim) self.b = sharedX(self.initializer.get_biases(rng, shape), name=self.layer_name + '_b') self.W = sharedX(self.initializer.get_weights(rng, shape), name=self.layer_name + '_W') self.mask = sharedX(self.initializer.get_mask(), name=self.layer_name + '_mask')
def set_input_space(self, space): self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) self.output_space = VectorSpace(self.dim) self.input_dims = [self.input_dim, self.input_dim, self.hidden_dim] self.output_dims = [self.dim, self.hidden_dim, self.gater_dim] self.W = [None,None,None] self.b = [None,None,None] for i in range(3): self._init_inner_layer(i) self.stoch_grad = sharedX(0) self.kl_grad = sharedX(0) self.linear_grad = sharedX(0)
def profile_grad(f): print 'profiling gradient of ',f rng = np.random.RandomState([2012,7,19]) batch_size = 80 rows = 26 cols = 27 channels = 30 pool_rows = 2 pool_cols = 3 zv = rng.randn( batch_size, rows, cols, channels ).astype(config.floatX) #put the inputs + outputs in shared variables so we don't pay GPU transfer during test grad_shared = sharedX(zv) z_shared = sharedX(zv) p_th, h_th = f( z_shared, (pool_rows, pool_cols) ) func = function([],updates = { grad_shared : T.grad(p_th.sum() + h_th.sum(), z_shared)} ) print 'warming up' for i in xrange(10): func() trials = 10 results = [] for i in xrange(trials): t1 = time.time() for j in xrange(10): func() t2 = time.time() print t2 - t1 results.append(t2-t1) print 'final: ',sum(results)/float(trials)
def _init_inner_layer(self, idx): rng = self.mlp.rng if self.irange[idx] is not None: assert self.istdev[idx] is None assert self.sparse_init[idx] is None W = rng.uniform(-self.irange[idx], self.irange[idx], (self.input_dims[idx], self.output_dims[idx])) elif self.istdev[idx] is not None: assert self.sparse_init[idx] is None W = rng.randn(self.input_dims[idx], self.output_dims[idx]) \ * self.istdev[idx] else: assert self.sparse_init[idx] is not None W = np.zeros((self.input_dims[idx], self.output_dims[idx])) for i in xrange(self.output_dims[idx]): assert self.sparse_init[idx] <= self.input_dims[idx] for j in xrange(self.sparse_init[idx]): idx2 = rng.randint(0, self.input_dims[idx]) while W[idx2, i] != 0: idx2 = rng.randint(0, self.input_dims[idx]) W[idx2, i] = rng.randn() W *= self.sparse_stdev[idx] W = sharedX(W) W.name = self.layer_name + '_W' + str(idx) b = sharedX( np.zeros((self.output_dims[idx],)) \ + self.init_bias[idx], \ name = self.layer_name + '_b' + str(idx)) self.W[idx] = W self.b[idx] = b
def profile(f): print 'profiling ',f rng = np.random.RandomState([2012,7,19]) batch_size = 128 rows = 30 cols = 30 channels = 16 pool_rows = 3 pool_cols = 3 zv = rng.randn(channels, rows, cols, batch_size).astype(config.floatX) #put the inputs + outputs in shared variables so we don't pay GPU transfer during test p_shared = sharedX(zv[:,0:rows:pool_rows,0:cols:pool_cols,:]) h_shared = sharedX(zv) z_shared = sharedX(zv) p_th, h_th = f( z_shared, (pool_rows, pool_cols) ) func = function([],updates = { p_shared : p_th, h_shared : h_th} ) print 'warming up' for i in xrange(10): func() trials = 10 results = [] for i in xrange(trials): t1 = time.time() for j in xrange(10): func() t2 = time.time() print t2 - t1 results.append(t2-t1) print 'final: ',sum(results)/float(trials)
def __init__(self,W1, b1,W2,b2, mf_iter): self.mf_iter = mf_iter self.W1 = sharedX(W1) self.W2 = sharedX(W2) self.b1 = sharedX(b1) self.b2 = sharedX(b2) self.dataset_yaml_src = "!obj:pylearn2.datasets.mnist.MNIST { which_set : train }"
def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) if not (self.detector_layer_dim % self.pool_size == 0): raise ValueError("detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" % (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size)) self.h_space = VectorSpace(self.detector_layer_dim) self.pool_layer_dim = self.detector_layer_dim / self.pool_size self.output_space = VectorSpace(self.pool_layer_dim) rng = self.mlp.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.detector_layer_dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.detector_layer_dim)) def mask_rejects(idx, i): if self.mask_weights is None: return False return self.mask_weights[idx, i] == 0. for i in xrange(self.detector_layer_dim): assert self.sparse_init <= self.input_dim for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0 or mask_rejects(idx, i): idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() W *= self.sparse_stdev W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W ,= self.transformer.get_params() assert W.name is not None if self.mask_weights is not None: expected_shape = (self.input_dim, self.detector_layer_dim) if expected_shape != self.mask_weights.shape: raise ValueError("Expected mask with shape "+str(expected_shape)+" but got "+str(self.mask_weights.shape)) self.mask = sharedX(self.mask_weights)
def __init__(self): self.W1 = [sharedX(rng.randn(num_features, chunk_width)) for i in xrange(num_chunks)] disturb_mem.disturb_mem() self.W2 = [sharedX(rng.randn(chunk_width)) for i in xrange(num_chunks)] self._params = safe_union(self.W1, self.W2) self.input_space = VectorSpace(num_features) self.output_space = VectorSpace(1)
def initialize_parameters(self, nhid): self.nhid = nhid self.prior_mu = sharedX(numpy.zeros(self.nhid), name="prior_mu") self.log_prior_sigma = sharedX(numpy.zeros(self.nhid), name="prior_log_sigma") self._params = [self.prior_mu, self.log_prior_sigma]
def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule=None, init_momentum=None, set_batch_size=False, train_iteration_mode=None, batches_per_iter=None, theano_function_mode=None, monitoring_costs=None, seed=[2012, 10, 5]): """ Parameters ---------- learning_rate : float The learning rate to use. Train object callbacks can change the \ learning rate after each epoch. SGD update_callbacks can change \ it after each minibatch. cost : pylearn2.costs.cost.Cost Cost object specifying the objective function to be minimized. \ Optionally, may be None. In this case, SGD will call the model's \ get_default_cost method to obtain the objective function. batch_size : optional, int The size of the batch to be used. If not specified, the model will be asked for the batch size, so you must have specified the batch size there. (Some models are rigidly defined to only work with one batch size) monitoring_batches : optional, int At the start of each epoch, we run "monitoring", to evaluate quantities such as the validation set error. monitoring_batches, if specified, determines the number of batches to draw from the iterator for each monitoring dataset. Unnecessary if not using monitoring or if `monitor_iteration_mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). TODO: make it possible to specify different monitoring_batches for each monitoring dataset. The Monitor itself already supports this. monitoring_dataset : optional, a Dataset or dictionary If not specified, no monitoring is used. If specified to be a Dataset, monitor on that Dataset. If specified to be dictionary, the keys should be string names of datasets, and the values should be Datasets. All monitoring channels will be computed for all monitoring Datasets and will have the dataset name and an underscore prepended to them. monitor_iteration_mode : optional, str The iteration mode used to iterate over the examples in all monitoring datasets. If not specified, defaults to 'sequential'. TODO: make it possible to specify different modes for different datasets. termination_criterion : optional, instance of pylearn2.termination_criteria.TerminationCriterion Used to determine when the algorithm should stop running. If not specified, runs forever--or more realistically, until external factors halt the python process (Kansas 1977). update_callbacks : optional, list If specified, each member of the list should be a callable that accepts an SGD instance as its only argument. All callbacks will be called with this SGD instance after each SGD step. learning_rule : training_algorithms.learning_rule.LearningRule A learning rule computes the new parameter values given old \ parameters and first-order gradients. If learning_rule is None, \ sgd.SGD will update parameters according to the standard SGD \ learning rule: param := param - learning_rate * d cost / d param This argument allows more sophisticated learning rules, such as SGD with momentum. init_momentum : **DEPRECATED** option, float Use learning_rule instead. If None, does not use momentum otherwise, use momentum and \ initialize the momentum coefficient to init_momentum. Callbacks \ can change this over time just like the learning rate. If the \ gradient is the same on every step, then the update taken by the \ SGD algorithm is scaled by a factor of 1/(1-momentum). See \ section 9 of Geoffrey Hinton's "A Practical Guide to Training \ Restricted Boltzmann Machines" for details. set_batch_size : optional, bool Defaults to False. If True, and batch_size conflicts with model.force_batch_size, \ will call model.set_batch_size(batch_size) in an attempt to \ change model.force_batch_size train_iteration_mode : optional, str Defaults to 'shuffled_sequential'. The iteration mode to use for iterating through training examples. batches_per_iter : optional, int The number of batches to draw from the iterator over training examples. If iterational mode is 'sequential' or 'shuffled_sequential', this is unnecessary; when unspecified we will iterate over all examples. theano_function_mode : optional, a valid argument to theano.function's 'mode' parameter. The theano mode to compile the updates function with. Note that \ pylearn2 includes some wraplinker modes that are not bundled with \ theano. See pylearn2.devtools. These extra modes let you do \ things like check for NaNs at every step, or record md5 digests \ of all computations performed by the update function to help \ isolate problems with nondeterminism. monitoring_costs : optional, list a list of Cost instances. The Monitor will also include all channels defined by these Costs, even though we don't train using them. seed : optional, valid argument to np.random.RandomState The seed used for the random number generate to be passed to the training dataset iterator (if any) """ if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of " + "Costs to represent a sum of Costs. Use " + "pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn( "init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead" ) # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = make_np_rng(seed, which_method=["randn", "randint"]) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs
def _initialize_hidbias(self): self.hidbias = sharedX(numpy.zeros(self.nhid), name='hb', borrow=True)
def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, learning_rule=None, init_momentum=None, set_batch_size=False, train_iteration_mode=None, batches_per_iter=None, theano_function_mode=None, monitoring_costs=None, seed=[2012, 10, 5]): if isinstance(cost, (list, tuple, set)): raise TypeError("SGD no longer supports using collections of " + "Costs to represent a sum of Costs. Use " + "pylearn2.costs.cost.SumOfCosts instead.") if init_momentum: warnings.warn( "init_momentum interface is deprecated and will " "become officially unsuported as of May 9, 2014. Please use the " "`learning_rule` parameter instead, providing an object of type " "`pylearn2.training_algorithms.learning_rule.Momentum` instead" ) # Convert to new interface under the hood. self.learning_rule = Momentum(init_momentum) else: self.learning_rule = learning_rule self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batch_size = monitoring_batch_size self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batch_size is not None: raise ValueError("Specified a monitoring batch size " + "but not a monitoring dataset.") if monitoring_batches is not None: raise ValueError("Specified an amount of monitoring batches " + "but not a monitoring dataset.") self.termination_criterion = termination_criterion self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = make_np_rng(seed, which_method=["randn", "randint"]) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs
def setup_detector_layer_c01b(layer, input_space, rng, irange="not specified"): """ .. todo:: WRITEME properly Takes steps to set up an object for use as being some kind of convolutional layer. This function sets up only the detector layer. Does the following: * raises a RuntimeError if cuda is not available * sets layer.input_space to input_space * sets up addition of dummy channels for compatibility with cuda-convnet: - layer.dummy_channels: # of dummy channels that need to be added (You might want to check this and raise an Exception if it's not 0) - layer.dummy_space: The Conv2DSpace representing the input with dummy channels added * sets layer.detector_space to the space for the detector layer * sets layer.transformer to be a Conv2D instance * sets layer.b to the right value Parameters ---------- layer : object Any python object that allows the modifications described below and \ has the following attributes: \ * pad: int describing amount of zero padding to add \ * kernel_shape: 2-element tuple or list describing spatial shape of \ kernel \ * fix_kernel_shape: bool, if true, will shrink the kernel shape to \ make it feasible, as needed (useful for hyperparameter searchers) \ * detector_channels: The number of channels in the detector layer \ * init_bias: numeric constant added to a tensor of zeros to \ initialize the bias \ * tied_b: If true, biases are shared across all spatial locations input_space : WRITEME A Conv2DSpace to be used as input to the layer rng : WRITEME A numpy RandomState or equivalent """ if irange != "not specified": raise AssertionError( "There was a bug in setup_detector_layer_c01b." "It uses layer.irange instead of the irange parameter to the " "function. The irange parameter is now disabled by this " "AssertionError, so that this error message can alert you that " "the bug affected your code and explain why the interface is " "changing. The irange parameter to the function and this " "error message may be removed after April 21, 2014." ) # Use "self" to refer to layer from now on, so we can pretend we're # just running in the set_input_space method of the layer self = layer # Make sure cuda is available check_cuda(str(type(self))) # Validate input if not isinstance(input_space, Conv2DSpace): raise TypeError("The input to a convolutional layer should be a " "Conv2DSpace, but layer " + self.layer_name + " got " + str(type(self.input_space))) if not hasattr(self, 'detector_channels'): raise ValueError("layer argument must have a 'detector_channels' " "attribute specifying how many channels to put in " "the convolution kernel stack.") # Store the input space self.input_space = input_space # Make sure number of channels is supported by cuda-convnet # (multiple of 4 or <= 3) # If not supported, pad the input with dummy channels ch = self.input_space.num_channels rem = ch % 4 if ch > 3 and rem != 0: self.dummy_channels = 4 - rem else: self.dummy_channels = 0 self.dummy_space = Conv2DSpace( shape=input_space.shape, channels=input_space.num_channels + self.dummy_channels, axes=('c', 0, 1, 'b') ) if hasattr(self, 'kernel_stride'): kernel_stride = self.kernel_stride else: kernel_stride = [1, 1] output_shape = \ [int(np.ceil((i_sh + 2. * self.pad - k_sh) / float(k_st))) + 1 for i_sh, k_sh, k_st in zip(self.input_space.shape, self.kernel_shape, kernel_stride)] def handle_kernel_shape(idx): if self.kernel_shape[idx] < 1: raise ValueError("kernel must have strictly positive size on all " "axes but has shape: " + str(self.kernel_shape)) if output_shape[idx] <= 0: if self.fix_kernel_shape: self.kernel_shape[idx] = \ self.input_space.shape[idx] + 2 * self.pad assert self.kernel_shape[idx] != 0 output_shape[idx] = 1 warnings.warn("Had to change the kernel shape to make " "network feasible") else: raise ValueError("kernel too big for input " "(even with zero padding)") map(handle_kernel_shape, [0, 1]) if self.detector_channels < 16: raise ValueError("Cuda-convnet requires the detector layer to have " "at least 16 channels.") self.detector_space = Conv2DSpace(shape=output_shape, num_channels=self.detector_channels, axes=('c', 0, 1, 'b')) if hasattr(self, 'partial_sum'): partial_sum = self.partial_sum else: partial_sum = 1 if hasattr(self, 'sparse_init') and self.sparse_init is not None: self.transformer = \ checked_call(make_sparse_random_conv2D, OrderedDict([('num_nonzero', self.sparse_init), ('input_space', self.input_space), ('output_space', self.detector_space), ('kernel_shape', self.kernel_shape), ('pad', self.pad), ('partial_sum', partial_sum), ('kernel_stride', kernel_stride), ('rng', rng)])) else: self.transformer = make_random_conv2D( irange=self.irange, input_axes=self.input_space.axes, output_axes=self.detector_space.axes, input_channels=self.dummy_space.num_channels, output_channels=self.detector_space.num_channels, kernel_shape=self.kernel_shape, pad=self.pad, partial_sum=partial_sum, kernel_stride=kernel_stride, rng=rng ) W, = self.transformer.get_params() W.name = self.layer_name + '_W' if self.tied_b: self.b = sharedX(np.zeros(self.detector_space.num_channels) + self.init_bias) else: self.b = sharedX(self.detector_space.get_origin() + self.init_bias) self.b.name = self.layer_name + '_b' logger.info('Input shape: {0}'.format(self.input_space.shape)) logger.info('Detector space: {0}'.format(self.detector_space.shape))
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: print 'batch gradient class compiling gradient function' t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() print 'done. Took ', t2 - t1 if self.verbose: print 'batch gradient class compiling objective function' if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: print 'done' self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt( sum([ T.sqr(elem).sum() for elem in self.param_to_grad_shared.values() ])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + ( 1. - self.new_weight) * self.ave_grad_size self._normalize_grad = function( [], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_' + elem.name) self._store_old_grad = function( [norm], updates=OrderedDict([(grad_to_old_grad[g], g * norm) for g in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g] for g in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) """ beta_pr is the Polak-Ribiere formula for beta. According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." (ie, it is meant to revert to steepest descent when you have traveled far enough that the objective function is behaving non-quadratically enough that the conjugate gradient formulas aren't working anymore) http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method """ assert grad not in grad_to_old_grad make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function( [], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def __init__(self, rows, cols, channels): dim = rows * cols * channels self.input_space = Conv2DSpace((rows, cols), channels) self.dim = dim rng = np.random.RandomState([2012, 9, 25]) self.P = sharedX(rng.uniform(-1., 1., (dim, )))
def set_input_space(self, space): self.input_space = space self.output_space = space dim = space.get_total_dimension() self.D = sharedX(np.zeros((dim, )), self.layer_name + '_D') self._params = [self.D]
def __init__(self, num_arms, mean_std = 1.0, std_std = 1.0): self.rng = make_np_rng(None, [2013, 11, 12], which_method="randn") self.means = sharedX(self.rng.randn(num_arms) * mean_std) self.stds = sharedX(np.abs(self.rng.randn(num_arms) * std_std)) self.theano_rng = make_theano_rng(None, self.rng.randint(2 ** 16), which_method="normal")
__email__ = "pylearn-dev@googlegroups" import warnings import numpy import theano.tensor as T from pylearn2.compat import OrderedDict from pylearn2.expr.basic import log_sum_exp from pylearn2.models.model import Model from pylearn2.models.vae.kl import find_integrator_for from pylearn2.space import VectorSpace from pylearn2.utils import wraps, sharedX, safe_update from pylearn2.utils.rng import make_np_rng default_seed = 2014 + 9 + 20 pi = sharedX(numpy.pi) class VAE(Model): """ Implementation of the variational autoencoder (VAE). Parameters ---------- nvis : int Number of dimensions in the input data prior : pylearn2.models.vae.prior.Prior Represents the prior distribution :math:`p_\\theta(\\mathbf{z})` conditional : pylearn2.models.vae.conditional.Conditional Represents the conditional distribution :math:`p_\\theta(\\mathbf{x} \\mid \\mathbf{z})`
__email__ = "dinhlaur@iro" import numpy as np import scipy import scipy.linalg import theano import theano.tensor as T from pylearn2.models.model import Model from pylearn2.models.mlp import Layer, Linear, MLP from pylearn2.space import VectorSpace, CompositeSpace from pylearn2.utils import sharedX, wraps, as_floatX from pylearn2.utils.rng import make_theano_rng from pylearn2.linear.matrixmul import MatrixMul from theano.compat.python2x import OrderedDict pi = sharedX(np.pi) T_inv = T.nlinalg.MatrixInverse() T_det = T.nlinalg.Det() class TriangularMLP(MLP): """ Triangular MLP, a MLP of bijective layers. (see pylearn2.models.mlp for arguments) """ def inv_fprop(self, state, return_all=False): """ Inversion of the MLP forward propagation. Parameters
def set_input_space(self, space): """ Note: this resets parameters! """ # set up detector space and initialize transformer setup_detector_layer_b01tc(layer=self, input_space=space, rng=self.mlp.rng, irange=self.irange) rng = self.mlp.rng detector_shape = self.detector_space.shape #def handle_pool_shape(idx): # if self.pool_shape[idx] < 1: # raise ValueError("bad pool shape: " + str(self.pool_shape)) # if self.pool_shape[idx] > detector_shape[idx]: # if self.fix_pool_shape: # assert detector_shape[idx] > 0 # self.pool_shape[idx] = detector_shape[idx] # else: # raise ValueError("Pool shape exceeds detector layer shape on axis %d" % idx) #map(handle_pool_shape, [0, 1, 2]) ### Check some precondition assert self.pool_shape[0] == self.pool_shape[1] assert self.pool_stride[0] == self.pool_stride[1] assert all( isinstance(elem, py_integer_types) for elem in self.pool_stride) for i in xrange(0, 2): assert self.pool_stride[i] <= self.pool_shape[i] assert all( isinstance(elem, py_integer_types) for elem in self.pool_stride) dummy_shape = [self.input_space.shape[0], self.input_space.shape[1]] # added to find out output space shape after temporal and spatial pooling "max_pool_c01b" dummy_output_shape = [ int(np.ceil((i_sh + 2. * self.pad - k_sh) / float(k_st))) + 1 for i_sh, k_sh, k_st in zip(dummy_shape, self.kernel_shape, self.kernel_stride) ] dummy_output_shape = [dummy_output_shape[0], dummy_output_shape[1]] #print dummy_output_shape dummy_detector_space = Conv2DSpace(shape=dummy_output_shape, num_channels=self.detector_channels, axes=('c', 0, 1, 'b')) # picked only 16 channels and 1 image in order to do a fast dummy maxpooling (16 because Alex's code needs at least 16 channels) dummy_detector = sharedX( dummy_detector_space.get_origin_batch(2)[0:16, :, :, :]) dummy_p = max_pool_c01b(c01b=dummy_detector, pool_shape=self.pool_shape, pool_stride=self.pool_stride) dummy_p = dummy_p.eval() # set space after temporal pooling with overlap if self.pool_temporal_stride[1] > self.pool_temporal_shape[1]: if self.fix_pool_stride: warnings.warn("Fixing the pool stride") ps = self.pool_temporal_shape[1] assert isinstance(ps, py_integer_types) self.pool_stride = [1, ps] else: raise ValueError("Stride too big.") # (0*1,'t') dummy_temp_image = [(dummy_p.shape[1] * dummy_p.shape[2]), self.detector_space.shape[2]] #overlapped temporal max pooling image_shape self.temp_pool_input_shape = dummy_temp_image dummy_temp_space = Conv2DSpace(shape=dummy_temp_image, num_channels=self.detector_channels, axes=('c', 0, 1, 'b')) temp_input = sharedX( dummy_temp_space.get_origin_batch(2)[0:16, :, :, :]) dummy_temp_p = temporal_max_pool_c01b( c01b=temp_input, pool_shape=self.pool_temporal_shape, pool_stride=self.pool_temporal_stride, image_shape=dummy_temp_image) dummy_temp_p = dummy_temp_p.eval() self.output_space = Conv3DSpace( shape=[dummy_p.shape[1], dummy_p.shape[2], dummy_temp_p.shape[2]], num_channels=self.num_channels, axes=('b', 0, 1, 't', 'c')) # Print spaces print "Input shape: ", self.input_space.shape print "Detector space: ", self.detector_space.shape print "Output space: ", self.output_space.shape
#stats = SufficientStatistics.from_observations( needed_stats = needed_stats, V = V, ** obs ) #em_functional = model.em_functional( stats = stats, H_hat = obs['H_hat'], S_hat = obs['S_hat'], var_s0_hat = obs['var_s0_hat'], var_s1_hat = obs['var_s1_hat']) trunc_kl = model.inference_procedure.truncated_KL(V, obs) if config.compute_test_value != 'off': assert not np.any(np.isnan(trunc_kl.tag.test_value)) assert len(trunc_kl.type.broadcastable) == 0 print 'compiling function...' from theano import function G = [ sharedX(np.zeros((batch_size, rbm.nhid), dtype='float32')) for rbm in model.dbm.rbms ] H = sharedX(np.zeros((batch_size, model.s3c.nhid), dtype='float32')) S = sharedX(np.zeros((batch_size, model.s3c.nhid), dtype='float32')) new_stats = SufficientStatistics.from_observations( needed_stats=needed_stats, V=V, H_hat=H, S_hat=S, var_s0_hat=obs['var_s0_hat'], var_s1_hat=obs['var_s1_hat']) obj = model.inference_procedure.truncated_KL( V, {
from pylearn2.datasets.binarizer import Binarizer from pylearn2.datasets.mnist import MNIST print 'Loading data...' raw = MNIST(which_set='train', one_hot=True) train = Binarizer(raw) print 'Compiling cost functions...' for model in models: model.niter = 10 from galatea.dbm.inpaint.super_inpaint import SuperInpaint from galatea.dbm.inpaint.super_inpaint import MaskGen from pylearn2.utils import sharedX mask_gen = MaskGen(drop_prob=sharedX(0.1), balance=0, sync_channels=0) cost = SuperInpaint(both_directions=0, noise=0, supervised=1, mask_gen=mask_gen) from pylearn2.utils import function def get_obj_func(model): X = model.get_input_space().make_batch_theano() Y = model.get_output_space().make_batch_theano() obj = cost(model, X, Y) return function([X, Y], obj)
def __init__(self, num_arms, mean_std=1.0, std_std=1.0): self.rng = np.random.RandomState([2013, 11, 12]) self.means = sharedX(self.rng.randn(num_arms) * mean_std) self.stds = sharedX(np.abs(self.rng.randn(num_arms) * std_std)) self.theano_rng = MRG_RandomStreams(self.rng.randint(2**16))
def test_softmax_mf_sample_consistent(): # A test of the Softmax class # Verifies that the mean field update is consistent with # the sampling function # Since a Softmax layer contains only one random variable # (with n_classes possible values) the mean field assumption # does not impose any restriction so mf_update simply gives # the true expected value of h given v. # We can thus use mf_update to compute the expected value # of a sample of y conditioned on v, and check that samples # drawn using the layer's sample method convert to that # value. rng = np.random.RandomState([2012,11,1,1154]) theano_rng = MRG_RandomStreams(2012+11+1+1154) num_samples = 1000 tol = .042 # Make DBM num_vis = rng.randint(1,11) n_classes = rng.randint(1, 11) v = BinaryVector(num_vis) v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX)) y = Softmax( n_classes = n_classes, layer_name = 'y', irange = 1.) y.set_biases(rng.uniform(-1., 1., (n_classes,)).astype(config.floatX)) dbm = DBM(visible_layer = v, hidden_layers = [y], batch_size = 1, niter = 50) # Randomly pick a v to condition on # (Random numbers are generated via dbm.rng) layer_to_state = dbm.make_layer_to_state(1) v_state = layer_to_state[v] y_state = layer_to_state[y] # Infer P(y | v) using mean field expected_y = y.mf_update( state_below = v.upward_state(v_state)) expected_y = expected_y[0, :] expected_y = expected_y.eval() # copy all the states out into a batch size of num_samples cause_copy = sharedX(np.zeros((num_samples,))).dimshuffle(0,'x') v_state = v_state[0,:] + cause_copy y_state = y_state[0,:] + cause_copy y_samples = y.sample(state_below = v.upward_state(v_state), theano_rng=theano_rng) y_samples = function([], y_samples)() check_multinomial_samples(y_samples, (num_samples, n_classes), expected_y, tol)
def make_local_rfs(dataset, nhid, rf_shape, stride, irange=.05, draw_patches=False, rng=None): """ Initializes a weight matrix with local receptive fields Parameters ---------- dataset : pylearn2.datasets.dataset.Dataset Dataset defining the topology of the space (needed to convert 2D patches into subsets of pixels in a 1D filter vector) nhid : int Number of hidden units to make filters for rf_shape : list or tuple (2 elements) Gives topological shape of a receptive field stride : list or tuple (2 elements) Gives offset between receptive fields irange : float If draw_patches is False, weights are initialized in U(-irange,irange) draw_patches : bool If True, weights are drawn from random examples Returns ------- weights : ndarray 2D ndarray containing the desired weights. """ s = dataset.view_shape() height, width, channels = s W_img = np.zeros((nhid, height, width, channels)) last_row = s[0] - rf_shape[0] last_col = s[1] - rf_shape[1] rng = make_np_rng(rng, [2012, 07, 18], which_method='uniform') if stride is not None: # local_rf_stride specified, make local_rfs on a grid assert last_row % stride[0] == 0 num_row_steps = last_row / stride[0] + 1 assert last_col % stride[1] == 0 num_col_steps = last_col / stride[1] + 1 total_rfs = num_row_steps * num_col_steps if nhid % total_rfs != 0: raise ValueError('nhid modulo total_rfs should be 0, but we get ' '%d modulo %d = %d' % (nhid, total_rfs, nhid % total_rfs)) filters_per_rf = nhid / total_rfs idx = 0 for r in xrange(num_row_steps): rc = r * stride[0] for c in xrange(num_col_steps): cc = c * stride[1] for i in xrange(filters_per_rf): if draw_patches: img = dataset.get_batch_topo(1)[0] local_rf = img[rc:rc + rf_shape[0], cc:cc + rf_shape[1], :] else: local_rf = rng.uniform( -irange, irange, (rf_shape[0], rf_shape[1], s[2])) W_img[idx, rc:rc + rf_shape[0], cc:cc + rf_shape[1], :] = local_rf idx += 1 assert idx == nhid else: raise NotImplementedError() #the case below is copy-pasted from s3c and not generalized yet #no stride specified, use random shaped patches """ assert local_rf_max_shape is not None for idx in xrange(nhid): shape = [ self.rng.randint(min_shape,max_shape+1) for min_shape, max_shape in zip( local_rf_shape, local_rf_max_shape) ] loc = [ self.rng.randint(0, bound - width + 1) for bound, width in zip(s, shape) ] rc, cc = loc if local_rf_draw_patches: img = local_rf_src.get_batch_topo(1)[0] local_rf = img[rc:rc+shape[0], cc:cc+shape[1], :] else: local_rf = self.rng.uniform(-self.irange, self.irange, (shape[0], shape[1], s[2]) ) W_img[idx,rc:rc+shape[0], cc:cc+shape[1],:] = local_rf """ W = dataset.view_converter.topo_view_to_design_mat(W_img).T rval = MatrixMul(W=sharedX(W)) return rval
def redo_everything(self): self.beta = sharedX(np.ones((self.nvis, )) * self.init_beta, 'beta') self.mu = sharedX(np.ones((self.nvis, )) * self.init_mu, 'mu') self.redo_theano()
def __init__(self, dim): self.dim = dim rng = np.random.RandomState([2012, 9, 25]) self.P = sharedX(rng.uniform(-1., 1., (dim, )))
def __init__(self, n_vis, n_hid, layer_name, rng=None, return_indices=None, param_init_range=0.02, forget_gate_init_bias=0.05, input_gate_init_bias=0., output_gate_init_bias=0., dropout_prob=0.0): if rng is None: rng = np.random.RandomState() self.rng = rng self.n_vis = n_vis self.n_hid = n_hid self.layer_name = layer_name self.param_init_range = param_init_range self.return_indices = return_indices self.forget_gate_init_bias = forget_gate_init_bias self.input_gate_init_bias = input_gate_init_bias self.output_gate_init_bias = output_gate_init_bias self.dropout_prob = dropout_prob # only create random arrays once and reuse via copy() irange = self.param_init_range init_Wxh = self.rng.uniform(-irange, irange, (self.n_vis, self.n_hid)) init_Whh = self.rng.uniform(-irange, irange, (self.n_hid, self.n_hid)) # input-to-hidden (rows, cols) = (n_visible, n_hidden) self.Wxh = theano.shared(value=init_Wxh, name=self.layer_name + '_Wxh', borrow=True) self.bxh = theano.shared(value=np.zeros(self.n_hid), name='bxh', borrow=True) # hidden-to-hidden (rows, cols) = (n_hidden, n_hidden) for both encoding and decoding ('tied weights') self.Whh = theano.shared(value=init_Whh, name=self.layer_name + '_Whh', borrow=True) # lstm parameters # Output gate switch self.O_b = sharedX(np.zeros( (self.n_hid, )) + self.output_gate_init_bias, name=(self.layer_name + '_O_b')) self.O_x = sharedX(init_Wxh, name=(self.layer_name + '_O_x')) self.O_h = sharedX(init_Whh, name=(self.layer_name + '_O_h')) self.O_c = sharedX(init_Whh.copy(), name=(self.layer_name + '_O_c')) # Input gate switch self.I_b = sharedX(np.zeros( (self.n_hid, )) + self.input_gate_init_bias, name=(self.layer_name + '_I_b')) self.I_x = sharedX(init_Wxh.copy(), name=(self.layer_name + '_I_x')) self.I_h = sharedX(init_Whh.copy(), name=(self.layer_name + '_I_h')) self.I_c = sharedX(init_Whh.copy(), name=(self.layer_name + '_I_c')) # Forget gate switch self.F_b = sharedX(np.zeros( (self.n_hid, )) + self.forget_gate_init_bias, name=(self.layer_name + '_F_b')) self.F_x = sharedX(init_Wxh.copy(), name=(self.layer_name + '_F_x')) self.F_h = sharedX(init_Whh.copy(), name=(self.layer_name + '_F_h')) self.F_c = sharedX(init_Whh.copy(), name=(self.layer_name + '_F_c')) self.params = [ self.Wxh, self.bxh, self.Whh, self.O_b, self.O_x, self.O_h, self.O_c, self.I_b, self.I_x, self.I_h, self.I_c, self.F_b, self.F_x, self.F_h, self.F_c ]
def __init__(self): self._params = [sharedX(np.zeros(shape)) for shape in shapes] self.input_space = VectorSpace(1)
def test_batch_gradient_descent(): """ Verify that batch gradient descent works by checking that it minimizes a quadratic function f(x) = x^T A x + b^T x + c correctly for several sampled values of A, b, and c. The ground truth minimizer is x = np.linalg.solve(A,-b)""" n = 3 A = T.matrix(name='A') b = T.vector(name='b') c = T.scalar(name='c') x = sharedX(np.zeros((n, )), name='x') half = np.cast[config.floatX](0.5) obj = half * T.dot(T.dot(x, A), x) + T.dot(b, x) + c minimizer = BatchGradientDescent(objective=obj, params=[x], inputs=[A, b, c]) num_samples = 3 rng = np.random.RandomState([1, 2, 3]) for i in xrange(num_samples): A = np.cast[config.floatX](rng.randn(1.5 * n, n)) A = np.cast[config.floatX](np.dot(A.T, A)) A += np.cast[config.floatX](np.identity(n) * .02) b = np.cast[config.floatX](rng.randn(n)) c = np.cast[config.floatX](rng.randn()) x.set_value(np.cast[config.floatX](rng.randn(n))) analytical_x = np.linalg.solve(A, -b) actual_obj = minimizer.minimize(A, b, c) actual_x = x.get_value() #Check that the value returned by the minimize method #is the objective function value at the parameters #chosen by the minimize method cur_obj = minimizer.obj(A, b, c) assert np.allclose(actual_obj, cur_obj) x.set_value(analytical_x) analytical_obj = minimizer.obj(A, b, c) #make sure the objective function is accurate to first 4 digits condition1 = not np.allclose(analytical_obj, actual_obj) condition2 = np.abs(analytical_obj - actual_obj) >= 1e-4 * np.abs(analytical_obj) if (config.floatX == 'float64' and condition1) \ or (config.floatX == 'float32' and condition2): print 'objective function value came out wrong on sample ', i print 'analytical obj', analytical_obj print 'actual obj', actual_obj """ The following section of code was used to verify that numerical error can make the objective function look non-convex print 'Checking for numerically induced non-convex behavior' def f(x): return 0.5 * np.dot(x,np.dot(A,x)) + np.dot(b,x) + c x.set_value(actual_x) minimizer._compute_grad(A,b,c) minimizer._normalize_grad() d = minimizer.param_to_grad_shared[x].get_value() x = actual_x.copy() prev = f(x) print prev step_size = 1e-4 x += step_size * d cur = f(x) print cur cur_sgn = np.sign(cur-prev) flip_cnt = 0 for i in xrange(10000): x += step_size * d prev = cur cur = f(x) print cur prev_sgn = cur_sgn cur_sgn = np.sign(cur-prev) if cur_sgn != prev_sgn: print 'flip' flip_cnt += 1 if flip_cnt > 1: print "Non-convex!" from matplotlib import pyplot as plt y = [] x = actual_x.copy() for j in xrange(10000): y.append(f(x)) x += step_size * d plt.plot(y) plt.show() assert False print 'None found' """ #print 'actual x',actual_x #print 'A:' #print A #print 'b:' #print b #print 'c:' #print c x.set_value(actual_x) minimizer._compute_grad(A, b, c) x_grad = minimizer.param_to_grad_shared[x] actual_grad = x_grad.get_value() correct_grad = 0.5 * np.dot(A, x.get_value()) + 0.5 * np.dot( A.T, x.get_value()) + b if not np.allclose(actual_grad, correct_grad): print 'gradient was wrong at convergence point' print 'actual grad: ' print actual_grad print 'correct grad: ' print correct_grad print 'max difference: ', np.abs(actual_grad - correct_grad).max() assert False minimizer._normalize_grad() d = minimizer.param_to_grad_shared[x].get_value() step_len = ( np.dot(b,d) + 0.5 * np.dot(d,np.dot(A,actual_x)) \ + 0.5 * np.dot(actual_x,np.dot(A,d)) ) / np.dot(d, np.dot(A,d)) g = np.dot(A, actual_x) + b deriv = np.dot(g, d) print 'directional deriv at actual', deriv print 'optimal step_len', step_len optimal_x = actual_x - d * step_len g = np.dot(A, optimal_x) + b deriv = np.dot(g, d) print 'directional deriv at optimal: ', deriv x.set_value(optimal_x) print 'obj at optimal: ', minimizer.obj(A, b, c) print 'eigenvalue range:' val, vec = np.linalg.eig(A) print(val.min(), val.max()) print 'condition number: ', (val.max() / val.min()) assert False
def shared_dataset(data_x): """Function that loads the dataset into shared variables""" if conf.get('normalize', True): return sharedX(data_x, borrow=True) else: return theano.shared(theano._asarray(data_x), borrow=True)
T.dot(X, self.W1) + T.dot(y, self.W2.T) + self.b1) y = T.nnet.softmax(T.dot(H, self.W2) + self.b2) return y def mfny_arg(self, X): H = T.nnet.sigmoid(T.dot(X, 2 * self.W1) + self.b1) y = T.nnet.softmax(T.dot(H, self.W2) + self.b2) for i in xrange(mf_iter - 1): H = T.nnet.sigmoid( T.dot(X, self.W1) + T.dot(y, self.W2.T) + self.b1) y = T.nnet.softmax(T.dot(H, self.W2) + self.b2) return T.dot(H, self.W2) + self.b2 X = sharedX(dataset.X) y = sharedX(dataset.y) idx = T.iscalar() idx.tag.test_value = 0 Xb = X[idx * batch_size:(idx + 1) * batch_size, :] yb = y[idx * batch_size:(idx + 1) * batch_size, :] mf1mod = cRBM(W1, b1, W2, b2) mfnmod = cRBM(W1, b1, W2, b2) ymf1_arg = mf1mod.mf1y_arg(Xb) ymfn_arg = mfnmod.mfny_arg(Xb)
idxs = np.arange(num_beta) pos = idxs / float(num_beta-1) scaled_shifted = pos * (max_exp-min_exp) + min_exp betas = 10 ** scaled_shifted kls = np.zeros((trials,num_beta)) ml_kls = np.zeros((trials,)) for trial in xrange(trials): #generate the data data_distribution = MND( sigma = np.identity(dim) / true_beta, mu = np.zeros((dim,)), seed = 17 * (trial+1) ) true = DiagonalMND( nvis = dim, init_beta = true_beta, init_mu = 0., min_beta = .1, max_beta = 10.) X = sharedX(function([],data_distribution.random_design_matrix(m))()) Xv = X.get_value() mu = Xv.mean(axis=0) print 'maximum likelihood mu: ',mu diff = Xv - mu var = np.square(diff).mean(axis=0) mlbeta = 1./var print 'maximum likelihood beta: ',mlbeta ml_model = DiagonalMND( nvis = dim, init_mu = mu, init_beta = mlbeta, min_beta = 0.0, max_beta = 1e6) ml_kl = kl_divergence( true, ml_model) ml_kl = function([],ml_kl)() assert ml_kl >= 0.0 ml_kls[trial] = ml_kl
def __init__(self, W1, b1, W2, b2): self.W1 = sharedX(W1) self.W2 = sharedX(W2) self.b1 = sharedX(b1) self.b2 = sharedX(b2)
def _initialize_visbias(self, nvis): self.visbias = sharedX(numpy.zeros(nvis), name='vb', borrow=True)
def train_all(self, dataset, mu=None): """ Process kmeans algorithm on the input to localize clusters. Parameters ---------- dataset : WRITEME mu : WRITEME Returns ------- rval : bool WRITEME """ #TODO-- why does this sometimes return X and sometimes return nothing? X = dataset.get_design_matrix() n, m = X.shape k = self.k if milk is not None: #use the milk implementation of k-means if it's available cluster_ids, mu = milk.kmeans(X, k) else: #our own implementation # taking random inputs as initial clusters if user does not provide # them. if mu is not None: if not len(mu) == k: raise Exception( 'You gave %i clusters, but k=%i were expected' % (len(mu), k)) else: indices = numpy.random.randint(X.shape[0], size=k) mu = X[indices] try: dists = numpy.zeros((n, k)) except MemoryError: raise TypicalMemoryError("dying trying to allocate dists " "matrix for {0} examples and {1} " "means".format(n, k)) old_kills = {} iter = 0 mmd = prev_mmd = float('inf') while True: if self.verbose: logger.info('kmeans iter {0}'.format(iter)) #print 'iter:',iter,' conv crit:',abs(mmd-prev_mmd) #if numpy.sum(numpy.isnan(mu)) > 0: if numpy.any(numpy.isnan(mu)): logger.info('nan found') return X #computing distances for i in xrange(k): dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1) if iter > 0: prev_mmd = mmd min_dists = dists.min(axis=1) #mean minimum distance: mmd = min_dists.mean() logger.info('cost: {0}'.format(mmd)) if iter > 0 and (iter >= self.max_iter or \ abs(mmd - prev_mmd) < self.convergence_th): #converged break #finding minimum distances min_dist_inds = dists.argmin(axis=1) #computing means i = 0 blacklist = [] new_kills = {} while i < k: b = min_dist_inds == i if not numpy.any(b): killed_on_prev_iter = True #initializes empty cluster to be the mean of the d data #points farthest from their corresponding means if i in old_kills: d = old_kills[i] - 1 if d == 0: d = 50 new_kills[i] = d else: d = 5 mu[i, :] = 0 for j in xrange(d): idx = numpy.argmax(min_dists) min_dists[idx] = 0 #chose point idx mu[i, :] += X[idx, :] blacklist.append(idx) mu[i, :] /= float(d) #cluster i was empty, reset it to d far out data points #recomputing distances for this cluster dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1) min_dists = dists.min(axis=1) for idx in blacklist: min_dists[idx] = 0 min_dist_inds = dists.argmin(axis=1) #done i += 1 else: mu[i, :] = numpy.mean(X[b, :], axis=0) if numpy.any(numpy.isnan(mu)): logger.info('nan found at {0}'.format(i)) return X i += 1 old_kills = new_kills iter += 1 self.mu = sharedX(mu) self._params = [self.mu] return True
def __init__(self, learning_rate, cost=None, batch_size=None, monitoring_batches=None, monitoring_dataset=None, monitor_iteration_mode='sequential', termination_criterion=None, update_callbacks=None, init_momentum=None, set_batch_size=False, train_iteration_mode=None, batches_per_iter=None, theano_function_mode=None, monitoring_costs=None, seed=[2012, 10, 5]): """ WRITEME learning_rate: The learning rate to use. Train object callbacks can change the learning rate after each epoch. SGD update_callbacks can change it after each minibatch. cost: a pylearn2.costs.cost.Cost object specifying the objective function to be minimized. Optionally, may be None. In this case, SGD will call the model's get_default_cost method to obtain the objective function. init_momentum: if None, does not use momentum otherwise, use momentum and initialize the momentum coefficient to init_momentum. Callbacks can change this over time just like the learning rate. If the gradient is the same on every step, then the update taken by the SGD algorithm is scaled by a factor of 1/(1-momentum). See section 9 of Geoffrey Hinton's "A Practical Guide to Training Restricted Boltzmann Machines" for details. set_batch_size: if True, and batch_size conflicts with model.force_batch_size, will call model.set_batch_size(batch_size) in an attempt to change model.force_batch_size theano_function_mode: The theano mode to compile the updates function with. Note that pylearn2 includes some wraplinker modes that are not bundled with theano. See pylearn2.devtools. These extra modes let you do things like check for NaNs at every step, or record md5 digests of all computations performed by the update function to help isolate problems with nondeterminism. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc """ if isinstance(cost, (list, tuple, set)): raise TypeError( "SGD no longer supports using collections of Costs to represent " " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.") self.learning_rate = sharedX(learning_rate, 'learning_rate') self.cost = cost self.batch_size = batch_size self.set_batch_size = set_batch_size self.batches_per_iter = batches_per_iter self._set_monitoring_dataset(monitoring_dataset) self.monitoring_batches = monitoring_batches self.monitor_iteration_mode = monitor_iteration_mode if monitoring_dataset is None: if monitoring_batches is not None: raise ValueError( "Specified an amount of monitoring batches but not a monitoring dataset." ) self.termination_criterion = termination_criterion self.init_momentum = init_momentum if init_momentum is None: self.momentum = None else: assert init_momentum >= 0. assert init_momentum < 1. self.momentum = sharedX(init_momentum, 'momentum') self._register_update_callbacks(update_callbacks) if train_iteration_mode is None: train_iteration_mode = 'shuffled_sequential' self.train_iteration_mode = train_iteration_mode self.first = True self.rng = np.random.RandomState(seed) self.theano_function_mode = theano_function_mode self.monitoring_costs = monitoring_costs
def setup(self, model, dataset): if self.cost is None: self.cost = model.get_default_cost() inf_params = [ param for param in model.get_params() if np.any(np.isinf(param.get_value())) ] if len(inf_params) > 0: raise ValueError("These params are Inf: " + str(inf_params)) if any([ np.any(np.isnan(param.get_value())) for param in model.get_params() ]): nan_params = [ param for param in model.get_params() if np.any(np.isnan(param.get_value())) ] raise ValueError("These params are NaN: " + str(nan_params)) self.model = model batch_size = self.batch_size if hasattr(model, "force_batch_size"): if model.force_batch_size > 0: if batch_size is not None: if batch_size != model.force_batch_size: if self.set_batch_size: model.set_batch_size(batch_size) else: raise ValueError( "batch_size argument to SGD conflicts with model's force_batch_size attribute" ) else: self.batch_size = model.force_batch_size model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() X = model.get_input_space().make_theano_batch(name="%s[X]" % self.__class__.__name__) self.topo = not X.ndim == 2 if config.compute_test_value == 'raise': if self.topo: X.tag.test_value = dataset.get_batch_topo(self.batch_size) else: X.tag.test_value = dataset.get_batch_design(self.batch_size) Y = T.matrix(name="%s[Y]" % self.__class__.__name__) fixed_var_descr = self.cost.get_fixed_var_descr(model, X, Y) self.on_load_batch = fixed_var_descr.on_load_batch if self.cost.supervised: if config.compute_test_value == 'raise': _, Y.tag.test_value = dataset.get_batch_design( self.batch_size, True) self.supervised = True cost_value = self.cost(model, X, Y, **fixed_var_descr.fixed_vars) else: self.supervised = False cost_value = self.cost(model, X, **fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: if self.supervised: cost_value.name = 'objective(' + X.name + ', ' + Y.name + ')' else: cost_value.name = 'objective(' + X.name + ')' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) if self.supervised: ipt = (X, Y) else: ipt = X dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=ipt, val=learning_rate, dataset=monitoring_dataset) if self.momentum: self.monitor.add_channel(name='momentum', ipt=ipt, val=self.momentum, dataset=monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i if self.cost.supervised: grads, updates = self.cost.get_gradients( model, X, Y, **fixed_var_descr.fixed_vars) else: grads, updates = self.cost.get_gradients( model, X, **fixed_var_descr.fixed_vars) for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % { 'costname': cost_value.name, 'paramname': param.name }) lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param, 1.) log.info('\t' + param_name + ': ' + str(lr)) if self.momentum is None: updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) else: for param in params: inc = sharedX(param.get_value() * 0.) if param.name is not None: inc.name = 'inc_' + param.name updated_inc = self.momentum * inc - learning_rate * lr_scalers.get( param, 1.) * grads[param] updates[inc] = updated_inc updates[param] = param + updated_inc for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.censor_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): if self.supervised: fn_inputs = [X, Y] else: fn_inputs = [X] self.sgd_update = function(fn_inputs, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params