def t_binomial(mean, size, const_size, var_input, input, steps, rtol): R = MRG_RandomStreams(234, use_cuda=False) u = R.binomial(size=size, p=mean) f = theano.function(var_input, u, mode=mode) out = f(*input) # Increase the number of steps if sizes implies only a few samples if numpy.prod(const_size) < 10: steps_ = steps * 100 else: steps_ = steps basictest(f, steps_, const_size, prefix="mrg cpu", inputs=input, allow_01=True, target_avg=mean, mean_rtol=rtol) if mode != "FAST_COMPILE" and cuda_available: R = MRG_RandomStreams(234, use_cuda=True) u = R.binomial(size=size, p=mean, dtype="float32") # well, it's really that this test w GPU doesn't make sense otw assert u.dtype == "float32" f = theano.function( var_input, theano.Out(theano.sandbox.cuda.basic_ops.gpu_from_host(u), borrow=True), mode=mode_with_gpu ) gpu_out = numpy.asarray(f(*input)) basictest( f, steps_, const_size, prefix="mrg gpu", inputs=input, allow_01=True, target_avg=mean, mean_rtol=rtol ) numpy.testing.assert_array_almost_equal(out, gpu_out, decimal=6) RR = theano.tensor.shared_randomstreams.RandomStreams(234) uu = RR.binomial(size=size, p=mean) ff = theano.function(var_input, uu, mode=mode) # It's not our problem if numpy generates 0 or 1 basictest(ff, steps_, const_size, prefix="numpy", allow_01=True, inputs=input, target_avg=mean, mean_rtol=rtol)
def get_fixed_var_descr(self, model, X, Y): """ .. todo:: WRITEME """ assert Y is not None batch_size = model.batch_size drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size)) drop_mask_X.name = 'drop_mask' X_space = model.get_input_space() updates = OrderedDict() rval = FixedVarDescr() inputs=[X, Y] if not self.supervised: update_X = self.mask_gen(X, X_space = X_space) else: drop_mask_Y = sharedX(np.ones(batch_size,)) drop_mask_Y.name = 'drop_mask_Y' update_X, update_Y = self.mask_gen(X, Y, X_space) updates[drop_mask_Y] = update_Y rval.fixed_vars['drop_mask_Y'] = drop_mask_Y if self.mask_gen.sync_channels: n = update_X.ndim assert n == drop_mask_X.ndim - 1 update_X.name = 'raw_update_X' zeros_like_X = T.zeros_like(X) zeros_like_X.name = 'zeros_like_X' update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x') update_X.name = 'update_X' updates[drop_mask_X] = update_X rval.fixed_vars['drop_mask'] = drop_mask_X if hasattr(model.inference_procedure, 'V_dropout'): include_prob = model.inference_procedure.include_prob include_prob_V = model.inference_procedure.include_prob_V include_prob_Y = model.inference_procedure.include_prob_Y theano_rng = MRG_RandomStreams(2012+11+20) for elem in flatten([model.inference_procedure.V_dropout]): updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V if "Softmax" in str(type(model.hidden_layers[-1])): hid = model.inference_procedure.H_dropout[:-1] y = model.inference_procedure.H_dropout[-1] updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y else: hid = model.inference_procedure.H_dropout for elem in flatten(hid): updates[elem] = theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob rval.on_load_batch = [utils.function(inputs, updates=updates)] return rval
def dropout(x, level, noise_shape=None, seed=None): '''Sets entries in `x` to zero at random, while scaling the entire tensor. # Arguments x: tensor level: fraction of the entries in the tensor that will be set to 0. noise_shape: shape for randomly generated keep/drop flags, must be broadcastable to the shape of `x` seed: random seed to ensure determinism. ''' if level < 0. or level >= 1: raise Exception('Dropout level must be in interval [0, 1[.') if seed is None: seed = np.random.randint(1, 10e6) rng = RandomStreams(seed=seed) retain_prob = 1. - level if noise_shape is None: random_tensor = rng.binomial(x.shape, p=retain_prob, dtype=x.dtype) else: random_tensor = rng.binomial(noise_shape, p=retain_prob, dtype=x.dtype) random_tensor = T.patternbroadcast(random_tensor, [dim == 1 for dim in noise_shape]) x *= random_tensor x /= retain_prob return x
def build_model(tparams, options): opt_ret = dict() trng = RandomStreams(1234) p = 0.5 retain_prob = 1. - p print('dropout: {0}'.format(p)) # description string: #words x #samples # text: text sentence # hypothesis: hypothesis sentence text_embedding = tensor.tensor3('text_embedding', dtype='float32') # text = tensor.matrix('text', dtype='int64') text_mask = tensor.matrix('text_mask', dtype='float32') hypothesis_embedding = tensor.tensor3('hypothesis_embedding', dtype='float32') # hypothesis = tensor.matrix('hypothesis', dtype='int64') hypothesis_mask = tensor.matrix('hypothesis_mask', dtype='float32') label = tensor.vector('label', dtype='int64') # encoder proj = get_layer(options['encoder'])[1](tparams, text_embedding, None, options, prefix='encoder', mask=text_mask) ctx = proj[0][-1] dec_ctx = ctx # dropout dec_ctx_dropped = dec_ctx dec_ctx_dropped *= trng.binomial(dec_ctx_dropped.shape, p=retain_prob, dtype=dec_ctx_dropped.dtype) dec_ctx_dropped /= retain_prob # decoder (hypothesis) proj_hypo = get_layer(options['decoder'])[1](tparams, hypothesis_embedding, dec_ctx, options, prefix='h_decode_t', mask=hypothesis_mask) proj_hypo_dropped = get_layer(options['decoder'])[1](tparams, hypothesis_embedding, dec_ctx_dropped, options, prefix='h_decode_t', mask=hypothesis_mask) hypo_ctx = proj_hypo[0][-1] hypo_ctx_dropped = proj_hypo_dropped[0][-1] # dropout hypo_ctx_dropped *= trng.binomial(hypo_ctx_dropped.shape, p=retain_prob, dtype=hypo_ctx_dropped.dtype) hypo_ctx_dropped /= retain_prob # cost (cross entropy) logit = get_layer('ff')[1](tparams, hypo_ctx, options, prefix='ff_logit', activ='tensor.nnet.sigmoid') logit_dropped = get_layer('ff')[1](tparams, hypo_ctx_dropped, options, prefix='ff_logit', activ='tensor.nnet.sigmoid') # flatten logit logit = logit.flatten() logit_dropped = logit_dropped.flatten() cost = binary_crossentropy(logit_dropped, label) cost = tensor.mean(cost) acc = tensor.mean(tensor.eq(tensor.round(logit), label)) return text_embedding, text_mask, hypothesis_embedding, hypothesis_mask, label, cost, acc
def get_sequence_dropout_mask(shape, p, stocdrop=False): srng = RandomStreams(seed=np.random.randint(1e6)) if not stocdrop: return srng.binomial(size=shape, p=1.0 - p, dtype=floatX) / (1.0 - p) else: # FIXME assumes shape of dim (time steps, batch size, hidden size) col_mask = srng.binomial(size=(shape[0], shape[1], 1), p=1.0 - p, dtype=floatX) mask = T.tile(col_mask, (1, 1, shape[2])) return mask
def rbm_ais_gibbs_for_v(rbmA_params, rbmB_params, beta, v_sample, seed=23098): """ Parameters: ----------- rbmA_params: list Parameters of the baserate model (usually infinite temperature). List should be of length 3 and contain numpy.ndarrays corresponding to model parameters (weights, visbias, hidbias). rbmB_params: list similar to rbmA_params, but for model at temperature 1. beta: theano.shared scalar, represents inverse temperature at which we wish to sample from. v_sample: theano.shared matrix of shape (n_runs, nvis), state of current particles. seed: int optional seed parameter for sampling from binomial units. """ (weights_a, visbias_a, hidbias_a) = rbmA_params (weights_b, visbias_b, hidbias_b) = rbmB_params theano_rng = RandomStreams(seed) # equation 15 (Salakhutdinov & Murray 2008) ph_a = nnet.sigmoid( (1 - beta) * (tensor.dot(v_sample, weights_a) + hidbias_a)) ha_sample = theano_rng.binomial( size=(v_sample.shape[0], len(hidbias_a)), n=1, p=ph_a, dtype=config.floatX) # equation 16 (Salakhutdinov & Murray 2008) ph_b = nnet.sigmoid(beta * (tensor.dot(v_sample, weights_b) + hidbias_b)) hb_sample = theano_rng.binomial( size=(v_sample.shape[0], len(hidbias_b)), n=1, p=ph_b, dtype=config.floatX) # equation 17 (Salakhutdinov & Murray 2008) pv_act = (1 - beta) * (tensor.dot(ha_sample, weights_a.T) + visbias_a) + \ beta * (tensor.dot(hb_sample, weights_b.T) + visbias_b) pv = nnet.sigmoid(pv_act) new_v_sample = theano_rng.binomial( size=(v_sample.shape[0], len(visbias_b)), n=1, p=pv, dtype=config.floatX) return new_v_sample
def compute_output(self, network, in_vw): p = network.find_hyperparameter(["dropout_probability", "probability", "p"], 0) if p == 0: network.copy_variable( name="default", previous_variable=in_vw, tags={"output"}, ) else: rescale_factor = 1 / (1 - p) mask_shape = in_vw.shape if any(s is None for s in mask_shape): # NOTE: this uses symbolic shape - can be an issue with # theano.clone and random numbers # https://groups.google.com/forum/#!topic/theano-users/P7Mv7Fg0kUs warnings.warn("using symbolic shape for dropout mask, " "which can be an issue with theano.clone") mask_shape = in_vw.variable.shape # TODO save this state so that we can seed the rng srng = MRG_RandomStreams() mask = rescale_factor * srng.binomial(mask_shape, p=p, dtype=floatX) network.create_variable( "default", variable=in_vw.variable * mask, shape=in_vw.shape, tags={"output"}, )
class SemMemModule(MergeLayer): # Semantic Memory Module (= Word Embedding Layer) # Lasagne Library has Merge Layer, which is basic layer class accepting multiple inputs. # Semantic Memory Module and its parameters ared shared into Input Module and Question Module. # Therefore, It might not act as ordinary feed-forward layer, and needs extra codes to be trained. def __init__(self, incomings, voc_size, hid_state_size, W=Normal(), **kwargs): # Initialize parameters and create theano variables super(SemMemModule, self).__init__(incomings, **kwargs) self.hid_state_size = hid_state_size self.W = self.add_param(W, (voc_size, hid_state_size), name='Word_Embedding', regularizable=False) self.rand_stream = RandomStreams(np.random.randint(1, 2147462579)) def get_output_shape_for(self, input_shapes): # Define output shape for certain input shapes (helps debugging) return (None, None, self.hid_state_size) def get_output_for(self, inputs, **kwargs): # Core part that actually describes how the theano variables works to produce output # input is in shape of (batch, sentence, word) # word_dropout is the varible determines the proportion of words to be masked to 0-vectors input = inputs[0] word_dropout = inputs[1] # Apply an input tensor to word embedding matrix and word_dropout. # And then, flatten it to shape of (batch*sentence, word, hid_state) to be fit into GRU library return T.reshape(self.W[input], (-1, input.shape[2], self.hid_state_size)) * self.rand_stream.binomial((input.shape[0]*input.shape[1], input.shape[2]), p=1-word_dropout, dtype=theano.config.floatX).dimshuffle((0,1,'x'))
class SimpleModel(Model): def __init__(self, nvis, num_hid, num_class): self.__dict__.update(locals()) del self.self self.input_space = VectorSpace(nvis) self.output_space = VectorSpace(num_class) self.theano_rng = MRG_RandomStreams(2012 + 10 + 16) rng = np.random.RandomState([16,10,2012]) self.W = sharedX(rng.uniform(-.05,.05,(nvis, num_hid))) self.hb = sharedX(np.zeros((num_hid,)) - 1.) self.V = sharedX(rng.uniform(-.05,.05,(num_hid, num_class))) self.cb = sharedX(np.zeros((num_class,))) self._params = [self.W, self.hb, self.V, self.cb ] def get_weights(self): return self.W.get_value() def get_weights_format(self): return ('v','h') def emit(self, X): Z = T.dot(X, self.W) + self.hb exp_H = T.nnet.sigmoid(Z) H = self.theano_rng.binomial(p = exp_H, n = 1, size = exp_H.shape, dtype = exp_H.dtype) Zc = T.dot(H, self.V) + self.cb return exp_H, H, Zc
class BitFlip: def __init__(self, nvis, prob): """ A conditional distribution that flips bits """ self.__dict__.update(locals()) del self.self self.s_rng = RandomStreams(17) def random_design_matrix(self, X): flip = self.s_rng.binomial(size=X.shape, n = 1, p = self.prob, dtype=config.floatX) return X * (1-flip) + (1-X)*flip def is_symmetric(self): """ A property of conditional distributions P(Y|X) Return true if P(y|x) = P(x|y) for all x,y """ return True
def dropout(X, p=0.): srng = RandomStreams() if p > 0: retain_prob = 1 - p X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) X /= retain_prob return X
class SimpleBernoulliSampleLayer(lasagne.layers.Layer): """ Simple sampling layer drawing samples from bernoulli distributions. Parameters ---------- mean : :class:`Layer` instances Parameterizing the mean value of each bernoulli distribution seed : int seed to random stream Methods ---------- seed : Helper function to change the random seed after init is called """ def __init__(self, mean, seed=lasagne.random.get_rng().randint(1, 2147462579), **kwargs): super(SimpleBernoulliSampleLayer, self).__init__(mean, **kwargs) self._srng = RandomStreams(seed) def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): self._srng.seed(seed) def get_output_shape_for(self, input_shape): return input_shape def get_output_for(self, mu, **kwargs): return self._srng.binomial(size=mu.shape, p=mu, dtype=mu.dtype)
class AdditiveMaskedDiagonalMND: def __init__(self, init_beta, nvis, prob): """ A conditional distribution that adds gaussian noise with diagonal precision matrix beta to another variable that it conditions on """ self.__dict__.update(locals()) del self.self self.beta = sharedX(np.ones((nvis,))*init_beta) assert self.beta.ndim == 1 self.s_rng = RandomStreams(17) def random_design_matrix(self, X): """ X: a theano variable containing a design matrix of observations of the random vector to condition on.""" Z = self.s_rng.normal(size=X.shape, avg=0., std=1./T.sqrt(self.beta), dtype=config.floatX) mask = self.s_rng.binomial(size=X.shape, n = 1, p = self.prob, dtype=config.floatX) return X+mask*Z def is_symmetric(self): """ A property of conditional distributions P(Y|X) Return true if P(y|x) = P(x|y) for all x,y """ return True
class FullyConnectedLayer(object): """Used to create a fully connected layer of neurons.""" def __init__(self, n_inputs, n_outputs, dropout_rate = 0.0, activation_fn=sigmoid): self.n_inputs = n_inputs self.n_outputs = n_outputs self.dropout_rate = dropout_rate self.activation_fn = activation_fn self.is_convolutional = False # Initializing weights and biases to samples from normal Gaussian self.w = shared(rng.normal(0,1.0/(self.n_outputs*(1-self.dropout_rate)),(self.n_outputs,self.n_inputs)).astype(config.floatX), borrow=True) self.b = shared(rng.normal(0,1.0,(self.n_outputs,)).astype(config.floatX), borrow=True) self.params = [self.w, self.b] self.nrg = RandomStreams() def set_inpt(self, inpt, training): self.inpt = T.flatten(inpt, 2) if training: bern = self.nrg.binomial(size = T.shape(self.inpt),p=1.0-self.dropout_rate, ndim=2).astype(config.floatX) #bern = bern.reshape(T.shape(inpt)) inpt = inpt*bern else: inpt = inpt*(1-self.dropout_rate) self.output = T.dot(self.w, self.inpt.T).T + self.b.dimshuffle('x',0) return self.activation_fn(self.output)
class Dropout(MaskedLayer): ''' Hinton's dropout. ''' def __init__(self, p): super(Dropout, self).__init__() self.p = p self.srng = RandomStreams(seed=np.random.randint(10e6)) def get_output(self, train=False): X = self.get_input(train) if self.p > 0.: retain_prob = 1. - self.p if train: X *= self.srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) else: X *= retain_prob return X def calc_output_dims(self, lastdims): return lastdims def get_config(self): return {"name": self.__class__.__name__, "p": self.p}
class Dropout(object): def __init__(self, input, p, is_train_stage): self.p = p self.srng = RandomStreams(seed = np.random.randint(10e6)) self.output = T.switch(is_train_stage, input * self.srng.binomial(input.shape, p = p, dtype = theano.config.floatX) / (1 - self.p), input)
class SampleBernoulli(Layer): """ Layer which samples a Bernoulli distribution whose statistics (mean, 'p') are given as inputs to the layer. :param mode: 'maximum_likelihood' for maximum likelihood sample, 'random' for random sample, 'mean_field' for mean-field approximation. """ def __init__(self, mode='maximum_likelihood'): super(SampleBernoulli, self).__init__() self.mode = mode if self.mode == 'random': self.srng = RandomStreams(seed=np.random.randint(10e6)) def get_output(self, train=False): p = self.get_input(train) if self.mode == 'maximum_likelihood': # draw maximum likelihood sample from Bernoulli distribution # x* = argmax_x p(x) = 1 if p(x=1) >= 0.5 # 0 otherwise return T.round(p, mode='half_away_from_zero') elif self.mode == 'random': # draw random sample from Bernoulli distribution # x* = x ~ p(x) = 1 if p(x=1) > uniform(0, 1) # 0 otherwise return self.srng.binomial(size=p.shape, n=1, p=p, dtype=theano.config.floatX) elif self.mode == 'mean_field': # draw mean-field approximation sample from Bernoulli distribution # x* = E[p(x)] = E[Bern(x; p)] = p return p else: raise NotImplementedError('Unknown sample mode!')
class DropOp(Layer): """ This layers randomly drops elements of the input by multiplying with a mask sampled from a binomial distribution """ def __init__(self, rng = None, name=None, dropout=1.): super(DropOp, self).__init__(0, 0, None, name) self.dropout = dropout if dropout < 1.: self.trng = RandomStreams(rng.randint(1e5)) def fprop(self, state_below, use_noise = True): print 'dropop use noise:', use_noise self.out = state_below if self.dropout < 1.: if use_noise: print 'training use noise' self.out = self.out * self.trng.binomial(self.out.shape, n=1, p=self.dropout, dtype=self.out.dtype) else: print 'decoding not use noise' self.out = self.out * self.dropout return self.out
def compute_output(self, network, in_vw): deterministic = network.find_hyperparameter(["deterministic"]) p = network.find_hyperparameter(["dropout_probability", "probability", "p"], 0) if deterministic or p == 0: network.copy_vw(name="default", previous_vw=in_vw, tags={"output"}) else: rescale_factor = 1 / (1 - p) mask_shape = in_vw.shape if any(s is None for s in mask_shape): # NOTE: this uses symbolic shape - can be an issue with # theano.clone and random numbers # https://groups.google.com/forum/#!topic/theano-users/P7Mv7Fg0kUs warnings.warn("using symbolic shape for dropout mask, " "which can be an issue with theano.clone") mask_shape = in_vw.symbolic_shape() # FIXME generalize to other shape dimensions. # assume this is of the form bc01 (batch, channel, width, height) mask_shape = mask_shape[:2] # TODO save this state so that we can seed the rng srng = MRG_RandomStreams() # set bernoulli probability to be inverse of dropout probability # because 1 means to keep the unit bernoulli_prob = 1 - p mask = rescale_factor * srng.binomial(mask_shape, p=bernoulli_prob, dtype=fX) mask = mask.dimshuffle(0, 1, "x", "x") network.create_vw("default", variable=in_vw.variable * mask, shape=in_vw.shape, tags={"output"})
def output(self, x, a): p = self.p srng = RandomStreams() if p > 0: retain_prob = 1 - p x *= srng.binomial(x.shape, p=retain_prob, dtype=theano.config.floatX) x /= retain_prob return x
class Dropout: def __init__(self, inp, p): # NOTE need to set p to 0 during testing self.srng = RandomStreams(seed=np.random.randint(1e6)) self.p = p self.inp = inp self.out = self.inp * self.srng.binomial(size=self.inp.shape, p=1.0 - self.p, dtype=floatX) / (1.0 - self.p)
def dropout(rng, x, p=0.5): """ Zero-out random values in x with probability p using rng """ if p > 0. and p < 1.: seed = rng.randint(2 ** 30) srng = RandomStreams(seed) mask = srng.binomial(n=1, p=1.-p, size=x.shape, dtype=theano.config.floatX) return x * mask return x
def __init__(self, x, p=0.5): use_noise = theano.shared(numpy_floatX(0.)) trng = RandomStreams(415) self.output = T.switch(use_noise, (x * trng.binomial(x.shape, p=p, n=1, dtype=x.dtype)), x * p )
class WordDropoutLayer(Layer): """Dropout layer Sets values to zero with probability p. See notes for disabling dropout during testing. Parameters ---------- incoming : a :class:`Layer` instance or a tuple the layer feeding into this layer, or the expected input shape p : float or scalar tensor The probability of setting a value to zero rescale : bool If true the input is rescaled with input / (1-p) when deterministic is False. Notes ----- The dropout layer is a regularizer that randomly sets input values to zero; see [1]_, [2]_ for why this might improve generalization. During training you should set deterministic to false and during testing you should set deterministic to true. If rescale is true the input is scaled with input / (1-p) when deterministic is false, see references for further discussion. Note that this implementation scales the input at training time. References ---------- .. [1] Hinton, G., Srivastava, N., Krizhevsky, A., Sutskever, I., Salakhutdinov, R. R. (2012): Improving neural networks by preventing co-adaptation of feature detectors. arXiv preprint arXiv:1207.0580. .. [2] Srivastava Nitish, Hinton, G., Krizhevsky, A., Sutskever, I., & Salakhutdinov, R. R. (2014): Dropout: A Simple Way to Prevent Neural Networks from Overfitting. Journal of Machine Learning Research, 5(Jun)(2), 1929-1958. """ def __init__(self, incoming, p=0.5, **kwargs): super(WordDropoutLayer, self).__init__(incoming, **kwargs) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) self.p = p def get_output_for(self, input, deterministic=False, **kwargs): """ Parameters ---------- input : tensor output from the previous layer deterministic : bool If true dropout and scaling is disabled, see notes """ if deterministic or self.p == 0: return input else: retain_prob = 1 - self.p # use nonsymbolic shape for dropout mask if possible input_shape = self.input_shape if any(s is None for s in input_shape): input_shape = input.shape return input * self._srng.binomial(input_shape,p=retain_prob,dtype='int32')
def dropout(x, level, seed=None): if level < 0. or level >= 1: raise Exception('Dropout level must be in interval [0, 1[.') if seed is None: seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) retain_prob = 1. - level x *= rng.binomial(x.shape, p=retain_prob, dtype=x.dtype) x /= retain_prob return x
def dropout_layer(state_before, use_noise, trng=None): if trng is None: trng = RandomStreams(1234) proj = tensor.switch( use_noise, state_before * trng.binomial(state_before.shape, p=0.5, n=1, dtype=state_before.dtype) * 2, state_before) return proj
class Dropout: # NOTE p here is the probability that we drop (zero out) unit def __init__(self, inp, p): # NOTE need to set p to 0 during testing self.srng = RandomStreams(seed=np.random.randint(1e6)) self.p = p self.inp = inp self.out = self.inp * self.srng.binomial(size=self.inp.shape, p=1.0 - self.p, dtype=floatX) / (1.0 - self.p)
def get_cost(self, X, Y, **kwargs): # Dream theano_rng = MRG_RandomStreams(2012 + 12 + 18) exp_y = T.nnet.softmax(T.alloc(0., self.batch_size, self.n_classes) + self.gyb) dy = theano_rng.multinomial(pvals = exp_y, dtype='float32') dy = block_gradient(dy) exp_h2 = T.nnet.sigmoid(T.dot(dy, self.gh2w) + self.gh2b) dh2 = theano_rng.binomial(p = exp_h2, size = exp_h2.shape, dtype='float32') dh2 = block_gradient(dh2) exp_h1 = T.nnet.sigmoid(T.dot(dh2, self.gh1w) + self.gh1b) dh1 = theano_rng.binomial(p = exp_h1, size = exp_h1.shape, dtype='float32') dh1 = block_gradient(dh1) exp_v = T.nnet.sigmoid(T.dot(dh1, self.gvw) + self.gvb) dv = theano_rng.binomial(p = exp_v, size = exp_v.shape, dtype='float32') dv = block_gradient(dv) # Explanation of dream zh1, rh1 = self.infer_h1(dv) zh2 = T.dot(rh1, self.rh2w) + self.rh2b rh2 = T.nnet.sigmoid(zh2) zy = T.dot(rh2, self.ryw) + self.ryb # Probability of dream dream_prob = sigmoid_prob(zh1, dh1) + sigmoid_prob(zh2, dh2) + softmax_prob(zy, dy) # Explanation of reality zh1, rh1 = self.infer_h1(X) rh1 = block_gradient(rh1) zh2 = T.dot(rh1, self.rh2w) + self.rh2b rh2 = theano_rng.binomial(p = T.nnet.sigmoid(zh2), size = zh2.shape, dtype='float32') rh2 = block_gradient(rh2) # Probability of reality real_prob = softmax_prob(T.alloc(0., self.batch_size, self.n_classes) + self.gyb, Y) + \ sigmoid_prob(T.dot(Y, self.gh2w) + self.gh2b, rh2) + \ sigmoid_prob(T.dot(rh2, self.gh1w) + self.gh1b, rh1) + \ sigmoid_prob(T.dot(rh1, self.gvw) + self.gvb, X) return - dream_prob - real_prob + .0001 * ( T.sqr(self.gvw).sum() + T.sqr(self.gh1w).sum() + \ T.sqr(self.gh2w).sum() )
class DropoutLayer: def __init__(self, p): self.p = p self.srng = RandomStreams(seed=np.random.randint(10e8)) def apply(self, x, training_time): if training_time: return x * self.srng.binomial(x.shape, p=1 - self.p, dtype=theano.config.floatX) / (1 - self.p) return x
def dropout(X,p=0.): """ Add some noise to regularize by drop out by probility p so to prevent overfitting """ if p>0: retain_prob=1-p srng=RandomStreams() X*=srng.binomial(X.shape,p=retain_prob,dtype=theano.config.floatX) X/=retain_prob return X
def __init__(self, num_neurons, id=-1, distribution='binomial', verbose=2, options=None): if verbose >= 3: print("... Creating a " + distribution + "random layer of " + \ "output_shape " + str(num_neurons)) super(random_layer, self).__init__(id=id, type='random', verbose=verbose) rng = numpy.random srng = RandomStreams(rng.randint(1, 2147462468), use_cuda=None) if isinstance(num_neurons, int): num_neurons = (num_neurons, ) if distribution == 'binomial': if not 'p' in options.keys(): if verbose >= 3: print("... Needs input p, by default assuming 0.5") p = 0.5 else: p = options["p"] self.output = srng.binomial(n=1, p=p, size=num_neurons, dtype=theano.config.floatX) elif distribution == 'uniform': if not 'limits' in options.keys(): if verbose >= 3: print("... Needs limits, assuming default (0,1)") limits = (0, 1) else: limits = options['limits'] self.output = srng.uniform(size=num_neurons, low=limits[0], high=limits[1], dtype=theano.config.floatX) elif distribution == 'gaussian' or distribution == 'normal': if not 'mu' in options.keys(): if verbose >= 3: print("... Needs mu, assuming default 0") mu = 0 else: mu = options['mu'] if not 'sigma' in options.keys(): if verbose >= 3: print("... Needs sigma, assuming default 1") sigma = 1 else: sigma = options['sigma'] self.output = srng.normal(size=num_neurons, avg=mu, std=sigma, dtype=theano.config.floatX) self.output_shape = num_neurons self.num_neurons = num_neurons if verbose >= 3: print("... Random layer is created with output shape " + str(self.output_shape))
class VAE: def __init__(self, n_in, n_hidden, n_out, n_hidden_decoder=None, trans_func=rectify, batch_size=100): self.n_in = n_in self.n_hidden = n_hidden self.n_out = n_out self.l_in = InputLayer((batch_size, n_in)) self.batch_size = batch_size self.transf = trans_func self.srng = RandomStreams() l_in_encoder = lasagne.layers.InputLayer(shape=(batch_size, n_in)) l_in_decoder = lasagne.layers.InputLayer(shape=(batch_size, n_out)) l_prev_encoder = l_in_encoder l_prev_decoder = l_in_decoder for i in range(len(n_hidden)): l_tmp_encoder = lasagne.layers.DenseLayer(l_prev_encoder, num_units=n_hidden[i], W=lasagne.init.Uniform(), nonlinearity=self.transf) l_prev_encoder = l_tmp_encoder # cause you might want a decoder which is not the mirror of the encoder if n_hidden_decoder is None: n_hidden_decoder = n_hidden self.n_hidden_decoder = n_hidden_decoder for i in range(len(n_hidden_decoder)): l_tmp_decoder = lasagne.layers.DenseLayer( l_prev_decoder, num_units=n_hidden_decoder[-(i + 1)], W=lasagne.init.Uniform(), nonlinearity=self.transf) l_prev_decoder = l_tmp_decoder l_in = lasagne.layers.InputLayer(shape=(batch_size, n_in)) self.model = VAELayer(l_in, encoder=l_prev_encoder, decoder=l_prev_decoder, latent_size=n_out, x_distribution='bernoulli', qz_distribution='gaussian', pz_distribution='gaussian') self.x = T.matrix('x') def build_model(self, train_x, test_x, valid_x, update, update_args): self.train_x = train_x self.test_x = test_x self.validation_x = valid_x self.update = update self.update_args = update_args self.index = T.iscalar('index') self.batch_slice = slice(self.index * self.batch_size, (self.index + 1) * self.batch_size) x = self.srng.binomial(size=self.x.shape, n=1, p=self.x) log_pz, log_qz_given_x, log_px_given_z = self.model.get_log_distributions( self.x) loss_eval = (log_pz + log_px_given_z - log_qz_given_x).sum() loss_eval /= self.batch_size all_params = get_all_params(self.model) updates = self.update(-loss_eval, all_params, *self.update_args) train_model = theano.function( [self.index], loss_eval, updates=updates, givens={ self.x: self.train_x[self.batch_slice], }, ) test_model = theano.function( [self.index], loss_eval, givens={ self.x: self.test_x[self.batch_slice], }, ) validate_model = theano.function( [self.index], loss_eval, givens={ self.x: self.validation_x[self.batch_slice], }, ) return train_model, test_model, validate_model def draw_sample(self, z): return self.model.draw_sample(z) def get_output(self, dat): z, _, _ = self.model.get_z_mu_sigma(dat) return z def get_reconstruction(self, z): return self.model.decoder_output(z)
class VisibleLayer(object): def __init__(self, v_dim, h_dim, v_type, mrng=None, rng=None, name=''): self.name = name if name != '' else 'v_layer' self.v_dim = v_dim self.h_dim = h_dim self.v_type = v_type seed = np.random.randint(1, 2**30) self._rng = RandomStreams(seed) if rng is None else rng self._mrng = MRG_RandomStreams(seed) if mrng is None else mrng self._build_params() def set_total_count(self, total_count): if not (self.v_type == InputType.poisson): raise ValueError( "The input type should be Poisson to set total count") self.total_count = total_count def _build_params(self): # W to connect with hidden layer self.params = [] if self.v_type == InputType.poisson: init_W = np.random.uniform(low=-1 / self.h_dim, high=1 / self.h_dim, size=(self.v_dim, self.h_dim)) self.W = init_weight(self.v_dim, self.h_dim, value=init_W, name=self.name + '-W') else: self.W = init_weight(self.v_dim, self.h_dim, name=self.name + '-W') self.b_v = init_bias(self.v_dim, name=self.name + '-b_v') # Ca binary, gaussian, and categorical self.params.extend([self.W, self.b_v]) # Truong hop gaussian co them sigma if self.v_type == InputType.gaussian: self.sigma_v = T.ones(shape=(self.v_dim, ), dtype=theano.config.floatX) self.sigma_v.name = self.name + "-sigma_v" # Result in a vector of (n, 1) def v_free_term(self, v): if self.v_type == InputType.poisson: return -T.sum(T.gammaln(1 + v), axis=1) else: return 0 # Result in a vector of (n, 1) def v_bias_term(self, v): # Note that for gaussian case, the v_bias should be negative if self.v_type == InputType.gaussian: return -T.sum((v - self.b_v)**2 / (2 * self.sigma_v**2), axis=1) else: return T.dot(v, self.b_v) # Result in a vector of (n, H) def v_weight_term(self, v): if self.v_type == InputType.gaussian: return T.dot((v / (self.sigma_v**2)), self.W) else: return T.dot(v, self.W) # Only support binary, gaussian and categorical def v_given_h(self, h): if self.v_type == InputType.binary: p_v_h = T.nnet.sigmoid(self.b_v + T.dot(h, self.W.T)) return p_v_h elif self.v_type == InputType.gaussian: mu_v = self.b_v + T.dot(h, self.W.T) return mu_v elif self.v_type == InputType.categorical: p_v_h = T.nnet.softmax(self.b_v + T.dot(h, self.W.T)) return p_v_h elif self.v_type == InputType.poisson: if not hasattr(self, 'total_count') or self.total_count is None: raise ValueError( 'Total count should be set for constrained Poisson') unconstrained_lmbd_v = T.exp(self.b_v + T.dot(h, self.W.T)) lmbd_v = unconstrained_lmbd_v * 1.0 / T.sum(unconstrained_lmbd_v, axis=1, keepdims=True) \ * self.total_count return lmbd_v # Only support binary, gaussian and categorical def sample_v_given_h(self, h0_sample): if self.v_type == InputType.binary: v1_mean = self.v_given_h(h0_sample) v1_sample = self._mrng.binomial(size=v1_mean.shape, n=1, p=v1_mean, dtype=theano.config.floatX) return [v1_mean, v1_sample] elif self.v_type == InputType.gaussian: mu_v1 = self.v_given_h(h0_sample) # Note that mu_v1 is returned v1_sample = self._mrng.normal(size=mu_v1.shape, avg=mu_v1, std=self.sigma_v, dtype=theano.config.floatX) return [mu_v1, v1_sample] # Note that there is constraint in the case of Multinomial elif self.v_type == InputType.categorical: prob_v1 = self.v_given_h(h0_sample) v1_sample = self._mrng.multinomial(pvals=prob_v1, n=1, dtype=theano.config.floatX) return [prob_v1, v1_sample] elif self.v_type == InputType.poisson: lmbd_v1 = self.v_given_h(h0_sample) # We have to use RandomStreams, not MRG_RandomStreams v1_sample = self._rng.poisson(size=lmbd_v1.shape, lam=lmbd_v1, dtype=theano.config.floatX) return [lmbd_v1, v1_sample] def l1_grad(self, l1): gW = l1_grad(self.W, l1) return [gW, 0] def l2_grad(self, l2): gW = l2_grad(self.W, l2) return [gW, 0] def nll_grad_formula(self, v0, vk, h0, hk): n_instances = v0.shape[0] gW = (T.dot(vk.T, hk) - T.dot(v0.T, h0)) / n_instances if self.v_type == InputType.gaussian: gb_v = T.mean((vk - v0) / (self.sigma_v**2), axis=0) grads = [gW, gb_v] else: gb_v = T.mean(vk - v0, axis=0) grads = [gW, gb_v] return grads def get_viewed_cost(self, v0, vk_stat): # Binary cross-entropy cost = 0 if self.v_type == InputType.binary: # Clip to avoid log(0) clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.float32(0.999999)) cost = -T.sum(v0 * T.log(clip_vk_stat) + (1 - v0) * T.log(1 - clip_vk_stat), axis=1) # Sum square error elif self.v_type == InputType.gaussian: cost = T.sum((v0 - vk_stat)**2, axis=1) # Categorical cross-entropy elif self.v_type == InputType.categorical: clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.float32(0.999999)) cost = -T.sum(v0 * T.log(clip_vk_stat), axis=1) elif self.v_type == InputType.poisson: clip_vk_stat = T.clip(vk_stat, np.float32(0.000001), np.inf) cost = -T.sum( -vk_stat + v0 * T.log(clip_vk_stat) - T.gammaln(1 + v0), axis=1) return cost def get_params(self): return self.params
def __init__(self, rng, layer_id, shape, X, mask, use_noise=1, p=0.5): """ Basic RNN with dropout Parameters ---------- :param rng: can be generated as numpy.random.seed(123) :type layer_id: str :param layer_id: id of this layer :type shape: tuple :param shape: (in_size, out_size) where in_size is the input dimension out_size is the hidden units' dimension :type X: a 3D or 2D variable, mostly a 3D one :param X: model inputs :type mask: theano variable :param mask: model inputs :type use_noise: theano variable :param use_noise: whether dropout is random :type p: float :param p: dropout ratio """ prefix = 'Basic' + layer_id self.in_size, self.hid_size = shape # weights for input self.W = init_weights(shape=(self.in_size, self.hid_size), name=prefix + '#W') # weights for hidden states self.U = init_weights(shape=(self.hid_size, self.hid_size), name=prefix + '#U') # bias self.b = init_bias(size=self.hid_size, name=prefix + '#b') self.X = X self.mask = mask nsteps = X.shape[0] if X.ndim == 3: n_samples = X.shape[1] else: n_samples = 1 assert mask is not None def _slice(x, n, dim): if x.ndim == 3: return x[:, :, n * dim:(n + 1) * dim] return x[:, n * dim:(n + 1) * dim] def _step(x_t, m_t, h_tm1): """ This function computes one step evolution in LSTM Parameters ---------- :type m_t: (n_samples, ) :param m_t: mask :type x_t: (n_samples, in_size) :param x_t: input at time t :type h_tm1: (n_samples, hid_size) :param h_tm1: hidden state at time (t - 1) """ # h_t with size (n_samples, hid_size) preact = T.dot(x_t, self.W) + T.dot(h_tm1, self.U) + self.b h_t = T.tanh(preact) # consider the mask h_t = m_t[:, None] * h_t + (1. - m_t)[:, None] * h_tm1 return h_t h, updates = theano.scan( fn=_step, sequences=[self.X, self.mask], outputs_info=[T.alloc(floatX(0.), n_samples, self.hid_size)]) # h here is of size (t, n_samples, hid_size) if p > 0: trng = RandomStreams(rng.randint(999999)) drop_mask = trng.binomial(size=h.shape, n=1, p=(1 - p), dtype=theano.config.floatX) self.activation = T.switch(T.eq(use_noise, 1), h * drop_mask, h * (1 - p)) else: self.activation = h self.params = [self.W, self.U, self.b]
class MixedRBM(Model): def __init__(self, v_dim=784, h_dim=500, v_types=[], v_indices=[], b_h=None, input_var=None, mrng=None, rng=None, name='', **kwargs): name = 'mixed_rbm' if name == '' else name super(MixedRBM, self).__init__(name, ) self.input = T.matrix('input') self.n_instances = self.input.shape[0] model_file = kwargs.get('model_file') if model_file is not None: self.load(model_file) self._load_params() else: self.v_dim = v_dim self.h_dim = h_dim self.v_types = v_types self.v_indices = v_indices seed = np.random.randint(1, 2**30) self._mrng = MRG_RandomStreams(seed) if mrng is None else mrng self._rng = RandomStreams(seed) if rng is None else rng self._rng = None if hasattr(self.v_indices[0], '__iter__'): self.v_ranges = self.v_indices else: self.v_ranges = [None] * len(self.v_indices) for i in xrange(len(self.v_indices)): self.v_ranges[i] = range(self.v_indices[i], self.v_indices[i + 1]) \ if i < len(self.v_indices) - 1 else range(self.v_indices[i], v_dim) self.v_layers = [] for i in xrange(len(self.v_ranges)): self.v_ranges[i] = np.asarray(self.v_ranges[i], dtype=np.int32) v_layer = VisibleLayer(v_dim=len(self.v_ranges[i]), h_dim=self.h_dim, v_type=self.v_types[i], name='v_layer({})'.format(i), mrng=self._mrng, rng=self._rng) if v_types[i] == InputType.poisson: total_count = T.sum(self.input[:, self.v_ranges[i]], axis=1, keepdims=True) v_layer.set_total_count(total_count) self.v_layers.append(v_layer) self._build_mask() self._build_params() def print_model_info(self): print "\nInfo of model {}".format(self.name) print "v_dims: {} | h_dim: {}".format(self.v_dim, self.h_dim) for i in xrange(len(self.v_types)): print "v_types: {} | v_ranges: {}".format(self.v_types[i], self.v_ranges[i]) def get_save(self): return [ self.name, self.v_dim, self.h_dim, self.v_indices, self.v_types, self._mrng, self._rng, self.big_mask, self.v_ranges, self.v_layers, self.b_h ] def set_load(self, saved_data): [ self.name, self.v_dim, self.h_dim, self.v_indices, self.v_types, self._mrng, self._rng, self.big_mask, self.v_ranges, self.v_layers, self.b_h ] = saved_data def _load_params(self): self.params = [self.b_h] for i in xrange(len(self.v_layers)): self.params.extend(self.v_layers[i].get_params()) def _build_params(self): self.b_h = init_bias(dim=self.h_dim, name=self.name + '-b_h') self.params = [self.b_h] for i in xrange(len(self.v_layers)): self.params.extend(self.v_layers[i].get_params()) def _build_mask(self): big_m = np.zeros((self.v_dim, self.v_dim), dtype=theano.config.floatX) k = 0 for i in xrange(len(self.v_ranges)): for j in xrange(len(self.v_ranges[i])): big_m[k, self.v_ranges[i][j]] = 1 k += 1 # self.big_mask = theano.shared(big_m, name='big_mask') # Sparse mask self.big_mask = sparse.shared(sp.csc_matrix(big_m), name='big_mask') def encode(self, v_data): h_code = self.h_given_v(self.input) fn = theano.function([self.input], h_code) return fn(v_data) def get_weight(self): Ws = [v_layer.W for v_layer in self.v_layers] return sparse.structured_dot(self.big_mask.T, T.concatenate(Ws, axis=0)) def _vs(self, v): # Mac loi ngo ngan o day ma mo mai khong ra # return [v[v_range] for v_range in self.v_ranges] return [v[:, v_range] for v_range in self.v_ranges] def score(self, data): free_fn = theano.function([self.input], self.free_energy(self.input)) return free_fn(data) # Energy from many v an 1 h def energy(self, vs, h): v_free = 0 v_bias = 0 v_weight = 0 for i in xrange(len(vs)): v_free += self.v_layers[i].v_free_term(vs[i]) v_bias += self.v_layers[i].v_bias_term(vs[i]) v_weight += self.v_layers[i].v_weight_term(vs[i]) return -(v_free + v_bias + v_weight * h + T.dot(h, self.b_h)) def free_energy(self, v): v_free = 0 v_bias = 0 v_weight = 0 vs = self._vs(v) for i in xrange(len(vs)): v_free += self.v_layers[i].v_free_term(vs[i]) v_bias += self.v_layers[i].v_bias_term(vs[i]) v_weight += self.v_layers[i].v_weight_term(vs[i]) h_term = T.sum(T.log(1 + T.exp(v_weight + self.b_h)), axis=1) return -(v_bias + v_free + h_term) def v_given_h(self, h): vs_stat = [] for i in xrange(len(self.v_layers)): vs_stat.append(self.v_layers[i].v_given_h(h)) return sparse.structured_dot(T.concatenate(vs_stat, axis=1), self.big_mask) def h_given_v(self, v): vs = self._vs(v) v_weight = 0 for i in xrange(len(vs)): v_weight += self.v_layers[i].v_weight_term(vs[i]) p_h_v = T.nnet.sigmoid(v_weight + self.b_h) return p_h_v # vs0_sample is list contain samples of each v_type def sample_h_given_v(self, v0_sample): h1_mean = self.h_given_v(v0_sample) h1_sample = self._mrng.binomial(size=h1_mean.shape, n=1, p=h1_mean, dtype=theano.config.floatX) return [h1_mean, h1_sample] # sample vs1 given h0_sample def sample_v_given_h(self, h0_sample): vs_stat = [] vs_sample = [] for i in xrange(len(self.v_layers)): v1_stat, v1_sample = self.v_layers[i].sample_v_given_h(h0_sample) vs_stat.append(v1_stat) vs_sample.append(v1_sample) v_stat = sparse.structured_dot(T.concatenate(vs_stat, axis=1), self.big_mask) v_sample = sparse.structured_dot(T.concatenate(vs_sample, axis=1), self.big_mask) return [v_stat, v_sample] # One step of gibbs sampling def gibbs_hvh(self, h0_sample): # Here we use v1_stat to show that it is sufficient statistics of v1 [v1_stat, v1_sample] = self.sample_v_given_h(h0_sample) [h1_mean, h1_sample] = self.sample_h_given_v(v1_sample) return [v1_stat, v1_sample, h1_mean, h1_sample] def gibbs_vhv(self, v0_sample): [h1_mean, h1_sample] = self.sample_h_given_v(v0_sample) [v1_stat, v1_sample] = self.sample_v_given_h(h1_sample) return [h1_mean, h1_sample, v1_stat, v1_sample] def run_CD_from_h(self, k, data_h): start_h = T.matrix("start_h") # [v_stats, v_samples, h_means, h_samples], updates \ outputs, updates \ = theano.scan(fn=self.gibbs_hvh, outputs_info=[None, None, None, start_h], n_steps=k, name="gibbs_hvh") CD_fn = theano.function([start_h], outputs=outputs[-1], updates=updates) return CD_fn(data_h) def run_CD_from_v(self, k, data_v): start_v = T.matrix("start_v") # [h_means, h_samples, v_stats, v_samples], updates \ outputs, updates \ = theano.scan(fn=self.gibbs_vhv, outputs_info=[None, None, None, start_v], n_steps=k, name="gibbs_vhv") CD_fn = theano.function([start_v], outputs=outputs[-1], updates=updates) return CD_fn(data_v) # Return visible variables def _gibbs_vhv_to_v_fn(self, steps, persis_v, is_sample=True, name=''): [h_means, h_samples, v_stats, v_samples], updates \ = theano.scan(self.gibbs_vhv, outputs_info=[None, None, None, persis_v], n_steps=steps, # init_gibbs dung de init name='gibbs_vhv') updates.update({persis_v: v_samples[-1]}) if is_sample: gibbs_fn = theano.function([], v_samples[-1], updates=updates, name=name) else: gibbs_fn = theano.function([], v_stats[-1], updates=updates, name=name) return gibbs_fn # Also return visible variables def _gibbs_hvh_to_v_fn(self, steps, persis_h, is_sample=True, name=''): [v_stats, v_samples, h_means, h_samples], updates \ = theano.scan(self.gibbs_hvh, outputs_info=[None, None, None, persis_h], n_steps=steps, # init_gibbs dung de init name='gibbs_vhv') updates.update({persis_h: h_samples[-1]}) if is_sample: gibbs_fn = theano.function([], v_samples[-1], updates=updates, name=name) else: gibbs_fn = theano.function([], v_stats[-1], updates=updates, name=name) return gibbs_fn def sample_given_input(self, input_x, init_gibbs=1000, betw_gibbs=100, loops=10, is_sample=False): print "Sample data from input using model {}".format(self.name) # Neu kich thuoc input la 1 thi phai chuyen no ve kich thuoc 2 if len(input_x.shape) == 1: persis_v = theano.shared( np.asarray(input_x.reshape(1, input_x.shape[0]), dtype=theano.config.floatX)) else: persis_v = theano.shared( np.asarray(input_x, dtype=theano.config.floatX)) if init_gibbs > 0: init_sampling_fn = self._gibbs_vhv_to_v_fn(init_gibbs, persis_v, is_sample=True, name='init_sampling_fn') else: init_sampling_fn = None sample_fn = self._gibbs_vhv_to_v_fn(betw_gibbs, persis_v, is_sample=is_sample, name='sample_fn') rvs_data = [] if init_sampling_fn is not None: init_sampling_fn() for idx in range(loops): print "Running sampling loop %d" % idx rv_data = sample_fn() rvs_data.append(rv_data) return np.asarray(rvs_data) # Sample randomly # We start from h and run gibbs chain until it reaches equilibrium def sample(self, init_gibbs=1000, betw_gibbs=100, n_samples=20, loops=10, is_sample=False): print "Sample random data using model {}".format(self.name) persis_h = theano.shared( np.zeros((n_samples, self.h_dim), dtype=theano.config.floatX)) if init_gibbs > 0: init_sampling_fn = self._gibbs_hvh_to_v_fn(init_gibbs, persis_h, is_sample=True, name='init_sampling_fn') else: init_sampling_fn = None sample_fn = self._gibbs_hvh_to_v_fn(betw_gibbs, persis_h, is_sample=is_sample, name='sample_fn') rvs_data = [] if init_sampling_fn is not None: init_sampling_fn() for idx in range(loops): print "Running sampling loop %d" % idx rv_data = sample_fn() rvs_data.append(rv_data) return np.asarray(rvs_data) def get_cost_udpates(self, lr, k, persis_h, l1, l2, stable_update, store_grad): # Run one sample step to get h h_mean, h_sample = self.sample_h_given_v(self.input) # Run normal CD start_h = persis_h if persis_h is not None else h_sample [v_stats, v_samples, h_means, h_samples], updates \ = theano.scan(fn=self.gibbs_hvh, outputs_info=[None, None, None, start_h], n_steps=k, name="gibbs_hvh") vk = v_samples[-1] v_stat_k = v_stats[-1] if persis_h is not None: updates[persis_h] = h_samples[-1] cost = self.get_viewed_cost(self.input, v_stat_k) cost = T.mean(cost) # For stable update, use mean value instead of random sampled value if stable_update: print "\nStable update is set to be True" updates = self.params_updates(self.input, v_stat_k, lr, l1, l2, updates, store_grad) else: print "\nStable update is set to be False" updates = self.params_updates(self.input, vk, lr, l1, l2, updates, store_grad) # return cost, updates return cost, updates def get_viewed_cost(self, v0, v_stat): cost = 0 vs0 = self._vs(v0) vs_stat = self._vs(v_stat) for i in xrange(len(self.v_layers)): type_cost = self.v_layers[i].get_viewed_cost(vs0[i], vs_stat[i]) cost += type_cost return cost def nll_grad_formula(self, v0, vk): h0 = self.h_given_v(v0) hk = self.h_given_v(vk) gb_h = T.mean(hk - h0, axis=0) grads = [gb_h] vs0 = self._vs(v0) vsk = self._vs(vk) for i in xrange(len(self.v_layers)): grads.extend(self.v_layers[i].nll_grad_formula( vs0[i], vsk[i], h0, hk)) return grads def l1_grad(self, l1): grads = [0] for i in xrange(len(self.v_layers)): grads.extend(self.v_layers[i].l1_grad(l1)) return grads def l2_grad(self, l2): grads = [0] for i in xrange(len(self.v_layers)): grads.extend(self.v_layers[i].l2_grad(l2)) return grads def params_updates(self, v0, vk, lr, l1, l2, updates, store_grad): if updates is None: updates = OrderedDict() if store_grad: self.stored_grads = OrderedDict() grads = [0 for _ in xrange(len(self.params))] o_grads = self.nll_grad_formula(v0, vk) grads = [grads[i] + o_grads[i] for i in xrange(len(self.params))] if store_grad: print "\nGradients over negative log-likelihood are stored in original_grads" o_shared_grads, updates = store_grads_in_update( self.params, o_grads, updates) self.stored_grads['original_grads'] = o_shared_grads if l1 is not None: print "Add L1 regularization ({}) to parameter updates".format(l1) l1_grads = self.l1_grad(l1) grads = [grads[i] + l1_grads[i] for i in xrange(len(self.params))] if store_grad: print "\nGradients over L1 regularization are stored in l1_grads" l1_shared_grads, updates = store_grads_in_update( self.params, l1_grads, updates) self.stored_grads['l1_grads'] = l1_shared_grads if l2 is not None: print "Add L2 regularization ({}) to parameter updates".format(l2) l2_grads = self.l2_grad(l2) grads = [grads[i] + l2_grads[i] for i in xrange(len(self.params))] if store_grad: print "\nGradients over L2 regularization are stored in l2_grads" l2_shared_grads, updates = store_grads_in_update( self.params, l2_grads, updates) self.stored_grads['l2_grads'] = l2_shared_grads if store_grad: print "\nGradients over total cost are stored in total_grads" t_shared_grads, updates = store_grads_in_update( self.params, grads, updates) self.stored_grads['total_grads'] = t_shared_grads grads = [grad.astype(theano.config.floatX) for grad in grads] if self.check_learning_algor(): params_updates = self.learning_algor(grads, self.params, lr, **self.learning_config) updates.update(params_updates) else: print "\nSimple SGD is used as training algorithm" for grad, param in zip(grads, self.params): updates[param] = param - grad * lr return updates def config_train(self, **kwargs): k = kwargs.get('CD_k') persis_h_data = kwargs.get('persis_h') l1 = kwargs.get('L1') l2 = kwargs.get('L2') if l1 is None: print "L1 should be set to enable sparse weight regularization" if l2 is None: print "L2 should be set to enable sparse weight regularization" stable_update = kwargs.get('stable_update') if stable_update is None: stable_update = False store_grad = kwargs.get('store_grad') if store_grad is None: store_grad = False self._build_train(k, persis_h_data, l1, l2, stable_update, store_grad) # persis_v_data is a numpy array def _build_train(self, k, persis_h_data, l1, l2, stable_update, store_grad): persis_h = theano.shared(persis_h_data, borrow=True) \ if persis_h_data is not None else None lr = T.scalar('lr') cost, updates = self.get_cost_udpates(lr, k, persis_h, l1, l2, stable_update, store_grad) print "\nBuild computation graph for training function of model {}".format( self.name) self.train_fn = theano.function([self.input, lr], cost, updates=updates) rv = self.v_given_h(self.h_given_v(self.input)) test_cost = self.get_viewed_cost(self.input, rv) test_cost = T.mean(test_cost) print "Build computation graph for validation function of model {}".format( self.name) self.valid_fn = theano.function([self.input], test_cost)
rng = np.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) corruption_level = 0.1 training_epochs = 25 learning_rate = 0.1 batch_size = 128 W1 = init_weights(28 * 28, 900) b1 = init_bias(900) b1_prime = init_bias(28 * 28) W1_prime = W1.transpose() W2 = init_weights(900, 10) b2 = init_bias(10) tilde_x = theano_rng.binomial( size=x.shape, n=1, p=1 - corruption_level, dtype=theano.config.floatX) * x y1 = T.nnet.sigmoid(T.dot(tilde_x, W1) + b1) z1 = T.nnet.sigmoid(T.dot(y1, W1_prime) + b1_prime) cost1 = -T.mean(T.sum(x * T.log(z1) + (1 - x) * T.log(1 - z1), axis=1)) params1 = [W1, b1, b1_prime] grads1 = T.grad(cost1, params1) updates1 = [(param1, param1 - learning_rate * grad1) for param1, grad1 in zip(params1, grads1)] train_da1 = theano.function(inputs=[x], outputs=cost1, updates=updates1, allow_input_downcast=True) p_y2 = T.nnet.softmax(T.dot(y1, W2) + b2) y2 = T.argmax(p_y2, axis=1)
class MultiLayer(Layer): """ Implementing a standard feed forward MLP """ def __init__(self, rng, n_in, n_hids=[500, 500], activation='TT.tanh', scale=0.01, sparsity=-1, rank_n_approx=0, rank_n_activ='lambda x: x', weight_noise=False, dropout=1., init_fn='sample_weights_classic', bias_fn='init_bias', bias_scale=0., learn_bias=True, grad_scale=1., name=None): """ :type rng: numpy random generator :param rng: numpy random generator :type n_in: int :param n_in: number of inputs units :type n_hids: list of ints :param n_hids: Number of hidden units on each layer of the MLP :type activation: string/function or list of :param activation: Activation function for the embedding layers. If a list it needs to have a value for each layer. If not, the same activation will be applied to all layers :type scale: float or list of :param scale: depending on the initialization function, it can be the standard deviation of the Gaussian from which the weights are sampled or the largest singular value. If a single value it will be used for each layer, otherwise it has to have one value for each layer :type sparsity: int or list of :param sparsity: if a single value, it will be used for each layer, otherwise it has to be a list with as many values as layers. If negative, it means the weight matrix is dense. Otherwise it means this many randomly selected input units are connected to an output unit :type rank_n_approx: int :param rank_n_approx: It applies to the first layer only. If positive and larger than 0, the first weight matrix is factorized into two matrices. The first one goes from input to `rank_n_approx` hidden units, the second from `rank_n_approx` to the number of units on the second layer :type rank_n_activ: string or function :param rank_n_activ: Function that is applied on on the intermediary layer formed from factorizing the first weight matrix (Q: do we need this?) :type weight_noise: bool :param weight_noise: If true, the model is used with weight noise (and the right shared variable are constructed, to keep track of the noise) :type dropout: float :param dropout: the probability with which hidden units are dropped from the hidden layer. If set to 1, dropout is not used :type init_fn: string or function :param init_fn: function used to initialize the weights of the layer. We recommend using either `sample_weights_classic` or `sample_weights` defined in the utils :type bias_fn: string or function :param bias_fn: function used to initialize the biases. We recommend using `init_bias` defined in the utils :type bias_scale: float :param bias_scale: argument passed to `bias_fn`, depicting the scale of the initial bias :type learn_bias: bool :param learn_bias: flag, saying if we should learn the bias or keep it constant :type grad_scale: float or theano scalar :param grad_scale: factor with which the gradients with respect to the parameters of this layer are scaled. It is used for differentiating between the different parameters of a model. :type name: string :param name: name of the layer (used to name parameters). NB: in this library names are very important because certain parts of the code relies on name to disambiguate between variables, therefore each layer should have a unique name. """ assert rank_n_approx >= 0, "Please enter a valid rank_n_approx" self.rank_n_approx = rank_n_approx if isinstance(rank_n_activ, (str, unicode)): rank_n_activ = eval(rank_n_activ) self.rank_n_activ = rank_n_activ if type(n_hids) not in (list, tuple): n_hids = [n_hids] n_layers = len(n_hids) self.n_layers = n_layers if type(scale) not in (list, tuple): scale = [scale] * n_layers if type(sparsity) not in (list, tuple): sparsity = [sparsity] * n_layers for idx, sp in enumerate(sparsity): if sp < 0: sparsity[idx] = n_hids[idx] if type(activation) not in (list, tuple): activation = [activation] * n_layers if type(bias_scale) not in (list, tuple): bias_scale = [bias_scale] * n_layers if bias_fn not in (list, tuple): bias_fn = [bias_fn] * n_layers if init_fn not in (list, tuple): init_fn = [init_fn] * n_layers for dx in xrange(n_layers): if isinstance(bias_fn[dx], (str, unicode)): bias_fn[dx] = eval(bias_fn[dx]) if isinstance(init_fn[dx], (str, unicode)): init_fn[dx] = eval(init_fn[dx]) if isinstance(activation[dx], (str, unicode)): activation[dx] = eval(activation[dx]) super(MultiLayer, self).__init__(n_in, n_hids[-1], rng, name) self.trng = RandomStreams(self.rng.randint(int(1e6))) self.activation = activation self.scale = scale self.sparsity = sparsity self.bias_scale = bias_scale self.bias_fn = bias_fn self.init_fn = init_fn self._grad_scale = grad_scale self.weight_noise = weight_noise self.dropout = dropout self.n_hids = n_hids self.learn_bias = learn_bias self._init_params() def _init_params(self): """ Initialize the parameters of the layer, either by using sparse initialization or small isotropic noise. """ self.W_ems = [] self.b_ems = [] if self.rank_n_approx: W_em1 = self.init_fn[0](self.n_in, self.rank_n_approx, self.sparsity[0], self.scale[0], self.rng) W_em2 = self.init_fn[0](self.rank_n_approx, self.n_hids[0], self.sparsity[0], self.scale[0], self.rng) self.W_em1 = theano.shared(W_em1, name='W1_0_%s' % self.name) self.W_em2 = theano.shared(W_em2, name='W2_0_%s' % self.name) self.W_ems = [self.W_em1, self.W_em2] else: W_em = self.init_fn[0](self.n_in, self.n_hids[0], self.sparsity[0], self.scale[0], self.rng) self.W_em = theano.shared(W_em, name='W_0_%s' % self.name) self.W_ems = [self.W_em] self.b_em = theano.shared(self.bias_fn[0](self.n_hids[0], self.bias_scale[0], self.rng), name='b_0_%s' % self.name) self.b_ems = [self.b_em] for dx in xrange(1, self.n_layers): W_em = self.init_fn[dx](self.n_hids[dx - 1] / self.pieces[dx], self.n_hids[dx], self.sparsity[dx], self.scale[dx], self.rng) W_em = theano.shared(W_em, name='W_%d_%s' % (dx, self.name)) self.W_ems += [W_em] b_em = theano.shared(self.bias_fn[dx](self.n_hids[dx], self.bias_scale[dx], self.rng), name='b_%d_%s' % (dx, self.name)) self.b_ems += [b_em] self.params = [x for x in self.W_ems] if self.learn_bias and self.learn_bias != 'last': self.params = [x for x in self.W_ems] + [x for x in self.b_ems] elif self.learn_bias == 'last': self.params = [x for x in self.W_ems] + [x for x in self.b_ems][:-1] self.params_grad_scale = [self._grad_scale for x in self.params] if self.weight_noise: self.nW_ems = [ theano.shared(x.get_value() * 0, name='noise_' + x.name) for x in self.W_ems ] self.nb_ems = [ theano.shared(x.get_value() * 0, name='noise_' + x.name) for x in self.b_ems ] self.noise_params = [x for x in self.nW_ems ] + [x for x in self.nb_ems] self.noise_params_shape_fn = [ constant_shape(x.get_value().shape) for x in self.noise_params ] def fprop(self, state_below, use_noise=True, no_noise_bias=False, first_only=False): """ Constructs the computational graph of this layer. If the input is ints, we assume is an index, otherwise we assume is a set of floats. """ if self.weight_noise and use_noise and self.noise_params: W_ems = [(x + y) for x, y in zip(self.W_ems, self.nW_ems)] if not no_noise_bias: b_ems = [(x + y) for x, y in zip(self.b_ems, self.nb_ems)] else: b_ems = self.b_ems else: W_ems = self.W_ems b_ems = self.b_ems if self.rank_n_approx: if first_only: emb_val = self.rank_n_activ(utils.dot(state_below, W_ems[0])) self.out = emb_val return emb_val emb_val = TT.dot( self.rank_n_activ(utils.dot(state_below, W_ems[0])), W_ems[1]) if b_ems: emb_val += b_ems[0] st_pos = 1 else: emb_val = utils.dot(state_below, W_ems[0]) if b_ems: emb_val += b_ems[0] st_pos = 0 emb_val = self.activation[0](emb_val) if self.dropout < 1.: if use_noise: emb_val = emb_val * self.trng.binomial( emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype) else: emb_val = emb_val * self.dropout for dx in xrange(1, self.n_layers): emb_val = utils.dot(emb_val, W_ems[st_pos + dx]) if b_ems: emb_val = self.activation[dx](emb_val + b_ems[dx]) else: emb_val = self.activation[dx](emb_val) if self.dropout < 1.: if use_noise: emb_val = emb_val * self.trng.binomial(emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype) else: emb_val = emb_val * self.dropout self.out = emb_val return emb_val
def lstm_decoder_layer(tparams_all, input_state, options, maxlen, dp, prefix="lstm_decoder_layer"): tparams_d = tparams_all[0] tparams_g = tparams_all[1] #rng = numpy.random.RandomState(4567) trng = RandomStreams(SEED) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(x_, m_, h_, c_): preact = tensor.dot(x_, tparams_g[_p(prefix, 'W')]) + tparams_g[_p(prefix, 'b')] + \ tensor.dot(h_, tparams_g[_p(prefix, 'U')]) i = tensor.nnet.sigmoid(_slice(preact, 0, options[_p(prefix, 'n')])) f = tensor.nnet.sigmoid(_slice(preact, 1, options[_p(prefix, 'n')])) o = tensor.nnet.sigmoid(_slice(preact, 2, options[_p(prefix, 'n')])) c = tensor.tanh(_slice(preact, 3, options[_p(prefix, 'n')])) c = f * c_ + i * c h = o * tensor.tanh(c) s = tensor.nnet.softmax(tensor.dot(h, tparams_g['to_idx_emb'])) #x_t = tensor.dot((s / s.max(axis=1)[:,None]).astype('int32').astype(theano.config.floatX), tparams_d['Wemb']) x_t = tensor.dot(tensor.switch(s < s.max(axis=1)[:,None], 0.0, 1.0).astype(theano.config.floatX), tparams_d['Wemb']) x_out = s.argmax(axis=1) m = tensor.switch(tensor.eq(x_out, 10), 0.0, 1.0).astype(theano.config.floatX) * m_ #x_t = tensor.dot(h_, tparams[_p(prefix, 'W_x')]) + tparams[_p(prefix, 'b_x')] return x_out, x_t, m, h, c ############################################################################################## rval, updates = theano.scan(_step, outputs_info=[None, input_state, tensor.alloc(numpy_floatX(1.), input_state.shape[0]), tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n']), tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n'])], name=_p(prefix, '_layers'), n_steps=maxlen) #proj_0 = rval[1]#tensor.tanh(rval[0]) m22 = trng.binomial(size=(input_state.shape[0],), p=dp, n=1, dtype=theano.config.floatX) #return rval[0]*m2, rval[1]*m2[:,None], rval[2]*m2 if(tensor.gt(maxlen, 4) == 1): x2 = tensor.alloc(numpy.asarray(0, dtype='int32'), maxlen - 4, input_state.shape[0]) x2 = tensor.concatenate((tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :], tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :], tensor.alloc(numpy.asarray(7, dtype='int32'), input_state.shape[0])[None, :], tensor.alloc(numpy.asarray(10, dtype='int32'), input_state.shape[0])[None, :], x2), axis=0) m2 = tensor.alloc(numpy_floatX(0.), maxlen - 3, input_state.shape[0]) m2 = tensor.concatenate((tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :], tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :], tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :], m2), axis=0) xt2 = tparams_d['Wemb'][x2] return rval[0]*m22+x2*(1-m22), rval[1]*m22[:,None]+xt2*(1-m22[:,None]), rval[2]*m22+m2*(1-m22) else: return rval[0]*m22, rval[1]*m22[:,None], rval[2]*m22
def random_binomial(shape, n=0, p=0.5, dtype=K.floatx(), seed=None): if seed is None: seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) return rng.binomial(shape, n=n, p=p, dtype=dtype)
class UnitsDropOut(object): """ Adds Dropout to any unit type. """ def __init__(self, variables, dropout_h=0., dropout_v=0., **kwargs): try: variables['input'] except KeyError: raise KeyError( "Dictionary 'variables' needs an entry with key 'input'") rng = np.random.RandomState() self.t_rng = RandomStreams(rng.randint(2**30)) self.level_h_ = theano.shared(np.cast[fx](dropout_h)) self.level_v_ = theano.shared(np.cast[fx](dropout_v)) act_fun_h = self.act_fun_h self.act_fun_h = lambda x: self.dropout(act_fun_h(x), self.level_h_) self.input = self.dropout(variables['input'], self.level_v_) self.do_suspended = False self.callback_add(partial(self.dropout_suspend, True), Notifier.MAKE_FINISHED, forward=True) self.callback_add(partial(self.dropout_suspend, False), Notifier.TRAINING_START, forward=True) self.callback_add(partial(self.dropout_suspend, True), Notifier.TRAINING_STOP, forward=True) def dropout(self, x, level): """ This function keeps '1-level' entries of the inputs the same and zero-out randomly selected subset of size 'level' """ return self.t_rng.binomial(size=x.shape, p=1. - level, dtype=fx) * x def dropout_suspend(self, suspend=True): if suspend: if not self.do_suspended: self.level_v_tmp = self.level_v self.level_h_tmp = self.level_h self.W.set_value(self.W.get_value() / (1 / (1 - self.level_h) * (1 - self.level_v))) self.level_v = 0. self.level_h = 0. self.do_suspended = True else: LOGGER.warning("Dropout already suspended, nothing to do.") else: if self.do_suspended: self.level_v = self.level_v_tmp self.level_h = self.level_h_tmp self.W.set_value(self.W.get_value() * (1 / (1 - self.level_h) * (1 - self.level_v))) self.do_suspended = False else: LOGGER.warning("Dropout was not suspended, nothing to do.") @property def level_h(self): return self.level_h_.get_value() @level_h.setter def level_h(self, value): #assert not self.do_suspended, "Please unsuspend dropout to change its level." self.level_h_.set_value(value) @property def level_v(self): return self.level_v_.get_value() @level_v.setter def level_v(self, value): #assert not self.do_suspended, "Please unsuspend dropout to change its level." self.level_v_.set_value(value)
class Model(object): def __init__(self, config): self._params = [] # shared variables for learned parameters self._sticky_hidden_states = [ ] # shared variables which are reset before each epoch self._np_rng = np.random.RandomState(config.seed // 2 + 123) self._theano_rng = RandomStreams( config.seed // 2 + 321) # generates random numbers directly on GPU self._init_scale = config.init_scale self._is_training = tt.iscalar('is_training') self._lr = theano.shared(cast_floatX(config.learning_rate), 'lr') input_data = tt.imatrix('input_data') # (batch_size, num_steps) targets = tt.imatrix('targets') # (batch_size, num_steps) noise_x = tt.matrix('noise_x') # (batch_size, num_steps) # Embed input words and apply variational dropout (for each sample, the embedding of # a dropped word-type consists of all zeros at all occurrences of word-type in sample). embedding = self.make_param((config.vocab_size, config.hidden_size), 'uniform') inputs = embedding[ input_data.T] # (num_steps, batch_size, hidden_size) inputs = self.apply_dropout(inputs, tt.shape_padright(noise_x.T)) rhn_updates = [] for _ in range(config.num_layers): # y shape: (num_steps, batch_size, hidden_size) y, sticky_state_updates = self.RHNLayer( inputs, config.depth, config.batch_size, config.hidden_size, config.drop_i, config.drop_s, config.init_T_bias, config.init_other_bias, config.tied_noise) rhn_updates += sticky_state_updates inputs = y noise_o = self.get_dropout_noise( (config.batch_size, config.hidden_size), config.drop_o) outputs = self.apply_dropout( y, tt.shape_padleft(noise_o)) # (num_steps, batch_size, hidden_size) # logits softmax_w = embedding.T if config.tied_embeddings else self.make_param( (config.hidden_size, config.vocab_size), 'uniform') softmax_b = self.make_param((config.vocab_size, ), config.init_other_bias) logits = tt.dot( outputs, softmax_w) + softmax_b # (num_steps, batch_size, vocab_size) # probabilities and prediction loss flat_logits = logits.reshape( (config.batch_size * config.num_steps, config.vocab_size)) flat_probs = tt.nnet.softmax(flat_logits) flat_targets = targets.T.flatten() # (batch_size * num_steps,) xentropies = tt.nnet.categorical_crossentropy( flat_probs, flat_targets) # (batch_size * num_steps,) pred_loss = xentropies.sum() / config.batch_size # weight decay l2_loss = 0.5 * tt.sum(tt.stack([tt.sum(p**2) for p in self._params])) loss = pred_loss + config.weight_decay * l2_loss grads = theano.grad(loss, self._params) # gradient clipping global_grad_norm = tt.sqrt( tt.sum(tt.stack([tt.sum(g**2) for g in grads]))) clip_factor = ifelse( global_grad_norm < config.max_grad_norm, cast_floatX(1), tt.cast(config.max_grad_norm / global_grad_norm, floatX)) param_updates = [(p, p - self._lr * clip_factor * g) for p, g in zip(self._params, grads)] self.train = theano.function([input_data, targets, noise_x], loss, givens={self._is_training: np.int32(1)}, updates=rhn_updates + param_updates) self.evaluate = theano.function( [input_data, targets], loss, # Note that noise_x is unused in computation graph of this function since _is_training is false. givens={ self._is_training: np.int32(0), noise_x: tt.zeros((config.batch_size, config.num_steps)) }, updates=rhn_updates) self._num_params = np.sum( [param.get_value().size for param in self._params]) if config.load_model: self.load(config.load_model) @property def lr(self): return self._lr.get_value() @property def num_params(self): return self._num_params def make_param(self, shape, init_scheme): """Create Theano shared variables, which are used as trainable model parameters.""" if isinstance(init_scheme, numbers.Number): init_value = np.full(shape, init_scheme, floatX) elif init_scheme == 'uniform': init_value = self._np_rng.uniform(low=-self._init_scale, high=self._init_scale, size=shape).astype(floatX) else: raise AssertionError('unsupported init_scheme') p = theano.shared(init_value) self._params.append(p) return p def apply_dropout(self, x, noise): return ifelse(self._is_training, noise * x, x) def get_dropout_noise(self, shape, dropout_p): keep_p = 1 - dropout_p noise = cast_floatX(1. / keep_p) * self._theano_rng.binomial( size=shape, p=keep_p, n=1, dtype=floatX) return noise def assign_lr(self, lr): self._lr.set_value(cast_floatX(lr)) def reset_hidden_state(self): for sticky_hidden_state in self._sticky_hidden_states: sticky_hidden_state.set_value( np.zeros_like(sticky_hidden_state.get_value())) def save(self, save_path): with open(save_path, 'wb') as f: for p in self._params: cPickle.dump(p.get_value(), f, protocol=cPickle.HIGHEST_PROTOCOL) def load(self, load_path): with open(load_path, 'rb') as f: for p in self._params: p.set_value(cPickle.load(f)) def linear(self, x, in_size, out_size, bias, bias_init=None): assert bias == (bias_init is not None) w = self.make_param((in_size, out_size), 'uniform') y = tt.dot(x, w) if bias: b = self.make_param((out_size, ), bias_init) y += b return y def RHNLayer(self, inputs, depth, batch_size, hidden_size, drop_i, drop_s, init_T_bias, init_H_bias, tied_noise): """Variational Recurrent Highway Layer (Theano implementation). References: Zilly, J, Srivastava, R, Koutnik, J, Schmidhuber, J., "Recurrent Highway Networks", 2016 Args: inputs: Theano variable, shape (num_steps, batch_size, hidden_size). depth: int, the number of RHN inner layers i.e. the number of micro-timesteps per timestep. drop_i: float, probability of dropout over inputs. drop_s: float, probability of dropout over recurrent hidden state. init_T_bias: a valid bias_init argument for linear(), initialization of bias of transform gate T. init_H_bias: a valid bias_init argument for linear(), initialization of bias of non-linearity H. tied_noise: boolean, whether to use the same dropout masks when calculating H and when calculating T. Returns: y: Theano variable, recurrent hidden states at each timestep. Shape (num_steps, batch_size, hidden_size). sticky_state_updates: a list of (shared variable, new shared variable value). """ # We first compute the linear transformation of the inputs over all timesteps. # This is done outside of scan() in order to speed up computation. # The result is then fed into scan()'s step function, one timestep at a time. noise_i_for_H = self.get_dropout_noise((batch_size, hidden_size), drop_i) noise_i_for_T = self.get_dropout_noise( (batch_size, hidden_size), drop_i) if not tied_noise else noise_i_for_H i_for_H = self.apply_dropout(inputs, noise_i_for_H) i_for_T = self.apply_dropout(inputs, noise_i_for_T) i_for_H = self.linear(i_for_H, in_size=hidden_size, out_size=hidden_size, bias=True, bias_init=init_H_bias) i_for_T = self.linear(i_for_T, in_size=hidden_size, out_size=hidden_size, bias=True, bias_init=init_T_bias) # Dropout noise for recurrent hidden state. noise_s = self.get_dropout_noise((batch_size, hidden_size), drop_s) if not tied_noise: noise_s = tt.stack( noise_s, self.get_dropout_noise((batch_size, hidden_size), drop_s)) def step_fn(i_for_H_t, i_for_T_t, y_tm1, noise_s): """ Args: Elements of sequences given to scan(): i_for_H_t: linear trans. of inputs for calculating non-linearity H at timestep t. Shape (batch_size, hidden_size). i_for_T_t: linear trans. of inputs for calculating transform gate T at timestep t. Shape (batch_size, hidden_size). Result of previous step function invocation (equals the outputs_info given to scan() on first timestep): y_tm1: Shape (batch_size, hidden_size). Non-sequences given to scan() (these are the same at all timesteps): noise_s: (batch_size, hidden_size) or (2, batch_size, hidden_size), depending on value of tied_noise. """ tanh, sigm = tt.tanh, tt.nnet.sigmoid noise_s_for_H = noise_s if tied_noise else noise_s[0] noise_s_for_T = noise_s if tied_noise else noise_s[1] s_lm1 = y_tm1 for l in range(depth): s_lm1_for_H = self.apply_dropout(s_lm1, noise_s_for_H) s_lm1_for_T = self.apply_dropout(s_lm1, noise_s_for_T) if l == 0: # On the first micro-timestep of each timestep we already have bias # terms summed into i_for_H_t and into i_for_T_t. H = tanh(i_for_H_t + self.linear(s_lm1_for_H, in_size=hidden_size, out_size=hidden_size, bias=False)) T = sigm(i_for_T_t + self.linear(s_lm1_for_T, in_size=hidden_size, out_size=hidden_size, bias=False)) else: H = tanh( self.linear(s_lm1_for_H, in_size=hidden_size, out_size=hidden_size, bias=True, bias_init=init_H_bias)) T = sigm( self.linear(s_lm1_for_T, in_size=hidden_size, out_size=hidden_size, bias=True, bias_init=init_T_bias)) s_l = (H - s_lm1) * T + s_lm1 s_lm1 = s_l y_t = s_l return y_t # The recurrent hidden state of the RHN is sticky (the last hidden state of one batch is carried over to the next batch, # to be used as an initial hidden state). These states are kept in shared variables and are reset before every epoch. y_0 = theano.shared(np.zeros((batch_size, hidden_size), floatX)) self._sticky_hidden_states.append(y_0) y, _ = theano.scan(step_fn, sequences=[i_for_H, i_for_T], outputs_info=[y_0], non_sequences=[noise_s]) y_last = y[-1] sticky_state_updates = [(y_0, y_last)] return y, sticky_state_updates
def make_output(self, output, collapse=True, sample_mean=None, gamma=None): self.output = output if collapse and self.depth > 1: self.output = self.make_consensus(self.output) if self.attrs['consensus'] == 'flat': self.attrs['n_out'] *= self.depth if self.attrs['batch_norm']: self.output = self.batch_norm(self.output, self.attrs['n_out'], sample_mean=sample_mean, gamma=gamma) if self.attrs['residual']: from NetworkHiddenLayer import concat_sources z, n_in = concat_sources(self.sources, unsparse=True, expect_source=False) assert n_in == self.attrs['n_out'] self.output += z if self.attrs['layer_drop'] > 0.0: # Stochastic Depth, http://arxiv.org/abs/1603.09382 from NetworkHiddenLayer import concat_sources z, n_in = concat_sources(self.sources, unsparse=True, expect_source=False) n_out = self.attrs['n_out'] if n_in != n_out: print("Layer drop with additional projection %i -> %i" % (n_in, n_out), file=log.v4) if n_in > 0: self.W_drop = self.add_param( self.create_forward_weights(n_in, n_out, name="W_drop_%s" % self.name)) z = T.dot(z, self.W_drop) else: z = 0 if self.train_flag: from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams rng = RandomStreams(self.rng.randint(1234) + 1) import theano.ifelse drop = rng.binomial(n=1, p=self.attrs['layer_drop'], size=(1, ), dtype='int8')[0] # drop = theano.printing.Print("drop")(drop) self.output = theano.ifelse.ifelse(drop, z, self.output) else: drop = self.attrs['layer_drop'] self.output = numpy.float32(drop) * z + numpy.float32( 1.0 - drop) * self.output if self.attrs['sparse']: self.output = T.argmax(self.output, axis=-1, keepdims=True) if self.attrs['sparse_filtering']: # https://dlacombejr.github.io/programming/2015/09/13/sparse-filtering-implemenation-in-theano.html fs = T.sqrt(self.output**2 + 1e-8) # numerical stability l2fs = T.sqrt(T.sum(fs**2, axis=1)) # l2 norm of row nfs = fs / l2fs.dimshuffle(0, 'x') # normalize rows l2fn = T.sqrt(T.sum(nfs**2, axis=0)) # l2 norm of column self.output = nfs / l2fn.dimshuffle('x', 0) # normalize columns self.output.name = "%s.output" % self.name self._output = output
class ssRBM(Model): """Spike & Slab Restricted Boltzmann Machine (RBM) """ def load_params(self, model_path): fp = open(model_path, 'r') model = pickle.load(fp) self.Wv.set_value(model.Wv.get_value()) self.Wh.set_value(model.Wh.get_value()) self.hbias.set_value(model.hbias.get_value()) self.mu.set_value(model.mu.get_value()) self.alpha.set_value(model.alpha.get_value()) self.beta.set_value(model.beta.get_value()) # sync random number generators self.rng.set_state(model.rng.get_state()) self.theano_rng.rstate = model.theano_rng.rstate for (self_rng_state, model_rng_state) in \ zip(self.theano_rng.state_updates, model.theano_rng.state_updates): self_rng_state[0].set_value(model_rng_state[0].get_value()) # reset timestamps self.batches_seen = model.batches_seen self.examples_seen = model.examples_seen fp.close() def __init__(self, input=None, Wv=None, hbias=None, numpy_rng = None, theano_rng = None, n_h=100, n_v=100, bw_h=10, init_from=None, neg_sample_steps=1, lr = 1e-3, lr_anneal_coeff=0, lr_timestamp=None, lr_mults = {}, iscales={}, clip_min={}, clip_max={}, l1 = {}, l2 = {}, sp_moving_avg=0.98, sp_type='KL', sp_weight={}, sp_targ={}, batch_size = 13, scalar_b = False, sparse_hmask = None, learn_h_weights = False, unit_norm_filters = True, compile=True, parametrize_sqrt_precision=True, debug=False, seed=1241234, my_save_path=None): """ :param n_h: number of h-hidden units :param n_v: number of visible units :param iscales: optional dictionary containing initialization scale for each parameter :param neg_sample_steps: number of sampling updates to perform in negative phase. :param l1: hyper-parameter controlling amount of L1 regularization :param l2: hyper-parameter controlling amount of L2 regularization :param batch_size: size of positive and negative phase minibatch :param compile: compile sampling and learning functions :param seed: seed used to initialize numpy and theano RNGs. """ super(ssRBM,self).__init__() for k in ['mu','alpha','beta', 'Wv', 'hbias']: assert k in iscales.keys() for k in ['h']: assert k in sp_weight.keys() for k in ['h']: assert k in sp_targ.keys() ### make sure all parameters are floatX ### for (k,v) in l1.iteritems(): l1[k] = npy_floatX(v) for (k,v) in l2.iteritems(): l2[k] = npy_floatX(v) for (k,v) in sp_weight.iteritems(): sp_weight[k] = npy_floatX(v) for (k,v) in sp_targ.iteritems(): sp_targ[k] = npy_floatX(v) for (k,v) in clip_min.iteritems(): clip_min[k] = npy_floatX(v) for (k,v) in clip_max.iteritems(): clip_max[k] = npy_floatX(v) # dump initialization parameters to object for (k,v) in locals().iteritems(): if k!='self': setattr(self,k,v) # allocate random number generators self.rng = numpy.random.RandomState(seed) if numpy_rng is None else numpy_rng self.theano_rng = RandomStreams(self.rng.randint(2**30)) if theano_rng is None else theano_rng ############### ALLOCATE PARAMETERS ################# self.n_s = self.n_h * bw_h # allocate bilinear-weight matrices self.Wh = sharedX(sparse_hmask.mask, name='Wh') if Wv is None: wv_val = self.rng.randn(n_v, self.n_s) * iscales['Wv'] self.Wv = sharedX(wv_val, name='Wv') else: self.Wv = Wv # allocate shared variables for bias parameters if hbias is None: self.hbias = sharedX(iscales['hbias'] * numpy.ones(n_h), name='hbias') else: self.hbias = hbias # mean (mu) and precision (alpha) parameters on s self.mu = sharedX(iscales['mu'] * numpy.ones(self.n_s), name='mu') self.alpha = sharedX(iscales['alpha'] * numpy.ones(self.n_s), name='alpha') self.alpha_prec = self.alpha**2 if parametrize_sqrt_precision else self.alpha # diagonal of precision matrix of visible units self.beta = sharedX(iscales['beta'] * numpy.ones(n_v), name='beta') self.beta_prec = self.beta**2 if parametrize_sqrt_precision else self.beta #### load layer 1 parameters from file #### if init_from: self.load_params(init_from) # allocate shared variable for persistent chain self.neg_v = sharedX(self.rng.rand(batch_size, n_v), name='neg_v') self.neg_ev = sharedX(self.rng.rand(batch_size, n_v), name='neg_ev') self.neg_s = sharedX(self.rng.rand(batch_size, self.n_s), name='neg_s') self.neg_h = sharedX(self.rng.rand(batch_size, n_h), name='neg_h') # moving average values for sparsity self.sp_pos_v = sharedX(self.rng.rand(1,self.n_v), name='sp_pos_v') self.sp_pos_h = sharedX(self.rng.rand(1,self.n_h), name='sp_pog_h') # learning rate - implemented as shared parameter for GPU self.lr_shrd = sharedX(lr, name='lr_shrd') self.lr_mults_it = {} self.lr_mults_shrd = {} for (k,v) in lr_mults.iteritems(): # make sure all learning rate multipliers are float64 self.lr_mults_it[k] = tools.HyperParamIterator(lr_timestamp, lr_mults[k]) self.lr_mults_shrd[k] = sharedX(self.lr_mults_it[k].value, name='lr_mults_shrd'+k) # allocate symbolic variable for input self.input = T.matrix('input') if input is None else input # configure input-space (new pylearn2 feature?) self.input_space = VectorSpace(n_v) # counters used by pylearn2 trainers self.batches_seen = 0 # incremented on every batch self.examples_seen = 0 # incremented on every training example self.force_batch_size = batch_size # force minibatch size self.error_record = [] ## ESTABLISH LIST OF LEARNT MODEL PARAMETERS ## self.params = [self.Wv, self.hbias, self.mu, self.alpha, self.beta] if self.learn_h_weights: self.params += [self.Wh] if compile: self.do_theano() def do_theano(self): """ Compiles all theano functions needed to use the model""" init_names = dir(self) ###### All fields you don't want to get pickled (e.g., theano functions) should be created below this line # SAMPLING: NEGATIVE PHASE neg_updates = self.neg_sampling_updates(n_steps=self.neg_sample_steps) self.sample_neg_func = function([], [], updates=neg_updates, name='sample_neg_func') pos_updates = {} # determing maximum likelihood cost main_cost = [self.ml_cost(), self.get_sparsity_cost(), self.get_reg_cost(self.l2, self.l1)] ## # COMPUTE GRADIENTS WRT. TO ALL COSTS ## learning_grads = utils_cost.compute_gradients(*main_cost) ## # BUILD UPDATES DICTIONARY ## learning_updates = utils_cost.get_updates( learning_grads, self.lr_shrd, multipliers = self.lr_mults_shrd) if self.learn_h_weights: learning_updates[self.Wh] *= self.sparse_hmask.mask learning_updates.update(pos_updates) # build theano function to train on a single minibatch self.batch_train_func = function([self.input], [], updates=learning_updates, name='train_rbm_func') # enforce constraints function constraint_updates = {} ## clip parameters to maximum values (if applicable) for (k,v) in self.clip_max.iteritems(): assert k in [param.name for param in self.params] param = getattr(self, k) constraint_updates[param] = T.clip(param, param, v) ## clip parameters to minimum values (if applicable) for (k,v) in self.clip_min.iteritems(): assert k in [param.name for param in self.params] param = getattr(self, k) constraint_updates[param] = T.clip(constraint_updates.get(param, param), v, param) ## Residual variance on beta is scalar valued if self.scalar_b: beta = constraint_updates.get(self.beta, self.beta) constraint_updates[self.beta] = T.mean(beta) * T.ones_like(beta) # constraint filters to have unit norm if self.unit_norm_filters: Wv = constraint_updates.get(self.Wv, self.Wv) constraint_updates[self.Wv] = Wv / T.sqrt(T.sum(Wv**2, axis=0)) self.enforce_constraints = theano.function([],[], updates=constraint_updates) ###### All fields you don't want to get pickled should be created above this line final_names = dir(self) self.register_names_to_del( [ name for name in (final_names) if name not in init_names ]) # Before we start learning, make sure constraints are enforced self.enforce_constraints() def learn(self, dataset, batch_size): x = dataset.get_batch_design(batch_size, include_labels=False) self.learn_mini_batch(x) # accounting... self.examples_seen += self.batch_size self.batches_seen += 1 # modify learning rate multipliers for (k, iter) in self.lr_mults_it.iteritems(): if iter.next(): print 'self.batches_seen = ', self.batches_seen self.lr_mults_shrd[k].set_value(iter.value) print 'lr_mults_shrd[%s] = %f' % (k,iter.value) self.enforce_constraints() # save to different path each epoch if self.my_save_path and self.batches_seen%1000==0: fname = self.my_save_path + '_e%i.pkl' % (self.batches_seen/1000) print 'Saving to %s ...' %fname, serial.save(fname, self) print 'done' def learn_mini_batch(self, x): # anneal learning rate self.lr_shrd.set_value(self.lr / (1. + self.lr_anneal_coeff * self.batches_seen)) # perform negative phase sampling self.sample_neg_func() if self.debug and ( numpy.isnan(self.neg_h.get_value()).any() or numpy.isnan(self.neg_s.get_value()).any() or numpy.isnan(self.neg_v.get_value()).any()): import pdb; pdb.set_trace() # update parameters self.batch_train_func(x) def energy(self, h_sample, s_sample, v_sample): """ Computes energy for a given configuration of (g,h,v,x,y). :param h_sample: T.matrix of shape (batch_size, n_h) :param s_sample: T.matrix of shape (batch_size, bw_h * n_h) :param v_sample: T.matrix of shape (batch_size, n_v) """ energy = -T.sum(s_sample * T.dot(v_sample, self.Wv) * T.dot(h_sample, self.Wh), axis=1) energy += T.sum(0.5 * self.alpha_prec * s_sample**2, axis=1) energy += T.sum(0.5 * self.beta_prec * v_sample**2, axis=1) energy -= T.sum(self.alpha_prec * self.mu * s_sample * T.dot(h_sample, self.Wh), axis=1) energy += T.sum(0.5 * self.alpha_prec * self.mu**2 * T.dot(h_sample, self.Wh), axis=1) energy -= T.dot(h_sample, self.hbias) return energy def __call__(self, v, output_type='hs'): assert output_type in ['h', 'hs'] h_mean = self.h_given_v(v) s_mean = self.s_given_hv(h_mean, v_sample) output_prods = { 'h': h_mean, 'hs': T.dot(h_mean, self.Wh) * s_mean } return output_prods[output_type] ###################################### # MATH FOR CONDITIONAL DISTRIBUTIONS # ###################################### def h_given_v(self, v_sample): """ Compute mean activation of h given v. :param v_sample: T.matrix of shape (batch_size, n_v matrix) """ from_v = T.dot(v_sample, self.Wv) temp = 0.5 * 1./self.alpha_prec * from_v**2 temp += from_v * self.mu h_mean = T.dot(temp, self.Wh.T) + self.hbias return T.nnet.sigmoid(h_mean) def sample_h_given_v(self, v_sample): """ Generates sample from p(h|v) """ h_mean = self.h_given_v(v_sample) h_sample = self.theano_rng.binomial(size=(self.batch_size,self.n_h), n=1, p=h_mean, dtype=floatX) return h_sample def s_given_hv(self, h_sample, v_sample): from_h = T.dot(h_sample, self.Wh) from_v = T.dot(v_sample, self.Wv) s_mean = (1./self.alpha_prec * from_v + self.mu) * from_h return s_mean def sample_s_given_hv(self, h_sample, v_sample): s_mean = self.s_given_hv(h_sample, v_sample) s_sample = self.theano_rng.normal( size=(self.batch_size, self.n_s), avg = s_mean, std = T.sqrt(1./self.alpha_prec), dtype=floatX) return s_sample def v_given_hs(self, h_sample, s_sample): """ Computes the mean-activation of visible units, given all other variables. :param h_sample: T.matrix of shape (batch_size, n_h) :param s_sample: T.matrix of shape (batch_size, n_s) """ from_h = T.dot(h_sample, self.Wh) v_mean = 1./self.beta_prec * T.dot(s_sample * from_h, self.Wv.T) return v_mean def sample_v_given_hs(self, h_sample, s_sample): v_mean = self.v_given_hs(h_sample, s_sample) v_sample = self.theano_rng.normal( size=(self.batch_size, self.n_v), avg = v_mean, std = T.sqrt(1./self.beta_prec), dtype=floatX) return v_sample ################## # SAMPLING STUFF # ################## def neg_sampling(self, h_sample, s_sample, v_sample, n_steps=1): """ Gibbs step for negative phase, which alternates: p(h|b,g,v), p(s|b,g,h,v) and p(v|b,g,h,s) :param f_sample: T.matrix of shape (batch_size, n_f) :param g_sample: T.matrix of shape (batch_size, n_g) :param h_sample: T.matrix of shape (batch_size, n_h) :param s_sample: T.matrix of shape (batch_size, n_s) :param v_sample: T.matrix of shape (batch_size, n_v) :param n_steps: number of Gibbs updates to perform in negative phase. """ def gibbs_iteration(h1, s1, v1): h2 = self.sample_h_given_v(v1) s2 = self.sample_s_given_hv(h2, v1) v2 = self.sample_v_given_hs(h2, s2) return [h2, s2, v2] [new_h, new_s, new_v] , updates = theano.scan( gibbs_iteration, outputs_info = [h_sample, s_sample, v_sample], n_steps=n_steps) return [new_h[-1], new_s[-1], new_v[-1]] def neg_sampling_updates(self, n_steps=1): """ Implements the negative phase, generating samples from p(h,s,v). :param n_steps: scalar, number of Gibbs steps to perform. """ [new_h, new_s, new_v] = self.neg_sampling(self.neg_h, self.neg_s, self.neg_v, n_steps=n_steps) # we want to plot the expected value of the samples new_ev = self.v_given_hs(new_h, new_s) updates = {self.neg_h : new_h, self.neg_s : new_s, self.neg_v : new_v, self.neg_ev: new_ev} return updates def ml_cost(self): """ Variational approximation to the maximum likelihood positive phase. :param v: T.matrix of shape (batch_size, n_v), training examples :return: tuple (cost, gradient) """ pos_h = self.h_given_v(self.input) pos_s = self.s_given_hv(pos_h, self.input) pos_cost = T.sum(self.energy(pos_h, pos_s, self.input)) neg_cost = T.sum(self.energy(self.neg_h, self.neg_s, self.neg_v)) batch_cost = pos_cost - neg_cost cost = batch_cost / self.batch_size # build gradient of cost with respect to model parameters cte = [pos_h, pos_s, self.neg_h, self.neg_s, self.neg_v] return utils_cost.Cost(cost, self.params, cte) def get_sparsity_cost(self): # update mean activation using exponential moving average hack_h = self.h_given_v(self.sp_pos_v) # define loss based on value of sp_type if self.sp_type == 'KL': eps = 1./self.batch_size loss = lambda targ, val: - targ * T.log(eps + val) - (1.-targ) * T.log(1. - val + eps) elif self.sp_type.startswith('Lee07'): loss = lambda targ, val: abs(targ - val) else: raise NotImplementedError('Sparsity type %s is not implemented' % self.sp_type) cost = T.zeros((), dtype=floatX) params = [] if self.sp_weight['h']: cost += self.sp_weight['h'] * T.sum(loss(self.sp_targ['h'], hack_h.mean(axis=0))) params += [self.hbias] if self.sp_type in ['KL','Lee07'] and self.sp_weight['h']: params += [self.Wv, self.alpha, self.mu] return utils_cost.Cost(cost, params) ############################## # GENERIC OPTIMIZATION STUFF # ############################## def get_reg_cost(self, l2=None, l1=None): """ Builds the symbolic expression corresponding to first-order gradient descent of the cost function ``cost'', with some amount of regularization defined by the other parameters. :param l2: dict containing amount of L2 regularization for Wg, Wh and Wv :param l1: dict containing amount of L1 regularization for Wg, Wh and Wv """ cost = T.zeros((), dtype=floatX) params = [] for p in self.params: if l1.get(p.name, 0): cost += l1[p.name] * T.sum(abs(p)) params += [p] if l2.get(p.name, 0): cost += l2[p.name] * T.sum(p**2) params += [p] return utils_cost.Cost(cost, params) def monitor_matrix(self, w, name=None): if name is None: assert hasattr(w, 'name') name = name if name else w.name return {name + '.min': w.min(axis=[0,1]), name + '.max': w.max(axis=[0,1]), name + '.absmean': abs(w).mean(axis=[0,1])} def monitor_vector(self, b, name=None): if name is None: assert hasattr(b, 'name') name = name if name else b.name return {name + '.min': b.min(), name + '.max': b.max(), name + '.absmean': abs(b).mean()} def get_monitoring_channels(self, x): chans = {} chans.update(self.monitor_matrix(self.Wv)) chans.update(self.monitor_vector(self.hbias)) chans.update(self.monitor_vector(self.alpha)) chans.update(self.monitor_vector(self.mu)) chans.update(self.monitor_vector(self.beta)) chans.update(self.monitor_matrix(self.neg_h)) chans.update(self.monitor_matrix(self.neg_s)) chans.update(self.monitor_matrix(self.neg_v)) return chans
class RBMrv_T: #class var goes here, instance var goes in constructor def __init__(self, noOfVisibleUnits, noOfHiddenUnits, CD_n, aRate, bRate, omegaRate, sigmaRate, omega=None, b=None, a=None, z=None, rprop_e = 0.01, rprop_en =0.005, sparseTargetp=0.01): ''' constructor RBMrv_T(self, noOfVisibleUnits, noOfHiddenUnits, CD_n, aRate, bRate, omegaRate, sigmaRate, omega=None, b=None, a=None, z=None, rprop_e = 0.01, rprop_en =0.005, sparseTargetp=0.01): noOfVisibleUnits (int): must be perfect square noOfHiddenUnits (int): must be perfect square CD_n (int): no. of iterations in MCMC simulation during training, check if model means are used if CD_n = 1 aRate (float32): update rate of parameter \underline{a} during training bRate (float32): update rate of parameter \underline{b} during training omegaRate (float32): update rate of parameter \boldsymbol{\omega} during training sigmaRate (float32): update rate of parameter \underline{z} during training omega (numpy array of float32): \omega parameter matrix with noOfVisible unit rows x noOfHiddenUnits columns b (numpy array of float32): b parameter vector, size = noOfHiddenUnits a (numpy array of float32): b parameter vector, size = noOfVisibleUnits z (numpy array of float32): z parameter vector, size = noOfVisibleUnits rprop_e (float32): rprop_en (float32): sparseTargetp (float32): target mean hidden unit activation for training. between (0,1) ''' self.epsilon = 0.0000001 theano.config.exception_verbosity = 'high' #rprop parameters and variables, rprop not used self.T_rprop_e = theano.shared(value=np.float32(rprop_e), name='T_rprop_e', borrow = True, allow_downcast=True) self.T_rprop_en = theano.shared(value=np.float32(rprop_en), name='T_rprop_en', borrow = True, allow_downcast=True) self.T_posUpdate = theano.shared(value=np.float32(0.5*(1.0+rprop_e)), name='T_posUpdate', borrow = True, allow_downcast=True) self.T_negUpdate = theano.shared(value=np.float32(0.5*(1.0-rprop_en)), name='T_negUpdate', borrow = True, allow_downcast=True) #network geometry and training parameters self.miniBatchSize = 0 #will be set in self.trainMB(...) self.parameterLoaded = False self.parameterSaved = False self.sparseTargetp = sparseTargetp self.CD_n = CD_n self.nv = noOfVisibleUnits self.nh = noOfHiddenUnits self.dimV = int(math.sqrt(self.nv)) self.dimH = int(math.sqrt(self.nh)) self.aRate = np.float32(aRate) self.bRate = np.float32(bRate) self.omegaRate = np.float32(omegaRate) self.sigmaRate = np.float32(sigmaRate) #initialise v and h self.v = np.float32(np.random.uniform(0, 1.0, self.nv)) self.h = np.float32(np.random.binomial(1.0,0.5,self.nh)) self.logLikelihood = [] self.likelihood4plot = [] self.T_aRate = theano.shared(value=np.float32(aRate), name='T_aRate', borrow = True, allow_downcast=True) self.T_bRate = theano.shared(value=np.float32(bRate), name='T_bRate', borrow = True, allow_downcast=True) self.T_omgRate = theano.shared(value=np.float32(omegaRate), name='T_omgRate', borrow = True, allow_downcast = True) self.T_sigRate = theano.shared(value=np.float32(sigmaRate), name='T_sigRate', borrow = True, allow_downcast = True) self.loadedRates = [aRate, bRate, omegaRate, sigmaRate]#for load/saveparameters(), can load to see previous rates but differes from constructor declared rates self.T_rng = RandomStreams() #use_cuda parameter set if on GPU #succesive calls on this T_rng will keep returning new values, so for MCMC even with #same start v vector value called twice consecutively you'll have different outputs #this is normal as the same T_rng gets called, without reset, giving different outputs everytime. self.T_CD_n = theano.shared(value=CD_n, name='T_CD_n', borrow = True, allow_downcast=True) if omega is None: #careful! use "1.0" instead of "1" below else it all rounds to zeros!!! omega = np.float32(np.random.uniform((-1.0)*(1.0/(np.sqrt(self.nh+self.nv))),(1.0/(np.sqrt(self.nh+self.nv))),self.nv*self.nh).reshape((self.nv,self.nh))) self.omega = omega self.T_omega = theano.shared(value=omega,name='T_omega',borrow=True, allow_downcast=True) #rprop previous gradient self.Tomg_grad_prev = theano.shared(value=np.float32(np.abs(omega*omegaRate)+omegaRate), name='Tomg_grad_prev', borrow = True, allow_downcast=True) #RMSprop accumulated gradient RMS self.Tomg_rmsH = theano.shared(value=omega,name='Tomg_rmsH', borrow=True, allow_downcast=True) if b is None: b = np.float32(np.random.uniform((-1.0)*(1.0/(self.nv)),(1.0/(self.nv)),self.nh)) self.b = b self.T_b = theano.shared(value=b,name='T_b',borrow=True, allow_downcast=True) #rprop previous gradient self.Tb_grad_prev = theano.shared(value=np.float32(np.abs(bRate*b)+bRate), name='Tb_grad_prev', borrow = True, allow_downcast=True) #RMSprop accumulated gradient RMS self.Tb_rmsH = theano.shared(value = b, name = 'Tb_rmsH', borrow = True, allow_downcast = True) if a is None: a = np.float32(np.random.uniform((-1.0)*(1.0/(self.nh)),(1.0/(self.nh)),self.nv)) self.a = a self.T_a = theano.shared(value=a,name='T_a',borrow=True, allow_downcast=True) #rprop previous gradient self.Ta_grad_prev = theano.shared(value=np.float32(np.abs(aRate*a)+aRate), name='Ta_grad_prev', borrow = True, allow_downcast=True) #RMSprop accumulated gradient RMS self.Ta_rms = theano.shared(value=a, name='Ta_rms', borrow=True, allow_downcast=True) # for sigma parameter we train z instead with e^z = \sigma^2 if z is None: z = np.float32(np.random.normal(0.0,(1.0/(self.nh*self.nh)),self.nv))#np.asarray([0.0]*self.nv, dtype=theano.config.floatX) self.z = z self.T_z = theano.shared(value=z,name='T_z',borrow=True, allow_downcast=True) self.T_sigmaSqr = T.exp(self.T_z) #rprop previous gradient self.Tz_grad_prev = theano.shared(value=np.float32(np.float32(np.abs(z*sigmaRate)+sigmaRate)), name='Tz_grad_prev', borrow = True, allow_downcast=True) #RMSprop accumulated gradient RMS self.Tz_rmsH = theano.shared(value=z, name = 'Tz_rmsH', borrow=True, allow_downcast=True) self.T_logZk = theano.shared(value = np.float32(0.0), name = 'T_logZk', borrow=True, allow_downcast=True) #will print in ipython notebook: print("RBMrv constructed for " + str(len(self.v)) + " visible units and " + str(len(self.h)) + " hidden units.") #print(", with Energy function:") #display(Math(r'E(\vec{v},\vec{h}) = \sum_i \frac{(v_i-a_i)^2}{2\sigma_i^2} - \sum_i \sum_j \omega_{ij}h_j\frac{v_i}{\sigma_i^2} - \sum_j b_j h_j')) def genSamples(self, noOfsamples, separation): """ Generated samples from loaded parameters: genSamples(self, noOfsamples, separation) Args: separation (int): number of MCMC separation of samples noOFsamples (int): total number of samples returned Return: geneartedSamples (np array): if images, use "generatedSamples[#sample].reshape((noOfvisibleUnits,noOfvisibleUnits))" for ploting """ generatedSamples = [] initSample = T.vector("initSample", dtype=theano.config.floatX) [scan_resV, scan_resH, H_meanStub, V_meanStub] , scan_updates = theano.scan(self.vtovMBall, outputs_info=[initSample, None, None, None] , n_steps=separation*(noOfsamples+1)) genSampleFn = theano.function(inputs=[initSample], outputs =[scan_resV, scan_resH], allow_input_downcast = True, updates = scan_updates) [currentV, currentH] = genSampleFn(np.asarray([0.0]*self.nv, dtype=theano.config.floatX)) generatedSamples = currentV[separation:separation*(noOfsamples+1):separation] return generatedSamples def checkNaN(self): """ prints NaN tests works on parameters a, b, z, omega of current object """ print("NaN test on omega: " + str(np.isnan(np.sum(np.sum(np.asarray(self.T_omega.eval())))))) print("NaN test on a: " + str(np.isnan(np.dot(np.asarray(self.T_a.eval()),np.asarray(self.T_a.eval()))))) print("NaN test on b: " + str(np.isnan(np.dot(np.asarray(self.T_b.eval()),np.asarray(self.T_b.eval()))))) print("NaN test on z: " + str(np.isnan(np.dot(np.asarray(self.T_z.eval()),np.asarray(self.T_z.eval()))))) print("max z = " + str(np.max(np.asarray(self.T_z.eval()))) + ", min z =" + str(np.min(np.asarray(self.T_z.eval())))) def printParameters(self): """ prints parameters a, b, z \sigma^2, omega """ print("a = " + str(self.T_a.get_value())) print("b = " + str(self.T_b.get_value())) print("z = " + str(self.T_z.get_value())) print("sigma^2 = " + str([math.exp(zi) for zi in self.T_z.get_value()])) print("omega = " + str(self.T_omega.get_value())) def plotAllRF(self, noOfRFs = 25): """ plots \omega_{ij} elements. With i=0,1,... noOfRFs as a square image args: noOfRFs (int): have to be perfect square and up to number of hidden units """ inputIndex = noOfRFs + 1 fig, myAxis = plt.subplots(int(np.sqrt(noOfRFs)),int(np.sqrt(noOfRFs))) xpt, ypt = myAxis.shape fig.tight_layout() for xind in range(0,xpt): for yind in range(0, ypt): myAxis[xind][yind].imshow(self.T_omega.eval()[:,inputIndex].reshape((self.dimV,self.dimV)), cmap = cm.Greys_r, interpolation='nearest') inputIndex = inputIndex + 1 plt.show() #print("weights are between (" + str(np.min(np.min(self.T_omega.eval()))) + "," + str(np.max(np.max(self.T_omega.eval()))) + ")") def plotSD(self): """ plot \sigma standard deviation parameter as sqaure image """ SDparameter = np.exp((np.asarray(self.T_z.eval()))) fig=plt.figure() im=plt.imshow(SDparameter.reshape((self.dimV,self.dimV)), cmap = cm.Greys_r, interpolation='nearest') fig.colorbar(im) def plot_a(self): """ plot a parameter as an image """ SDparameter = np.asarray(self.T_a.eval()) fig=plt.figure() im=plt.imshow(SDparameter.reshape((self.dimV,self.dimV)), cmap = cm.Greys_r, interpolation='nearest') fig.colorbar(im) def plot_b(self): """ plot b parameter as an image """ SDparameter = np.asarray(self.T_b.eval()) fig = plt.figure() im = plt.imshow(SDparameter.reshape((self.dimH,self.dimH)), cmap = cm.Greys_r, interpolation='nearest') fig.colorbar(im) def saveParameters(self, fileName): """ saves all essential parameters so simulation can resume after calling loadParameters() file saved in npz format ars: fileName (string): in single quotes '...' and excluding extensions. """ np.savez(fileName, T_omega = self.T_omega.eval(), Tomg_rmsH = self.Tomg_rmsH.eval(), T_a = self.T_a.eval(), Ta_rms = self.Ta_rms.eval(), T_b = self.T_b.eval(), Tb_rmsH = self.Tb_rmsH.eval(), T_z = self.T_z.eval(), Tz_rmsH = self.Tz_rmsH.eval(), Ta_grad_prev = self.Ta_grad_prev.eval(), Tb_grad_prev = self.Ta_grad_prev.eval(), Tz_grad_prev = self.Tz_grad_prev.eval(), Tomg_grad_prev = self.Tomg_grad_prev.eval(), logLikelihood = self.logLikelihood, likelihood4plot = self.likelihood4plot, T_logZk = self.T_logZk.eval(), loadedRates = self.loadedRates, miniBatchSize = self.miniBatchSize, aRate = self.aRate, bRate = self.bRate, omegaRate = self.omegaRate, sigmaRate = self.sigmaRate, CD_n = self.CD_n, sparseTargetp = self.sparseTargetp) #print("parameters saved in: " + str(fileName) + ".npz") self.parameterSaved = True def loadParameters(self, fileName): """ loads npz file to restore all simulation parameters make sure the parameters you're loading fits the current object (e.g. same #visible/#hidden units) ars: fileName (string): in single quotes '...' and excluding extensions. """ loadedFile = np.load(fileName + '.npz') self.miniBatchSize = loadedFile['miniBatchSize'] self.aRate = np.float32(loadedFile['aRate']) #without explicit cast it turns into float64?! self.bRate = np.float32(loadedFile['bRate']) self.omegaRate = np.float32(loadedFile['omegaRate']) self.sigmaRate = np.float32(loadedFile['sigmaRate']) self.CD_n = loadedFile['CD_n'] self.sparseTargetp = loadedFile['sparseTargetp'] self.T_omega.set_value(loadedFile['T_omega']) self.Tomg_rmsH.set_value(loadedFile['Tomg_rmsH']) self.T_a.set_value(loadedFile['T_a']) self.Ta_rms.set_value(np.float32(loadedFile['Ta_rms'])) self.T_b.set_value(loadedFile['T_b']) self.Tb_rmsH.set_value(loadedFile['Tb_rmsH']) self.T_z.set_value(loadedFile['T_z']) self.Tz_rmsH.set_value(loadedFile['Tz_rmsH']) self.Ta_grad_prev.set_value(loadedFile['Ta_grad_prev']) self.Tb_grad_prev.set_value(loadedFile['Tb_grad_prev']) self.Tz_grad_prev.set_value(loadedFile['Tz_grad_prev']) self.Tomg_grad_prev.set_value(loadedFile['Tomg_grad_prev']) self.logLikelihood = loadedFile['logLikelihood'] self.likelihood4plot = loadedFile['likelihood4plot'] self.likelihood4plot = self.likelihood4plot.tolist() self.T_logZk.set_value(loadedFile['T_logZk']) self.loadedRates = loadedFile['loadedRates'] #print("after loading, omega = " + str(self.T_omega.eval())) self.parameterLoaded = True def energyFnMB(self, VM, HM): """ evaluates the energy functions of the RBM given row vector(s) of v and h args: VM (T.matrix): rows of visible layer values HM (T.matrix): rows of hidden layer values return: a row Theano vector, elements being E(v_row, h_row) """ T_bh = T.dot(HM, self.T_b) T_omghv = T.transpose(T.sum(T.mul(T.dot(T.mul(T.fill(VM, T.exp(-self.T_z)), VM), self.T_omega), HM), axis=1,acc_dtype=theano.config.floatX)) T_Vsqr = T.mul(VM-T.fill(VM, self.T_a),VM-T.fill(VM, self.T_a)) T_VsqrOmg = T.transpose(T.sum(T.mul(T.fill(T_Vsqr,np.float32(0.5)*T.exp(-self.T_z)),T_Vsqr),axis=1, acc_dtype=theano.config.floatX)) return -T_VsqrOmg + T_omghv + T_bh def vtohMB(self, VsampM): """ computes hidden unit outputs given visible unit outputs ("half" a MCMC iteration) computes in parallel given input rows of visible units args: VsampM (T.matrix): rows of visible unit outputs returns: a T.matrix, rows of hidden unit outputs """ Vomg = T.matrix(name="Vomg", dtype=theano.config.floatX) vtohMBres = T.matrix(name ="vtohMBres", dtype=theano.config.floatX) T_HP = T.matrix(name="T_HP", dtype=theano.config.floatX) Vomg = T.dot(T.mul(T.fill(VsampM, T.exp(-self.T_z)), VsampM), self.T_omega) T_Hp = T.nnet.ultra_fast_sigmoid(T.fill(Vomg, self.T_b) + Vomg) vtohMBres = self.T_rng.binomial(size = T_Hp.shape, p=T_Hp, dtype=theano.config.floatX) return vtohMBres def vtovMBall(self, VsampM): """ computes visible unit outputs given visible unit inputs (single MCMC iteration) multiple paralle MCMC iterations using rows of the input matrix args: VsampM (T.matrix): rows of this matrix are visible unit inputs return: ahtovMBres (T.matrix): rows of this matrix are visible unit outputs after a single MCMC iteration """ #v to h part aVomg = T.matrix(name="Vomg", dtype=theano.config.floatX) avtohMBres = T.matrix(name ="vtohMBres", dtype=theano.config.floatX) aT_HP = T.matrix(name="T_HP", dtype=theano.config.floatX) aVomg = T.dot(T.mul(T.fill(VsampM, T.exp(-self.T_z)), VsampM), self.T_omega) aT_Hp = T.nnet.ultra_fast_sigmoid(T.fill(aVomg, self.T_b) + aVomg) avtohMBres = self.T_rng.binomial(size = aT_Hp.shape, p=aT_Hp, dtype=theano.config.floatX) #h to v part: aT_omgH = T.matrix(name="T_omgH", dtype=theano.config.floatX) aT_means = T.matrix(name="T_means", dtype=theano.config.floatX) ahtovMBres = T.matrix(name="htovMBres", dtype=theano.config.floatX) aT_omgH = T.transpose(T.dot(self.T_omega, T.transpose(avtohMBres))) aT_means = T.fill(aT_omgH, self.T_a) + aT_omgH ahtovMBres = self.T_rng.normal(size=aT_means.shape, avg=aT_means, std=T.fill(aT_means,T.sqrt(T.exp(self.T_z))), dtype=theano.config.floatX) return [ahtovMBres, avtohMBres, aT_Hp, aT_means] def htovMB(self, HsampM): """ computes visible unit outputs given hidden unit inputs ("half" a MCMC iteration) computes in parallel given input rows of hidden units args: HsampM (T.matrix): rows of hidden unit inputs returns: a T.matrix, rows of visible unit outputs """ T_omgH = T.matrix(name="T_omgH", dtype=theano.config.floatX) T_means = T.matrix(name="T_means", dtype=theano.config.floatX) htovMBres = T.matrix(name="htovMBres", dtype=theano.config.floatX) T_omgH = T.transpose(T.dot(self.T_omega, T.transpose(HsampM))) T_means = T.fill(T_omgH, self.T_a) + T_omgH htovMBres = self.T_rng.normal(size=T_means.shape, avg=T_means, std=T.fill(T_means,T.sqrt(T.exp(self.T_z))), dtype=theano.config.floatX) return htovMBres def trainMB(self, V_egMin, noOfEpoch, noOfMiniBatchEx): """ trains the current RBM object, returns nothing with parameter updates being internal args: V_egMin (theano.shared 2D array): call eval() to supply as argument. rows of this are input examples. V_egMin[N:M] extracts M-N examples, each of size noOfVisible units noOfEpoch (int): total number of Epoch to simulate, each Epoch goes through V_egMin noOfMiniBatchEx (int): number of examples to be grouped into minibatches """ self.miniBatchSize = noOfMiniBatchEx print("size of input example is: " + str(V_egMin.shape)) V_egM = T.matrix(name="T_egM", dtype=theano.config.floatX) [V_CDmAcc, H_CDmAcc, H_CDmean, V_CDmean] , scan_updates = theano.scan(self.vtovMBall, outputs_info=[V_egM, None, None, None] , n_steps=self.CD_n) V_CDm = V_CDmAcc[-1] #these are matrixes H_CDm = H_CDmAcc[-1] #these are matrixes H_egM = self.vtohMB(V_egM) energyVector_eg = self.energyFnMB(V_egM, H_egM) energyVector_cd = self.energyFnMB(V_CDm, H_CDm) costFn = T.mean(energyVector_eg, dtype=theano.config.floatX, acc_dtype=theano.config.floatX) - T.mean(energyVector_cd, dtype=theano.config.floatX, acc_dtype=theano.config.floatX) Ta_grad, Tb_grad, Tz_grad, Tomg_grad = T.grad(cost=costFn, wrt=[self.T_a, self.T_b, self.T_z, self.T_omega], consider_constant=[V_egM, H_egM, V_CDm, H_CDm]) #regular gradient gradFromMB = theano.function(inputs=[V_egM], outputs=[Ta_grad, Tb_grad, Tz_grad, Tomg_grad], allow_input_downcast=True, updates = scan_updates + [(self.T_a, self.T_a + self.aRate*Ta_grad), (self.T_b, self.T_b + self.bRate*Tb_grad), (self.T_z, self.T_z + self.sigmaRate*Tz_grad), (self.T_omega, self.T_omega + self.omegaRate*Tomg_grad)], mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) #rprop: Code not used Ta_rpropMag = T.mul(T.abs_(self.Ta_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Ta_grad_prev)+T.sgn(Ta_grad))) + T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Ta_grad_prev)+T.sgn(Ta_grad))-np.float32(2.0)))) Ta_rprop = T.mul(T.sgn(Ta_grad),Ta_rpropMag.clip(np.float32(self.epsilon),50)) Tb_rpropMag = T.mul(T.abs_(self.Tb_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Tb_grad_prev)+T.sgn(Tb_grad))) + T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Tb_grad_prev)+T.sgn(Tb_grad))-np.float32(2.0)))) Tb_rprop = T.mul(T.sgn(Tb_grad),Tb_rpropMag.clip(np.float32(self.epsilon),50)) Tz_rpropMag = T.mul(T.abs_(self.Tz_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Tz_grad_prev)+T.sgn(Tz_grad))) + T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Tz_grad_prev)+T.sgn(Tz_grad))-np.float32(2.0))) ) Tz_rprop = T.mul(T.sgn(Tz_grad),Tz_rpropMag.clip(np.float32(self.epsilon),50)) Tomg_rpropMag = T.mul(T.abs_(self.Tomg_grad_prev), T.mul(self.T_posUpdate, T.abs_(T.sgn(self.Tomg_grad_prev)+T.sgn(Tomg_grad))) + T.mul(self.T_negUpdate, T.abs_(T.abs_(T.sgn(self.Tomg_grad_prev)+T.sgn(Tomg_grad))-np.float32(2.0)))) Tomg_rprop = T.mul(T.sgn(Tomg_grad),Tomg_rpropMag.clip(np.float32(self.epsilon),50)) gradFromMBrprop = theano.function(inputs=[V_egM], outputs=[Ta_rprop, Tb_rprop, Tz_rprop, Tomg_rprop], allow_input_downcast=True, updates = scan_updates + [(self.T_a, self.T_a + Ta_rprop), (self.T_b, self.T_b + Tb_rprop), (self.T_z, self.T_z + Tz_rprop), (self.T_omega, self.T_omega + Tomg_rprop), (self.Ta_grad_prev, Ta_rprop), (self.Tb_grad_prev, Tb_rprop), (self.Tz_grad_prev, Tz_rprop), (self.Tomg_grad_prev, Tomg_rprop)], mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) #RMSprop only: [a_grad, b_grad, z_grad, omg_grad] = gradFromMB(V_egMin[0:noOfMiniBatchEx]) #initial RMS correction if (not(self.parameterLoaded) and not(self.parameterSaved)): self.Ta_rms.set_value(np.float32(np.abs(a_grad))) # = theano.shared(value = np.float32(np.abs(a_grad)), name = 'Ta_rms', borrow=True, allow_downcast=True) Tb_rms = theano.shared(value = np.float32(np.abs(b_grad)), name = 'Tb_rms', borrow=True, allow_downcast=True) Tz_rms = theano.shared(value = np.float32(np.abs(z_grad)), name = 'Tz_rms', borrow=True, allow_downcast=True) Tomg_rms = theano.shared(value = np.float32(np.abs(omg_grad)), name = 'Tomg_rms', borrow=True, allow_downcast=True) gradFromMBRMSprop = theano.function(inputs=[V_egM], outputs=[Ta_grad, Tb_grad, Tz_grad, Tomg_grad], allow_input_downcast=True, updates = scan_updates + [(self.Ta_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Ta_rms,self.Ta_rms))+T.mul(np.float32(0.1),T.mul(Ta_grad,Ta_grad)))), (Tb_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(Tb_rms,Tb_rms))+T.mul(np.float32(0.1),T.mul(Tb_grad,Tb_grad)))), (Tz_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(Tz_rms,Tz_rms))+T.mul(np.float32(0.1),T.mul(Tz_grad,Tz_grad)))), (Tomg_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(Tomg_rms,Tomg_rms))+T.mul(np.float32(0.1),T.mul(Tomg_grad,Tomg_grad)))), (self.T_a, self.T_a + self.aRate*T.mul(Ta_grad,T.maximum(np.float32(self.epsilon),self.Ta_rms)**-1)), (self.T_b, self.T_b + self.bRate*T.mul(Tb_grad,T.maximum(np.float32(self.epsilon),Tb_rms)**-1)), (self.T_z, self.T_z + self.sigmaRate*T.mul(Tz_grad,T.maximum(np.float32(self.epsilon),Tz_rms)**-1)), (self.T_omega, self.T_omega + self.omegaRate*T.mul(Tomg_grad,T.maximum(np.float32(self.epsilon),Tomg_rms)**-1))], mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) #sparse hidden units optimization + RMSprop: #first calculate probability of hidden units firing given visible examples: aVomg = T.dot(T.mul(T.fill(V_egM, T.exp(-self.T_z)), V_egM), self.T_omega) aT_Hp = T.nnet.sigmoid(T.fill(aVomg, self.T_b) + aVomg)#T.nnet.ultra_fast_sigmoid() did not work for us aT_HpMean = T.mean(aT_Hp) # mean activation over minibatch and all Hk #cross entropy between mean hidden unit activation and target mean activation probability "self.sparseTargetp" sparseHcost = T.mul(np.float32(-self.sparseTargetp), T.log(aT_HpMean)) - T.mul((np.float32(1.0)-self.sparseTargetp), T.log(np.float32(1.0)-aT_HpMean)) Tb_gradH, Tz_gradH, Tomg_gradH = T.grad(cost=sparseHcost, wrt=[self.T_b, self.T_z, self.T_omega], consider_constant=[V_egM]) sparseGradFn = theano.function(inputs = [V_egM], outputs =[Tb_gradH, Tz_gradH, Tomg_gradH], allow_input_downcast=True, mode = 'FAST_RUN') [b_gradH, z_gradH, omg_gradH] = sparseGradFn(V_egMin[0:noOfMiniBatchEx]) #initial RMS correction if (not(self.parameterLoaded) and not(self.parameterSaved)): self.Tb_rmsH.set_value(np.float32(np.abs(b_grad - b_gradH))) self.Tz_rmsH.set_value(np.float32(np.abs(z_grad - z_gradH))) self.Tomg_rmsH.set_value(np.float32(np.abs(omg_grad - omg_gradH))) gradSparseH = theano.function(inputs=[V_egM], outputs=[Ta_grad, Tb_grad, Tz_grad, Tomg_grad, Tb_gradH, Tz_gradH, Tomg_gradH], allow_input_downcast=True, updates = scan_updates + [(self.Ta_rms, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Ta_rms,self.Ta_rms))+T.mul(np.float32(0.1),T.mul(Ta_grad,Ta_grad)))), (self.Tb_rmsH, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Tb_rmsH,self.Tb_rmsH))+T.mul(np.float32(0.1),T.mul(Tb_grad-Tb_gradH,Tb_grad-Tb_gradH)))), (self.Tz_rmsH, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Tz_rmsH,self.Tz_rmsH))+T.mul(np.float32(0.1),T.mul(Tz_grad-Tz_gradH,Tz_grad-Tz_gradH)))), (self.Tomg_rmsH, T.sqrt(T.mul(np.float32(0.9),T.mul(self.Tomg_rmsH,self.Tomg_rmsH))+T.mul(np.float32(0.1),T.mul(Tomg_grad-Tomg_gradH,Tomg_grad-Tomg_gradH)))), (self.T_a, self.T_a + self.aRate*T.mul(Ta_grad,T.maximum(np.float32(self.epsilon),self.Ta_rms)**-1)), (self.T_b, self.T_b + self.bRate*T.mul(Tb_grad-Tb_gradH,T.maximum(np.float32(self.epsilon),self.Tb_rmsH)**-1)), (self.T_z, self.T_z + self.sigmaRate*T.mul(Tz_grad-Tz_gradH,T.maximum(np.float32(self.epsilon),self.Tz_rmsH)**-1)), (self.T_omega, self.T_omega + self.omegaRate*T.mul(Tomg_grad-Tomg_gradH,T.maximum(np.float32(self.epsilon),self.Tomg_rmsH)**-1))], mode='FAST_RUN')#NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) #reconstruction errors: [V_egM_recon, H_egM_reconStub, H_meanStubC, V_meanStubC] = self.vtovMBall(V_egM) V_error = V_egM - V_egM_recon V_errorSqr = T.mul(V_error, V_error) reconError = theano.function(inputs = [V_egM], outputs = [T.mean(T.sum(V_errorSqr,axis=1, acc_dtype=theano.config.floatX), acc_dtype=theano.config.floatX)], allow_input_downcast=True, mode='FAST_RUN') print("***************************************************************************************************") print("training network with " + str(self.nv) + " real visible units and " + str(self.nh) + " binary hidden units") print("reconstruction error before training = " + str(np.array(reconError(V_egMin))[0])) noOfMiniBatches = np.int(len(V_egMin)/noOfMiniBatchEx) print("number of mini-batches = " + str(noOfMiniBatches) + ", with " + str(noOfMiniBatchEx) + " examples per mini-batch") print("number of Epochs = " + str(noOfEpoch)) print("***************************************************************************************************") #input images already randomised with consecutive images belonging to different class, use directly as minibatch. for j in xrange(noOfEpoch): pretime=time.time() for i in xrange(noOfMiniBatches): [a_upDate, b_upDate, z_upDate, omg_upDate, b_upDateH, z_upDateH, omg_upDateH] = gradSparseH(V_egMin[i*noOfMiniBatchEx:(i+1)*noOfMiniBatchEx]) myErr = reconError(V_egMin) self.likelihood4plot = self.likelihood4plot + [np.float32(myErr)] print("epoch " + str(j) + ": reconstruction error = " + str(myErr[0]) + ", time taken = " + str(time.time() - pretime)) print("\n***************************************************************************************************") print("reconstruction error after training for " + str(noOfEpoch) + " epochs = " + str(np.array(reconError(V_egMin))[0])) self.checkNaN() print("***************************************************************************************************") plt.figure plt.plot(np.arange(0.0, len(self.likelihood4plot), 1), self.likelihood4plot) plt.show()
class SparseDropoutLayer(Layer): """Dropout layer Sets values to zero with probability p. See notes for disabling dropout during testing. Parameters ---------- incoming : a :class:`Layer` instance or a tuple the layer feeding into this layer, or the expected input shape p : float or scalar tensor The probability of setting a value to zero rescale : bool If true the input is rescaled with input / (1-p) when deterministic is False. Notes ----- The dropout layer is a regularizer that randomly sets input values to zero; see [1]_, [2]_ for why this might improve generalization. During training you should set deterministic to false and during testing you should set deterministic to true. If rescale is true the input is scaled with input / (1-p) when deterministic is false, see references for further discussion. Note that this implementation scales the input at training time. References ---------- .. [1] Hinton, G., Srivastava, N., Krizhevsky, A., Sutskever, I., Salakhutdinov, R. R. (2012): Improving neural networks by preventing co-adaptation of feature detectors. arXiv preprint arXiv:1207.0580. .. [2] Srivastava Nitish, Hinton, G., Krizhevsky, A., Sutskever, I., & Salakhutdinov, R. R. (2014): Dropout: A Simple Way to Prevent Neural Networks from Overfitting. Journal of Machine Learning Research, 5(Jun)(2), 1929-1958. """ def __init__(self, incoming, p=0.5, rescale=True, **kwargs): super(SparseDropoutLayer, self).__init__(incoming, **kwargs) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) self.p = p self.rescale = rescale def get_output_for(self, input, deterministic=False, **kwargs): """ Parameters ---------- input : tensor output from the previous layer deterministic : bool If true dropout and scaling is disabled, see notes """ if deterministic or self.p == 0: return input else: retain_prob = 1 - self.p if self.rescale: input *= 1 / retain_prob # use nonsymbolic shape for dropout mask if possible input_shape = self.input_shape if any(s is None for s in input_shape): input_shape = input.shape return sp.row_scale( input, self._srng.binomial(input_shape[:1], p=retain_prob, dtype=theano.config.floatX))
def discrete_grads(loss, network, LR, update_type, best_params, H, N, th): W_params = lasagne.layers.get_all_params( network, discrete=True) #Get all the weight parameters layers = lasagne.layers.get_all_layers(network) W_grads = [] for layer in layers: params = layer.get_params(discrete=True) if params: W_grads.append(theano.grad( loss, wrt=layer.W)) #Here layer.W = weight_tune(param) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W_params, learning_rate=LR) for param, parambest in izip(W_params, best_params): L = 2 * H / pow(2, N) #state step length in Z_N a = random.random() #c is a random variable with binary value if a < 0.8: c = 1 else: c = 0 b = random.random() state_rand = T.round( b * pow(2, N) ) * L - H #state_rand is a random state in the discrete weight space Z_N delta_W1 = c * ( state_rand - parambest ) #parambest would transfer to state_rand with probability of a, or keep unmoved with probability of 1-a delta_W1_direction = T.cast(T.sgn(delta_W1), theano.config.floatX) dis1 = T.abs_(delta_W1) #the absolute distance k1 = delta_W1_direction * T.floor(dis1 / L) #the integer part v1 = delta_W1 - k1 * L #the decimal part Prob1 = T.abs_(v1 / L) #the transfer probability Prob1 = T.tanh( th * Prob1 ) #the nonlinear tanh() function accelerates the state transfer delta_W2 = updates[param] - param delta_W2_direction = T.cast(T.sgn(delta_W2), theano.config.floatX) dis2 = T.abs_(delta_W2) #the absolute distance k2 = delta_W2_direction * T.floor(dis2 / L) #the integer part v2 = delta_W2 - k2 * L #the decimal part Prob2 = T.abs_(v2 / L) #the transfer probability Prob2 = T.tanh( th * Prob2 ) #the nonlinear tanh() function accelerates the state transfer srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) Gate1 = T.cast( srng.binomial(n=1, p=Prob1, size=T.shape(Prob1)), theano.config.floatX ) # Gate1 is a binary variable with probability of Prob1 to be 1 Gate2 = T.cast( srng.binomial(n=1, p=Prob2, size=T.shape(Prob2)), theano.config.floatX ) # Gate2 is a binary variable with probability of Prob2 to be 1 delta_W1_new = (k1 + delta_W1_direction * Gate1) * L #delta_W1_new = k*L where k is an integer updates_param1 = T.clip(parambest + delta_W1_new, -H, H) updates_param1 = weight_tune( updates_param1, -H, H ) #fine tuning for guaranteeing each element strictly constrained in the discrete space delta_W2_new = (k2 + delta_W2_direction * Gate2) * L #delta_W2_new = k*L where k is an integer updates_param2 = T.clip(param + delta_W2_new, -H, H) updates_param2 = weight_tune( updates_param2, -H, H ) #fine tuning for guaranteeing each element strictly constrained in the discrete space # if update_type<100, the weight probabilistically tranfers from parambest to state_rand, which helps to search the global minimum # elst it would probabilistically transfer from param to a state nearest to updates[param] updates[param] = T.switch(T.lt(update_type, 100), updates_param1, updates_param2) return updates
class DeepFishNet150: ''' this class represents a convolutional neural network with 3 convolutional layers and 2 fully connected layers connected with a final softmax layer. Predicts 2 classes (objects and non-objects) ''' def __init__(self, imgSize=None, crossvalidid=None, loadData=True, mode=None, modelToLoad=None, randomData=False, dropout_params=None, caffeModelName=None, total_epochs=10): ''' DeepLearningNode constructor. Initializes variables and the weights associated with the network ''' assert (mode != None) self.mode = mode self.srng = RandomStreams() self.imgSize = imgSize self.dataMatTrain = None self.labelMatTrain = None self.dataMatTest = None self.labelMatTest = None self.dropout_params = dropout_params self.total_epochs = total_epochs self.crossvalidid = crossvalidid self.totalTrainSamples = None self.totalTestSamples = None self.randomData = None self.caffeModelName = caffeModelName self.modelToLoad = modelToLoad # initialize your model self.initializeModel() if (mode == 'Train'): print 'call train your model' self.trainThisModel() # save your model for future use #self.saveThisModel() elif (mode == 'Test'): assert (modelToLoad != None) self.loadThisModel(modelToLoad) pass pass def moveToCaffeDir(self): ''' util function to move to cur dir to ''' os.chdir(expanduser('~') + '/Programs/caffe/') pass def matrifyMyData(self): ''' returns labels in one hot form [1, 0], [0, 1], [0, 1], ..... ''' nTrainLabel = [] # nTestLabel = [] self.uniqueClasses = np.unique(self.labelMatTrain) self.nClasses = self.uniqueClasses.shape[0] # print "nClasses ",self.nClasses, self.uniqueClasses dLabel = np.eye(self.nClasses) for i in range(self.totalTrainSamples): #print self.labelMatTrain[i][0], dLabel[self.labelMatTrain[i][0]] nTrainLabel.append(dLabel[self.labelMatTrain[i][0]]) nTrainLabel = np.array(nTrainLabel) self.labelMatTrain = nTrainLabel return def floatX(self, X): ''' float casting your input ''' return np.asarray(X, dtype=theano.config.floatX) def init_weights(self, shape, weightType=None, typeLayer=None, caffeLayerName=None): ''' return randomly initialized weights ''' return theano.shared(self.floatX(np.random.randn(*shape) * 0.01), borrow=True) def sigmoid(self, X): ''' # 1 apply non-linear activation function of the given input. ''' return 1.0 / (1.0 + T.exp(-X)) def rectify(self, X): ''' # 1 apply non-linear activation function of the given input. # 2 aparently ReLU is faster than signoid/tanh ''' return T.maximum(X, 0.) def softmax(self, X): ''' # get your final softmax classifier so that it returns probabilities ''' # make it mathematically easier e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x')) return e_x / e_x.sum(axis=1).dimshuffle(0, 'x') def dropout(self, X, p=0.): ''' # define dropout function with the given probability of retaining ''' if p > 0: retain_prob = 1 - p X *= self.srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) X /= retain_prob return X def RMSprop(self, costC, paramsC, lr=0.02, rho=0.9, epsilon=1e-6): ''' # your cost minimizing function. ''' grads = T.grad(cost=costC, wrt=paramsC) updates = [] ii = 0 # print len(params) # print len(grads) for p, g in zip(paramsC, grads): # ii += 1 # print ii, p.get_value() acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g**2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - lr * g)) return updates def model(self, X, w1, w2, w3, w4, w5, w_output, p_drop_conv=0, p_drop_hidden=0): ''' # your main model 3 convolutional layers 2 fully connected layers 1 SoftMax layer ''' # first convolutional layer if (self.mode == "Train"): l1a = self.rectify(conv2d(X, w1, border_mode='full')) l1 = max_pool_2d(l1a, (2, 2), st=None, padding=(0, 0), mode='max') l1 = self.dropout(l1, p_drop_conv) elif (self.mode == "Test"): l1a = self.rectify( conv2d(X, w1 * (1 - p_drop_conv), border_mode='full')) l1 = max_pool_2d(l1a, (2, 2), st=None, padding=(0, 0), mode='max') # convOut1 = conv2d(X, w1) convOut1 = l1a # second convolutional layer if (self.mode == "Train"): l2a = self.rectify(conv2d(l1, w2)) l2 = max_pool_2d(l2a, (2, 2), st=None, padding=(0, 0), mode='max') l2 = self.dropout(l2, p_drop_conv) elif (self.mode == "Test"): l2a = self.rectify(conv2d(l1, w2 * (1 - p_drop_conv))) l2 = max_pool_2d(l2a, (2, 2), st=None, padding=(0, 0), mode='max') # # third convolutional layer if (self.mode == "Train"): l3a = self.rectify(conv2d(l2, w3)) l3 = max_pool_2d(l3a, (2, 2), st=None, padding=(0, 0), mode='max') l3 = self.dropout(l3, p_drop_conv) elif (self.mode == "Test"): l3a = self.rectify(conv2d(l2, w3 * (1 - p_drop_conv))) l3 = max_pool_2d(l3a, (2, 2), st=None, padding=(0, 0), mode='max') # # flatten the output l3 = T.flatten(l3, outdim=2) # 1st fully connected layer if (self.mode == "Train"): l4 = self.rectify(T.dot(l3, w4)) l4 = self.dropout(l4, p_drop_hidden) elif (self.mode == "Test"): l4 = self.rectify(T.dot(l3, w4 * (1 - p_drop_hidden))) # 2nd fully connected layer if (self.mode == "Train"): l5 = self.rectify(T.dot(l4, w5)) l5 = self.dropout(l5, p_drop_hidden) elif (self.mode == "Test"): l5 = self.rectify(T.dot(l4, w5 * (1 - p_drop_hidden))) # connected the above output to softmax layer pyx = self.softmax(T.dot(l5, w_output)) return l1, l2, l3, l4, l5, pyx, convOut1 def getL1Norm(self, params, scaleForm=0.0001): ''' get L1 normalization on the weights. performing regularization using L1 reduces the size of the weights and makes them sparse ''' tsum = 0 for eachParam in params: tsum += abs(eachParam).sum() return tsum * scaleForm pass def getL2Norm(self, params, scaleForm=0.0001): ''' get L2 normalization on the weights. performing regularization using L2 gives rise to uniqe solution for the weights ''' tsum = 0 for eachParam in params: tsum += T.sqrt(eachParam**2).sum() return tsum * scaleForm def getL2NormSquare(self, params, scaleForm=0.0001): ''' get L2 (square) normalization on the weights. less computationally expensive than pure L2 normalization ''' tsum = 0 for eachParam in params: tsum += abs(eachParam**2).sum() return tsum * scaleForm def initializeModel(self): ''' define your deep learning model ''' print 'defining model' X = T.ftensor4() Y = T.fmatrix() #initialize your weghts, kernels # format n kernels, n channels, kernel_w x kernel_h # 20 kernels on gray scale image with 5 x 5 sized kernel w1 = self.init_weights((20, 3, 5, 5), weightType='Xavier', caffeLayerName='conv1') # 50 20-channel 5 x 5 sized kernel w2 = self.init_weights((50, 20, 5, 5), weightType='Xavier', caffeLayerName='conv2') # 50 50-channel 4 y_x 4 sized kernel w3 = self.init_weights((50, 50, 4, 4), weightType='Xavier', caffeLayerName='conv3') # flatten the inputs and pass to fully connected layer w4 = self.init_weights((14450, 1000), weightType='Xavier') # flatten the inputs and pass to fully connected layer w5 = self.init_weights((1000, 500), weightType='Xavier') # flatten the inputs and pass to fully connected layer w_output = self.init_weights((500, 2), weightType='Xavier') # define your deep model if (self.dropout_params == None): # if there is no default dropout params mentioned, just set them manually self.dropout_params = {} self.dropout_params['conv'] = 0.1 self.dropout_params['fc'] = 0.2 print 'initializing with dropout_params: ', self.dropout_params[ 'conv'], self.dropout_params['fc'] noise_l1, noise_l2, noise_l3, noise_l4, noise_l5, noise_py_x, convOut1 = self.model( X, w1, w2, w3, w4, w5, w_output, p_drop_conv=self.dropout_params['conv'], p_drop_hidden=self.dropout_params['fc']) # get your label from the predicted probabilties y_x = T.argmax(noise_py_x, axis=1) # y_x = noise_py_x >= 0.5 self.learning_rate = 0.0001 self.params = [w1, w2, w3, w4, w5, w_output] L1_norm = self.getL1Norm(self.params) L2_norm = self.getL2Norm(self.params) # pd = np.array(self.params) # mean cross entropy with L2 regularization self.cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y)) self.paramUpdates = self.RMSprop(self.cost, self.params, lr=self.learning_rate) #self.paramUpdates = self.MomentumOptimizer(self.cost, self.params, lr = self.learning_rate) if (self.modelToLoad != None): self.loadThisModel(self.modelToLoad) # self.cost = T.mean((T.nnet.binary_crossentropy(noise_py_x, Y))) print 'compiling functions' print 'current learning rate: ', self.learning_rate start_compilation_time = time.clock() if (self.mode == "Train"): print 'compiling train function startin at ', strftime( "%Y-%m-%d %H:%M:%S") self.train = theano.function(inputs=[X, Y], outputs=self.cost, updates=self.paramUpdates, allow_input_downcast=True) print 'compiling predict function' self.predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True) print 'compiling predictProb function' self.predictProb = theano.function(inputs=[X], outputs=noise_py_x, allow_input_downcast=True) end_compilation_time = time.clock() self.getFirstLayerOutput = theano.function(inputs=[X], outputs=convOut1) print 'compiled the functions, ended at ', strftime( "%Y-%m-%d %H:%M:%S") print 'time takent compile the functions: ', end_compilation_time - start_compilation_time def trainThisModel(self): ''' iterate through the data train the classifier ''' print 'training the model' # self.total_epochs = 10 self.mini_batch_size = 32 still_looping = True for each_epoch in range(self.total_epochs): if (still_looping == False): print 'exiting each_epoch loop' break print '--' * 5 print 'epoch: ', each_epoch print '--' * 5 iterId = 0 costList = [] self.cost = 0 # trainFile = '/home/ganymede/Datasets/cross validation/'+str(self.crossvalidid)+'/train_lmdb_'+str(self.imgSize)+'/' # self.testMean = np.load('/home/jason/Desktop/Robotics/Dataset/lmdb generator/data_lmdb.npy') # print trainMean.shape # self.trainMean = np.reshape(trainMean, (1, 3, self.imgSize, self.imgSize)) # print trainMean.shape # print trainFile lmdb_env = lmdb.open(trainFile) lmdb_txn = lmdb_env.begin() lmdb_cursor = lmdb_txn.cursor() datum = caffe_pb2.Datum() dataList = [] labelList = [] for key, value in lmdb_cursor: datum.ParseFromString(value) label = datum.label data = caffe.io.datum_to_array(datum) dataList.append(data) labelList.append(label) if (len(dataList) == self.mini_batch_size and len(labelList) == self.mini_batch_size): self.dataMatTrain = np.array(dataList) self.labelMatTrain = np.array(labelList) self.totalTrainSamples = self.dataMatTrain.shape[0] # print self.dataMatTrain.shape, self.labelMatTrain.shape self.labelMatTrain = self.labelMatTrain.reshape( (self.mini_batch_size, 1)) # print self.dataMatTrain.shape, self.labelMatTrain.shape self.matrifyMyData() # print self.dataMatTrain.shape, self.labelMatTrain.shape trX, trY = self.dataMatTrain, self.labelMatTrain trX = self.getMeanNormalizedData(trX, 'Train') ccost = self.train(trX, trY) # print 'mean: ',np.mean(trX,axis=0) predVals = self.predict(trX) if (iterId % 100 == 0): print 'epoch: ', each_epoch, 'iter: ' + str( iterId) + ', cost: ', ccost costList.append(ccost) pd2 = np.argmax(trY, axis=1) print '--' pass pass iterId += 1 dataList = [] labelList = [] avg_cost = sum(costList) / float(len(costList)) self.saveThisModel('c' + str(self.crossvalidid) + "_" + str(each_epoch) + '_' + str(avg_cost) + '_TempModel_DeepLearningNode_' + str(self.imgSize) + '.npz') def saveThisModel(self, fileName=None): ''' get the variables from the classifier model save the model to disk ''' os.chdir(curDirName) params = [] for eachParam in self.params: params.append(eachParam.get_value()) print len(params) print params[0].shape # try saving with npz format if (fileName == None): fileName = 'MyLeNet_' + str( strftime("%Y-%m-%d %H:%M:%S")) + 'total_epochs_' + str( self.total_epochs) + '.npz' np.savez_compressed(fileName, params=params) print 'saving to ', os.getcwd() print 'saved to ', fileName self.fileName = fileName # self.moveToCaffeDir() pass def loadThisModel(self, modelToLoad): ''' take the path of the classifier load the classifier model assign the variables to initialized model ''' print 'loading this model' print modelToLoad print os.path.exists(modelToLoad) == True params = np.load(modelToLoad) allParams = params['params'] print type(allParams) print allParams.shape for eachParam in range(allParams.shape[0]): print allParams[eachParam].shape #self.params = None for eachParam in range(allParams.shape[0]): self.params[eachParam].set_value(allParams[eachParam]) print 'loaded the saved convnet classifier params' pass def writeFirstLayerToDisk(self, imgArray): ''' take image convolve the image with first layer filters write filters to disk ''' # print imgArray.shape convOut1 = self.getFirstLayerOutput(imgArray) # print convOut1.shape # cv2.imwrite(curDirName+"/vis/0.jpg", convOut1[0, 0,:,:].reshape(204, 204)) # cv2.imwrite(curDirName+"/vis/1.jpg", convOut1[0, 1,:,:].reshape(204, 204)) pass def predictThisImage(self, imgArray=None, meanSubtracted=True): ''' 1 take image get mean normalized image return class of the image ''' if (meanSubtracted == False): imgArray = self.getMeanNormalizedData(imgArray, 'Test') return self.predict(imgArray) def predictThisImageWithProbability(self, imgArray=None, meanSubtracted=True): ''' 1 take image get mean normalized image return probabilties of the image being an object, non object ''' if (meanSubtracted == False): imgArray = self.getMeanNormalizedData(imgArray, 'Test') return self.predictProb(imgArray) def getMeanNormalizedData(self, data, mode): ''' # 1 take image # 2 reshape it # 3 subtract mean image # 4 divide by 255 # 5 return image ''' self.testMean = np.load('/home/ganymede/Datasets/cross validation/' + str(self.crossvalidid) + '/test_lmdb_' + str(self.imgSize) + '.npy') self.testMean = np.reshape(self.testMean, (1, 3, self.imgSize, self.imgSize)) data = data.astype('float32') # print 'subtracting mean' if (mode == "Train"): data -= self.trainMean elif (mode == "Test"): data -= self.testMean data = data / float(255.0) return data
class RVal(Elem, TensorWrapped, Masked): # random value def __init__(self, seed=None, **kw): super(RVal, self).__init__(**kw) if seed is None: seed = np.random.randint(0, 1e6) self.rng = RandomStreams(seed=seed) self.value = None def binomial(self, shape, n=1, p=0.5, ndim=None, dtype="int32"): if isinstance(shape, Elem): shape = shape.d self.value = self.rng.binomial(shape, n, p, ndim, dtype) return self def normal(self, shape, avg=0.0, std=1.0, ndim=None, dtype=None): if isinstance(shape, Elem): shape = shape.d self.value = self.rng.normal(shape, avg, std, ndim, dtype) return self def multinomial(self, shape, n=1, pvals=None, without_replacement=False, ndim=None, dtype="int32"): if isinstance(shape, Elem): shape = shape.d if without_replacement: self.value = self.rng.multinomial_wo_replacement( shape, n, pvals, ndim, dtype) else: self.value = self.rng.multinomial(shape, n, pvals, ndim, dtype) return self def gumbel(self, shape, eps=1e-10): if isinstance(shape, Elem): shape = shape.d x = self.rng.uniform(shape, 0.0, 1.0) self.value = -theano.tensor.log(-theano.tensor.log(x + eps) + eps) return self @property def d(self): return self.value @property def v(self): return self.value.eval() @property def allparams(self): return set() @property def allupdates(self): return {} @property def all_extra_outs(self): return {}
class FourwayLstm(superclass.RNN): ''' Fields: ''' hidden_dimension = 200 input_dimension = 81 srng = None use_cross_entropy_loss = True ''' Initialization: ''' def __init__(self, optimizer_config_path, loss="cross-entropy"): n_layers = 4 self.input_lstm_layer = network_ops.fourdirectional_lstm_layer( 'input_layer_', self.input_dimension * 2 + 1, self.hidden_dimension) self.lstm_layers = [ network_ops.fourdirectional_lstm_layer('layer_' + str(l), self.hidden_dimension * 4, self.hidden_dimension) for l in range(n_layers - 1) ] self.output_convolution = network_ops.linear_layer_on_tensor( 'output_layer', self.hidden_dimension * 4, 1) self.layers = [self.input_lstm_layer ] + self.lstm_layers + [self.output_convolution] self.use_cross_entropy_loss = loss == "cross-entropy" super().__init__('sentence', optimizer_config_path) ''' Theano functions: ''' def __pairwise_features(self, V, Vs, sentence_length): thingy, _ = theano.scan( fn=lambda x, y: T.concatenate([y, T.zeros(1), x]), sequences=Vs, non_sequences=V) root_feature = T.concatenate( (T.ones(1), T.zeros(self.input_dimension))) root_features = T.concatenate((V, root_feature)) flat_version = thingy.flatten() with_root = T.concatenate((root_features, flat_version)) in_shape = T.reshape(with_root, newshape=(sentence_length + 1, self.input_dimension * 2 + 1)) return in_shape def theano_sentence_loss(self, Vs, gold): preds = self.theano_sentence_prediction(Vs) if self.use_cross_entropy_loss: losses = T.nnet.categorical_crossentropy(preds, gold) else: losses = T.pow(preds - gold, 2) return T.sum(losses) def dropout(self, tensor, dropout_prob=0.5, training=True): if not training: return tensor if self.srng is None: self.srng = RandomStreams(seed=12345) keep_prob = 1.0 - dropout_prob mask = self.srng.binomial(size=tensor.shape, p=keep_prob, dtype='floatX') return tensor * mask / keep_prob def theano_sentence_prediction(self, Vs): pairwise_vs, _ = theano.scan(fn=self.__pairwise_features, outputs_info=None, sequences=Vs, non_sequences=[Vs, Vs.shape[0]]) pairwise_vs = self.dropout(pairwise_vs, dropout_prob=0.2, training=self.input_lstm_layer.training) full_matrix = self.input_lstm_layer.function(pairwise_vs) for layer in self.lstm_layers: full_matrix = self.dropout(full_matrix, dropout_prob=0.5, training=self.input_lstm_layer.training) full_matrix = layer.function(full_matrix) full_matrix = self.dropout(full_matrix, dropout_prob=0.5, training=self.input_lstm_layer.training) final_matrix = self.output_convolution.function(full_matrix)[:, :, 0] if self.use_cross_entropy_loss: final_matrix = T.nnet.softmax(final_matrix) return final_matrix
def test_binomial(): #TODO: test size=None, ndim=X #TODO: test size=X, ndim!=X.ndim #TODO: test random seed in legal value(!=0 and other) #TODO: test sample_size not a multiple of guessed #streams #TODO: test size=Var, with shape that change from call to call #we test size in a tuple of int and a tensor.shape. #we test the param p with int. if mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE']: sample_size = (10, 50) steps = 50 rtol = 0.02 else: sample_size = (500, 50) steps = int(1e3) rtol = 0.01 x = tensor.matrix() v = tensor.vector() for mean in [0.1, 0.5]: for size, var_input, input in [ (sample_size, [], []), (x.shape, [x], [numpy.zeros(sample_size, dtype=config.floatX)]) ]: #print '' #print 'ON CPU with size=(%s) and mean(%d):' % (str(size), mean) R = MRG_RandomStreams(234, use_cuda=False) # Note: we specify `nstreams` to avoid a warning. u = R.binomial(size=size, p=mean, nstreams=rng_mrg.guess_n_streams(size, warn=False)) f = theano.function(var_input, u, mode=mode) #theano.printing.debugprint(f) out = f(*input) #print 'random?[:10]\n', out[0, 0:10] #print 'random?[-1,-10:]\n', out[-1, -10:] basictest(f, steps, sample_size, prefix='mrg cpu', inputs=input, allow_01=True, target_avg=mean, mean_rtol=rtol) if mode != 'FAST_COMPILE' and cuda_available: #print '' #print 'ON GPU with size=(%s) and mean(%d):' % (str(size), mean) R = MRG_RandomStreams(234, use_cuda=True) u = R.binomial(size=size, p=mean, dtype='float32', nstreams=rng_mrg.guess_n_streams(size, warn=False)) #well, it's really that this test w GPU doesn't make sense otw assert u.dtype == 'float32' f = theano.function( var_input, theano.Out(theano.sandbox.cuda.basic_ops.gpu_from_host(u), borrow=True), mode=mode_with_gpu) #theano.printing.debugprint(f) gpu_out = numpy.asarray(f(*input)) #print 'random?[:10]\n', gpu_out[0, 0:10] #print 'random?[-1,-10:]\n', gpu_out[-1, -10:] basictest(f, steps, sample_size, prefix='mrg gpu', inputs=input, allow_01=True, target_avg=mean, mean_rtol=rtol) numpy.testing.assert_array_almost_equal(out, gpu_out, decimal=6) #print '' #print 'ON CPU w NUMPY with size=(%s) and mean(%d):' % (str(size), # mean) RR = theano.tensor.shared_randomstreams.RandomStreams(234) uu = RR.binomial(size=size, p=mean) ff = theano.function(var_input, uu, mode=mode) # It's not our problem if numpy generates 0 or 1 basictest(ff, steps, sample_size, prefix='numpy', allow_01=True, inputs=input, target_avg=mean, mean_rtol=rtol)
class SpatialDropoutLayer(Layer): """Spatial dropout layer Sets whole filter activations to zero with probability p. See notes for disabling dropout during testing. Parameters ---------- incoming : a :class:`Layer` instance or a tuple the layer feeding into this layer, or the expected input shape p : float or scalar tensor The probability of setting a value to zero rescale : bool If true the input is rescaled with input / (1-p) when deterministic is False. Notes ----- The spatial dropout layer is a regularizer that randomly sets whole the values of whole features to zero. This is an adaptation of normal dropout, which is generally useful in fully convolutional settings, such as [1]_. It is also called a feature dropout layer. During training you should set deterministic to false and during testing you should set deterministic to true. If rescale is true the input is scaled with input / (1-p) when deterministic is false, see references for further discussion. Note that this implementation scales the input at training time. References ---------- .. [1] Oliveira, G. Valada, A., Bollen, C., Bugard, W., Brox. T. (2016): Deep Learning for Human Part Discovery in Images. IEEE International Conference on Robotics and Automation (ICRA), IEEE, 2016. """ def __init__(self, incoming, p=0.5, rescale=True, **kwargs): super(SpatialDropoutLayer, self).__init__(incoming, **kwargs) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) self.p = p self.rescale = rescale def get_output_for(self, input, deterministic=False, **kwargs): """ Parameters ---------- input : tensor output from the previous layer deterministic : bool If true dropout and scaling is disabled, see notes """ if deterministic or self.p == 0: return input else: # Using theano constant to prevent upcasting one = T.constant(1) retain_prob = one - self.p if self.rescale: input /= retain_prob mask = self._srng.binomial(input.shape[:2], p=retain_prob, dtype=input.dtype) axes = [0, 1] + (['x'] * (input.ndim - 2)) mask = mask.dimshuffle(*axes) return input * mask
class DropoutLayer(Layer): """Dropout layer Sets values to zero with probability p. See notes for disabling dropout during testing. Parameters ---------- incoming : a :class:`Layer` instance or a tuple the layer feeding into this layer, or the expected input shape p : float or scalar tensor The probability of setting a value to zero rescale : bool If ``True`` (the default), scale the input by ``1 / (1 - p)`` when dropout is enabled, to keep the expected output mean the same. shared_axes : tuple of int Axes to share the dropout mask over. By default, each value can be dropped individually. ``shared_axes=(0,)`` uses the same mask across the batch. ``shared_axes=(2, 3)`` uses the same mask across the spatial dimensions of 2D feature maps. Notes ----- The dropout layer is a regularizer that randomly sets input values to zero; see [1]_, [2]_ for why this might improve generalization. The behaviour of the layer depends on the ``deterministic`` keyword argument passed to :func:`lasagne.layers.get_output`. If ``True``, the layer behaves deterministically, and passes on the input unchanged. If ``False`` or not specified, dropout (and possibly scaling) is enabled. Usually, you would use ``deterministic=False`` at train time and ``deterministic=True`` at test time. See also -------- dropout_channels : Drops full channels of feature maps spatial_dropout : Alias for :func:`dropout_channels` dropout_locations : Drops full pixels or voxels of feature maps References ---------- .. [1] Hinton, G., Srivastava, N., Krizhevsky, A., Sutskever, I., Salakhutdinov, R. R. (2012): Improving neural networks by preventing co-adaptation of feature detectors. arXiv preprint arXiv:1207.0580. .. [2] Srivastava Nitish, Hinton, G., Krizhevsky, A., Sutskever, I., & Salakhutdinov, R. R. (2014): Dropout: A Simple Way to Prevent Neural Networks from Overfitting. Journal of Machine Learning Research, 5(Jun)(2), 1929-1958. """ def __init__(self, incoming, p=0.5, rescale=True, shared_axes=(), **kwargs): super(DropoutLayer, self).__init__(incoming, **kwargs) #TODO: use same random #self._srng = RandomStreams(get_rng().randint(1, 2147462579)) r = get_rng().randint(1, 2147462579) self._srng = RandomStreams(r) print(self, r) self.p = p self.rescale = rescale self.shared_axes = tuple(shared_axes) def get_output_for(self, input, deterministic=False, **kwargs): if deterministic or self.p == 0: return input else: # Using theano constant to prevent upcasting one = T.constant(1, dtype='int8') retain_prob = one - self.p if self.rescale: input /= retain_prob # use nonsymbolic shape for dropout mask if possible mask_shape = self.input_shape if any(s is None for s in mask_shape): mask_shape = input.shape # apply dropout, respecting shared axes if self.shared_axes: shared_axes = tuple(a if a >= 0 else a + input.ndim for a in self.shared_axes) mask_shape = tuple(1 if a in shared_axes else s for a, s in enumerate(mask_shape)) mask = self._srng.binomial(mask_shape, p=retain_prob, dtype=input.dtype) if self.shared_axes: bcast = tuple(bool(s == 1) for s in mask_shape) mask = T.patternbroadcast(mask, bcast) return input * mask def reinit(self): r = get_rng().randint(1, 2147462579) self._srng = RandomStreams(r) print(self, r)
class GRU(Model): """ A standard GRU model with no output layer. See GRU_softmax or GRU_regression for implementations with an output layer. The output is simply the state of the last hidden layer. """ def __init__(self, input_size, hidden_sizes, activation='tanh', use_layer_normalization=False, drop_prob=0., use_zoneout=False, use_skip_connections=False, seed=1234): """ Parameters ---------- input_size : int Number of units each element Xi in the input sequence X has. hidden_sizes : int, list of int Number of hidden units each GRU should have. activation : str Activation function to apply on the "cell candidate" use_layer_normalization : bool Use LayerNormalization to normalize preactivations and stabilize hidden layer evolution drop_prob : float Dropout/Zoneout probability for recurrent networks. See: https://arxiv.org/pdf/1512.05287.pdf & https://arxiv.org/pdf/1606.01305.pdf use_zoneout : bool Use zoneout implementation instead of dropout (a different zoneout mask will be use at each timestep) use_skip_connections : bool Use skip connections from the input to all hidden layers in the network, and from all hidden layers to the output layer seed : int Random seed used for dropout normalization """ self.graph_updates = OrderedDict() self._gen = None self.input_size = input_size self.hidden_sizes = [hidden_sizes ] if type(hidden_sizes) is int else hidden_sizes self.activation = activation self.use_layer_normalization = use_layer_normalization self.drop_prob = drop_prob self.use_zoneout = use_zoneout self.use_skip_connections = use_skip_connections self.seed = seed self.srng = MRG_RandomStreams(self.seed) layer_class = LayerGRU if self.use_layer_normalization: layer_class = LayerGruNormalized self.layers = [] last_hidden_size = self.input_size for i, hidden_size in enumerate(self.hidden_sizes): self.layers.append( layer_class(last_hidden_size, hidden_size, activation=activation, name="GRU{}".format(i))) last_hidden_size = hidden_size + (input_size if self.use_skip_connections else 0) self.dropout_vectors = {} if self.drop_prob and not self.use_zoneout: p = 1 - self.drop_prob for layer in self.layers: self.dropout_vectors[layer.name] = self.srng.binomial( size=(layer.hidden_size, ), n=1, p=p, dtype=floatX) / p def initialize(self, weights_initializer=initer.UniformInitializer(1234)): for layer in self.layers: layer.initialize(weights_initializer) @property def updates(self): return self.graph_updates @property def hyperparameters(self): hyperparameters = { 'version': 2, 'input_size': self.input_size, 'hidden_sizes': self.hidden_sizes, 'activation': self.activation, 'use_layer_normalization': self.use_layer_normalization, 'drop_prob': self.drop_prob, 'use_zoneout': self.use_zoneout, 'use_skip_connections': self.use_skip_connections, 'seed': self.seed } return hyperparameters @property def parameters(self): parameters = [] for layer in self.layers: parameters += layer.parameters return parameters def get_init_states(self, batch_size): states_h = [] for i, hidden_size in enumerate(self.hidden_sizes): state_h = np.zeros((batch_size, hidden_size), dtype=floatX) states_h.append(state_h) return states_h def _fprop(self, Xi, *args): layers_h = [] input = Xi for i, layer in enumerate(self.layers): drop_states = None drop_value = None if self.drop_prob: if self.use_zoneout: drop_value = 1. drop_states = self.srng.binomial((layer.hidden_size, ), n=1, p=1 - self.drop_prob, dtype=floatX) else: drop_value = 0. drop_states = self.dropout_vectors[layer.name] last_h = args[i] h = layer.fprop(input, last_h, drop_states, drop_value) layers_h.append(h) if self.use_skip_connections: input = T.concatenate([h, Xi], axis=-1) else: input = h return tuple(layers_h) def get_output(self, X): outputs_info_h = [] for hidden_size in self.hidden_sizes: outputs_info_h.append(T.zeros((X.shape[0], hidden_size))) results, updates = theano.scan( fn=self._fprop, outputs_info=outputs_info_h, sequences=[ T.transpose(X, axes=(1, 0, 2)) ]) # We want to scan over sequence elements, not the examples. self.graph_updates = updates # Put back the examples so they are in the first dimension. self.h = T.transpose(results[0], axes=(1, 0, 2)) return self.h def save(self, path): savedir = smartutils.create_folder(pjoin(path, type(self).__name__)) smartutils.save_dict_to_json_file(pjoin(savedir, "hyperparams.json"), self.hyperparameters) params = {param.name: param.get_value() for param in self.parameters} assert len(self.parameters) == len( params) # Implies names are all unique. np.savez(pjoin(savedir, "params.npz"), **params) state = { "version": 1, "_srng_rstate": self.srng.rstate, "_srng_state_updates": [ state_update[0].get_value() for state_update in self.srng.state_updates ] } np.savez(pjoin(savedir, "state.npz"), **state) def load(self, path): loaddir = pjoin(path, type(self).__name__) parameters = np.load(pjoin(loaddir, "params.npz")) for param in self.parameters: param.set_value(parameters[param.name]) state = np.load(pjoin(loaddir, 'state.npz')) self.srng.rstate[:] = state['_srng_rstate'] for state_update, saved_state in zip(self.srng.state_updates, state["_srng_state_updates"]): state_update[0].set_value(saved_state) @classmethod def create(cls, path, **kwargs): loaddir = pjoin(path, cls.__name__) hyperparams = smartutils.load_dict_from_json_file( pjoin(loaddir, "hyperparams.json")) hyperparams.update(kwargs) if hyperparams['version'] < 2: hyperparams['drop_prob'] = hyperparams['dropout_prob'] del hyperparams['dropout_prob'] model = cls(**hyperparams) model.load(path) return model
def test_undefined_grad(): srng = MRG_RandomStreams(seed=1234) # checking uniform distribution low = tensor.scalar() out = srng.uniform((), low=low) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, low) high = tensor.scalar() out = srng.uniform((), low=0, high=high) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, high) out = srng.uniform((), low=low, high=high) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, (low, high)) # checking binomial distribution prob = tensor.scalar() out = srng.binomial((), p=prob) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, prob) # checking multinomial distribution prob1 = tensor.scalar() prob2 = tensor.scalar() p = [theano.tensor.as_tensor_variable([prob1, 0.5, 0.25])] out = srng.multinomial(size=None, pvals=p, n=4)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, theano.tensor.sum(out), prob1) p = [theano.tensor.as_tensor_variable([prob1, prob2])] out = srng.multinomial(size=None, pvals=p, n=4)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, theano.tensor.sum(out), (prob1, prob2)) # checking choice p = [theano.tensor.as_tensor_variable([prob1, prob2, 0.1, 0.2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0], (prob1, prob2)) p = [theano.tensor.as_tensor_variable([prob1, prob2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0], (prob1, prob2)) p = [theano.tensor.as_tensor_variable([prob1, 0.2, 0.3])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0], prob1) # checking normal distribution avg = tensor.scalar() out = srng.normal((), avg=avg) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg) std = tensor.scalar() out = srng.normal((), avg=0, std=std) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std) out = srng.normal((), avg=avg, std=std) assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, (avg, std))
def __init__(self, nam, W=0, maxlen=0, load=False, training=False): self.W = W # 创建2个LSTM单元(参数:WUb)放入词典中,并初始化参数 # Generate 2 LSTM unit with Guassian innitialization # Type: Dictionary self.maxlen = maxlen newp = creatrnnx() self.model_name = nam # 让两个LSTM单元的参数WUb的初始相同 # Make the weights(WUb) of both LSTM unit same for i in newp.keys(): if i[0] == '1': newp['2' + i[1:]] = newp[i] # Create 5 tensors (symoblic) variables (y, mask11, mask21, emb11, emb21) # Here, config.floatX = 'float32' y = T.vector('y', dtype=config.floatX) mask11 = T.matrix('mask11', dtype=config.floatX) mask21 = T.matrix('mask21', dtype=config.floatX) emb11 = T.ftensor3('emb11') emb21 = T.ftensor3('emb21') # 3-D float-type tensor # Load the existed model (pre-trained weights) if needed if load == True: newp = pickle.load(open(nam, 'rb')) # Convert 'newp' to shared-tensor-type dictionary 'tnewp' # Shared tenasor variable self.tnewp = init_tparams(newp) # Set tensor-type noise use_noise = theano.shared(numpy_floatX(0.)) # Set tensor-type random number generator # rng -> random number generator trng = RandomStreams(1234) # ??? rrng? # create a 3-D random tensor for "dropout"? rate = 0.5 rrng = trng.binomial(emb11.shape, p=1 - rate, n=1, dtype=emb11.dtype) # print "rrng:" # print "type of rrng:", type(rrng) # print rrng # 具体化LSTM模型的结构和参数(核心)proj代表着一个mini-batch输入以后的输出值 # Implement the LSTM module; # Here 'False' -> NOT apply DROPOUT layers; # Since the input is in the format: (Max No. of words in batch, No. of Samples, 300) # Note: that the 1st term and 2nd term are exchanged! # 只需要getp()即scan循环以后的最后一次(timesteps)结果,之前记录LSTM输出的结果都抛弃 # proj11[-1] -> (No. of samples[N], Hidden unit dimension[timesteps]) -> (N, 50) # proj11 takes the inputs as embedding matrix emb1 and gives the o/p of the LSTM_A proj11 = getpl2(emb11, '1lstm1', mask11, False, rrng, 50, self.tnewp)[-1] proj21 = getpl2(emb21, '2lstm1', mask21, False, rrng, 50, self.tnewp)[-1] # Define the cost function dif = (proj21 - proj11).norm(L=1, axis=1) s2 = T.exp(-dif) sim = T.clip(s2, 1e-7, 1.0 - 1e-7) # Similarity lr = tensor.scalar(name='lr') # learning rate ys = T.clip((y - 1.0) / 4.0, 1e-7, 1.0 - 1e-7) cost = T.mean((sim - ys)**2) ns = emb11.shape[1] self.f2sim = theano.function([emb11, mask11, emb21, mask21], sim, allow_input_downcast=True) self.f_proj11 = theano.function([emb11, mask11], proj11, allow_input_downcast=True) # NOT used self.f_cost = theano.function([emb11, mask11, emb21, mask21, y], cost, allow_input_downcast=True) # NOT used # Prepare for the backpropogation and gradiant descend if training == True: # 计算cost对不同参数的导数,并且平均两个LSTM模型的参数 # The gradi refers to gradients wrt. cost, and is a list containing gradients to be update weights # We average out the gradients by appending to another list grads[] # So, we average out the gradients : wrt LSTM_A and wrt LSTM_B # i.e, gradient= (grad(wrt(LSTM_A)+grad(wrt(LSTM_B))/2.0 to maintain the symmetricity between either LSTMs # wrt: (variable or list of variables) – term[s] for which we want gradients gradi = tensor.grad( cost, wrt=self.tnewp.values()) # T.grad -> differential grads = [] l = len(gradi) for i in range(0, l / 2): gravg = (gradi[i] + gradi[i + l / 2]) / (4.0) #print i,i+9 grads.append(gravg) for i in range(0, len(self.tnewp.keys()) / 2): grads.append(grads[i]) # Here, the f_grad_shared and f_update are theano functions self.f_grad_shared, self.f_update = adadelta( lr, self.tnewp, grads, emb11, mask11, emb21, mask21, y, cost)
def __init__(self, sources, n_out, index, y_in=None, target=None, target_index=None, sparse=False, cost_scale=1.0, input_scale=1.0, L1=0.0, L2=0.0, L2_eye=None, varreg=0.0, output_L2_reg=0.0, output_entropy_reg=0.0, output_entropy_exp_reg=0.0, with_bias=True, mask="unity", dropout=0.0, batch_drop=False, batch_norm=False, bn_use_sample=False, layer_drop=0.0, residual=False, carry=False, sparse_filtering=False, gradient_scale=1.0, trainable=True, device=None, dtype='float32', **kwargs): """ :param list[NetworkBaseLayer.Layer] sources: list of source layers :param int n_out: output dim of W_in and dim of bias :param float L1: l1-param-norm regularization :param float L2: l2-param-norm regularization :param str mask: "unity" or "dropout" :type dropout: float """ super(Layer, self).__init__(**kwargs) self.index = index self.sources = sources; ":type: list[Layer]" self.num_sources = len(sources) self.D = max([s.D for s in sources if isinstance(s,Layer)] + [0]) if mask is None: mask = 'none' self.set_attr('mask', mask) self.set_attr('dropout', dropout) self.set_attr('sparse', sparse) self.set_attr('bn_use_sample', bn_use_sample) self.set_attr('sparse_filtering', sparse_filtering) if not trainable: self.set_attr('trainable', trainable) # only store if not default self.gradient_scale = 0.0 # just to be sure else: self.gradient_scale = gradient_scale if gradient_scale != 1.0: self.set_attr('gradient_scale', gradient_scale) self.set_attr('layer_drop', layer_drop) assert not carry, "not supported anymore" self.set_attr('residual', residual) self.set_attr('n_out', n_out) self.set_attr('L1', L1) self.set_attr('L2', L2) if L2_eye: self.set_attr('L2_eye', L2_eye) self.device = device # if device else str(theano.config.device) for s in self.sources: s.transfer_output(self.device) self.set_attr('varreg', varreg) if output_L2_reg: self.set_attr('output_L2_reg', output_L2_reg) if output_entropy_reg: self.set_attr('output_entropy_reg', output_entropy_reg) if output_entropy_exp_reg: self.set_attr('output_entropy_exp_reg', output_entropy_exp_reg) self.set_attr('batch_norm', batch_norm) self.set_attr('input_scale', input_scale) if y_in is not None: self.y_in = {} for k in y_in: if not isinstance(y_in[k], T.Variable): continue self.y_in[k] = time_batch_make_flat(y_in[k]) # TODO: better not flatten here... self.y_in[k].n_out = getattr(y_in[k], "n_out", None) else: self.y_in = None self.constraints = T.constant(0) if target: self.set_attr('target', target) if target_index: self.set_attr('target_index', target_index) assert target_index in self.network.j self.index = index = self.network.j[target_index] if cost_scale != 1: self.set_attr("cost_scale", cost_scale) if with_bias: self.b = self.add_param(self.create_bias(n_out), 'b_%s'%self.name) else: self.set_attr('with_bias', False) self.b = numpy.float32(0) self.mass = T.constant(1., name = "mass_%s" % self.name, dtype='float32') self.masks = [None] * len(self.sources) assert mask in ['dropout', 'unity', 'none'], "invalid mask: %s" % mask if mask == "dropout" or (mask == 'none' and dropout > 0): assert 0.0 < dropout < 1.0 # If we apply this mass during training then we don't need any mask or mass for testing. # The expected weight should be 1 in # E[x] = mass * (1-dropout) # so mass has to be 1 / (1 - dropout). self.mass = T.constant(1.0 / (1.0 - dropout), dtype='float32') from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(self.rng.randint(1234) + 1) if self.depth > 1: self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],self.depth)), theano.config.floatX) for s in self.sources] else: if batch_drop: self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=s.output.shape), theano.config.floatX) for s in self.sources] else: self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],)), theano.config.floatX) for s in self.sources]
def random_binomial(shape, p=0.0, dtype=_FLOATX, seed=None): if seed is None: seed = np.random.randint(1, 10e6) rng = RandomStreams(seed=seed) return rng.binomial(shape, p=p, dtype=dtype)
class Generator(object): def __init__(self, args, embedding_layer, nclasses, encoder): self.args = args self.embedding_layer = embedding_layer self.nclasses = nclasses self.encoder = encoder def ready(self): encoder = self.encoder embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = encoder.dropout # len*batch x = self.x = encoder.x z = self.z = encoder.z n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] layer_type = args.layer.lower() for i in xrange(2): if layer_type == "rcnn": l = RCNN( n_in=n_e, # if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = LSTM( n_in=n_e, # if i == 0 else n_d, n_out=n_d, activation=activation) layers.append(l) # len * batch #masks = T.cast(T.neq(x, padding_id), theano.config.floatX) masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0, 1, "x")) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 output_layer = self.output_layer = Layer(n_in=size, n_out=1, activation=sigmoid) # len*batch*1 probs = output_layer.forward(h_final) # len*batch probs2 = probs.reshape(x.shape) self.MRG_rng = MRG_RandomStreams() z_pred = self.z_pred = T.cast( self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8") # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # self.z_pred = theano.gradient.disconnected_grad(z_pred) z2 = z.dimshuffle((0, 1, "x")) logpz = -T.nnet.binary_crossentropy(probs, z2) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch zsum = T.sum(z, axis=0, dtype=theano.config.floatX) zdiff = T.sum(T.abs_(z[1:] - z[:-1]), axis=0, dtype=theano.config.floatX) loss_mat = encoder.loss_mat if args.aspect < 0: loss_vec = T.mean(loss_mat, axis=1) else: assert args.aspect < self.nclasses loss_vec = loss_mat[:, args.aspect] self.loss_vec = loss_vec coherent_factor = args.sparsity * args.coherent loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \ T.mean(zdiff) * coherent_factor cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) self.obj = T.mean(cost_vec) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg cost = self.cost = cost_logpz * 10 + l2_cost print "cost.dtype", cost.dtype self.cost_e = loss * 10 + encoder.l2_cost
def apply_dropout(computation_graph, variables, drop_prob, rng=None, seed=None, custom_divisor=None): """Apply dropout to specified variables in a graph. Parameters ---------- computation_graph : instance of :class:`ComputationGraph` The computation graph. variables : list of :class:`~tensor.TensorVariable` Variables to be dropped out. drop_prob : float Probability of dropping out. If you want to apply the dropout with different probabilities for different layers, call it several times. rng : :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` Random number generator. seed : int Random seed to be used if `rng` was not specified. custom_divisor : float or None, optional Divide dropped variables by a given scalar value. If `None`, (default) dropped variables will be divided by `(1 - drop_prob)` which is equivalent to scaling by `(1 - drop_prob)` at test time as recommended in [DROPOUT]_. Returns ------- dropped_computation_graph : instance of :class:`ComputationGraph` A new computation graph with dropout applied to the specified variables. In order to train with, or monitor, the outputs of the original computation graph with dropout applies, use the variables contained in `dropped_computation_graph.outputs`. Notes ----- For more information, see [DROPOUT]_. .. [DROPOUT] Hinton et al. *Improving neural networks by preventing co-adaptation of feature detectors*, arXiv:1207.0580. Examples -------- >>> import numpy >>> from theano import tensor, function >>> from blocks.bricks import MLP, Identity >>> from blocks.filter import VariableFilter >>> from blocks.initialization import Constant >>> from blocks.roles import INPUT >>> linear = MLP([Identity(), Identity()], [2, 10, 2], ... weights_init=Constant(1), biases_init=Constant(2)) >>> x = tensor.matrix('x') >>> y = linear.apply(x) >>> cg = ComputationGraph(y) We are going to drop out all the input variables >>> inputs = VariableFilter(roles=[INPUT])(cg.variables) Here we apply dropout with default setting to our computation graph >>> cg_dropout = apply_dropout(cg, inputs, 0.5) Dropped out variables have role `DROPOUT` and are tagged with `replacement_of` tag. Let's filter these variables and check if they have the links to original ones. >>> dropped_out = VariableFilter(roles=[DROPOUT])(cg_dropout.variables) >>> inputs_referenced = [var.tag.replacement_of for var in dropped_out] >>> set(inputs) == set(inputs_referenced) True Compiling theano functions to forward propagate in original and dropped out graphs >>> fprop = function(cg.inputs, cg.outputs[0]) >>> fprop_dropout = function(cg_dropout.inputs, cg_dropout.outputs[0]) Initialize an MLP and apply these functions >>> linear.initialize() >>> fprop(numpy.ones((3, 2), ... dtype=theano.config.floatX)) # doctest:+ELLIPSIS array([[ 42., 42.], [ 42., 42.], [ 42., 42.]]... >>> fprop_dropout(numpy.ones((3, 2), ... dtype=theano.config.floatX)) # doctest:+ELLIPSIS array([[ 0., 0.], [ 0., 0.], [ 0., 0.]]... And after the second run answer is different >>> fprop_dropout(numpy.ones((3, 2), ... dtype=theano.config.floatX)) # doctest:+ELLIPSIS array([[ 0., 52.], [ 100., 0.], [ 0., 0.]]... """ if not rng and not seed: seed = 1 if not rng: rng = MRG_RandomStreams(seed) if custom_divisor is None: divisor = (1 - drop_prob) else: divisor = custom_divisor replacements = [ (var, var * rng.binomial(var.shape, p=1 - drop_prob, dtype=theano.config.floatX) / divisor) for var in variables ] for variable, replacement in replacements: add_role(replacement, DROPOUT) replacement.tag.replacement_of = variable return computation_graph.replace(replacements)
def fprop(self, state_below): """ :development: (1) what is the shape of state_below? Does it account for batches? - let's assume that it uses the (time, batch, data) approach in the original code, so need some changes (2) do _scan_updates do anything important? """ z0 = T.alloc(np.cast[theano.config.floatX](0), self.n_hid) c0 = T.alloc(np.cast[theano.config.floatX](0), self.n_hid) # z0 = T.alloc(np.cast[theano.config.floatX](0), state_below.shape[0], self.n_hid) # c0 = T.alloc(np.cast[theano.config.floatX](0), state_below.shape[0], self.n_hid) if state_below.shape[0] == 1: z0 = T.unbroadcast(z0, 0) c0 = T.unbroadcast(c0, 0) Wxh = self.Wxh Whh = self.Whh bxh = self.bxh state_below_input = T.dot(state_below, self.I_x) + self.I_b state_below_forget = T.dot(state_below, self.F_x) + self.F_b state_below_output = T.dot(state_below, self.O_x) + self.O_b state_below = T.dot(state_below, Wxh) + bxh # probability that a given connection is dropped is self.dropout_prob # the 'p' parameter to binomial determines the likelihood of returning a 1 # is the mask value is a 1, then the connection is not dropped # therefore 1 - dropout_prob gives the prob of droping a node (aka prob of 0) theano_rng = MRG_RandomStreams(max(self.rng.randint(2**15), 1)) mask = theano_rng.binomial(p=self.dropout_prob, size=state_below.shape, dtype=state_below.dtype) def fprop_step(state_below, state_below_input, state_below_forget, state_below_output, mask, state_before, cell_before, Whh): i_on = T.nnet.sigmoid(state_below_input + T.dot(state_before, self.I_h) + T.dot(cell_before, self.I_c)) f_on = T.nnet.sigmoid(state_below_forget + T.dot(state_before, self.F_h) + T.dot(cell_before, self.F_c)) c_t = state_below + T.dot(state_before, Whh) c_t = f_on * cell_before + i_on * T.tanh(c_t) o_on = T.nnet.sigmoid(state_below_output + T.dot(state_before, self.O_h) + T.dot(c_t, self.O_c)) z = o_on * T.tanh(c_t) # either carry the new values (z) or carry the old values (state_before) z = z * mask + (1 - mask) * state_before return z, c_t ((z, c), updates) = scan(fn=fprop_step, sequences=[ state_below, state_below_input, state_below_forget, state_below_output, mask ], outputs_info=[z0, c0], non_sequences=[Whh]) if self.return_indices is not None: if len(self.return_indices) > 1: return [z[i] for i in self.return_indices] else: return z[self.return_indices[0]] else: return z