def prediction(self, h, bias): srng = RandomStreams(seed=42) prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \ self.compute_parameters(h, bias) mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1) v = T.arange(0, mean_x.shape[0]) m_x = mean_x[v, mode] m_y = mean_y[v, mode] s_x = std_x[v, mode] s_y = std_y[v, mode] r = rho[v, mode] # cov = r * (s_x * s_y) normal = srng.normal((h.shape[0], 2)) x = normal[:, 0] y = normal[:, 1] # x_n = T.shape_padright(s_x * x + cov * y + m_x) # y_n = T.shape_padright(s_y * y + cov * x + m_y) x_n = T.shape_padright(m_x + s_x * x) y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1.-r**2))) uniform = srng.uniform((h.shape[0],)) pin = T.shape_padright(T.cast(bernoulli > uniform, floatX)) return T.concatenate([x_n, y_n, pin], axis=1)
def CTC_train(self): CTC_LOSSs = T.cast(T.mean(self.CTC_LOSS(), axis=0), "float32") train_data_d = [] train_data_m = [] train_data_m_s = [] learning_rate = T.scalar() decay = T.scalar() seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) grad_rate = 0.8 for data in self.train_data: data_d = rng.binomial((1,), p=grad_rate, dtype="float32")[0]*T.grad(CTC_LOSSs, data) train_data_d.append(data_d) data_m_s = theano.shared(np.zeros(data.get_value().shape).astype(np.float32)) train_data_m_s.append(data_m_s) data_m = data_m_s*decay + (1-decay)*data_d**2 train_data_m.append(data_m) #self.grad_test = theano.function([self.X, self.Y], train_data_d[-4]) #self.data_d_print = theano.function([self.X,self.Y],train_data_d[0][0]) #upd = [(d,d-learning_rate*d_d)for d,d_d in zip(self.train_data,train_data_d)] upd = [(d, d-learning_rate*d_d/T.sqrt(d_m+1e-4))for d,d_d,d_m in zip(self.train_data,train_data_d,train_data_m)] upd1 = [(d_m_s, decay*d_m_s+(1-decay)*d_d**2) for d_m_s,d_d in zip(train_data_m_s,train_data_d)] upd +=upd1 #self.test = theano.function([self.X,self.Y],train_data_d[0]) self.sgd_train = theano.function([self.X, self.Y, learning_rate, decay], [], updates = upd )
def _negative_sampling(self, num_negative_samples, target_indices): assert num_negative_samples > 0 logging.debug('Stochastically sampling %d negative instances ' 'out of %d classes (%.2f%%).', num_negative_samples, self.num_entities, 100.0 * float(num_negative_samples) / self.num_entities) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams( seed=np.random.randint(low=0, high=(1 << 30))) rng_sample_size = (self.batch_size, num_negative_samples,) logging.debug( 'Using %s for random sample generation of %s tensors.', RandomStreams, rng_sample_size) logging.debug('For every batch %d random integers are sampled.', np.prod(rng_sample_size)) random_negative_indices = srng.choice( rng_sample_size, a=self.num_entities, p=self.clazz_distribution) if self.__DEBUG__: random_negative_indices = theano.printing.Print( 'random_negative_indices')(random_negative_indices) return random_negative_indices
def common_init(self, mr, vr, sr, di, ce, node_id): """ Initialization function used by the base class and subclasses. """ self.MEANRATE = mr self.VARRATE = vr self.STARVRATE = sr self.DIMS = di self.CENTS = ce self.ID = node_id srng = RandomStreams(seed=100) rv_u = srng.uniform((self.CENTS, self.DIMS)) f = function([], rv_u) self.mean = 2*f() #print self.mean var1 = T.dscalar('var1') var2 = T.dmatrix('var2') var3 = T.mul self.var = theanoScaMatMul(0.001,np.ones((self.CENTS, self.DIMS))) self.starv = np.ones((self.CENTS, 1)) self.belief = np.zeros((1, self.CENTS)) self.children = [] self.last = np.zeros((1, self.DIMS)) self.whitening = False
def test_binomial_vector(self): random = RandomStreams(utt.fetch_seed()) n = tensor.lvector() prob = tensor.vector() out = random.binomial(n=n, p=prob) assert out.ndim == 1 f = function([n, prob], out) n_val = [1, 2, 3] prob_val = numpy.asarray([.1, .2, .3], dtype=config.floatX) seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(n_val, prob_val) numpy_val0 = numpy_rng.binomial(n=n_val, p=prob_val) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) val1 = f(n_val[:-1], prob_val[:-1]) numpy_val1 = numpy_rng.binomial(n=n_val[:-1], p=prob_val[:-1]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = function([n, prob], random.binomial(n=n, p=prob, size=(3,))) val2 = g(n_val, prob_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy_rng.binomial(n=n_val, p=prob_val, size=(3,)) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, n_val[:-1], prob_val[:-1])
def test_uniform_vector(self): random = RandomStreams(utt.fetch_seed()) low = tensor.dvector() high = tensor.dvector() out = random.uniform(low=low, high=high) assert out.ndim == 1 f = function([low, high], out) low_val = [.1, .2, .3] high_val = [1.1, 2.2, 3.3] seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(low_val, high_val) numpy_val0 = numpy_rng.uniform(low=low_val, high=high_val) print('THEANO', val0) print('NUMPY', numpy_val0) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) val1 = f(low_val[:-1], high_val[:-1]) numpy_val1 = numpy_rng.uniform(low=low_val[:-1], high=high_val[:-1]) print('THEANO', val1) print('NUMPY', numpy_val1) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = function([low, high], random.uniform(low=low, high=high, size=(3,))) val2 = g(low_val, high_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy_rng.uniform(low=low_val, high=high_val, size=(3,)) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, low_val[:-1], high_val[:-1])
def test_random_integers_vector(self): random = RandomStreams(utt.fetch_seed()) low = tensor.lvector() high = tensor.lvector() out = random.random_integers(low=low, high=high) assert out.ndim == 1 f = function([low, high], out) low_val = [100, 200, 300] high_val = [110, 220, 330] seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(low_val, high_val) numpy_val0 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1) for lv, hv in zip(low_val, high_val)]) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) val1 = f(low_val[:-1], high_val[:-1]) numpy_val1 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1) for lv, hv in zip(low_val[:-1], high_val[:-1])]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = function([low, high], random.random_integers(low=low, high=high, size=(3,))) val2 = g(low_val, high_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1) for lv, hv in zip(low_val, high_val)]) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, low_val[:-1], high_val[:-1])
def __init__(self, n_visible, n_hidden, weights=None, hidden_bias=None, visible_bias=None, random_on_gpu=False, seed=69, activation=T.nnet.sigmoid): self.n_visible = n_visible self.n_hidden = n_hidden if random_on_gpu: self.t_rng = GPU_RandomStreams(seed) else: self.t_rng = RandomStreams(seed) if not weights: weights = np.asarray( np.random.normal( scale=0.01, size=(self.n_visible, self.n_hidden)), dtype=theano.config.floatX) self.ts_weights = theano.shared(value=weights, name='W', borrow=True) if not hidden_bias: hidden_bias = np.zeros(n_hidden, dtype=theano.config.floatX) self.ts_hidden_bias = theano.shared(value=hidden_bias, name='hb', borrow=True) if not visible_bias: visible_bias = np.zeros(n_visible, dtype=theano.config.floatX) self.ts_visible_bias = theano.shared(value=visible_bias, name='vb', borrow=True) self.x = T.matrix(name='x') self.activation = activation self.params = [self.ts_weights, self.ts_hidden_bias, self.ts_visible_bias]
def test_default_dtype(self): random = RandomStreams(utt.fetch_seed()) low = tensor.dscalar() high = tensor.dscalar() # Should not silently downcast from low and high out0 = random.uniform(low=low, high=high, size=(42,)) assert out0.dtype == 'float64' f0 = function([low, high], out0) val0 = f0(-2.1, 3.1) assert val0.dtype == 'float64' # Should downcast, since asked explicitly out1 = random.uniform(low=low, high=high, size=(42,), dtype='float32') assert out1.dtype == 'float32' f1 = function([low, high], out1) val1 = f1(-1.1, 1.1) assert val1.dtype == 'float32' # Should use floatX lowf = tensor.fscalar() highf = tensor.fscalar() outf = random.uniform(low=lowf, high=highf, size=(42,)) assert outf.dtype == config.floatX ff = function([lowf, highf], outf) valf = ff(numpy.float32(-0.1), numpy.float32(0.3)) assert valf.dtype == config.floatX
def test_multinomial_vector(self): random = RandomStreams(utt.fetch_seed()) n = tensor.lvector() pvals = tensor.matrix() out = random.multinomial(n=n, pvals=pvals) assert out.ndim == 2 f = function([n, pvals], out) n_val = [1, 2, 3] pvals_val = [[.1, .9], [.2, .8], [.3, .7]] pvals_val = numpy.asarray(pvals_val, dtype=config.floatX) seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(n_val, pvals_val) numpy_val0 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val, pvals_val)]) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) val1 = f(n_val[:-1], pvals_val[:-1]) numpy_val1 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val[:-1], pvals_val[:-1])]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = function([n, pvals], random.multinomial(n=n, pvals=pvals, size=(3,))) val2 = g(n_val, pvals_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val, pvals_val)]) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, n_val[:-1], pvals_val[:-1])
def theano_sentence_prediction(self, Sentence, Chars, WordLengths): input_lstm_res_f = self.input_lstm_forward_layer.function(Sentence, Chars, WordLengths) input_lstm_res_b = self.input_lstm_backward_layer.function(Sentence, Chars, WordLengths) input_combined = T.concatenate((input_lstm_res_f, input_lstm_res_b), axis=1) #Make pairwise features. This is really just "tensor product with concatenation instead of multiplication". Is there a command for that? full_matrix, _ = theano.scan(fn=self.__pairwise_features, outputs_info=None, sequences=input_combined, non_sequences=[input_combined, Sentence.shape[0]]) if len(self.lstm_layers) > 0 and self.lstm_layers[0].training: srng = RandomStreams(seed=12345) full_matrix = T.switch(srng.binomial(size=(Sentence.shape[0], Sentence.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0) else: full_matrix = 0.5 * full_matrix full_matrix = self.transition_layer.function(full_matrix) for layer in self.lstm_layers: if layer.training: print("hah-train") full_matrix = T.switch(srng.binomial(size=(Sentence.shape[0], Sentence.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0) else: print("heh-notrain") full_matrix = 0.5 * full_matrix full_matrix = layer.function(full_matrix) final_matrix = self.output_convolution.function(full_matrix) return T.nnet.softmax(final_matrix)
def _dropout_from_layer(self, layer): stream = RandomStreams(self.numpy_range.randint(999999)) mask = stream.binomial(size=layer.shape, n=1, p=(1-self._p), dtype=theano.config.floatX) return layer * Tensor.cast(mask, theano.config.floatX)
def __init__(self, rng, train_input, test_input, n_in, n_out): # self.input = input.flatten(2) self.W = theano.shared( value=numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ), name='W', borrow=True ) self.b = theano.shared( value=numpy.zeros((n_out,), dtype=theano.config.floatX), name='b', borrow=True ) p = 0.5 tmp_output = T.nnet.relu(T.dot(train_input.flatten(2), self.W) + self.b) srng = RandomStreams(rng.randint(1234)) mask = (srng.uniform(size=tmp_output.shape) < p)/p self.train_output = tmp_output * mask self.test_output = T.nnet.relu(T.dot(test_input.flatten(2), self.W) + self.b) self.params = [self.W, self.b]
def __init__(self, classifier, args, noise_dist): self.y = T.ivector("y") ## Cost function # Sum over minibatch instances (log ( u(w|c) / (u(w|c) + k * p_n(w)) ) + sum over noise samples ( log ( u(x|c) / ( u(x|c) + k * p_n(x) ) ))) # Generating noise samples srng = RandomStreams(seed=1234) noise_samples = srng.choice( size=(self.y.shape[0], args.num_noise_samples), a=args.num_classes, p=noise_dist, dtype="int32" ) log_noise_dist = theano.shared(np.log(noise_dist.get_value()), borrow=True) # log_num_noise_samples = theano.shared(math.log(args.num_noise_samples)).astype(theano.config.floatX) log_num_noise_samples = theano.shared(np.log(args.num_noise_samples, dtype=theano.config.floatX)) # Data Part of Cost Function: log ( u(w|c) / (u(w|c) + k * p_n(w)) data_scores = classifier.output[T.arange(self.y.shape[0]), self.y] data_denom = self.logadd(data_scores, log_num_noise_samples + log_noise_dist[self.y]) data_prob = data_scores - data_denom # Sumation of Noise Part of Cost Function: sum over noise samples ( log ( u(x|c) / ( u(x|c) + k * p_n(x) ) )) noise_mass = ( log_num_noise_samples + log_noise_dist[noise_samples] ) # log(k) + log(p_n(x)) for all noise samples (Shape: #instaces x k) noise_scores = classifier.output[T.arange(noise_samples.shape[0]).reshape((-1, 1)), noise_samples] noise_denom = self.logadd(noise_scores, noise_mass) noise_prob_sum = T.sum(noise_mass - noise_denom, axis=1) self.cost = -T.mean(data_prob + noise_prob_sum) self.test = T.sum(data_scores)
def test_normal_vector(self): random = RandomStreams(utt.fetch_seed()) avg = tensor.dvector() std = tensor.dvector() out = random.normal(avg=avg, std=std) assert out.ndim == 1 f = function([avg, std], out) avg_val = [1, 2, 3] std_val = [.1, .2, .3] seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(avg_val, std_val) numpy_val0 = numpy_rng.normal(loc=avg_val, scale=std_val) assert numpy.allclose(val0, numpy_val0) # arguments of size (2,) val1 = f(avg_val[:-1], std_val[:-1]) numpy_val1 = numpy_rng.normal(loc=avg_val[:-1], scale=std_val[:-1]) assert numpy.allclose(val1, numpy_val1) # Specifying the size explicitly g = function([avg, std], random.normal(avg=avg, std=std, size=(3,))) val2 = g(avg_val, std_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy_rng.normal(loc=avg_val, scale=std_val, size=(3,)) assert numpy.allclose(val2, numpy_val2) self.assertRaises(ValueError, g, avg_val[:-1], std_val[:-1])
def __init__(self, neurons, dimensions, count = 1, max_rate = (200, 300), intercept = (-1.0, 1.0), t_ref = 0.002, t_rc = 0.02, seed = None, type = 'lif', dt = 0.001, encoders = None, name = None, address = "localhost"): self.seed = seed self.neurons = neurons self.dimensions = dimensions self.count = count self.name = name self.address = address self.ticker_conn = None # create the neurons # TODO: handle different neuron types, which may have different parameters to pass in self.neuron = neuron.names[type]((count, self.neurons), t_rc = t_rc, t_ref = t_ref, dt = dt) # compute alpha and bias srng = RandomStreams(seed=seed) max_rates = srng.uniform([neurons], low=max_rate[0], high=max_rate[1]) threshold = srng.uniform([neurons], low=intercept[0], high=intercept[1]) alpha, self.bias = theano.function([], self.neuron.make_alpha_bias(max_rates,threshold))() self.bias = self.bias.astype('float32') # compute encoders self.encoders = make_encoders(neurons, dimensions, srng, encoders=encoders) self.encoders = (self.encoders.T * alpha).T # make default origin self.origin = dict(X=origin.Origin(self)) self.accumulator = {}
def theano_sentence_prediction(self, Vs): #Make pairwise features. This is really just "tensor product with concatenation instead of multiplication". Is there a command for that? pairwise_vs, _ = theano.scan(fn=self.__pairwise_features, outputs_info=None, sequences=Vs, non_sequences=[Vs, Vs.shape[0]]) if self.input_lstm_layer.training: srng = RandomStreams(seed=12345) full_matrix = self.input_lstm_layer.function(pairwise_vs) for layer in self.lstm_layers: if self.input_lstm_layer.training: print("hah-train") full_matrix = T.switch(srng.binomial(size=(Vs.shape[0], Vs.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0) else: print("heh-notrain") full_matrix = 0.5 * full_matrix full_matrix = layer.function(full_matrix) if self.input_lstm_layer.training: print("hah-train") full_matrix = T.switch(srng.binomial(size=(Vs.shape[0], Vs.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0) else: print("heh-notrain") full_matrix = 0.5 * full_matrix final_matrix = self.output_convolution.function(full_matrix) return T.nnet.softmax(final_matrix)
def kmeans(train_set_x): if train_set_x is None: train_set_x = T.matrix('train_set_x') ######################## # Normalize the inputs # ######################## epsilon_norm = 10 epsilon_zca = 0.015 K = 500 train_set_x = train_set_x - T.mean(train_set_x, axis=0) / T.sqrt(T.var(train_set_x, axis=0) + epsilon_norm) ##################### # Whiten the inputs # ##################### # a simple choice of whitening transform is the ZCA whitening transform # epsilon_zca is small constant # for contrast-normalizaed data, setting epsilon_zca to 0.01 for 16-by-16 pixel patches, # or to 0.1 for 8-by-8 pixel patches # is good starting point cov = T.dot(train_set_x, T.transpose(train_set_x)) / train_set_x.shape[1] U, S, V = linalg.svd(cov) tmp = T.dot(U, T.diag(1/T.sqrt(S + epsilon_zca))) tmp = T.dot(tmp, T.transpose(U)) whitened_x = T.dot(tmp, train_set_x) ###################### # Training the Model # ###################### # Initialization dimension_size = whitened_x.shape[0] num_samples = whitened_x.shape[1] srng = RandomStreams(seed=234) D = srng.normal(size=(dimension_size, K)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) # typically 10 iterations is enough num_iteration = 15 # compute new centroids, D_new for i in xrange(num_iteration): dx = T.dot(D.T, whitened_x) arg_max_dx = T.argmax(dx, axis=0) s = dx[arg_max_dx, T.arange(num_samples)] S = T.zeros((K, num_samples)) S = T.set_subtensor(S[arg_max_dx, T.arange(num_samples)], s) D = T.dot(whitened_x, T.transpose(S)) + D D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) return D
def build_model(tparams, options): trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples if options['use_target_as_input']: x = tensor.tensor3('x', dtype='float32') else: x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype='float32') # context: #samples x dim ctx = tensor.matrix('ctx', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # word embedding if options['use_target_as_input']: emb = x else: emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) # decoder if options.setdefault('feedforward', False): proj_h = tensor.dot(emb, tparams['Wff']) proj_h = (proj_h * mask[:,:,None]).sum(axis=0) proj_h = proj_h / mask.sum(axis=0)[:,None] elif options.setdefault('regress', False): proj_h = (emb * mask[:,:,None]).sum(axis=0) proj_h = tensor.dot(proj_h, tparams['Wff']) proj_h = proj_h / mask.sum(axis=0)[:,None] else: proj = get_layer('lstm')[1](tparams, emb, options, prefix='encoder', mask=mask) proj_h = proj[0] if options['use_mean']: proj_h = (proj_h * mask[:,:,None]).sum(axis=0) proj_h = proj_h / mask.sum(axis=0)[:,None] else: proj_h = proj_h[-1] if 'n_layers' in options: for lidx in xrange(1, options['n_layers']): proj_h = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_out_%d'%lidx, activ='tanh') out = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_out', activ='linear') # cost if options['loss_type'] == 'cosine': out = out / tensor.sqrt((out ** 2).sum(1))[:,None] cost = 1. - (out * ctx).sum(1) elif options['loss_type'] == 'ranking': out = out / tensor.sqrt((out ** 2).sum(1))[:,None] rndidx = trng.permutation(n=ctx.shape[0]) ctx_rnd = ctx[rndidx] cost = tensor.maximum(0., 1 - (out * ctx).sum(1) + (out * ctx_rnd).sum(1)) else: raise Exception('Unknown loss function') return trng, use_noise, x, mask, ctx, cost
def __init__(self,X,mask,shape,is_train=1,p=0.5,state_pre=None): prefix="GRU" self.in_size,self.hidden_size=shape self.W_xr=theano.shared(value=np.asarray((np.random.randn(self.in_size,self.hidden_size) * 0.1),dtype=theano.config.floatX), name=prefix+'_W_xr') self.W_hr=theano.shared(value=np.asarray((np.random.randn(self.hidden_size,self.hidden_size) * 0.1),dtype=theano.config.floatX), name=prefix+'_W_hr') self.b_r=theano.shared(value=np.asarray(np.zeros(self.hidden_size),dtype=theano.config.floatX), name=prefix+'_b_r') self.W_xz=theano.shared(value=np.asarray((np.random.randn(self.in_size,self.hidden_size) * 0.1),dtype=theano.config.floatX), name=prefix+'_W_xz') self.W_hz=theano.shared(value=np.asarray((np.random.randn(self.hidden_size,self.hidden_size) * 0.1),dtype=theano.config.floatX), name=prefix+'_W_hz') self.b_z=theano.shared(value=np.asarray(np.zeros(self.hidden_size),dtype=theano.config.floatX), name=prefix+'_b_z') self.W_xh=theano.shared(value=np.asarray((np.random.randn(self.in_size,self.hidden_size) * 0.1),dtype=theano.config.floatX), name=prefix+'_W_xh') self.W_hh=theano.shared(value=np.asarray((np.random.randn(self.hidden_size,self.hidden_size) * 0.1),dtype=theano.config.floatX), name=prefix+'_W_hh') self.b_h=theano.shared(value=np.asarray(np.zeros(self.hidden_size),dtype=theano.config.floatX), name=prefix+'_b_h') self.X=X self.mask=mask batch_size=self.X.shape[1] if state_pre==None: state_pre=T.zeros((batch_size,self.hidden_size),dtype=theano.config.floatX) def _step(x,m,h_tm1): r=T.nnet.sigmoid(T.dot(x,self.W_xr) + T.dot(h_tm1,self.W_hr) +self.b_r) z=T.nnet.sigmoid(T.dot(x,self.W_xz) + T.dot(h_tm1,self.W_hz) +self.b_z) gh=T.tanh(T.dot(x , self.W_xh) + T.dot(r * h_tm1 , self.W_hh) + self.b_h) h_t=z * h_tm1 + (T.ones_like(z) - z) * gh h_t = h_t * m[:,None] return h_t h,_=theano.scan(fn=_step, sequences=[self.X,self.mask], outputs_info=state_pre) self.h=h if p>0: trng=RandomStreams(12345) drop_mask=trng.binomial(n=1,p=1-p,size=h.shape,dtype=theano.config.floatX) self.activation=T.switch(T.eq(is_train,1),h*drop_mask,h*(1-p)) else: self.activation=T.switch(T.eq(is_train,1),h,h) self.params=[self.W_xr,self.W_hr,self.b_r, self.W_xz,self.W_hz,self.b_z, self.W_xh,self.W_hh,self.b_h]
def __init__(self, input, filter_shape, corruption_level = 0.1, shared_W = None, shared_b = None, image_shape = None, poolsize = (2,2)): theano_rng = RandomStreams() fan_in = numpy.prod(filter_shape[1:]) fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) center = theano.shared(value = 1, name="center") scale = theano.shared(value = 2, name="scale") if shared_W != None and shared_b != None : self.W = shared_W self.b = shared_b else: initial_W = numpy.asarray( numpy.random.uniform( low = -numpy.sqrt(6./(fan_in+fan_out)), high = numpy.sqrt(6./(fan_in+fan_out)), size = filter_shape), dtype = theano.config.floatX) initial_b = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.W = theano.shared(value = initial_W, name = "W") self.b = theano.shared(value = initial_b, name = "b") initial_b_prime= numpy.zeros((filter_shape[1],),dtype=theano.config.floatX) self.b_prime = theano.shared(value = initial_b_prime, name = "b_prime") self.x = input self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level,dtype=theano.config.floatX) * self.x conv1_out = conv.conv2d(self.tilde_x, self.W, filter_shape=filter_shape, image_shape=image_shape, border_mode='valid') self.y = T.tanh(conv1_out + self.b.dimshuffle('x', 0, 'x', 'x')) da_filter_shape = [ filter_shape[1], filter_shape[0], filter_shape[2], filter_shape[3] ] initial_W_prime = numpy.asarray( numpy.random.uniform( \ low = -numpy.sqrt(6./(fan_in+fan_out)), \ high = numpy.sqrt(6./(fan_in+fan_out)), \ size = da_filter_shape), dtype = theano.config.floatX) self.W_prime = theano.shared(value = initial_W_prime, name = "W_prime") conv2_out = conv.conv2d(self.y, self.W_prime, filter_shape = da_filter_shape, border_mode='full') self.z = (T.tanh(conv2_out + self.b_prime.dimshuffle('x', 0, 'x', 'x'))+center) / scale scaled_x = (self.x + center) / scale self.L = - T.sum( scaled_x*T.log(self.z) + (1-scaled_x)*T.log(1-self.z), axis=1 ) self.cost = T.mean(self.L) self.params = [ self.W, self.b, self.b_prime ]
def dropout(rng, x, p=0.5): """ Zero-out random values in x with probability p using rng """ if p > 0. and p < 1.: seed = rng.randint(2 ** 30) srng = RandomStreams(seed) mask = srng.binomial(n=1, p=1.-p, size=x.shape, dtype=theano.config.floatX) return x * mask return x
def dropout(random_state, X, keep_prob=0.5): if keep_prob > 0. and keep_prob < 1.: seed = random_state.randint(2 ** 30) srng = RandomStreams(seed) mask = srng.binomial(n=1, p=keep_prob, size=X.shape, dtype=theano.config.floatX) return X * mask return X
def __init__(self, input, rescale, recentre): srng = RandomStreams(seed=234) self.input = input dequantize_input = input + srng.uniform(size=input.shape, low=-0.5/255, high=0.5/255) self.output = rescale * (dequantize_input - recentre)
def __init__(self, rng, x, n_in, n_h, p, training, rnn_batch_training=False): """ This is to initialise a standard RNN hidden unit :param rng: random state, fixed value for randome state for reproducible objective results :param x: input data to current layer :param n_in: dimension of input data :param n_h: number of hidden units/blocks :param p: the probability of dropout :param training: a binary value to indicate training or testing (for dropout training) """ self.input = x if p > 0.0: if training==1: srng = RandomStreams(seed=123456) self.input = T.switch(srng.binomial(size=x.shape,p=p), x, 0) else: self.input = (1-p) * x #(1-p) * self.n_in = int(n_in) self.n_h = int(n_h) self.rnn_batch_training = rnn_batch_training # random initialisation Wx_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_in), size=(n_in, n_h)), dtype=config.floatX) Wh_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_h), size=(n_h, n_h)), dtype=config.floatX) # Input gate weights self.W_xi = theano.shared(value=Wx_value, name='W_xi') self.W_hi = theano.shared(value=Wh_value, name='W_hi') # bias self.b_i = theano.shared(value=np.zeros((n_h, ), dtype=config.floatX), name='b_i') # initial value of hidden and cell state if self.rnn_batch_training: self.h0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'h0') self.c0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'c0') self.h0 = T.repeat(self.h0, x.shape[1], 0) self.c0 = T.repeat(self.c0, x.shape[1], 0) else: self.h0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'h0') self.c0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'c0') self.Wix = T.dot(self.input, self.W_xi) [self.h, self.c], _ = theano.scan(self.recurrent_as_activation_function, sequences = [self.Wix], outputs_info = [self.h0, self.c0]) self.output = self.h self.params = [self.W_xi, self.W_hi, self.b_i] self.L2_cost = (self.W_xi ** 2).sum() + (self.W_hi ** 2).sum()
def expr(self, model, data, **kwargs): """ Overwrites the Cost.expr so we can inject our theano.Op. """ space,source = self.get_data_specs(model) space.validate(data) #really no point to using these random values. Could be zeros srng = RandomStreams(seed=234) return OverwriteOp(self.cost,model)(srng.uniform(low=0.0,high=1000.0,dtype=theano.config.floatX),data)
def test_symbolic_shape(self): random = RandomStreams(utt.fetch_seed()) shape = tensor.lvector() f = function([shape], random.uniform(size=shape, ndim=2)) assert f([2, 3]).shape == (2, 3) assert f([4, 8]).shape == (4, 8) self.assertRaises(ValueError, f, [4]) self.assertRaises(ValueError, f, [4, 3, 4, 5])
def Kmeans(X_train=None, K=300, epsilon_whitening=0.015): if X_train is None: X_train = T.matrix("X_train") ######################## # Normalize the inputs # ######################## # A constant added to the variance to avoid division by zero epsilon_norm = 10 # We subtract from each training sample (each column in X_train) its mean X_train = X_train - T.mean(X_train, axis=0) / T.sqrt(T.var(X_train, axis=0) + epsilon_norm) ##################### # Whiten the inputs # ##################### sigma = T.dot(X_train, T.transpose(X_train)) / X_train.shape[1] U, s, V = linalg.svd(sigma, full_matrices=False) tmp = T.dot(U, T.diag(1 / T.sqrt(s + epsilon_whitening))) tmp = T.dot(tmp, T.transpose(U)) X_Whitened = T.dot(tmp, X_train) ###################### # Training the Model # ###################### # Initialization dimensions = X_Whitened.shape[0] samples = X_Whitened.shape[1] srng = RandomStreams(seed=234) # We initialize the centroids by sampling them from a normal # distribution, and then normalizing them to unit length # D \in R^{n \times k} D = srng.normal(size=(dimensions, K)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) iterations = 30 for i in xrange(iterations): # Initialize new point representations # for every pass of the algorithm S = T.zeros((K, samples)) tmp = T.dot(D.T, X_Whitened) res = T.argmax(tmp, axis=0) max_values = tmp[res, T.arange(samples)] S = T.set_subtensor(S[res, T.arange(samples)], max_values) D = T.dot(X_Whitened, T.transpose(S)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) return D
def __init__(self, n_v, n_h, inputs, vbias=None, hbias=None, initial_W=None, v_unit='BIN', unit_type='LOG'): ''' v_unit: str, optional, default: 'BIN' This variable control the output unit of our RBM The possible value are ['BIN', 'LOG', 'GAUSS'] unit_type: str, optional, default:'LOG' This variable control the activation function of the unit, 'LIN' -> W.h +b 'LOG' -> sig(W.h +b) ''' self.type = unit_type if initial_W is None: initial_W = np.asarray(np.random.uniform( low=-4*np.sqrt(6. / (n_v + n_h)), high=4*np.sqrt(6. / (n_v + n_h)), size=(n_v, n_h)), dtype=theano.config.floatX) if hbias is None: hbias = theano.shared(value=np.zeros(n_h, dtype=theano.config.floatX), name='hbias') if vbias is None: vbias = theano.shared(value=np.zeros(n_v, dtype=theano.config.floatX), name='vbias') e1 = np.zeros((n_v, n_h), dtype=theano.config.floatX) e2 = np.zeros((n_h, n_v), dtype=theano.config.floatX) self.inputs = inputs self.shape = (n_v, n_h) self.W = theano.shared(value=initial_W, name='W') self.eps_up = theano.shared(value=e1, name='eps_u') self.eps_down = theano.shared(value=e2, name='eps_d') self.vbias = vbias self.hbias = hbias np_rng = np.random.RandomState() theano_rng = RandomStreams(np_rng.randint(2**30)) self.v_type = v_unit if v_unit is 'LOG': theano_rng.v_unit = self.log_sample elif v_unit is 'GAUSS': theano_rng.v_unit = self.gauss_sample else: theano_rng.v_unit = theano_rng.binomial self.theano_rng = theano_rng self.params = [self.W, self.vbias, self.hbias] self.params_ft = [self.eps_up, self.eps_down] self.hid = theano.function([self.inputs], self.up(self.inputs))
def maxout(Z, stop_dropout, archi, dropout_rate, seed=5432): th.config.floatX = 'float32' Z_out = T.maximum(Z[:, :int(archi / 2)], Z[:, int(archi / 2):]) prob = (1 - dropout_rate) srng = RandomStreams(seed=seed) return ifelse(T.lt(stop_dropout, 1.05), Z_out * srng.binomial(size=T.shape(Z_out), p=prob).astype('float32'), Z_out)
import download_datasets.mnist filename = r'./data/mnist.pkl.gz' train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename) if args.reinit: init_batch_size = min(64, size) init_batch = train_x[:size][-init_batch_size:].reshape( init_batch_size, 784) else: init_batch = None if args.model == 'BHN_MLPWN': model = MLPWeightNorm_BHN(lbda=lbda, perdatapoint=perdatapoint, srng=RandomStreams(seed=args.seed + 2000), prior=prior, coupling=coupling, n_hiddens=n_hiddens, n_units=n_units, flow=args.flow, noise_distribution=args.noise_distribution, init_batch=init_batch) elif args.model == 'BHN_MLPCD': model = MLPConcreteDropout_BHN( lbda=lbda, alpha=args.alpha, beta=args.beta, perdatapoint=perdatapoint, srng=RandomStreams(seed=args.seed + 2000), prior=prior,
def __init__( self, numpy_rng, n_visible, n_hidden, theano_rng=None, input=None, theta=None, bvis=None ): """ :type numpy_rng: numpy.random.RandomState :param numpy_rng: number random generator used to generate weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` """ self.n_visible = n_visible self.n_hidden = n_hidden self.input = input # create a Theano random generator that gives symbolic random values if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # initialize theta = (W,b) with 0s; W gets the shape (n_visible, n_hidden), # while b is a vector of n_out elements, making theta a vector of # n_visible*n_hidden + n_hidden elements if not theta: theta_values = numpy.asarray( numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible + 1)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible + 1)), size=(n_visible * n_hidden + n_hidden) ), dtype=theano.config.floatX ) theta = theano.shared( value=theta_values, name='theta', borrow=True ) self.theta = theta # W is represented by the fisr n_visible*n_hidden elements of theta W = self.theta[0:n_visible * n_hidden].reshape((n_visible, n_hidden)) # b is the rest (last n_hidden elements) bhid = self.theta[n_visible * n_hidden:n_visible * n_hidden + n_hidden] if not bvis: bvis_values = numpy.asarray( numpy_rng.uniform(self.n_visible,), dtype=theano.config.floatX ) bvis = theano.shared( value=bvis_values, borrow=True ) self.W = W # b corresponds to the bias of the hidden self.b = bhid # b_prime corresponds to the bias of the visible self.b_prime = bvis # tied weights, therefore W_prime is W transpose self.W_prime = self.W.T self.theano_rng = theano_rng # if no input is given, generate a variable representing the input if input is None: self.x = T.matrix(name='input') else: self.x = input self.params = [self.theta, self.b_prime] self.train_cost_array=[] self.valid_error_array = [] self.epoch=0
class SS_ReconsSRBM: def reset_rng(self): self.rng = N.random.RandomState([12.,9.,2.]) self.theano_rng = RandomStreams(self.rng.randint(2**30)) if self.initialized: self.redo_theano() # def __getstate__(self): d = copy.copy(self.__dict__) #remove everything set up by redo_theano for name in self.names_to_del: if name in d: del d[name] print "WARNING: not pickling random number generator!!!!" del d['theano_rng'] return d def __setstate__(self, d): self.__dict__.update(d) #self.redo_theano() # todo: make some way of not running this, so it's possible to just open something up and look at its weights fast without recompiling it def weights_format(self): return ['v','h'] def get_dimensionality(self): return 0 def important_error(self): return 2 def __init__(self, nvis, nhid, learning_rate, irange, init_c, mean_field_iters, q_damping_factor, s_default_damping_factor, tau, fancy_damp, no_damp_iters, persistent_chains, init_a, init_alpha, init_beta, gibbs_iters, enc_weight_decay, use_cd, instrumented = False): self.initialized = False self.fancy_damp = fancy_damp if fancy_damp: assert q_damping_factor == s_default_damping_factor self.s_default_damping_factor = s_default_damping_factor self.tau = tau assert type(tau) == type(1.) self.reset_rng() self.nhid = nhid self.nvis = nvis self.learning_rate = learning_rate self.ERROR_RECORD_MODE_MONITORING = 0 self.error_record_mode = self.ERROR_RECORD_MODE_MONITORING self.init_weight_mag = irange self.force_batch_size = 0 self.init_c = init_c self.init_a = init_a self.init_alpha = init_alpha self.persistent_chains = persistent_chains self.mean_field_iters = mean_field_iters self.no_damp_iters = no_damp_iters self.gibbs_iters = gibbs_iters self.q_damping_factor = q_damping_factor self.enc_weight_decay = N.cast[floatX](enc_weight_decay) self.names_to_del = [] self.use_cd = use_cd self.init_beta = init_beta self.instrumented = instrumented self.redo_everything() def set_error_record_mode(self, mode): self.error_record_mode = mode def set_size_from_dataset(self, dataset): self.nvis = dataset.get_output_dim() self.redo_everything() self.b.set_value( dataset.get_marginals(), borrow=False) # def get_input_dim(self): return self.nvis def get_output_dim(self): return self.nhid def redo_everything(self): self.initialized = True self.error_record = [] if self.instrumented: self.instrument_record = InstrumentRecord() # self.examples_seen = 0 self.batches_seen = 0 self.W = shared( N.cast[floatX](self.rng.uniform(-self.init_weight_mag, self.init_weight_mag, (self.nvis, self.nhid ) ) )) self.W.name = 'W' self.c = shared( N.cast[floatX](N.zeros(self.nhid) + self.init_c) ) self.c.name = 'c' self.b = shared( N.cast[floatX](N.zeros(self.nvis))) self.b.name = 'b' self.chains = shared ( N.cast[floatX]( N.zeros((self.persistent_chains,self.nvis))) ) self.chains.name = 'chains' self.a = shared(N.cast[floatX](N.zeros(self.nhid)+self.init_a)) self.a.name = 'a' self.alpha = shared(N.cast[floatX] (N.zeros(self.nhid)+self.init_alpha)) self.alpha.name = 'alpha' self.beta = shared(N.cast[floatX] (N.zeros(self.nvis)+self.init_beta)) self.beta.name = 'beta' self.params = [ self.W, self.a, self.b, self.c, self.alpha, self.beta ] self.clip = [ 0, 0, 0, 0, 1, 1 ] self.redo_theano() # def expected_energy(self, V, Q, Mu1): name = V.name #V = Print('V.'+V.name,attrs=['min','mean','max'])(V); V.name = name #Q = #Print('Q.'+V.name,attrs=['min','mean','max'])(Q) #Mu1 = #Print('Mu1.'+V.name,attrs=['min','mean','max'])(Mu1) ugly = Q*(1/self.gamma+T.sqr(Mu1)) - T.sqr(Q)*T.sqr(Mu1) #ugly = #Print('ugly',attrs=['shape'])(ugly) ugly.name = 'ugly' term_1 = 0.5 * T.dot(self.w, T.mean(ugly,axis=0)) term_1.name = 'term_1' #term_1 = #Print('term_1')(term_1) recons = T.dot(Q*Mu1,self.W.T) #recons = #Print('recons',attrs=['shape'])(recons) recons.name = 'recons' iterm = 0.5*self.nvis*T.mean(T.sqr(recons)*self.beta) #iterm = #Print('iterm',attrs=['shape'])(iterm) #iterm = #Print('iterm')(iterm) iterm.name = 'iterm' normalized_vis = self.beta * (V-self.b) main_term = - self.nvis * T.mean(normalized_vis*recons) #main_term = #Print('main_term',attrs=['shape'])(main_term) #main_term = #Print('main_term')(main_term) normalized_vis.name = 'normalized_vis' #normalized_vis = #Print('normalized_vis',attrs=['shape'])(normalized_vis) main_term.name = 'main_term' S = (1-Q)*(T.sqr(self.a)/T.sqr(self.alpha)+1./self.alpha) + Q*(T.sqr(Mu1)+1./self.gamma) #S = #Print('S',attrs=['shape'])(S) #S = #Print('S.'+V.name)(S) S.name = 'S' contain_s = 0.5 * T.mean(T.dot(S,self.alpha)) #contain_s = #Print('contain_s',attrs=['shape'])(contain_s) #contain_s = #Print('contain_s')(contain_s) contain_s.name = 'contain_s' vis_bias = - self.nvis * T.mean(normalized_vis) #vis_bias = #Print('vis_bias',attrs=['shape'])(vis_bias) #vis_bias = #Print('vis_bias')(vis_bias) vis_bias.name = 'vis_bias' contain_v = 0.5 * T.mean(T.dot(T.sqr(V),self.beta)) #contain_v = #Print('contain_v',attrs=['shape'])(contain_v) #contain_v = #Print('contain_v')(contain_v) contain_v.name = 'contain_v' hid_bias = -T.mean(T.dot(Q,self.c)) #hid_bias = #Print('hid_bias',attrs=['shape'])(hid_bias) #hid_bias = #Print('his_bias')(hid_bias) hid_bias.name = 'hid_bias' s_bias = -T.mean(T.dot(Q*Mu1+(1.-Q)*(self.a/self.alpha),self.a)) #s_bias = #Print('s_bias',attrs=['s_bias'])(s_bias) #s_bias = #Print('s_bias')(s_bias) s_bias.name = 's_boas' rval = term_1 + iterm + main_term + contain_s + vis_bias \ + contain_v + hid_bias + s_bias rval.name = 'rval' assert len(rval.type().broadcastable) == 0 return rval def redo_theano(self): init_names = dir(self) if 'theano_rng' not in dir(self): assert self.initialized print "WARNING: pickle did not contain theano_rng, starting from default one" self.reset_rng() return self.W_T = self.W.T self.w = T.sum(self.beta * T.sqr(self.W).T,axis=1) self.w.name = 'w' #self.alpha = #Print('alpha',attrs=['min','mean','max'])(self.alpha) #self.w = #Print('w',attrs=['min','mean','max'])(self.w) self.gamma = self.alpha + self.w #self.gamma = #Print('gamma',attrs=['min','mean','max'])(self.gamma) lr = T.scalar() X = T.matrix() X.name = 'X' pos_Q, pos_Mu1 = self.infer_Q_Mu1(X) pos_Q.name = 'pos_Q' pos_Mu1.name = 'pos_Mu1' self.H_exp_func = function([X],pos_Q) self.Mu1_func = function([X],pos_Mu1) self.hid_exp_func = function([X],pos_Q*pos_Mu1) if self.use_cd: samples = [ X ] else: samples = [ self.chains ] outside_pos_Q = shared(N.cast[floatX](N.zeros((1,1)))) outside_neg_Q = shared(N.cast[floatX](N.zeros((1,1)))) outside_pos_Mu1 = shared(N.cast[floatX](N.zeros((1,1)))) outside_neg_Mu1 = shared(N.cast[floatX](N.zeros((1,1)))) for i in xrange(self.gibbs_iters): if i == 0 and not self.use_cd: #if using SML, the first Q of gibbs sampling was already computed during the #previous call to learn_mini_batch samples.append(self.gibbs_step( Q = outside_neg_Q, Mu1 = outside_neg_Mu1) ) else: samples.append(self.gibbs_step( V = samples[-1])) # # #if using SML, this needs to be called on the first mini batch to make sure outside_neg_Q is initialized first_Q, first_Mu1 = self.infer_Q_Mu1(self.chains) self.set_up_sampler = function([],updates=[ (outside_neg_Q, first_Q), (outside_neg_Mu1, first_Mu1)]) self.first_mini_batch = True final_sample = samples[-1] final_sample.name = 'final_sample' neg_Q, neg_Mu1 = self.infer_Q_Mu1(final_sample) neg_Q.name = 'neg_Q' neg_Mu1.name = 'neg_Mu1' sampling_updates = [ (outside_pos_Q, pos_Q), (outside_neg_Q, neg_Q), (outside_pos_Mu1, pos_Mu1), (outside_neg_Mu1, neg_Mu1) ] if not self.use_cd: sampling_updates.append((self.chains,final_sample)) self.run_sampling = function([X], updates = sampling_updates, name = 'run_sampling') obj = self.expected_energy(X,outside_pos_Q, outside_pos_Mu1) \ - self.expected_energy(self.chains,outside_neg_Q, outside_neg_Mu1) \ + self.enc_weight_decay * T.mean(T.sqr(self.W)) grads = [ T.grad(obj,param) for param in self.params ] learning_updates = [] for i in xrange(len(self.params)): update = self.params[i] - lr * grads[i] #update = #Print(self.params[i].name+' preclip',attrs=['min','mean','max'])(update) if self.clip[i]: update = T.clip(update,.1,1000) # learning_updates.append((self.params[i],update)) # self.learn_from_samples = function([X, lr], updates = learning_updates , name='learn_from_samples') self.recons_func = function([X], self.gibbs_step_exp(X) , name = 'recons_func') self.sample = function([X], self.gibbs_step(X), name = 'sample_func') if self.instrumented: self.make_instruments() # final_names = dir(self) self.names_to_del = [ name for name in final_names if name not in init_names ] def learn(self, dataset, batch_size): self.learn_mini_batch(dataset.get_batch_design(batch_size)) def error_func(self, x): return N.square( x - self.recons_func(x)).mean() def record_monitoring_error(self, dataset, batch_size, batches): print 'running on monitoring set' assert self.error_record_mode == self.ERROR_RECORD_MODE_MONITORING w = self.W.get_value(borrow=True) #alpha = self.alpha.get_value(borrow=True) beta = self.beta.get_value(borrow=True) #print "alpha summary: "+str( (alpha.min(),alpha.mean(),alpha.max())) print "beta summary: "+str( (beta.min(), beta.mean(), beta.max())) if N.any(N.isnan(w)): raise Exception("Nan") print 'weights summary: '+str( (w.min(),w.mean(),w.max())) errors = [] if self.instrumented: self.clear_instruments() for i in xrange(batches): x = dataset.get_batch_design(batch_size) error = self.error_func(x) errors.append( error ) if self.instrumented: self.update_instruments(x) # # self.error_record.append( (self.examples_seen, self.batches_seen, N.asarray(errors).mean() ) ) if self.instrumented: self.instrument_record.begin_report(examples_seen = self.examples_seen, batches_seen = self.batches_seen) self.make_instrument_report() self.instrument_record.end_report() self.clear_instruments() # print 'monitoring set done' # def recons_from_Q_Mu1(self,Q,Mu1): return self.b + T.dot(Q*Mu1, self.W.T) # def recons_err_from_Q_Mu1(self,Q,Mu1,V): return T.mean(T.sqr(V-self.recons_from_Q_Mu1(Q,Mu1))) def binary_entropy(self,Q): mod_Q = 1e-6 + (1.-2e-6)*Q return -(mod_Q * T.log(Q) + (1.-mod_Q)*T.log(1.-mod_Q)) def make_instruments(self): assert not self.use_cd #currently just supports PCD recons_outputs = [] ave_act_outputs = [] cond_ent_outputs = [] neg_chains_recons_outputs = [] neg_chains_ave_act_outputs = [] neg_chains_cond_ent_outputs = [] self.instrument_X = T.matrix() for max_iters in xrange(1,self.mean_field_iters+1): pos_Q, pos_Mu1 = self.infer_Q_Mu1(self.instrument_X, max_iters = max_iters) neg_Q, neg_Mu1 = self.infer_Q_Mu1(self.chains, max_iters = max_iters) recons_outputs.append(self.recons_err_from_Q_Mu1(pos_Q,pos_Mu1,self.instrument_X)) neg_chains_recons_outputs.append(self.recons_err_from_Q_Mu1(neg_Q,neg_Mu1,self.chains)) ave_act_outputs.append(T.mean(pos_Q, axis=0)) neg_chains_ave_act_outputs.append(T.mean(neg_Q, axis=0)) cond_ent_outputs.append(T.mean(self.binary_entropy(pos_Q),axis=0)) neg_chains_cond_ent_outputs.append(T.mean(self.binary_entropy(neg_Q),axis=0)) # self.neg_chains_recons_after_mean_field = function([],neg_chains_recons_outputs) self.neg_chains_ave_act_after_mean_field = function([],neg_chains_ave_act_outputs) self.neg_chains_cond_ent_after_mean_field = function([],neg_chains_cond_ent_outputs) self.recons_after_mean_field_func = function([self.instrument_X],recons_outputs) self.ave_act_after_mean_field_func = function([self.instrument_X],ave_act_outputs) self.cond_ent_after_mean_field_func = function([self.instrument_X],cond_ent_outputs) neg_chain_norms = T.sqrt(T.sum(T.sqr(self.chains),axis=1)) self.neg_chain_norms_summary = function([], [neg_chain_norms.min(),neg_chain_norms.mean(),neg_chain_norms.max()]) weight_norms = T.sqrt(T.sum(T.sqr(self.W),axis=0)) self.weight_norms_summary = function([], [weight_norms.min(),weight_norms.mean(),weight_norms.max()]) self.hid_bias_summary = function([],[self.c.min(),self.c.mean(),self.c.max()]) self.vis_bias_summary = function([],[self.b.min(),self.b.mean(),self.b.max()]) self.beta_func = function([],self.beta) # def clear_instruments(self): self.cond_ent_after_mean_field = [[] for i in xrange(self.mean_field_iters)] self.recons_after_mean_field = [[] for i in xrange(self.mean_field_iters)] self.ave_act_after_mean_field = [[] for i in xrange(self.mean_field_iters)] # def update_instruments(self, X): ce = self.cond_ent_after_mean_field_func(X) re = self.recons_after_mean_field_func(X) aa = self.ave_act_after_mean_field_func(X) for fr, to in [ (ce,self.cond_ent_after_mean_field), (re, self.recons_after_mean_field), (aa, self.ave_act_after_mean_field) ]: assert len(to) == self.mean_field_iters assert len(fr) == self.mean_field_iters for fr_elem, to_elem in zip(fr,to): to_elem.append(fr_elem) # # # def make_instrument_report(self): r = self.instrument_record neg_chains_recons = self.neg_chains_recons_after_mean_field() neg_chains_ave_act = self.neg_chains_ave_act_after_mean_field() neg_chains_cond_ent = self.neg_chains_cond_ent_after_mean_field() for i in xrange(1,self.mean_field_iters+1): re = N.asarray(self.recons_after_mean_field[i-1]).mean() r.report(('recons_err_after_mean_field',i),re) r.report(('neg_recons_err_after_mean_field',i),neg_chains_recons[i-1]) aa_mat = N.asarray(self.ave_act_after_mean_field[i-1]) assert len(aa_mat.shape) == 2 assert aa_mat.shape[1] == self.nhid aa_vec = aa_mat.mean(axis=0) aa_min = aa_vec.min() aa_mean = aa_vec.mean() aa_max = aa_vec.max() naa_vec = neg_chains_ave_act[i-1] naa_min = naa_vec.min() naa_mean = naa_vec.mean() naa_max = naa_vec.max() r.report(('ave_act_after_mean_field_min',i),aa_min) r.report(('ave_act_after_mean_field_mean',i),aa_mean) r.report(('ave_act_after_mean_field_max',i),aa_max) r.report(('neg_ave_act_after_mean_field_min',i),naa_min) r.report(('neg_ave_act_after_mean_field_mean',i),naa_mean) r.report(('neg_ave_act_after_mean_field_max',i),naa_max) ce_mat = N.asarray(self.cond_ent_after_mean_field[i-1]) assert len(ce_mat.shape) == 2 assert ce_mat.shape[1] == self.nhid ce_vec = ce_mat.mean(axis=0) ce_min, ce_mean, ce_max = ce_vec.min(), ce_vec.mean(), ce_vec.max() nce_vec = neg_chains_cond_ent[i-1] nce_min, nce_mean, nce_max = nce_vec.min(), nce_vec.mean(), nce_vec.max() r.report(('cond_ent_after_mean_field_min',i),ce_min) r.report(('cond_ent_after_mean_field_mean',i),ce_mean) r.report(('cond_ent_after_mean_field_max',i),ce_max) r.report(('neg_cond_ent_after_mean_field_min',i),nce_min) r.report(('neg_cond_ent_after_mean_field_mean',i),nce_mean) r.report(('neg_cond_ent_after_mean_field_max',i),nce_max) # neg_chain_norms_min, neg_chain_norms_mean, neg_chain_norms_max = self.neg_chain_norms_summary() r.report('neg_chain_norms_min', neg_chain_norms_min) r.report('neg_chain_norms_mean', neg_chain_norms_mean) r.report('neg_chain_norms_max', neg_chain_norms_max) weight_norms_min, weight_norms_mean, weight_norms_max = self.weight_norms_summary() r.report('weight_norms_min', weight_norms_min) r.report('weight_norms_mean', weight_norms_mean) r.report('weight_norms_max', weight_norms_max) hid_bias_min, hid_bias_mean, hid_bias_max = self.hid_bias_summary() r.report('hid_bias_min', hid_bias_min) r.report('hid_bias_mean', hid_bias_mean) r.report('hid_bias_max', hid_bias_max) vis_bias_min, vis_bias_mean, vis_bias_max = self.vis_bias_summary() r.report('vis_bias_min', vis_bias_min) r.report('vis_bias_mean', vis_bias_mean) r.report('vis_bias_max', vis_bias_max) r.report('beta',self.beta_func()) # def reconstruct(self, x, use_noise): assert x.shape[0] == 1 print 'x summary: '+str((x.min(),x.mean(),x.max())) #this method is mostly a hack to make the formatting work the same as denoising autoencoder self.truth_shared = shared(x.copy()) if use_noise: self.vis_shared = shared(x.copy() + 0.15 * N.cast[floatX](self.rng.randn(*x.shape))) else: self.vis_shared = shared(x.copy()) self.reconstruction = self.recons_func(self.vis_shared.get_value()) print 'recons summary: '+str((self.reconstruction.min(),self.reconstruction.mean(),self.reconstruction.max())) def gibbs_step_exp(self, V = None, Q = None, Mu1 = None): if V is not None: assert Q is None assert Mu1 is None base_name = V.name if base_name is None: base_name = 'anon' Q, Mu1 = self.infer_Q_Mu1(V) else: assert Q is not None assert Mu1 is not None Q_name = Q.name if Q_name is None: Q_name = 'anon' base_name = 'from_Q_'+Q_name # H, S = self.sample_hid(Q, Mu1) H.name = base_name + '->hid_sample' sample = self.b + T.dot(H*S,self.W_T) sample.name = base_name + '->sample_expectation' return sample def gibbs_step(self, V = None, Q = None, Mu1 = None): if V is not None: assert Q is None base_name = V.name if base_name is None: base_name = 'anon' # else: assert Q is not None Q_name = Q.name if Q_name is None: Q_name = 'anon' # base_name = 'from_Q_'+Q_name # m = self.gibbs_step_exp(V, Q, Mu1) assert m.dtype == floatX std = T.sqrt(1./self.beta) #std = #Print('vis_std',attrs=['min','mean','max'])(std) sample = self.theano_rng.normal(size = m.shape, avg = m, std = std, dtype = m.dtype) sample.name = base_name + '->sample' return sample def sample_hid(self, Q, Mu1): H = self.theano_rng.binomial(size = Q.shape, n = 1, p = Q, dtype = Q.dtype) std = T.sqrt(1./self.gamma) #std = #Print('hid_std',attrs=['min','mean','max'])(std) S = self.theano_rng.normal(size = Mu1.shape, avg = Mu1, std = std, dtype = Mu1.dtype) return H, S def infer_Q_Mu1(self, V, max_iters = 0): if max_iters > 0: iters = min(max_iters, self.mean_field_iters) else: iters = self.mean_field_iters # base_name = V.name if base_name is None: base_name = 'anon' first_Q, first_Mu1 = self.init_mean_field_step(V) Q = [ first_Q ] Mu1 = [ first_Mu1 ] no_damp = 0 for i in xrange(iters - 1): damp = i + 1 < self.mean_field_iters - self.no_damp_iters no_damp += (damp == False) new_Q, new_Mu1 = self.damped_mean_field_step(V,Q[-1],Mu1[-1],damp) Q.append ( new_Q ) Mu1.append( new_Mu1) # if max_iters == 0: assert no_damp == self.no_damp_iters else: assert self.no_damp_iters is not None assert self.mean_field_iters is not None assert max_iters is not None assert no_damp == max(0, self.no_damp_iters - (self.mean_field_iters - max_iters)) # for i in xrange(len(Q)): Q[i].name = base_name + '->Q ('+str(i)+')' assert len(Q[-1].type().broadcastable) == 2 assert len(Mu1[-1].type().broadcastable) == 2 return Q[-1], Mu1[-1] def Q_from_A(self, A): assert len(A.type().broadcastable) == 2 return T.nnet.sigmoid(0.5*(T.sqr(A)/self.gamma-T.sqr(self.a)/self.alpha)+self.c-0.5*T.log(self.gamma/self.alpha)) def mean_field_step(self, V, P, Mu): assert len(V.type().broadcastable) == 2 iterm = T.dot(T.dot(P*Mu,self.W.T*self.beta),self.W) normalized_V = self.beta * (V-self.b) main_term = T.dot(normalized_V, self.W) A = self.w * P*Mu - iterm + main_term + self.a Mu1 = A / self.gamma Q = self.Q_from_A( A) assert len(Q.type().broadcastable) == 2 return Q, Mu1 # def mean_field_fancy_step(self, V, P, Mu): iterm = T.dot(T.dot(P*Mu,self.W.T*self.beta),self.W) normalized_V = self.beta * (V-self.b) main_term = T.dot(normalized_V, self.W) iA = self.w * P*Mu - iterm full_A = iA + main_term+self.a Mu1 = full_A / self.gamma Q = self.Q_from_A( full_A) iMu = iA / self.gamma #if this is negative, we are ammplifying so we use default damping #if this is positive, we are flipping, use max(0,lambda(tau)) discriminant = T.sgn(Mu-iMu) * Mu/(1e-10+abs(Mu-iMu)) Lambda = self.tau * discriminant - T.sgn(Mu-iMu) * iMu/(1e-10+abs(Mu-iMu)) mask = discriminant <= 0 fancy_damp = mask*self.s_default_damping_factor + (1.-mask)*T.maximum(0.,Lambda) return Q, Mu1, fancy_damp def init_mean_field_step(self, V, damp = True): #return self.damped_mean_field_step(V, T.nnet.sigmoid(self.c-0.5*T.log(self.gamma/self.alpha)), self.a/self.alpha, damp) return self.damped_mean_field_step(V, T.zeros_like(T.dot(V,self.W)), T.zeros_like(T.dot(V,self.W)), damp) def damped_mean_field_step(self, V, P, Mu, damp): if self.fancy_damp: Q, Mu1, fancy_damp = self.mean_field_fancy_step(V,P,Mu) else: Q, Mu1 = self.mean_field_step(V,P,Mu) # if damp: r_Q = self.q_damping_factor * P + (1.0 - self.q_damping_factor) * Q if self.fancy_damp: r_Mu = fancy_damp * Mu + (1.0-fancy_damp) * Mu1 else: r_Mu = self.s_default_damping_factor * Mu + (1.0-self.s_default_damping_factor) * Mu1 # else: r_Q = Q r_Mu = Mu1 # assert len(r_Q.type().broadcastable) == 2 return r_Q, r_Mu # def debug_dump(self, x): print "making debug dump" print 'x: '+str((x.min(),x.mean(),x.max())) W = self.W.get_value() print 'W: '+str((W.min(),W.mean(),W.max())) w = function([],self.w)() print 'w: '+str((w.min(),w.mean(),w.max())) alpha = self.alpha.get_value() print 'alpha: '+str((alpha.min(),alpha.mean(),alpha.max())) beta = self.beta.get_value() print 'beta: '+str((beta.min(),beta.mean(),beta.max())) prior_Q = function([],T.nnet.sigmoid(self.c-0.5*T.log(self.gamma/self.alpha)))() print 'prior_Q: '+str((prior_Q.min(),prior_Q.mean(),prior_Q.max())) prior_Mu = function([],self.a/self.alpha)() print 'prior_Mu: '+str((prior_Mu.min(),prior_Mu.mean(),prior_Mu.max())) var = T.matrix() var.name = 'debug_x' for i in xrange(1,self.mean_field_iters+1): outputs = self.infer_Q_Mu1(var,max_iters=i) Q, Mu = function([var],outputs)(x) print 'after '+str(i)+' mean field steps:' print '\tQ: '+str((Q.min(),Q.mean(),Q.max())) print '\tMu: '+str((Mu.min(),Mu.mean(),Mu.max())) # assert False def learn_mini_batch(self, x): #t1 = time.time() if self.first_mini_batch: self.first_mini_batch = False if not self.use_cd: self.set_up_sampler() # # #Mu1 = self.Mu1_func(x) #if Mu1.max() > 500.: # self.debug_dump(x) #print '\nrun_sampling\n' self.run_sampling(x) #print '\nlearn_from_samples\n' self.learn_from_samples(x,self.learning_rate) #pos_Q, neg_Q = self.run_sampling(x) #self.learn_from_samples(x, pos_Q, neg_Q, self.learning_rate) #t2 = time.time() #print 'batch took '+str(t2-t1)+' sec' self.examples_seen += x.shape[0] self.batches_seen += 1
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[1000, 1000, 1000]): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.layer_sizes = hidden_layers_sizes assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.x_mask = T.matrix('x_mask') # For partial information. # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) self.params.extend(rbm_layer.params) # And build the upside-down network. This shares parameters with the # forward network. Except the weights are transposed and stuff. # The "isolated" layers let you run only the upside-down part of the # network, for generation. The non-isolated layers are connected to # the forward, compressing part of the network, and are used for # training. reverse_input = self.sigmoid_layers[-1].output self.isolated_reverse_input = theano.shared( numpy.zeros([10, hidden_layers_sizes[-1]])) isolated_input = self.isolated_reverse_input self.reverse_layers = [None] * self.n_layers self.isolated_reverse = [None] * self.n_layers for i in reversed(xrange(self.n_layers)): if i == 0: out_size = n_ins else: out_size = hidden_layers_sizes[i-1] reverse_sigmoid = HiddenLayer(rng=numpy_rng, input=reverse_input, n_in=hidden_layers_sizes[i], n_out=out_size, W=self.sigmoid_layers[i].W.T, b=self.rbm_layers[i].vbias, activation=T.nnet.sigmoid ) isolated_sigmoid = HiddenLayer(rng=numpy_rng, input=isolated_input, n_in=hidden_layers_sizes[i], n_out=out_size, W=self.sigmoid_layers[i].W.T, b=self.rbm_layers[i].vbias, activation=T.nnet.sigmoid ) reverse_input = reverse_sigmoid.output isolated_input = isolated_sigmoid.output self.reverse_layers[i] = reverse_sigmoid self.isolated_reverse[i] = isolated_sigmoid # The fine-tune cost is the reconstruction error of the entire net. self.finetune_cost = ((self.x - self.reverse_layers[0].output)**2).sum() # The cost for training the generative net - in this case, self.x is # completely disconnected, and we feed a pattern into the reverse net. self.generative_cost = ((self.x - self.isolated_reverse[0].output)**2).sum() # The l1 cost is for generating constrained samples of the input. (Aka # harmonizing a melody.) Given a melody in self.x and a mask # self.x_mask of which parts of self.x actually matter, it computes the # error between the generated sample and the melody. self.l1_cost = (((self.x - self.isolated_reverse[0].output) * self.x_mask)**2).sum()
def test_rbm(learning_rate=0.1, training_epochs=15, dataset='../../data/mnist.pkl.gz', batch_size=20, n_chains=20, n_samples=10, output_folder='rbm_plots', n_hidden=500): """ Demonstrate how to train and afterwards sample from it using Theano. This is demonstrated on MNIST. :param learning_rate: learning rate used for training the RBM :param training_epochs: number of epochs used for training :param dataset: path the the pickled dataset :param batch_size: size of a batch used to train the RBM :param n_chains: number of parallel Gibbs chains to be used for sampling :param n_samples: number of samples to plot for each chain """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) # initialize storage for the persistent chain (state = hidden # layer of chain) persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden), dtype=theano.config.floatX), borrow=True) # construct the RBM class rbm = RBM(input=x, n_visible=28 * 28, n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng) # get the cost and the gradient corresponding to one step of CD-15 #cost, updates = rbm.get_cost_updates(lr=learning_rate, # persistent=persistent_chain, k=1) cost, updates = rbm.get_cost_updates(lr=learning_rate, persistent=None, k=1) ################################# # Training the RBM # ################################# if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) # it is ok for a theano function to have no output # the purpose of train_rbm is solely to update the RBM parameters train_rbm = theano.function( [index], cost, updates=updates, givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]}, name='train_rbm') plotting_time = 0. start_time = time.clock() # go through training epochs for epoch in xrange(training_epochs): # go through the training set mean_cost = [] for batch_index in xrange(n_train_batches): mean_cost += [train_rbm(batch_index)] print 'Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost) # Plot filters after each training epoch plotting_start = time.clock() # Construct image from the weight matrix image = PIL.Image.fromarray( tile_raster_images(X=rbm.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_at_epoch_%i.png' % epoch) plotting_stop = time.clock() plotting_time += (plotting_stop - plotting_start) end_time = time.clock() pretraining_time = (end_time - start_time) - plotting_time print('Training took %f minutes' % (pretraining_time / 60.)) ################################# # Sampling from the RBM # ################################# # find out the number of test samples number_of_test_samples = test_set_x.get_value(borrow=True).shape[0] # pick random test examples, with which to initialize the persistent chain test_idx = rng.randint(number_of_test_samples - n_chains) persistent_vis_chain = theano.shared( numpy.asarray(test_set_x.get_value(borrow=True)[test_idx:test_idx + n_chains], dtype=theano.config.floatX)) plot_every = 1000 # define one step of Gibbs sampling (mf = mean-field) define a # function that does `plot_every` steps before returning the # sample for plotting [presig_hids, hid_mfs, hid_samples, presig_vis, vis_mfs, vis_samples], updates = \ theano.scan(rbm.gibbs_vhv, outputs_info=[None, None, None, None, None, persistent_vis_chain], n_steps=plot_every) # add to updates the shared variable that takes care of our persistent # chain :. updates.update({persistent_vis_chain: vis_samples[-1]}) # construct the function that implements our persistent chain. # we generate the "mean field" activations for plotting and the actual # samples for reinitializing the state of our persistent chain sample_fn = theano.function([], [vis_mfs[-1], vis_samples[-1]], updates=updates, name='sample_fn') # create a space to store the image for plotting ( we need to leave # room for the tile_spacing as well) image_data = numpy.zeros((29 * n_samples + 1, 29 * n_chains - 1), dtype='uint8') for idx in xrange(n_samples): # generate `plot_every` intermediate samples that we discard, # because successive samples in the chain are too correlated vis_mf, vis_sample = sample_fn() print ' ... plotting sample ', idx image_data[29 * idx:29 * idx + 28, :] = tile_raster_images( X=vis_mf, img_shape=(28, 28), tile_shape=(1, n_chains), tile_spacing=(1, 1)) # construct image image = PIL.Image.fromarray(image_data) image.save('samples.png') os.chdir('../')
class AELADLSTMS(object): def __init__(self, wordlist, argv, aspect_num=0): parser = argparse.ArgumentParser() parser.add_argument('--name', type=str, default='lstm') parser.add_argument('--rseed', type=int, default=int(1000 * time.time()) % 19491001) parser.add_argument('--dim_word', type=int, default=300) parser.add_argument('--dim_hidden', type=int, default=300) parser.add_argument('--dim_aspect', type=int, default=300) parser.add_argument('--grained', type=int, default=3, choices=[3]) parser.add_argument('--regular', type=float, default=0.001) parser.add_argument('--word_vector', type=str, default='data/glove.840B.300d.txt') args, _ = parser.parse_known_args(argv) self.name = args.name self.srng = RandomStreams(seed=args.rseed) self.dim_word, self.dim_hidden = args.dim_word, args.dim_hidden self.dim_aspect = args.dim_aspect self.grained = args.grained self.regular = args.regular self.num = len(wordlist) + 1 self.init_param() self.load_word_vector(args.word_vector, wordlist) self.init_function() def init_param(self): def shared_matrix(dim, name, u=0, b=0): matrix = self.srng.uniform(dim, low=-u, high=u, dtype=theano.config.floatX) + b f = theano.function([], matrix) return theano.shared(f(), name=name) u = lambda x: 1 / np.sqrt(x) dimc, dimh, dima = self.dim_word, self.dim_hidden, self.dim_aspect dim_lstm_para = dimh + dimc self.Vw = shared_matrix((self.num, dimc), 'Vw', 0.01) self.Wi = shared_matrix((dimh, dim_lstm_para), 'Wi', u(dimh)) self.Wo = shared_matrix((dimh, dim_lstm_para), 'Wo', u(dimh)) self.Wf = shared_matrix((dimh, dim_lstm_para), 'Wf', u(dimh)) self.bi = shared_matrix((dimh,), 'bi', 0.) self.bo = shared_matrix((dimh,), 'bo', 0.) self.bf = shared_matrix((dimh,), 'bf', 0.) self.Wc = shared_matrix((dimh, dim_lstm_para), 'Wc', u(dimh)) self.bc = shared_matrix((dimh,), 'bc', 0.) self.Ws = shared_matrix((dimh + dimh, self.grained), 'Ws', u(dimh)) self.bs = shared_matrix((self.grained,), 'bs', 0.) self.h0, self.c0 = np.zeros(dimh, dtype=theano.config.floatX), np.zeros(dimc, dtype=theano.config.floatX) self.params = [self.Wi, self.Wo, self.Wf, self.Wc, self.bi, self.bo, self.bf, self.bc, self.Ws, self.bs] self.Wp_L = shared_matrix((dimh, dimh), 'Wp', u(dimh)) self.Wx_L = shared_matrix((dimh, dimh), 'Wx', u(dimh)) self.Wp_R = shared_matrix((dimh, dimh), 'Wp', u(dimh)) self.Wx_R = shared_matrix((dimh, dimh), 'Wx', u(dimh)) self.params.extend([self.Wp_L, self.Wx_L, self.Wp_R, self.Wx_R]) self.alpha_h_W_L = shared_matrix((dimh, dimh + dimh), 'alpha_h_W_L', u(dimh * 2)) self.alpha_h_W_R = shared_matrix((dimh, dimh + dimh), 'alpha_h_W_R', u(dimh * 2)) self.params.extend([self.alpha_h_W_L, self.alpha_h_W_R]) self.a_for_left = theano.shared(1.0, name='a_for_left') self.a_for_middle = theano.shared(1.0, name='a_for_middle') self.b_for_left = theano.shared(0.0, name='b_for_left') self.a_back_right = theano.shared(1.0, name='a_back_right') self.b_back_right = theano.shared(0.0, name='b_back_right') self.params.extend([self.a_for_left, self.a_for_middle, self.b_for_left]) self.params.extend([self.a_back_right, self.b_back_right]) def init_function(self): self.seq_loc = T.lvector() self.seq_idx = T.lvector() self.target = T.lvector() self.target_content_index = T.lscalar() self.seq_len = T.lscalar() self.solution = T.matrix() self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0) self.all_tar_vector = T.take(self.Vw, self.target, axis=0) self.tar_vector = T.mean(self.all_tar_vector, axis=0) self.target_vector_dim = self.tar_vector.dimshuffle('x', 0) self.seq_matrix = T.concatenate([self.seq_matrix[0:self.target_content_index], self.target_vector_dim, self.seq_matrix[self.target_content_index + 1:]], axis=0) h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like(self.bc, dtype=theano.config.floatX) def rnn(X, aspect): def encode_forward(x_t, h_fore, c_fore): v = T.concatenate([h_fore, x_t]) f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf) i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi) o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo) c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc) h_next = o_t * T.tanh(c_next) return h_next, c_next def encode_backward(x_t, h_fore, c_fore): v = T.concatenate([h_fore, x_t]) f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf) i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi) o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo) c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc) h_next = o_t * T.tanh(c_next) return h_next, c_next loc_for = T.zeros_like(self.seq_loc) + self.target_content_index al_for = self.a_for_left * T.exp( -self.b_for_left * T.abs_( self.seq_loc[0:self.target_content_index] - loc_for[0:self.target_content_index])) am_for = self.a_for_middle * [1] a_for = T.concatenate([al_for, am_for]) locate_for = T.zeros_like(self.seq_matrix[0:self.target_content_index + 1], dtype=T.config.floatX) + T.reshape(a_for, [-1, 1]) loc_back = T.zeros_like(self.seq_loc) + self.target_content_index ar_back = self.a_back_right * T.exp( -self.b_back_right * T.abs_( self.seq_loc[self.target_content_index + 1:] - loc_back[self.target_content_index + 1:])) ar_back = ar_back[::-1] a_back = T.concatenate([am_for, ar_back]) locate_back = T.zeros_like(self.seq_matrix[self.target_content_index:], dtype=T.config.floatX) + T.reshape( a_back, [-1, 1]) scan_result_forward, _forward = theano.scan(fn=encode_forward, sequences=locate_for * X[0:self.target_content_index + 1], outputs_info=[h, c]) scan_result_backward, _backward = theano.scan(fn=encode_backward, sequences=locate_back * X[self.target_content_index:][::-1], outputs_info=[h, c]) embedding_l = scan_result_forward[0] embedding_r = scan_result_backward[0] h_target_for = embedding_l[-1] h_target_back = embedding_r[-1] attention_h_target_l = embedding_l cont_l = T.concatenate([h_target_for, h_target_back]) yuyi_l = T.transpose(cont_l) alpha_h_l = T.dot(T.dot(attention_h_target_l, self.alpha_h_W_L), yuyi_l) alpha_tmp_l = T.nnet.softmax(alpha_h_l) r_l = T.dot(alpha_tmp_l, embedding_l) h_star_L = T.tanh(T.dot(r_l, self.Wp_L)) attention_h_target_r = embedding_r cont_r = T.concatenate([h_target_for, h_target_back]) yuyi_r = T.transpose(cont_r) alpha_h_r = T.dot(T.dot(attention_h_target_r, self.alpha_h_W_R), yuyi_r) alpha_tmp_r = T.nnet.softmax(alpha_h_r) r_r = T.dot(alpha_tmp_r, embedding_r) h_star_R = T.tanh(T.dot(r_r, self.Wp_R)) embedding = T.concatenate([h_star_L, h_star_R], axis=1) return embedding embedding = rnn(self.seq_matrix, self.tar_vector) embedding_for_train = embedding * self.srng.binomial(embedding.shape, p=0.5, n=1, dtype=embedding.dtype) embedding_for_test = embedding * 0.5 self.pred_for_train = T.nnet.softmax(T.dot(embedding_for_train, self.Ws) + self.bs) self.pred_for_test = T.nnet.softmax(T.dot(embedding_for_test, self.Ws) + self.bs) self.l2 = sum([T.sum(param ** 2) for param in self.params]) - T.sum(self.Vw ** 2) self.loss_sen = -T.tensordot(self.solution, T.log(self.pred_for_train), axes=2) self.loss_l2 = 0.5 * self.l2 * self.regular self.loss = self.loss_sen + self.loss_l2 grads = T.grad(self.loss, self.params) self.updates = collections.OrderedDict() self.grad = {} for param, grad in zip(self.params, grads): g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \ dtype=theano.config.floatX)) self.grad[param] = g self.updates[g] = g + grad self.func_train = theano.function( inputs=[self.seq_idx, self.target, self.solution, self.target_content_index, self.seq_loc, self.seq_len, theano.In(h, value=self.h0), theano.In(c, value=self.c0)], outputs=[self.loss, self.loss_sen, self.loss_l2], updates=self.updates, on_unused_input='warn') self.func_test = theano.function( inputs=[self.seq_idx, self.target, self.target_content_index, self.seq_loc, self.seq_len, theano.In(h, value=self.h0), theano.In(c, value=self.c0)], outputs=self.pred_for_test, on_unused_input='warn') def load_word_vector(self, fname, wordlist): loader = WordLoader() dic = loader.load_word_vector(fname, wordlist, self.dim_word) not_found = 0 Vw = self.Vw.get_value() for word, index in wordlist.items(): try: Vw[index] = dic[word] except: not_found += 1 print 'not_found:', not_found self.Vw.set_value(Vw)
class BaseModel(object): def init_start(self, config): self._params = {} self._is_training = tt.iscalar('is_training') self._np_rng = np.random.RandomState(config.seed // 2 + 123) if config.device == 'cpu': from theano.tensor.shared_randomstreams import RandomStreams # works on cpu else: from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams # works on gpu self._theano_rng = RandomStreams(config.seed // 2 + 321) self._init_scale = config.init_scale self._pre_epoch_hooks = [] def register_pre_epoch_hook(self, func, for_train=False, for_eval=False): assert for_train or for_eval self._pre_epoch_hooks.append((func, for_train, for_eval)) def invoke_pre_epoch_hooks(self, is_train=False, is_eval=False): assert is_train or is_eval for func, for_train, for_eval in self._pre_epoch_hooks: if (for_train and is_train) or (for_eval and is_eval): func() ################### Constructing shared vars ################### def get_param_init(self, shape, init_scheme, init_scale=None): if isinstance(init_scheme, numbers.Number): value = np.full(shape, float(init_scheme)) elif init_scheme == 'identity': assert len(shape) == 2 and shape[0] == shape[1] value = np.eye(shape[0]) elif init_scheme == 'uniform': scale = init_scale or self._init_scale value = self._np_rng.uniform(low=-scale, high=scale, size=shape) elif init_scheme == 'gaussian': scale = init_scale or self._init_scale value = self._np_rng.normal(loc=0., scale=scale, size=shape) elif init_scheme == 'glorot_uniform': assert len(shape) == 2 s = np.sqrt(6.0 / (shape[0] + shape[1])) value = self._np_rng.uniform(low=-s, high=s, size=shape) elif init_scheme == 'orthogonal': assert len(shape) == 2 u, _, v = np.linalg.svd(self._np_rng.normal(0.0, 1.0, shape), full_matrices=False) #assert u.shape == shape value = u if u.shape == shape else v scale = init_scale or 1.1 value *= scale else: raise AssertionError('unrecognized init scheme') return value def make_param_from_value(self, name, value): if name in self._params: param = self._params[name]; if value.shape != param.get_value().shape: raise AssertionError('parameter {} re-use attempt with mis-matching shapes: ' 'existing shape {}, requested shape {}'.format( name, param.get_value().shape, value.shape)) return param param = get_shared_floatX(value, name) self._params[name] = param return param def make_param(self, name, shape, init_scheme, init_scale=None): value = self.get_param_init(shape, init_scheme, init_scale) return self.make_param_from_value(name, value) def make_concat_param(self, name, shapes, init_schemes, axis): if len(shapes) != len(init_schemes): raise AssertionError('number of shapes and number of init schemes are incompatible') if len(set([shape[:axis] + shape[axis+1:] for shape in shapes])) != 1: raise AssertionError('all shapes should be identical on all axes except given axis') val = np.concatenate([self.get_param_init(shape, init_scheme) for shape, init_scheme in zip(shapes, init_schemes)], axis=axis) w = self.make_param_from_value(name, val) return w ################### I/O ################### def save(self, filename): logging.getLogger().info('Saving model weights to {}'.format(filename)) verify_dir_exists(filename) param_dict = {name : param.get_value() for name, param in self._params.iteritems()} with open(filename, 'wb') as f: cPickle.dump(param_dict, f, protocol=cPickle.HIGHEST_PROTOCOL) def load_if_exists(self, filename, allow_mismatch=False): if not os.path.isfile(filename): return False logger = logging.getLogger() logger.info('Loading model weights found in {}'.format(filename)) with open(filename, 'rb') as f: param_dict = cPickle.load(f) param_names, loaded_param_names = set(self._params.keys()), set(param_dict.keys()) if param_names != loaded_param_names: msg = ('Parameter names loaded from {} do not match model\'s parameter names.\n' 'param names only found in model: {}\n' 'param names only found in loaded model: {}').format( filename, param_names.difference(loaded_param_names), loaded_param_names.difference(param_names)) if allow_mismatch: logger.info(msg) param_dict = {param_name: param_dict[param_name] for param_name \ in param_names.intersection(loaded_param_names)} else: raise AssertionError(msg) for name, value in param_dict.iteritems(): self._params[name].set_value(value) return True ################### Dropout ################### def get_dropout_noise(self, shape, dropout_p): if dropout_p == 0: return 1 keep_p = 1 - dropout_p return cast_floatX_np(1. / keep_p) * self._theano_rng.binomial( size=shape, p=keep_p, n=1, dtype=floatX) def apply_dropout_noise(self, x, noise): return ifelse(self._is_training, noise * x, x) def dropout(self, x, dropout_p): return self.apply_dropout_noise(x, self.get_dropout_noise(x.shape, dropout_p)) ################### Misc ################### def get_param_sizes(self): param_sizes = {name: param.get_value().size for name, param in self._params.iteritems()} return sum(param_sizes.values()), param_sizes ################### Simple layers ################### def linear(self, name, x, input_dim, output_dim, with_bias=True, w_init='uniform', bias_init=0): # x (..., input_dim) n = namer(name) W = self.make_param(n('W'), (input_dim, output_dim), w_init) y = tt.dot(x, W) # (..., output_dim) if with_bias: b = self.make_param(n('b'), (output_dim,), bias_init) y += b return y def ff(self, name, x, dims, activation, dropout_ps, **kwargs): assert len(dims) >= 2 if dropout_ps: if isinstance(dropout_ps, numbers.Number): dropout_ps = [dropout_ps] * (len(dims) - 1) else: assert len(dropout_ps) == len(dims) - 1 n = namer(name) h = x if activation == 'relu': f = tt.nnet.relu elif activation == 'sigmoid': f = tt.nnet.sigmoid elif activation == 'tanh': f = tt.tanh else: raise AssertionError('unrecognized activation function') for i, (input_dim, output_dim) in enumerate(zip(dims[:-1], dims[1:])): if dropout_ps: h = self.dropout(h, dropout_ps[i]) h = f(self.linear(n('l%d' % (i+1)), h, input_dim, output_dim, **kwargs)) return h ################### LSTM ################### def stacked_bi_lstm(self, name, x, x_mask, num_layers, input_dim, hidden_dim, drop_x, drop_h, **kwargs): n = namer(name) h = x for l in range(1, num_layers+1): h = self.bi_lstm(n('l%d' % l), h, x_mask, input_dim if l == 1 else 2*hidden_dim, hidden_dim, drop_x, drop_h, **kwargs) return h # (timesteps, batch_size, 2*hidden_dim) def bi_lstm(self, name, x, x_mask, input_dim, hidden_dim, drop_x, drop_h, **kwargs): n = namer(name) fwd_h = self.lstm(n('fwd'), x, x_mask, input_dim, hidden_dim, drop_x, drop_h, backward=False, **kwargs) bck_h = self.lstm(n('bck'), x, x_mask, input_dim, hidden_dim, drop_x, drop_h, backward=True, **kwargs) bi_h = tt.concatenate([fwd_h, bck_h], axis=2) # (timesteps, batch_size, 2*hidden_dim) return bi_h def lstm(self, name, x, x_mask, input_dim, hidden_dim, drop_x, drop_h, backward=False, couple_i_and_f=False, learn_initial_state=False, tie_x_dropout=True, sep_x_dropout=False, sep_h_dropout=False, w_init='uniform', u_init='orthogonal', forget_bias_init=1, other_bias_init=0): """Customizable uni-directional LSTM layer. Handles masks, can learn initial state, input and forget gate can be coupled, with recurrent dropout, no peephole connections. Args: x: Theano tensor, shape (timesteps, batch_size, input_dim) x_mask: Theano tensor, shape (timesteps, batch_size) input_dim: int, dimension of input vectors hidden_dim: int, dimension of hidden state drop_x: float, dropout rate to apply to inputs drop_h: float, dropout rate to apply to hidden state backward: boolean, whether to recur over timesteps in reveresed order couple_i_and_f: boolean, whether to have input gate = 1 - forget gate learn_initial_state: boolean, whether to have initial cell state and initial hidden state as learnt parameters tie_x_dropout: boolean, whether to have the same dropout masks across timesteps for input sep_x_dropout: boolean, if True dropout is applied over weights of lin. trans. of input; otherwise it is applied over input activations sep_h_dropout: boolean, if True dropout is applied over weights of lin. trans. of hidden state; otherwise it is applied over hidden state activations w_init: string, initialization scheme for weights of lin. trans. of input u_init: string, initialization scheme for weights of lin. trans. of hidden state forget_bias_init: string, initialization scheme for forget gate's bias other_bias_init: string, initialization scheme for other biases Note: Proper variational dropout (Gal 2015) is: tie_x_dropout=True, sep_x_dropout=True, sep_h_dropout=True A faster alternative is: tie_x_dropout=True, sep_x_dropout=False, sep_h_dropout=False Returns: h: Theano variable, recurrent hidden states at each timestep, shape (timesteps, batch_size, hidden_dim) """ n = namer(name) timesteps, batch_size = x.shape[0], x.shape[1] num_non_lin = 3 if couple_i_and_f else 4 num_gates = num_non_lin - 1 W = self.make_concat_param(n('W'), # (input_dim, [3|4]*hidden_dim) num_non_lin*[(input_dim, hidden_dim)], num_non_lin*[w_init], axis=1) b = self.make_concat_param(n('b'), # ([3|4]*hidden_dim,) num_non_lin*[(hidden_dim,)], [forget_bias_init] + num_gates*[other_bias_init], axis=0) U = self.make_concat_param(n('U'), # (hidden_dim, [3|4]*hidden_dim) num_non_lin*[(hidden_dim, hidden_dim)], num_non_lin*[u_init], axis=1) if not sep_x_dropout: if tie_x_dropout: x = self.apply_dropout_noise(x, self.get_dropout_noise((batch_size, input_dim), drop_x)) else: x = self.dropout(x, drop_x) lin_x = tt.dot(x, W) + b # (timesteps, batch_size, [3|4]*hidden_dim) else: if tie_x_dropout: x_for_f = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) x_for_o = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) if num_gates == 3: x_for_i = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) x_for_g = self.apply_dropout_noise( x, self.get_dropout_noise((batch_size, input_dim), drop_x)) else: x_for_f = self.dropout(x, drop_x) x_for_o = self.dropout(x, drop_x) if num_gates == 3: x_for_i = self.dropout(x, drop_x) x_for_g = self.dropout(x, drop_x) lin_x_tensors = [tt.dot(x_for_f, W[:,:hidden_dim]), tt.dot(x_for_o, W[:,hidden_dim:2*hidden_dim])] if num_gates == 3: lin_x_tensors.append(tt.dot(x_for_i, W[:,2*hidden_dim:3*hidden_dim])) lin_x_tensors.append(tt.dot(x_for_g, W[:,num_gates*hidden_dim:])) lin_x = tt.concatenate(lin_x_tensors, axis=2) + b # (timesteps, batch_size, [3|4]*hidden_dim) def step_fn(lin_x_t, x_mask_t, h_tm1, c_tm1, h_noise, U): # lin_x_t (batch_size, [3|4]*hidden_dim) # x_mask_t (batch_size, 1) # h_tm1 (batch_size, hidden_dim) # c_tm1 (batch_size, hidden_dim) # h_noise (batch_size, [1|3|4]*hidden_dim) # 1 if not sep_h_dropout, otherwise: 3 or 4 depending on num_non_lin # U (hidden_dim, [3|4]*hidden_dim) if not sep_h_dropout: h_tm1 = self.apply_dropout_noise(h_tm1, h_noise) lin_h_tm1 = tt.dot(h_tm1, U) # (batch_size, [3|4]*hidden_dim) else: h_tm1_for_f = self.apply_dropout_noise(h_tm1, h_noise[:,:hidden_dim]) h_tm1_for_o = self.apply_dropout_noise(h_tm1, h_noise[:,hidden_dim:2*hidden_dim]) if num_gates == 3: h_tm1_for_i = self.apply_dropout_noise(h_tm1, h_noise[:,2*hidden_dim:3*hidden_dim]) h_tm1_for_g = self.apply_dropout_noise(h_tm1, h_noise[:,num_gates*hidden_dim:]) lin_h_tm1_tensors = [tt.dot(h_tm1_for_f, U[:,:hidden_dim]), tt.dot(h_tm1_for_o, U[:,hidden_dim:2*hidden_dim])] if num_gates == 3: lin_h_tm1_tensors.append(tt.dot(h_tm1_for_i, U[:,2*hidden_dim:3*hidden_dim])) lin_h_tm1_tensors.append(tt.dot(h_tm1_for_g, U[:,num_gates*hidden_dim:])) lin_h_tm1 = tt.concatenate(lin_h_tm1_tensors, axis=1) # (batch_size, [3|4]*hidden_dim) lin = lin_x_t + lin_h_tm1 # (batch_size, [3|4]*hidden_dim) gates = tt.nnet.sigmoid(lin[:, :num_gates*hidden_dim]) # (batch_size, [3|4]*hidden_dim) f_gate = gates[:, :hidden_dim] # (batch_size, hidden_dim) o_gate = gates[:, hidden_dim:2*hidden_dim] # (batch_size, hidden_dim) i_gate = gates[:, 2*hidden_dim:] if num_gates == 3 else 1 - f_gate # (batch_size, hidden_dim) g = tt.tanh(lin[:, num_gates*hidden_dim:]) # (batch_size, hidden_dim) c_t = f_gate * c_tm1 + i_gate * g h_t = o_gate * tt.tanh(c_t) h_t = tt.switch(x_mask_t, h_t, h_tm1) c_t = tt.switch(x_mask_t, c_t, c_tm1) return h_t, c_t # end of step_fn if learn_initial_state: h0 = self.make_param(n('h0'), (hidden_dim,), 0) c0 = self.make_param(n('c0'), (hidden_dim,), 0) batch_h0 = tt.extra_ops.repeat(h0[None,:], batch_size, axis=0) batch_c0 = tt.extra_ops.repeat(c0[None,:], batch_size, axis=0) else: batch_h0 = batch_c0 = tt.zeros((batch_size, hidden_dim)) x_mask = tt.shape_padright(x_mask) # (timesteps, batch_size, 1) original_x_mask = x_mask if backward: lin_x = lin_x[::-1] x_mask = x_mask[::-1] h_noise = self.get_dropout_noise( (batch_size, hidden_dim if not sep_h_dropout else num_non_lin*hidden_dim), drop_h) results, _ = theano.scan(step_fn, sequences = [lin_x, x_mask], outputs_info = [batch_h0, batch_c0], non_sequences = [h_noise, U], name = n('scan')) h = results[0] # (timesteps, batch_size, hidden_dim) if backward: h = h[::-1] h *= original_x_mask return h
def __init__(self, L1, ratio): self.random_stream = RandomStreams(seed=1) self.L1 = L1 self.one_ratio = ratio
import theano.tensor as T from theano import function from theano.tensor.shared_randomstreams import RandomStreams import numpy from theano.printing import pydotprint random = RandomStreams(seed=42) a = random.normal((1, 3)) b = T.dmatrix('a') f1 = a * b g1 = function([b], f1) pydotprint(g1, outfile="s9.png", var_with_name_simple=True) print "Invocation 1:", g1(numpy.ones((1, 3))) print "Invocation 2:", g1(numpy.ones((1, 3))) print "Invocation 3:", g1(numpy.ones((1, 3))) # Invocation 1: [[ 1.25614218 -0.53793023 -0.10434045]] # Invocation 2: [[ 0.66992188 -0.70813926 0.99601177]] # Invocation 3: [[ 0.0724739 -0.66508406 0.93707751]]
def train_dA( learning_rate, training_epochs, window_size, corruption_level, n_hidden, train_set, output_folder, train_algo="sgd"): """ This dA is tested on ICHI_Data :type learning_rate: float :param learning_rate: learning rate used for training the DeNosing AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type window_size: int :param window_size: size of window used for training :type corruption_level: float :param corruption_level: corruption_level used for training the DeNosing AutoEncoder :type n_hidden: int :param n_hidden: count of nodes in hidden layer :type output_folder: string :param output_folder: folder for costand error graphics with results """ start_time = time.clock() rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) x = T.vector('x') da = dA( numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=window_size, n_hidden=n_hidden ) if train_algo == "sgd": updated_da = train_da_sgd( learning_rate=learning_rate, window_size=window_size, training_epochs=training_epochs, corruption_level=corruption_level, train_set=train_set, da=da ) base_folder = "da_sgd" else: updated_da = train_da_cg( da=da, train_set=train_set, window_size=window_size, corruption_level=corruption_level, training_epochs=training_epochs ) base_folder = "da_cg" visualize_da(train_cost=updated_da.train_cost_array, window_size=window_size, learning_rate=learning_rate, corruption_level=corruption_level, n_hidden=n_hidden, output_folder=output_folder, base_folder=base_folder) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The with corruption %f code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % (corruption_level, (training_time) / 60.))
import numpy import numpy.random import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams numpy_rng = numpy.random.RandomState(1) theano_rng = RandomStreams(1) from scipy import misc from hyperParams import * def loadFrames(): frames_train = numpy.load(data_path + 'train/' + 'frames.npy')[:numframes_train, :] frames_train = numpy.reshape(frames_train, (numseqs_train, seq_dim)).astype('float32') frames_test = numpy.load(data_path + 'test/' + 'frames.npy')[:numframes_test, :] frames_test = numpy.reshape(frames_test, (numseqs_test, seq_dim)).astype('float32') return (frames_train, frames_test) def loadOpticalFlow(): ofx_train = numpy.load(data_path + 'train/' + 'ofx.npy')[:numframes_train, :]
def __init__(self, D, M, Q, Domain_number, m, pre_params, Hiddenlayerdim1, Hiddenlayerdim2): self.Xlabel = T.matrix('Xlabel') self.X = T.matrix('X') N = self.X.shape[0] self.Weight = T.matrix('Weight') ker = kernel(Q) mmd = MMD(M, Domain_number) mu_value = np.random.randn(M, D) Sigma_b_value = np.zeros((M, M)) + np.log(0.01) Z_value = m[:M] self.test = Z_value ls_value = np.zeros(Domain_number) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu', borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b', borrow=True) self.Z = theano.shared(value=Z_value, name='Z', borrow=True) self.ls = theano.shared(value=ls_value, name='ls', borrow=True) self.params = [self.mu, self.Sigma_b, self.Z, self.ls] self.hiddenLayer_x = HiddenLayer(rng=rng, input=self.X, n_in=D, n_out=Hiddenlayerdim1, activation=T.nnet.relu, number='_x') self.hiddenLayer_hidden = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=Hiddenlayerdim1, n_out=Hiddenlayerdim2, activation=T.nnet.relu, number='_h') self.hiddenLayer_m = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=Q, activation=T.nnet.relu, number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=Q, activation=T.nnet.relu, number='_S') self.loc_params = [] self.loc_params.extend(self.hiddenLayer_x.params) self.loc_params.extend(self.hiddenLayer_hidden.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params = {} for i in self.loc_params: self.local_params[str(i)] = i self.params.extend(ker.params) self.params.extend(mmd.params) self.hyp_params = {} for i in [self.mu, self.Sigma_b, self.ls]: self.hyp_params[str(i)] = i self.Z_params = {} for i in [self.Z]: self.Z_params[str(i)] = i self.global_params = {} for i in self.params: self.global_params[str(i)] = i self.params.extend(self.hiddenLayer_x.params) self.params.extend(self.hiddenLayer_hidden.params) self.params.extend(self.hiddenLayer_m.params) self.params.extend(self.hiddenLayer_S.params) self.wrt = {} for i in self.params: self.wrt[str(i)] = i for i, j in pre_params.items(): self.wrt[i].set_value(j) m = self.hiddenLayer_m.output S_0 = self.hiddenLayer_S.output S_1 = T.exp(S_0) S = T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N, Q)) eps_M = srng.normal((M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled + Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) Kmm = mmd.MMD_kenel_Xonly(mmd.Zlabel_T, Kmm, self.Weight) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z, Xtilda) Kmn = mmd.MMD_kenel_ZX(self.Xlabel, Kmn, self.Weight) Knn = ker.RBF(Xtilda) Knn = mmd.MMD_kenel_Xonly(self.Xlabel, Knn, self.Weight) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) mean_U = T.dot(Kinterval.T, self.U) betaI = T.diag(T.dot(self.Xlabel, beta)) Covariance = betaI self.LL = (self.log_mvn(self.X, mean_U, Covariance) - 0.5 * T.sum(T.dot(betaI, Ktilda))) self.KL_X = -self.KLD_X(m, S) self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)
import pickle import sys import time import numpy import theano import theano.sparse import theano.tensor as T from sklearn.metrics import mean_squared_error from sklearn.metrics import roc_auc_score from theano.tensor.shared_randomstreams import RandomStreams import dl_utils as ut import sampling_based_gaussian_binary_rbm_sparse as gbrbm srng = RandomStreams(seed=234) rng = numpy.random rng.seed(1234) batch_size = 1000 # batch size lr = 0.0006 # learning rate lambda1 = 0.0001 # .01 hidden0 = 300 # regularisation rate hidden1 = 300 # hidden layer 1 hidden2 = 100 # hidden layer 2 acti_type = 'tanh' # activation type epoch = 100 # epochs number advertiser = '2997' if len(sys.argv) > 1: advertiser = sys.argv[1] train_file = '../data/train.fm.txt' # training file test_file = '../data/test.fm.txt' # test file
def __init__(self, numargs, embed_size, pred_vocab_size, arg_vocab_size, initial_pred_rep=None, initial_arg_rep=None, margin=5, lr=0.01, activation=T.nnet.sigmoid): numpy_rng = numpy.random.RandomState(12345) theano_rng = RandomStreams(54321) self.lr = lr #margin = 5 # Initializing predicate representations if initial_pred_rep is not None: num_preds, pred_dim = initial_pred_rep.shape assert pred_vocab_size == num_arrays, "Initial predicate representation is not the same size as pred_vocab_size" assert embed_size == pred_dim, "Initial predicate representation does not have the same dimensionality as embed_size" else: initial_pred_rep_range = 4 * numpy.sqrt( 6. / (pred_vocab_size + embed_size)) initial_pred_rep = numpy.asarray( numpy_rng.uniform(low=-initial_pred_rep_range, high=initial_pred_rep_range, size=(pred_vocab_size, embed_size))) self.pred_rep = theano.shared(value=initial_pred_rep, name='P') # Initializing argument representations if initial_arg_rep is not None: arg_rep_len, arg_dim = initial_arg_rep.shape assert arg_vocab_size == arg_rep_len, "Initial argument representation is not the same size as arg_vocab_size" assert embed_size == arg_dim, "Initial argument representation does not have the same dimensionality as embed_size" else: initial_arg_rep_range = 4 * numpy.sqrt( 6. / (arg_vocab_size + embed_size)) initial_arg_rep = numpy.asarray( numpy_rng.uniform(low=-initial_arg_rep_range, high=initial_arg_rep_range, size=(arg_vocab_size, embed_size))) self.arg_rep = theano.shared(value=initial_arg_rep, name='A') # Initialize scorer scorer_dim = embed_size * (numargs + 1) # Predicate is +1 initial_scorer_range = 4 * numpy.sqrt(6. / scorer_dim) initial_scorer = numpy.asarray( numpy_rng.uniform(low=-initial_scorer_range, high=initial_scorer_range, size=scorer_dim)) self.scorer = theano.shared(value=initial_scorer, name='s') # Initialize indicator indicator_dim = embed_size * (numargs + 1) # Predicate is +1 initial_indicator_range = 4 * numpy.sqrt(6. / (indicator_dim + numargs)) initial_indicator = numpy.asarray( numpy_rng.uniform(low=-initial_indicator_range, high=initial_indicator_range, size=(indicator_dim, numargs))) self.indicator = theano.shared(value=initial_indicator, name='I') # Define symbolic pred-arg self.pred_ind = T.iscalar('p') self.arg_inds = T.iscalars(numargs) pred = self.pred_rep[self.pred_ind].reshape((1, embed_size)) args = self.arg_rep[self.arg_inds].reshape((1, embed_size * numargs)) pred_arg = activation(T.concatenate([pred, args], axis=1)) # Define symbolic rand pred-arg for training scorer rand_pred_ind = theano_rng.random_integers(low=0, high=pred_vocab_size - 1) rand_arg_inds = theano_rng.random_integers([1, numargs], low=0, high=arg_vocab_size - 1) rand_pred = self.pred_rep[rand_pred_ind].reshape((1, embed_size)) rand_args = self.arg_rep[rand_arg_inds].reshape( (1, embed_size * numargs)) rand_pred_arg = activation( T.concatenate([rand_pred, rand_args], axis=1)) # Define symbolic pred_rand-arg for training indicator pred_rand_arg = activation(T.concatenate([pred, rand_args], axis=1)) # Define scores and loss self.corr_score = T.sum(T.dot(pred_arg, self.scorer)) rand_score = T.sum(T.dot(rand_pred_arg, self.scorer)) self.margin_loss = T.maximum(0, margin - self.corr_score + rand_score) # Define indicator values and loss orig_ind_labels = T.constant(numpy.zeros(numargs)) self.indicator_pred = T.nnet.sigmoid(T.dot(pred_arg, self.indicator)) rand_ind_labels = T.constant(numpy.ones(numargs)) rand_indicator_pred = T.nnet.sigmoid( T.dot(pred_rand_arg, self.indicator)) self.indicator_loss = T.mean( (self.indicator_pred - orig_ind_labels)**2) + T.mean( (rand_indicator_pred - rand_ind_labels)**2) # Define params and inputs self.score_params = [self.pred_rep, self.arg_rep, self.scorer] self.indicator_params = [self.pred_rep, self.arg_rep, self.indicator] self.score_ind_inputs = [self.pred_ind] + list(self.arg_inds)
def __init__(self, numpy_rng, theano_rng=None, cfg=None, dnn_shared=None, shared_layers=[]): self.layers = [] self.dropout_layers = [] self.params = [] self.delta_params = [] self.cfg = cfg self.n_ins = cfg.n_ins self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout self.pool_size = cfg.pool_size self.input_dropout_factor = cfg.input_dropout_factor self.dropout_factor = cfg.dropout_factor self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.hidden_layers_number): # construct the hidden layer if i == 0: input_size = self.n_ins layer_input = self.x if self.input_dropout_factor > 0.0: dropout_layer_input = _dropout_from_layer( theano_rng, self.x, self.input_dropout_factor) else: dropout_layer_input = self.x else: input_size = self.hidden_layers_sizes[i - 1] layer_input = ( 1 - self.dropout_factor[i - 1]) * self.layers[-1].output dropout_layer_input = self.dropout_layers[-1].dropout_output W = None b = None if (i in shared_layers): W = dnn_shared.layers[i].W b = dnn_shared.layers[i].b if self.do_maxout == False: dropout_layer = DropoutHiddenLayer( rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W=W, b=b, activation=self.activation, dropout_factor=self.dropout_factor[i]) hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], activation=self.activation, W=dropout_layer.W, b=dropout_layer.b) else: dropout_layer = DropoutHiddenLayer( rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, W=W, b=b, activation=(lambda x: 1.0 * x), dropout_factor=self.dropout_factor[i], do_maxout=True, pool_size=self.pool_size) hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, activation=(lambda x: 1.0 * x), W=dropout_layer.W, b=dropout_layer.b, do_maxout=True, pool_size=self.pool_size) # add the layer to our list of layers self.layers.append(hidden_layer) self.dropout_layers.append(dropout_layer) self.params.extend(dropout_layer.params) self.delta_params.extend(dropout_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.dropout_logLayer = LogisticRegression( input=self.dropout_layers[-1].dropout_output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) self.logLayer = LogisticRegression( input=(1 - self.dropout_factor[-1]) * self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs, W=self.dropout_logLayer.W, b=self.dropout_logLayer.b) self.dropout_layers.append(self.dropout_logLayer) self.layers.append(self.logLayer) self.params.extend(self.dropout_logLayer.params) self.delta_params.extend(self.dropout_logLayer.delta_params) # compute the cost self.finetune_cost = self.dropout_logLayer.negative_log_likelihood( self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum()
def __init__( self, numpy_rng, theano_rng=None, n_ins=23, hidden_layers_size=[128,32], corruption_levels=[0.0,0.0], v_h_learning_rates=[0.1,0.1], h_v_learning_rates=[0.1,0.1] ): #self.hidden_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_size) self.v_h_learning_rates = v_h_learning_rates self.h_v_learning_rates = h_v_learning_rates self.corruption_levels = corruption_levels assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) self.x = T.matrix('x') # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP for i in range(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_size[i - 1] if i == 0: layer_input = self.x else: layer_input = self.dA_layers[-1].output ''' hidden_layer = HiddenLayer(rng=numpy_rng, input = layer_input, n_in = input_size, n_out=hidden_layers_size[i], activation=T.nnet.sigmoid) self.hidden_layers.append(hidden_layer) ''' if i == 0: dA_layer = dA(numpy_rng=numpy_rng, theano_rng = theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_size[i],#W=hidden_layer.W,bhid=hidden_layer.b, v_h_active = T.nnet.sigmoid) else: dA_layer = dA(numpy_rng=numpy_rng, theano_rng = theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_size[i],#W=hidden_layer.W,bhid=hidden_layer.b, v_h_active = T.nnet.sigmoid, h_v_active = T.nnet.sigmoid ) self.dA_layers.append(dA_layer) self.params.extend(dA_layer.params) #reconstructed_input = self.x #reconstructed_input = self.dA_layers[0].x #for i in range(self.n_layers): # reconstructed_input = self.dA_layers[i].get_hidden_values(reconstructed_input) #reconstructed_input = [self.x] #for i in range(self.n_layers): # temp = self.dA_layers[i].get_hidden_values(reconstructed_input[-1]) # reconstructed_input.append(temp) reconstructed_input = [] reconstructed_input.append(self.dA_layers[-1].output) for i in range(self.n_layers - 1, -1, -1): temp = self.dA_layers[i].get_reconstructed_input(reconstructed_input[-1]) reconstructed_input.append(temp) self.finetune_cost = self.dA_layers[0].get_error(reconstructed_input[-1])
def adam(params, cost=None, gradients=None, learningrate=0.0002, beta1=0.9, beta2=0.999, epsilon=1e-8, eta=0., gamma=0.55, iterstart=0): """ Computes the updates for ADAM. :type params: list :param params: Network parameters. :type cost: theano.tensor.var.TensorVariable :param cost: Cost variable (scalar). Optional if the gradient is provided. :type gradients: list :param gradients: Gradient of a cost w.r.t. parameters. Optional if the cost is provided. :type learningrate: theano.tensor.var.TensorVariable or float :param learningrate: Learning rate of SGD. Can be a float (static) or a dynamic theano variable. :type beta1: float :param beta1: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980 :type beta2: float :param beta2: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980 :type epsilon: float :param epsilon: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980 :type eta: float :param eta: Eta for noisy gradient. See Neelakantan et al. 2015: http://arxiv.org/pdf/1511.06807v1.pdf :type gamma: float :param gamma: Gamma for noisy gradient. See Neelakantan et al. 2015: http://arxiv.org/pdf/1511.06807v1.pdf :type iterstart: int or float :param iterstart: Adam anneals the learning rate with iterations. This parameter specifies the initial value of the iteration count, such that the learning rate is scaled appropriately (or the model might jump out of the potential minimum where it's at). :return: List of updates """ # Validate input assert not (cost is None and gradients is None), "Update function adam requires either a cost scalar or a list of " \ "gradients." # Compute gradients if requested if gradients is None and cost is not None: pdC = T.grad(cost, wrt=params) # Kill gradients if cost is nan dC = [ th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC ] else: dC = gradients updates = [] # Gradient noising if not (eta == 0): # RNG srng = RandomStreams() # Iteration counter itercount = th.shared(np.asarray(iterstart, dtype=th.config.floatX)) # Add noise dC = [ dparam + srng.normal(size=dparam.shape, std=T.sqrt(eta / (1 + itercount)**gamma), dtype='floatX') for dparam in dC ] # Update itercount updates.append((itercount, itercount + 1)) # Implementation as in reference paper, nothing spectacular here... tm1 = th.shared(np.asarray(iterstart, dtype=th.config.floatX)) t = tm1 + 1 at = learningrate * T.sqrt(1 - beta2**t) / (1 - beta1**t) for param, dparam in zip(params, dC): paramshape = param.get_value().shape mtm1 = th.shared(np.zeros(paramshape, dtype=th.config.floatX)) vtm1 = th.shared(np.zeros(paramshape, dtype=th.config.floatX)) mt = beta1 * mtm1 + (1 - beta1) * dparam vt = beta2 * vtm1 + (1 - beta2) * dparam**2 u = at * mt / (T.sqrt(vt) + epsilon) updates.append((mtm1, mt)) updates.append((vtm1, vt)) updates.append((param, param - u)) updates.append((tm1, t)) return updates
def __init__(self, train, test, alpha_lambda, n_user, n_item, n_in, n_hidden): """ 构建 模型参数 :param train: 添加mask后的 :param test: 添加mask后的 :param n_user: 用户的真实数目 :param n_item: 商品items的真正数目,init()里补全一个商品作为填充符 :param n_in: rnn输入向量的维度 :param n_hidden: rnn隐层向量的维度 :return: """ # 来自于theano官网的dAE部分。 rng = np.random.RandomState(123) self.thea_rng = RandomStreams(rng.randint(2**30)) # 旗下随机函数可在GPU下运行。 # 用mask进行补全后的train/test tra_buys_masks, tra_masks, tra_buys_neg_masks = train tes_buys_masks, tes_masks, tes_buys_neg_masks = test self.tra_buys_masks = theano.shared(borrow=True, value=np.asarray(tra_buys_masks, dtype='int32')) self.tes_buys_masks = theano.shared(borrow=True, value=np.asarray(tes_buys_masks, dtype='int32')) self.tra_masks = theano.shared(borrow=True, value=np.asarray(tra_masks, dtype='int32')) self.tes_masks = theano.shared(borrow=True, value=np.asarray(tes_masks, dtype='int32')) self.tra_buys_neg_masks = theano.shared(borrow=True, value=np.asarray( tra_buys_neg_masks, dtype='int32')) self.tes_buys_neg_masks = theano.shared(borrow=True, value=np.asarray( tes_buys_neg_masks, dtype='int32')) # 把超参数shared self.alpha_lambda = theano.shared(borrow=True, value=np.asarray( alpha_lambda, dtype=theano.config.floatX)) # 初始化,先定义局部变量,再self.修饰成实例变量 rang = 0.5 lt = uniform(-rang, rang, (n_item + 1, n_in)) # 多出来一个(填充符),存放用于补齐用户购买序列/实际不存在的item ui = uniform(-rang, rang, (4, n_hidden, n_hidden)) wh = uniform(-rang, rang, (4, n_hidden, n_hidden)) c0 = np.zeros((n_hidden, ), dtype=theano.config.floatX) h0 = np.zeros((n_hidden, ), dtype=theano.config.floatX) bi = np.zeros((4, n_hidden), dtype=theano.config.floatX) # 建立参数。 self.lt = theano.shared(borrow=True, value=lt.astype(theano.config.floatX)) self.ui = theano.shared(borrow=True, value=ui.astype(theano.config.floatX)) self.wh = theano.shared(borrow=True, value=wh.astype(theano.config.floatX)) self.c0 = theano.shared(borrow=True, value=c0) self.h0 = theano.shared(borrow=True, value=h0) self.bi = theano.shared(borrow=True, value=bi) # 存放训练好的users、items表达。用于计算所有users对所有items的评分:users * items trained_items = uniform(-rang, rang, (n_item + 1, n_hidden)) trained_users = uniform(-rang, rang, (n_user, n_hidden)) self.trained_items = theano.shared(borrow=True, value=trained_items.astype( theano.config.floatX)) self.trained_users = theano.shared(borrow=True, value=trained_users.astype( theano.config.floatX))
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # The DBN is an MLP, for which all weights of intermediate layers are shared with a # different RBM. We will first construct the DBN as a deep multilayer perceptron, and # when constructing each sigmoidal layer we also construct an RBM that shares weights # with that layer. During pretraining we will train these RBMs (which will lead # to chainging the weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of the layer below or # the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden layer below or the # input of the DBN if you are on the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are going to only declare that # the parameters of the sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(\ input = self.sigmoid_layers[-1].output,\ n_in = hidden_layers_sizes[-1], n_out = n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def __init__(self, rate): self.p = numpy.array(1 - rate).astype(theano.config.floatX) self.rng = RandomStreams(numpy.random.randint(1234))
def __init__(self, input=None, n_visible=784, n_hidden=500, \ W=None, hbias=None, vbias=None, numpy_rng=None, theano_rng=None): """ RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. :param input: None for standalone RBMs or symbolic variable if RBM is part of a larger graph. :param n_visible: number of visible units :param n_hidden: number of hidden units :param W: None for standalone RBMs or symbolic variable pointing to a shared weight matrix in case RBM is part of a DBN network; in a DBN, the weights are shared between RBMs and layers of a MLP :param hbias: None for standalone RBMs or symbolic variable pointing to a shared hidden units bias vector in case RBM is part of a different network :param vbias: None for standalone RBMs or a symbolic variable pointing to a shared visible units bias """ self.n_visible = n_visible self.n_hidden = n_hidden if numpy_rng is None: # create a number generator numpy_rng = numpy.random.RandomState(1234) if theano_rng is None: theano_rng = RandomStreams(numpy_rng.randint(2**30)) if W is None: # W is initialized with `initial_W` which is uniformely # sampled from -4*sqrt(6./(n_visible+n_hidden)) and # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if # converted using asarray to dtype theano.config.floatX so # that the code is runable on GPU initial_W = numpy.asarray(numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden)), dtype=theano.config.floatX) # theano shared variables for weights and biases W = theano.shared(value=initial_W, name='W', borrow=True) if hbias is None: # create shared variable for hidden units bias hbias = theano.shared(value=numpy.zeros( n_hidden, dtype=theano.config.floatX), name='hbias', borrow=True) if vbias is None: # create shared variable for visible units bias vbias = theano.shared(value=numpy.zeros( n_visible, dtype=theano.config.floatX), name='vbias', borrow=True) # initialize input layer for standalone RBM or layer0 of DBN self.input = input if not input: self.input = T.matrix('input') self.W = W self.hbias = hbias self.vbias = vbias self.theano_rng = theano_rng # **** WARNING: It is not a good idea to put things in this list # other than shared variables created in this function. self.params = [self.W, self.hbias, self.vbias]
def __init__(self, nin, nout, nhidden, ngauss, nvar): self.nin = nin self.nout = nout self.nhidden = nhidden self.ngauss = ngauss self.nu = nu = (self.nout * (self.nout + 1)) // 2 # In/out: x = T.dmatrix("x") theta = T.dmatrix("theta") # Parameters: W = theano.shared(np.random.rand(nin, nhidden), name="W") b = theano.shared(np.random.rand(nhidden), name="b") y = T.dot(x, W) + b W_alpha = theano.shared(1e-8 * np.random.rand(nhidden, ngauss), name="W_alpha") b_alpha = theano.shared(np.zeros(ngauss), name="b_alpha") alpha = T.nnet.softmax(T.dot(y, W_alpha) + b_alpha) W_mk = theano.shared(1e-8 * np.random.rand(nhidden, ngauss * nout), name="W_mk") b_mk = theano.shared(np.zeros((ngauss * nout)), name="b_mk") W_u = theano.shared(1e-8 * np.random.rand(nhidden, ngauss * nu), name="W_u") b_u = theano.shared(np.zeros((ngauss * nu)), name="b_u") # Compute the Gaussian cost using a reduce: Uvals = T.dot(y, W_u) + b_u mkvals = T.dot(y, W_mk) + b_mk def apply_gaussian(Uv, mk, a, th, current): for i in range(ngauss): arg = T.exp(Uv[i * nu:i * nu + nout]) current += T.sum(arg) U = T.diag(arg) U = T.set_subtensor(U[np.triu_indices(nout, 1)], Uv[i * nu + nout:(i + 1) * nu]) r = th - mk[i * nout:(i + 1) * nout] r2 = T.dot(r, T.dot(U.T, T.dot(U, r))) current += T.log(a[i]) - 0.5 * r2 return current outputs_info = T.as_tensor_variable(np.asarray(0.0, float)) lnprob, _ = theano.reduce(apply_gaussian, [Uvals, mkvals, alpha, theta], outputs_info) cost = -lnprob self.params = [W, b, W_alpha, b_alpha, W_mk, b_mk, W_u, b_u] self.grads = T.grad(cost, self.params) updates = get_adam_updates(cost, self.params) self.update_step = theano.function([x, theta], outputs=cost, updates=updates) self.cost_func = theano.function([x, theta], outputs=cost) # Stochastic objective: ntot = np.sum([np.prod(np.shape(p.get_value())) for p in self.params]) rng = RandomStreams() u = rng.normal((nvar, ntot)) phi_m = theano.shared(np.zeros(ntot), name="phi_m") phi_s = theano.shared(np.zeros(ntot), name="phi_s") phi = (phi_m + T.exp(0.5 * phi_s))[None, :] * u print(theano.function([], outputs=phi)().shape)
def reset_rng(self): self.rng = N.random.RandomState([12.,9.,2.]) self.theano_rng = RandomStreams(self.rng.randint(2**30)) if self.initialized: self.redo_theano()
def __init__(self, numpy_rng, theano_rng=None, n_ins=110, hidden_layers_sizes=[30, 5], n_outs=2, corruption_levels=[0.1, 0.2]): self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units # or the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) self.y_pred = self.logLayer.y_pred
def __init__(self, numpy_rng, theano_rng=None, input=None, n_visible=784, n_hidden=500, W=None, bhid=None, bvis=None): """ Initialize the dA class by specifying the number of visible units (the dimension d of the input ), the number of hidden units ( the dimension d' of the latent or hidden space ) and the corruption level. The constructor also receives symbolic variables for the input, weights and bias. Such a symbolic variables are useful when, for example the input is the result of some computations, or when weights are shared between the dA and an MLP layer. When dealing with SdAs this always happens, the dA on layer 2 gets as input the output of the dA on layer 1, and the weights of the dA are used in the second stage of training to construct an MLP. :type numpy_rng: numpy.random.RandomState :param numpy_rng: number random generator used to generate weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type input: theano.tensor.TensorType :param input: a symbolic description of the input or None for standalone dA :type n_visible: int :param n_visible: number of visible units :type n_hidden: int :param n_hidden: number of hidden units :type W: theano.tensor.TensorType :param W: Theano variable pointing to a set of weights that should be shared belong the dA and another architecture; if dA should be standalone set this to None :type bhid: theano.tensor.TensorType :param bhid: Theano variable pointing to a set of biases values (for hidden units) that should be shared belong dA and another architecture; if dA should be standalone set this to None :type bvis: theano.tensor.TensorType :param bvis: Theano variable pointing to a set of biases values (for visible units) that should be shared belong dA and another architecture; if dA should be standalone set this to None """ self.n_visible = n_visible self.n_hidden = n_hidden # create a Theano random generator that gives symbolic random values if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # note : W' was written as `W_prime` and b' as `b_prime` if not W: # W is initialized with `initial_W` which is uniformely sampled # from -4*sqrt(6./(n_visible+n_hidden)) and # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if # converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU initial_W = numpy.asarray(numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden)), dtype=theano.config.floatX) W = theano.shared(value=initial_W, name='W', borrow=True) if not bvis: bvis = theano.shared(value=numpy.zeros(n_visible, dtype=theano.config.floatX), borrow=True) if not bhid: bhid = theano.shared(value=numpy.zeros(n_hidden, dtype=theano.config.floatX), name='b', borrow=True) self.W = W # b corresponds to the bias of the hidden self.b = bhid # b_prime corresponds to the bias of the visible self.b_prime = bvis # tied weights, therefore W_prime is W transpose self.W_prime = self.W.T self.theano_rng = theano_rng # if no input is given, generate a variable representing the input if input == None: # we use a matrix because we expect a minibatch of several # examples, each example being a row self.x = T.dmatrix(name='input') else: self.x = input self.params = [self.W, self.b, self.b_prime]
extra_in_scp_file = arguments['extra_in_scp_file'] lstm_param_file = arguments['lstm_param_file'] lstm_cfg_file = arguments['lstm_cfg_file'] layer_index = int(arguments['layer_index']) # network structure cfg = cPickle.load(open(lstm_cfg_file, 'r')) cfg.init_activation() kaldiread = KaldiReadIn(in_scp_file) extra_kaldiread = KaldiReadIn(extra_in_scp_file) kaldiwrite = KaldiWriteOut(out_ark_file) log('> ... setting up the Phase ATTEND LSTM layers') rng = numpy.random.RandomState(89677) theano_rng = RandomStreams(rng.randint(2**30)) lstm = PhaseATTEND_LSTM(numpy_rng=rng, theano_rng=theano_rng, cfg=cfg) _file2nnet(layers=lstm.layers, set_layer_num=len(lstm.layers), filename=lstm_param_file) out_function = lstm.build_extract_feat_function() while True: uttid, in_matrix = kaldiread.read_next_utt() extra_uttid, extra_in_matrix = extra_kaldiread.read_next_utt() if uttid == '': break print 'in_matrix:' + str(in_matrix.shape) final_matrix = numpy.zeros((in_matrix.shape[0], cfg.n_outs), dtype=theano.config.floatX)
def test_dA(learning_rate=0.1, training_epochs=15, dataset='mnist.pkl.gz', batch_size=20, output_folder='dA_plots'): """ This demo is tested on MNIST :type learning_rate: float :param learning_rate: learning rate used for training the DeNosing AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type dataset: string :param dataset: path to the picked dataset """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) #################################### # BUILDING THE MODEL NO CORRUPTION # #################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500) cost, updates = da.get_cost_updates(corruption_level=0., learning_rate=learning_rate) train_da = theano.function([index], cost, updates=updates, givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]}) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The no corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)) image = PIL.Image.fromarray( tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_0.png') ##################################### # BUILDING THE MODEL CORRUPTION 30% # ##################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500) cost, updates = da.get_cost_updates(corruption_level=0.3, learning_rate=learning_rate) train_da = theano.function([index], cost, updates=updates, givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]}) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The 30% corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % (training_time / 60.)) image = PIL.Image.fromarray(tile_raster_images( X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_30.png') os.chdir('../')
class SampledMeanBinaryCrossEntropy(DefaultDataSpecsMixin, Cost): """ .. todo:: WRITEME properly CE cost that goes with sparse autoencoder with L1 regularization on activations For theory: Y. Dauphin, X. Glorot, Y. Bengio. ICML2011 Large-Scale Learning of Embeddings with Reconstruction Sampling Parameters ---------- L1 : WRITEME ratio : WRITEME """ def __init__(self, L1, ratio): self.random_stream = RandomStreams(seed=1) self.L1 = L1 self.one_ratio = ratio def expr(self, model, data, **kwargs): """ .. todo:: WRITEME """ self.get_data_specs(model)[0].validate(data) X = data # X is theano sparse X_dense = theano.sparse.dense_from_sparse(X) noise = self.random_stream.binomial(size=X_dense.shape, n=1, prob=self.one_ratio, ndim=None) # a random pattern that indicates to reconstruct all the 1s and some of the 0s in X P = noise + X_dense P = theano.tensor.switch(P > 0, 1, 0) P = tensor.cast(P, theano.config.floatX) # L1 penalty on activations reg_units = theano.tensor.abs_(model.encode(X)).sum(axis=1).mean() # penalty on weights, optional # params = model.get_params() # W = params[2] # there is a numerical problem when using # tensor.log(1 - model.reconstruct(X, P)) # Pascal fixed it. before_activation = model.reconstruct_without_dec_acti(X, P) cost = ( 1 * X_dense * tensor.log(tensor.log(1 + tensor.exp(-1 * before_activation))) + (1 - X_dense) * tensor.log(1 + tensor.log(1 + tensor.exp(before_activation)))) cost = (cost * P).sum(axis=1).mean() cost = cost + self.L1 * reg_units return cost
def __init__(self, dropout_rate): Layer.__init__(self) self.dropout_rate = dropout_rate numpy_rng = np.random.RandomState(123) self.theano_rng = RandomStreams(numpy_rng.randint(2**30))
class LstmBasic(object): def __init__(self, train, test, alpha_lambda, n_user, n_item, n_in, n_hidden): """ 构建 模型参数 :param train: 添加mask后的 :param test: 添加mask后的 :param n_user: 用户的真实数目 :param n_item: 商品items的真正数目,init()里补全一个商品作为填充符 :param n_in: rnn输入向量的维度 :param n_hidden: rnn隐层向量的维度 :return: """ # 来自于theano官网的dAE部分。 rng = np.random.RandomState(123) self.thea_rng = RandomStreams(rng.randint(2**30)) # 旗下随机函数可在GPU下运行。 # 用mask进行补全后的train/test tra_buys_masks, tra_masks, tra_buys_neg_masks = train tes_buys_masks, tes_masks, tes_buys_neg_masks = test self.tra_buys_masks = theano.shared(borrow=True, value=np.asarray(tra_buys_masks, dtype='int32')) self.tes_buys_masks = theano.shared(borrow=True, value=np.asarray(tes_buys_masks, dtype='int32')) self.tra_masks = theano.shared(borrow=True, value=np.asarray(tra_masks, dtype='int32')) self.tes_masks = theano.shared(borrow=True, value=np.asarray(tes_masks, dtype='int32')) self.tra_buys_neg_masks = theano.shared(borrow=True, value=np.asarray( tra_buys_neg_masks, dtype='int32')) self.tes_buys_neg_masks = theano.shared(borrow=True, value=np.asarray( tes_buys_neg_masks, dtype='int32')) # 把超参数shared self.alpha_lambda = theano.shared(borrow=True, value=np.asarray( alpha_lambda, dtype=theano.config.floatX)) # 初始化,先定义局部变量,再self.修饰成实例变量 rang = 0.5 lt = uniform(-rang, rang, (n_item + 1, n_in)) # 多出来一个(填充符),存放用于补齐用户购买序列/实际不存在的item ui = uniform(-rang, rang, (4, n_hidden, n_hidden)) wh = uniform(-rang, rang, (4, n_hidden, n_hidden)) c0 = np.zeros((n_hidden, ), dtype=theano.config.floatX) h0 = np.zeros((n_hidden, ), dtype=theano.config.floatX) bi = np.zeros((4, n_hidden), dtype=theano.config.floatX) # 建立参数。 self.lt = theano.shared(borrow=True, value=lt.astype(theano.config.floatX)) self.ui = theano.shared(borrow=True, value=ui.astype(theano.config.floatX)) self.wh = theano.shared(borrow=True, value=wh.astype(theano.config.floatX)) self.c0 = theano.shared(borrow=True, value=c0) self.h0 = theano.shared(borrow=True, value=h0) self.bi = theano.shared(borrow=True, value=bi) # 存放训练好的users、items表达。用于计算所有users对所有items的评分:users * items trained_items = uniform(-rang, rang, (n_item + 1, n_hidden)) trained_users = uniform(-rang, rang, (n_user, n_hidden)) self.trained_items = theano.shared(borrow=True, value=trained_items.astype( theano.config.floatX)) self.trained_users = theano.shared(borrow=True, value=trained_users.astype( theano.config.floatX)) # 内建predict函数。不要写在这里,写在子类里,否则子类里会无法覆盖掉重写。 # self.__theano_predict__(n_in, n_hidden) def update_neg_masks(self, tra_buys_neg_masks, tes_buys_neg_masks): # 每个epoch都更新负样本 self.tra_buys_neg_masks.set_value(np.asarray(tra_buys_neg_masks, dtype='int32'), borrow=True) self.tes_buys_neg_masks.set_value(np.asarray(tes_buys_neg_masks, dtype='int32'), borrow=True) def update_trained_items(self): # 更新最终的items表达 lt = self.lt.get_value(borrow=True) # self.lt是shared,用get_value()。 self.trained_items.set_value(np.asarray(lt, dtype=theano.config.floatX), borrow=True) # update def update_trained_users(self, all_hus): # 外部先计算好,传进来直接更新 self.trained_users.set_value(np.asarray(all_hus, dtype=theano.config.floatX), borrow=True) # update def compute_sub_all_scores(self, start_end): # 其实可以直接传过来实数参数 # 计算users * items,每个用户对所有商品的评分(需去掉填充符) sub_all_scores = T.dot(self.trained_users[start_end], self.trained_items[:-1].T) return sub_all_scores.eval() # shape=(sub_n_user, n_item) def compute_sub_auc_preference(self, start_end): # items.shape=(n_item+1, 20),因为是mask形式,所以需要填充符。 # 注意矩阵形式的索引方式。 tes_items = self.trained_items[self.tes_buys_masks[ start_end]] # shape=(sub_n_user, len(tes_mask[0]), n_hidden) tes_items_neg = self.trained_items[self.tes_buys_neg_masks[start_end]] users = self.trained_users[start_end] shp0, shp2 = users.shape # shape=(sub_n_user, n_hidden) # 利用性质:(n_user, 1, n_hidden) * (n_user, len, n_hidden) = (n_user, len, n_hidden),即broadcast # 利用性质:np.sum((n_user, len, n_hidden), axis=2) = (n_user, len), # 即得到各用户对test里正负样本的偏好值 all_upqs = T.sum(users.reshape( (shp0, 1, shp2)) * (tes_items - tes_items_neg), axis=2) all_upqs *= self.tes_masks[start_end] # 只保留原先test items对应有效位置的偏好值 return all_upqs.eval() > 0 # 将>0的标为True, 也就是1 def get_corrupted_input_whole(self, inp, corruption_prob): # 处理2D矩阵:randomly set whole feature to zero. Matrix.shape=(n, m) # denoising方式0:随机将某些图、文特征整体性置为0 # 比如原先一条序列的图像特征是(num, 1024); 那么0/1概率矩阵是(num, 1), T.Rebroadcast,再相乘 # if corruption_prob < 0. or corruption_prob >= 1.: # raise Exception('Drop prob must be in interval [0, 1)') retain_prob = 1. - corruption_prob randoms = self.thea_rng.binomial( size=(inp.shape[0], 1), # shape=(num, 1) n=1, p=retain_prob, # p是得1的概率。 dtype=theano.config.floatX) randoms = T.Rebroadcast((1, True))(randoms) return inp * randoms # shape=(num, 1024) def get_corrupted_input_whole_minibatch(self, inp, corruption_prob): # 亲测表明:在序列前做data的corruption,效果更好更稳定。 # 处理3D矩阵 retain_prob = 1. - corruption_prob randoms = self.thea_rng.binomial( size=(inp.shape[0], inp.shape[1], 1), # shape=(seq_length, batch_size, 1) n=1, p=retain_prob, # p是得1的概率。 dtype=theano.config.floatX) randoms = T.Rebroadcast((2, True))(randoms) return inp * randoms # shape=(seq_length, batch_size, 1024) def dropout(self, inp, drop_prob): # 处理向量:randomly set some positions to zero. Vector.shape=(n, ) # 例如一个向量20维,就有20个位置,也就是有20个神经元。 # train时做dropout,test时还是全连接。 # if drop_prob < 0. or drop_prob >= 1.: # raise Exception('Drop prob must be in interval [0, 1)') retain_prob = 1. - drop_prob # 取0.5就可以了。 randoms = self.thea_rng.binomial( size=inp.shape, # 生成与向量inp同样维度的0、1向量 n=1, # 每个神经元实验一次 p=retain_prob) # 每个神经元*1的概率为p/retain_prob。*0的概率为drop_prob inp *= randoms # 屏蔽某些神经元,重置为0 inp /= retain_prob # drop完后需要rescale,以维持inp在dropout前后的数值和(sum)不变。 return inp # 直接本地修改inp,所以调用时'self.dropout(x, 0.5)'即可直接本地修改输入x。 def __theano_predict__(self, n_in, n_hidden): """ 测试阶段再跑一遍训练序列得到各个隐层。用全部数据一次性得出所有用户的表达 """ ui, wh = self.ui, self.wh tra_mask = T.imatrix() actual_batch_size = tra_mask.shape[0] seq_length = T.max(T.sum(tra_mask, axis=1)) # 获取mini-batch里各序列的长度最大值作为seq_length c0 = T.alloc(self.c0, actual_batch_size, n_hidden) # shape=(n, 20) h0 = T.alloc(self.h0, actual_batch_size, n_hidden) # shape=(n, 20) bi = T.alloc(self.bi, actual_batch_size, 4, n_hidden) # shape=(n, 4, 20), 原维度放在后边 bi = bi.dimshuffle(1, 2, 0) # shape=(4, 20, n) # 隐层是1个GRU Unit:都可以用这个统一的格式。 pidxs = T.imatrix() ps = self.trained_items[ pidxs] # shape((actual_batch_size, seq_length, n_hidden)) ps = ps.dimshuffle( 1, 0, 2) # shape=(seq_length, batch_size, n_hidden)=(157, n, 20) def recurrence(p_t, c_t_pre1, h_t_pre1): # 特征、隐层都处理成shape=(batch_size, n_hidden)=(n, 20) gates = T.dot(ui, p_t.T) + T.dot( wh, h_t_pre1.T) + bi # shape=(4, 20, n) i, f, g, o = sigmoid(gates[0]).T, sigmoid(gates[1]).T, tanh( gates[2]).T, sigmoid(gates[3]).T c_t = f * c_t_pre1 + i * g # shape=(n, 20) h_t = o * tanh(c_t) # shape=(n, 20) return [c_t, h_t] [c, h], _ = theano.scan( # h.shape=(157, n, 20) fn=recurrence, sequences=ps, outputs_info=[c0, h0], n_steps=seq_length) # 得到batch_hts.shape=(n, 20),就是这个batch里每个用户的表达ht。 # 必须要用T.sum(),不然无法建模到theano的graph里、报length not known的错 hs = h.dimshuffle(1, 0, 2) # shape=(batch_size, seq_length, n_hidden) hts = hs[ # shape=(n, n_hidden) T.arange(actual_batch_size), # 行. 花式索引a[[1,2,3],[2,5,6]],需给定行列的表示 T.sum(tra_mask, axis=1) - 1] # 列。需要mask是'int32'型的 # givens给数据 start_end = T.ivector() self.seq_predict = theano.function( inputs=[start_end], outputs=hts, givens={ pidxs: self. tra_buys_masks[start_end], # 类型是 TensorType(int32, matrix) tra_mask: self.tra_masks[start_end] }) def predict(self, idxs): return self.seq_predict(idxs)