def ada_updates(params, grads, rho=0.95, eps=1e-6): ''' Ada-delta algorithm reference: http://www.cnblogs.com/neopenx/p/4768388.html ''' # initialization: # dp : delta params # dp_sqr: (delta params) ** 2 # gr_sqr: gradient ** 2 running_gr = [theano.shared(p.get_value() * th_floatX(0.)) for p in params] running_dp_sqr = [ theano.shared(p.get_value() * th_floatX(0.)) for p in params ] running_gr_sqr = [ theano.shared(p.get_value() * th_floatX(0.)) for p in params ] # update gr gr_updates = [(gr_i, new_gr_i) for gr_i, new_gr_i in zip(running_gr, grads)] # update gr_sqr gr_sqr_updates = [(gr_sqr_i, rho * gr_sqr_i + (1 - rho) * gr_i**2) for gr_sqr_i, gr_i in zip(running_gr_sqr, running_gr)] # calculate (delta params) by RMS # NOTE: here dp_sqr is from last time calculation, because dp has not be calculated! dp = [ -gr_i * (dp_sqr_i + eps)**0.5 / (gr_sqr_i + eps)**0.5 for gr_i, dp_sqr_i, gr_sqr_i in zip(running_gr, running_dp_sqr, running_gr_sqr) ] # update dx_sqr dp_sqr_updates = [(dp_sqr_i, rho * dp_sqr_i + (1 - rho) * dp_i**2) for dp_sqr_i, dp_i in zip(running_dp_sqr, dp)] # update params param_updates = [(param_i, param_i + dp_i) for param_i, dp_i in zip(params, dp)] return gr_updates, gr_sqr_updates, dp_sqr_updates, param_updates
def __init__(self, corpus, n_emb, n_hidden, batch_size, conv_size, pooling, rng=None, th_rng=None, load_from=None, gensim_w2v=None): ''' n_hidden: output conv stack size conv_size: filter height size ''' self.corpus = corpus self.n_emb = n_emb self.n_hidden = n_hidden self.batch_size = batch_size self.conv_size = conv_size self.pooling = pooling assert pooling in ('mean', 'max') if rng is None: rng = np.random.RandomState(1226) if th_rng is None: th_rng = RandomStreams(1226) # x/mask: (batch size, nsteps) x = T.matrix('x', dtype='int32') mask = T.matrix('mask', dtype=theano.config.floatX) y = T.vector('y', dtype='int32') batch_idx_seq = T.vector('index', dtype='int32') use_noise = theano.shared(th_floatX(0.)) self.x, self.mask, self.y, self.batch_idx_seq, self.use_noise = x, mask, y, batch_idx_seq, use_noise # No need for transpose of x/mask in CNN n_samples, n_steps = x.shape # transpose mask-matrix to be consistent with pooling-layer-inputs trans_mask = mask.T # truncate mask-matrix to be consistent with conv-outputs trunc_mask = trans_mask[(conv_size - 1):] # list of model layers model_layers = [] model_layers.append( EmbLayer(x, load_from=load_from, rand_init_params=(rng, (corpus.dic.size, n_emb)), gensim_w2v=gensim_w2v, dic=corpus.dic)) # emb-out: (batch size, n_words/steps, emb_dim) # conv-in: (batch size, 1(input stack size), n_words/steps, emb_dim) # conv-out: (batch size, n_hidden(output stack size), output feature map height, 1(output feature map width)) # pooling-in: (output feature map height, batch size, output stack size) conv_in = model_layers[-1].outputs[:, None, :, :] model_layers.append( ConvLayer(conv_in, image_shape=(batch_size, 1, corpus.maxlen, n_emb), load_from=load_from, rand_init_params=(rng, (n_hidden, 1, conv_size, n_emb)))) pooling_in = T.transpose(model_layers[-1].outputs.flatten(3), axes=(2, 0, 1)) if pooling == 'mean': model_layers.append(MeanPoolingLayer(pooling_in, trunc_mask)) else: model_layers.append(MaxPoolingLayer(pooling_in, trunc_mask)) model_layers.append( DropOutLayer(model_layers[-1].outputs, use_noise, th_rng)) model_layers.append( HiddenLayer(model_layers[-1].outputs, activation=T.nnet.softmax, load_from=load_from, rand_init_params=(rng, (n_hidden, corpus.n_type)))) self.model_layers = model_layers model_params = [] for layer in model_layers: model_params += layer.params self.pred_prob = model_layers[-1].outputs self.pred = T.argmax(self.pred_prob, axis=1) off = 1e-8 self.cost = -T.mean( T.log(self.pred_prob[T.arange(n_samples), y] + off)) # attributes with `func` suffix is compiled function self.predict_func = theano.function(inputs=[x, mask], outputs=self.pred) self.predict_prob_func = theano.function(inputs=[x, mask], outputs=self.pred_prob) grads = T.grad(self.cost, model_params) self.gr_updates, self.gr_sqr_updates, self.dp_sqr_updates, self.param_updates = ada_updates( model_params, grads)
def __init__(self, corpus, n_emb, n_hidden, pooling, rng=None, th_rng=None, load_from=None, gensim_w2v=None): self.corpus = corpus self.n_emb = n_emb self.n_hidden = n_hidden self.pooling = pooling assert pooling in ('mean', 'max') if rng is None: rng = np.random.RandomState(1226) if th_rng is None: th_rng = RandomStreams(1226) # x/mask: (batch size, nsteps) x = T.matrix('x', dtype='int32') mask = T.matrix('mask', dtype=theano.config.floatX) y = T.vector('y', dtype='int32') batch_idx_seq = T.vector('index', dtype='int32') use_noise = theano.shared(th_floatX(0.)) self.x, self.mask, self.y, self.batch_idx_seq, self.use_noise = x, mask, y, batch_idx_seq, use_noise # TRANSPOSE THE AXIS! trans_x, trans_mask = x.T, mask.T # trancate the useless data trunc_x, trunc_mask = RNNModel.trunc_inputs_mask(trans_x, trans_mask) n_steps, n_samples = trunc_x.shape # list of model layers model_layers = [] model_layers.append( EmbLayer(trunc_x, load_from=load_from, rand_init_params=(rng, (corpus.dic.size, n_emb)), gensim_w2v=gensim_w2v, dic=corpus.dic)) model_layers.append( RNNLayer(model_layers[-1].outputs, trunc_mask, load_from=load_from, rand_init_params=(rng, (n_emb, n_hidden)))) if pooling == 'mean': model_layers.append( MeanPoolingLayer(model_layers[-1].outputs, trunc_mask)) else: model_layers.append( MaxPoolingLayer(model_layers[-1].outputs, trunc_mask)) model_layers.append( DropOutLayer(model_layers[-1].outputs, use_noise, th_rng)) model_layers.append( HiddenLayer(model_layers[-1].outputs, activation=T.nnet.softmax, load_from=load_from, rand_init_params=(rng, (n_hidden, corpus.n_type)))) self.model_layers = model_layers model_params = [] for layer in model_layers: model_params += layer.params self.pred_prob = model_layers[-1].outputs self.pred = T.argmax(self.pred_prob, axis=1) off = 1e-8 self.cost = -T.mean( T.log(self.pred_prob[T.arange(n_samples), y] + off)) # attributes with `func` suffix is compiled function self.predict_func = theano.function(inputs=[x, mask], outputs=self.pred) self.predict_prob_func = theano.function(inputs=[x, mask], outputs=self.pred_prob) grads = T.grad(self.cost, model_params) self.gr_updates, self.gr_sqr_updates, self.dp_sqr_updates, self.param_updates = ada_updates( model_params, grads)