def rmsprop(lr, tparams, grads, x, mask, y, cost): """ A variant of SGD that scales the step size by running average of the recent step norms. Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs mask: Theano variable Sequence mask y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [Hint2014]_. .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, lecture 6a, http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf """ zipped_grads = [ theano.shared(p.get_value() * utils.numpy_floatX(0.0), name="%s_grad" % k) for k, p in tparams.iteritems() ] running_grads = [ theano.shared(p.get_value() * utils.numpy_floatX(0.0), name="%s_rgrad" % k) for k, p in tparams.iteritems() ] running_grads2 = [ theano.shared(p.get_value() * utils.numpy_floatX(0.0), name="%s_rgrad2" % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rgup + rg2up, name="rmsprop_f_grad_shared") updir = [ theano.shared(p.get_value() * utils.numpy_floatX(0.0), name="%s_updir" % k) for k, p in tparams.iteritems() ] updir_new = [ (ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2) ] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function( [lr], [], updates=updir_new + param_up, on_unused_input="ignore", name="rmsprop_f_update" ) return f_grad_shared, f_update
def adadelta(lr, tparams, grads, x, mask, y, cost): """ An adaptive learning rate optimizer Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs mask: Theano variable Sequence mask y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [ADADELTA]_. .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning Rate Method*, arXiv:1212.5701. """ zipped_grads = [theano.shared(p.get_value() * utils.numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items()] running_up2 = [theano.shared(p.get_value() * utils.numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.items()] running_grads2 = [theano.shared(p.get_value() * utils.numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def encoder(tparams, state_below, mask, seq_output=False, prefix='lstm_encoder'): """ state_below: size of n_steps * n_samples * n_x """ n_steps = state_below.shape[0] n_samples = state_below.shape[1] n_h = tparams[_p(prefix, 'U')].shape[0] def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ tparams[_p(prefix, 'b')] def _step(m_, x_, h_, c_, U): preact = tensor.dot(h_, U) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) f = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) o = tensor.nnet.sigmoid(_slice(preact, 2, n_h)) c = tensor.tanh(_slice(preact, 3, n_h)) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c seqs = [mask, state_below_] rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[ tensor.alloc(numpy_floatX(0.), n_samples, n_h), tensor.alloc(numpy_floatX(0.), n_samples, n_h) ], non_sequences=[tparams[_p(prefix, 'U')]], name=_p(prefix, '_layers'), n_steps=n_steps, strict=True) h_rval = rval[0] if seq_output: return h_rval else: # size of n_samples * n_h return h_rval[-1]
def encoder(tparams, state_below, mask, seq_output=False, prefix='lstm_encoder'): """ state_below: size of n_steps * n_samples * n_x """ n_steps = state_below.shape[0] n_samples = state_below.shape[1] n_h = tparams[_p(prefix,'U')].shape[0] def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ tparams[_p(prefix, 'b')] def _step(m_, x_, h_, c_, U): preact = tensor.dot(h_, U) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) f = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) o = tensor.nnet.sigmoid(_slice(preact, 2, n_h)) c = tensor.tanh(_slice(preact, 3, n_h)) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c seqs = [mask, state_below_] rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples,n_h), tensor.alloc(numpy_floatX(0.), n_samples,n_h)], non_sequences = [tparams[_p(prefix, 'U')]], name=_p(prefix, '_layers'), n_steps=n_steps, strict=True) h_rval = rval[0] if seq_output: return h_rval else: # size of n_samples * n_h return h_rval[-1]
def decoder_layer(tparams, state_below, prefix='decoder_lstm'): """ state_below: size of n_steps * n_samples * n_x """ nsteps = state_below.shape[0] n_h = tparams[_p(prefix, 'U')].shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(x_, h_, c_, U): preact = tensor.dot(h_, U) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) f = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) o = tensor.nnet.sigmoid(_slice(preact, 2, n_h)) c = tensor.tanh(_slice(preact, 3, n_h)) c = f * c_ + i * c h = o * tensor.tanh(c) return h, c state_below_ = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] seqs = [state_below_] rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[ tensor.alloc(numpy_floatX(0.), n_samples, n_h), tensor.alloc(numpy_floatX(0.), n_samples, n_h) ], non_sequences=[tparams[_p(prefix, 'U')]], name=_p(prefix, '_layers'), n_steps=nsteps, strict=True) h_rval = rval[0] return h_rval
def rmsprop(lr, tparams, grads, iin, out, updates): """ A variant of SGD that scales the step size by running average of the recent step norms. Notes ----- For more information, see [Hint2014]_. .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, lecture 6a, http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf """ zipped_grads = [ theano.shared(p.get_value() * utils.numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items() ] running_grads = [ theano.shared(p.get_value() * utils.numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.items() ] running_grads2 = [ theano.shared(p.get_value() * utils.numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(iin, out, updates=zgup + rgup + rg2up + updates, name='rmsprop_f_grad_shared') updir = [ theano.shared(p.get_value() * utils.numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.items() ] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg**2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') return f_grad_shared, f_update
def perform(self, x): x1, x2 = x[0], x[1] nsteps = x1.shape[0] n_samples = x1.shape[1] # if x1.ndim == 3: # n_samples = x1.shape[1] # else: # n_samples = 1 # def _slice(x_t, idx, ndim): if x_t.ndim == 3: return x_t[:, :, idx * ndim: (idx + 1) * ndim] return x_t[:, idx * ndim:(idx + 1) * ndim] def _step(x_t, h_tm1, c_tm1, W, U, b): # z = sigmoid( W * x(t) + U * h(t-1) + b) # zi = W * x(t) + U * h(t-1) + b zi = T.dot(x_t, W) + T.dot(h_tm1, U) + b # zi = T.dot(h_tm1, self.Uh) # zi += x_t # W = [Wi, Wf, Wo, Wc], U = [Ui, Uf, Uo, Uc], b = [bi, bf, bo, bc] i = T.nnet.sigmoid(_slice(zi, 0, self.n_output)) f = T.nnet.sigmoid(_slice(zi, 1, self.n_output)) o = T.nnet.sigmoid(_slice(zi, 2, self.n_output)) c = T.tanh(_slice(zi, 3, self.n_output)) c = f * c_tm1 + i * c; h = o * T.tanh(c) # output at each time # s = softmax(w * h_t + b) return h, c # h0 and c0 are initialized randomly h0 = T.alloc(numpy_floatX(0.), n_samples, self.n_output); c0 = T.alloc(numpy_floatX(0.), n_samples, self.n_output) h0 = theano.tensor.unbroadcast(h0, 1); c0 = theano.tensor.unbroadcast(c0, 1) [h, c], _ = theano.scan(_step, sequences=[x1], outputs_info=[h0, c0], non_sequences=[self.Wh, self.Uh, self.bh], name='blstm_layers', n_steps=nsteps) [h_reverse, c_reverse], _ = theano.scan(_step, sequences=[x2], outputs_info=[h0, c0], non_sequences=[self.Wh_reverse, self.Uh_reverse, self.bh_reverse], name='blstm_layers_reverse', n_steps=nsteps, go_backwards=True) self.input = x self.output = [h, h_reverse]
def adadelta(lr, tparams, grads, iin, out, updates): """ An adaptive learning rate optimizer Notes ----- For more information, see [ADADELTA]_. .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning Rate Method*, arXiv:1212.5701. """ zipped_grads = [ theano.shared(p.get_value() * utils.numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items() ] running_up2 = [ theano.shared(p.get_value() * utils.numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.items() ] running_grads2 = [ theano.shared(p.get_value() * utils.numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(iin, out, updates=zgup + rg2up + updates, name='adadelta_f_grad_shared') updir = [ -tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2) ] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) dim_proj = options['dim_proj'] rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[ tensor.alloc(numpy_floatX(0.), n_samples, dim_proj), tensor.alloc(numpy_floatX(0.), n_samples, dim_proj) ], name=_p(prefix, '_layers'), n_steps=nsteps) # outputs_info include h_ and c_ # return only hidden states, so return rval[0] return rval[0]
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8): """ default: lr=0.0002 """ grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g ** 2) for g in grads])) if tensor.ge(norm, 5): grads = [g * 5 / norm for g in grads] gshared = [theano.shared(p.get_value() * 0.0, name="%s_grad" % k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] i = theano.shared(numpy_floatX(0.0)) i_t = i + 1.0 fix1 = 1.0 - b1 ** (i_t) fix2 = 1.0 - b2 ** (i_t) lr_t = lr * (tensor.sqrt(fix2) / fix1) for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.0) v = theano.shared(p.get_value() * 0.0) m_t = (b1 * g) + ((1.0 - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1.0 - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def build_model(tparams, options): trng = RandomStreams(options['SEED']) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # size of n_samples * n_z z = tensor.matrix('z', dtype=config.floatX) # size of n_samples * n_y y = tensor.matrix('y', dtype=config.floatX) z = dropout(z, trng, use_noise) h = tensor.tanh(tensor.dot(z, tparams['Wy1']) + tparams['by1']) h = dropout(h, trng, use_noise) # size of n_samples * n_y pred = tensor.nnet.sigmoid(tensor.dot(h, tparams['Wy2']) + tparams['by2']) f_pred = theano.function([z], pred, name='f_pred') cost = (-y * tensor.log(pred + 1e-6) - (1. - y) * tensor.log(1. - pred + 1e-6)).sum() / z.shape[0] return use_noise, z, y, cost, f_pred
def build_model(tparams,options): trng = RandomStreams(options['SEED']) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # input sentences, size of n_steps * n_samples x = tensor.matrix('x', dtype='int64') # the corresponding masks padding zeros mask = tensor.matrix('mask', dtype=config.floatX) # size of n_samples * n_z z = tensor.matrix('z', dtype=config.floatX) y = tensor.matrix('y', dtype=config.floatX) z = dropout(z, trng, use_noise) y = dropout(y, trng, use_noise) n_steps = x.shape[0] # the sentence length in this mini-batch n_samples = x.shape[1] # the number of sentences in this mini-batch n_x = tparams['Wemb'].shape[1] # the dimension of the word embedding # size of n_steps,n_samples,n_x emb = tparams['Wemb'][x.flatten()].reshape([n_steps,n_samples,n_x]) emb = dropout(emb, trng, use_noise) # 1 * n_samples * n_x z0 =tensor.dot(z,tparams['C0']).dimshuffle('x',0,1) # n_steps * n_samples * n_x emb_input = tensor.concatenate((z0,emb[:n_steps-1])) # n_steps * n_samples mask0 =mask[0].dimshuffle('x',0) mask_input = tensor.concatenate((mask0,mask[:n_steps-1])) # decoding the sentence vector z back into the original sentence h_decoder = encoder_layer(tparams, emb_input, mask_input,y, seq_output=True) h_decoder = dropout(h_decoder, trng, use_noise) shape = h_decoder.shape h_decoder = h_decoder.reshape((shape[0]*shape[1], shape[2])) Vhid = tensor.dot(tparams['Vhid'],tparams['Wemb'].T) pred_x = tensor.dot(h_decoder, Vhid) + tparams['bhid'] pred = tensor.nnet.softmax(pred_x) x_vec = x.reshape((shape[0]*shape[1],)) index = tensor.arange(shape[0]*shape[1]) pred_word = pred[index, x_vec] mask_word = mask.reshape((shape[0]*shape[1],)) index_list = theano.tensor.eq(mask_word, 1.).nonzero()[0] pred_word = pred_word[index_list] # the cross-entropy loss cost = -tensor.log(pred_word + 1e-6).sum() / n_samples return use_noise, x, mask, y, z, cost
def decoder_layer(tparams, state_below, prefix='decoder_vanilla'): """ state_below: size of n_steps * n_samples * n_x """ n_steps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 n_h = tparams[_p(prefix, 'U')].shape[0] def _step_slice(x_, h_, U): preact = tensor.dot(h_, U) preact += x_ h = tensor.tanh(preact) return h state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ tparams[_p(prefix, 'b')] rval, updates = theano.scan( _step_slice, sequences=[state_below_], outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, n_h)], non_sequences=[tparams[_p(prefix, 'U')]], name=_p(prefix, '_layers'), n_steps=n_steps) return rval
def rmsprop(self, lr, tparams, grads, inp_list, cost, params): clip = params["grad_clip"] decay_rate = tensor.constant(params["decay_rate"], dtype=theano.config.floatX) smooth_eps = tensor.constant(params["smooth_eps"], dtype=theano.config.floatX) zipped_grads = [theano.shared(np.zeros_like(p.get_value()), name="%s_grad" % k) for k, p in tparams.iteritems()] running_grads2 = [ theano.shared(np.zeros_like(p.get_value()), name="%s_rgrad2" % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] if clip > 0.0: rg2up = [ ( rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g, -clip, clip) ** 2), 0.0, np.inf), ) for rg2, g in zip(running_grads2, grads) ] else: rg2up = [ (rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (g ** 2), 0.0, np.inf)) for rg2, g in zip(running_grads2, grads) ] f_grad_shared = theano.function(inp_list, cost, updates=zgup + rg2up, name="rmsprop_f_grad_shared") updir = [theano.shared(p.get_value() * numpy_floatX(0.0), name="%s_updir" % k) for k, p in tparams.iteritems()] updir_new = [ (ud, -lr * zg / (tensor.sqrt(rg2) + smooth_eps)) for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2) ] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function( [lr], [], updates=updir_new + param_up, on_unused_input="ignore", name="rmsprop_f_update" ) return f_grad_shared, f_update, zipped_grads, running_grads2, updir
def pred_error(f_pred, prepare_data, data, iterator, fname='', verbose=False): """ Just compute the error f_pred: Theano fct computing the prediction prepare_data: usual prepare_data for that dataset. """ valid_err = 0 if verbose: f = open('../data/trec/TREC_10.label') lines = f.readlines() f.close() f_out = open(fname + 'trec_out_dscnn.txt', 'w') cat_ind = ['ABBR', 'ENTY', 'DESC', 'HUM', 'LOC', 'NUM'] cnt = 0 for b, valid_index in iterator: x, mask, y = prepare_data([data[0][t] for t in valid_index], numpy.array(data[1])[valid_index]) preds = f_pred(x, mask) targets = numpy.array(data[1])[valid_index] if verbose: for i in range(len(preds)): p = preds[i] if p != targets[i]: f_out.write('*') f_out.write(cat_ind[p] + ' ') f_out.write(lines[cnt]) cnt += 1 valid_err += (preds == targets).sum() valid_err = 1. - numpy_floatX(valid_err) / len(data[0]) return valid_err * 100
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n+1) * dim] return _x[:, n * dim:(n+1) * dim] def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) dim_proj = options['dim_proj'] rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, dim_proj), tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)], name=_p(prefix, '_layers'), n_steps=nsteps) return rval[0][-1]
def decoder_layer(tparams, state_below, z, mask, prefix='decoder_lstm'): """ state_below: size of n_steps * n_samples * n_x z: size of n_samples * n_z """ nsteps = state_below.shape[0] n_h = tparams[_p(prefix, 'U')].shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 state_belowx0 = tensor.dot(z, tparams[_p(prefix, 'C0')]) + \ tparams[_p(prefix, 'b0')] h0 = tensor.tanh(state_belowx0) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] state_below_ = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] # tensor.dot(z, tparams[_p(prefix, 'C')]) def _step(m_, x_, h_, c_, U): preact = tensor.dot(h_, U) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) f = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) o = tensor.nnet.sigmoid(_slice(preact, 2, n_h)) c = tensor.tanh(_slice(preact, 3, n_h)) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c seqs = [mask[:nsteps - 1], state_below_[:nsteps - 1]] rval, updates = theano.scan( _step, sequences=seqs, outputs_info=[h0, tensor.alloc(numpy_floatX(0.), n_samples, n_h)], non_sequences=[tparams[_p(prefix, 'U')]], name=_p(prefix, '_layers'), n_steps=nsteps - 1, strict=True) h0x = h0.dimshuffle('x', 0, 1) h_rval = rval[0] return tensor.concatenate((h0x, h_rval))
def lstm_layer(tparams, x, mask, prefix): n_steps = x.shape[0] n_samples = x.shape[1] n_h = tparams[_p(prefix, 'U_i')].shape[0] x_i = tensor.dot(x, tparams[_p(prefix, 'W_i')]) + tparams[_p( prefix, 'b_i')] x_f = tensor.dot(x, tparams[_p(prefix, 'W_f')]) + tparams[_p( prefix, 'b_f')] x_o = tensor.dot(x, tparams[_p(prefix, 'W_o')]) + tparams[_p( prefix, 'b_o')] x_c = tensor.dot(x, tparams[_p(prefix, 'W_c')]) + tparams[_p( prefix, 'b_c')] def _step(m_, xt_i, xt_f, xt_o, xt_c, h_, c_, U_i, U_f, U_o, U_c): i = tensor.nnet.sigmoid(tensor.dot(h_, U_i) + xt_i) f = tensor.nnet.sigmoid(tensor.dot(h_, U_f) + xt_f) o = tensor.nnet.sigmoid(tensor.dot(h_, U_o) + xt_o) c = tensor.tanh(tensor.dot(h_, U_c) + xt_c) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c seqs = [mask, x_i, x_f, x_o, x_c] non_seqs = [ tparams[_p(prefix, 'U_i')], tparams[_p(prefix, 'U_f')], tparams[_p(prefix, 'U_o')], tparams[_p(prefix, 'U_c')] ] rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[ tensor.alloc(numpy_floatX(0.), n_samples, n_h), tensor.alloc(numpy_floatX(0.), n_samples, n_h) ], non_sequences=non_seqs, name=_p(prefix, '_layers'), n_steps=n_steps, strict=True) # hseq, cseq return rval
def perform(self, x): nsteps = x.shape[0] # if x.ndim == 3: # n_samples = x.shape[1] # else: # n_samples = 1 # n_samples = x.shape[1] def _slice(x_t, idx, ndim): if x_t.ndim == 3: return x_t[:, :, idx * ndim: (idx + 1) * ndim] return x_t[:, idx * ndim:(idx + 1) * ndim] def _step(x_t, h_tm1, c_tm1): # z = sigmoid( W * x(t) + U * h(t-1) + b) # zi = W * x(t) + U * h(t-1) + b zi = T.dot(x_t, self.Wh) + T.dot(h_tm1, self.Uh) + self.bh # zi = T.dot(h_tm1, self.Uh) # zi += x_t # W = [Wi, Wf, Wo, Wc], U = [Ui, Uf, Uo, Uc], b = [bi, bf, bo, bc] i = T.nnet.sigmoid(_slice(zi, 0, self.n_output)) f = T.nnet.sigmoid(_slice(zi, 1, self.n_output)) o = T.nnet.sigmoid(_slice(zi, 2, self.n_output)) c = T.tanh(_slice(zi, 3, self.n_output)) c = f * c_tm1 + i * c; h = o * T.tanh(c) # output at each time # s = softmax(w * h_t + b) return [h, c] # h0 and c0 are initialized randomly h0 = T.alloc(numpy_floatX(0.), n_samples, self.n_output); c0 = T.alloc(numpy_floatX(0.), n_samples, self.n_output) h0 = theano.tensor.unbroadcast(h0, 1); c0 = theano.tensor.unbroadcast(c0, 1) [h, c], _ = theano.scan(fn=_step, sequences=x, outputs_info=[h0, c0], n_steps=nsteps) self.input = x self.output = h
def Santa(tparams, cost, inps, lr, eidx, nframes, max_epoch, rho=0.95, anne_rate=0.5, e=1e-8, clip_norm=5): """ The implementation of Santa algorithm. tparams: theano shared variables, params that we need to optimize cost: cost function, the cross-entropy loss in our case inps: input theano variables lr: learning rate, in our case, we choose it to be 1.*1e-3, or 2.*1e-4 eidx: the current epochs we are running, used to decide when to change from exploration to refinement nframes: how many time-steps we have in the training dataset. max_epoch: the maximum of epochs we run rho, anne_rate, e, clip_norm: hyper-parameters we used in all the algorithms. """ trng = RandomStreams(123) grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g*clip_norm/norm for g in grads] gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] i = theano.shared(numpy_floatX(0.)) i_t = i + 1. for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) alpha = theano.shared(np.ones(p.get_value().shape)*.5) alpha_t = alpha + (m**2 - lr/(i_t ** anne_rate)) * tensor.lt(eidx, 0.15*max_epoch) v_t = rho * v + (1.-rho) * (g ** 2) pcder = tensor.sqrt(tensor.sqrt(v_t)+e) eps = trng.normal(p.get_value().shape, avg = 0.0, std = 1.0, dtype=theano.config.floatX) m_t = -lr*g/pcder + (1. - alpha_t) * m + (tensor.sqrt(2*lr*v_t/(i_t ** anne_rate)/nframes) *eps) * tensor.lt(eidx, 0.15*max_epoch) p_t = p + (m_t/ pcder) updates.append((alpha, alpha_t)) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr,eidx,nframes,max_epoch], [], updates=updates) return f_grad_shared, f_update
def encoder(tparams, state_below, mask, seq_output=False, prefix='gru_encoder'): """ state_below: size of n_steps * n_samples * n_x """ n_steps = state_below.shape[0] n_samples = state_below.shape[1] n_h = tparams[_p(prefix,'Ux')].shape[1] def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ tparams[_p(prefix, 'b')] state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ tparams[_p(prefix, 'bx')] def _step(m_, x_, xx_, h_, U, Ux): preact = tensor.dot(h_, U) preact += x_ r = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) u = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) preactx = tensor.dot(h_, Ux) preactx = preactx * r preactx = preactx + xx_ h = tensor.tanh(preactx) h = u * h_ + (1. - u) * h h = m_[:,None] * h + (1. - m_)[:,None] * h_ return h seqs = [mask, state_below_, state_belowx] rval, updates = theano.scan(_step, sequences=seqs, outputs_info = [tensor.alloc(numpy_floatX(0.), n_samples, n_h)], non_sequences = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]], name=_p(prefix, '_layers'), n_steps=n_steps, strict=True) if seq_output: return rval else: # size of n_samples * n_h return rval[-1]
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8, clip_norm=5): """ default: lr=0.0002 This is the implementation of the Adam algorithm Reference: http://arxiv.org/pdf/1412.6980v8.pdf """ grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g * clip_norm / (norm + e) for g in grads] zero = numpy.float32(0) gshared = [ theano.shared(p.get_value() * zero, name='%s_grad' % k) for k, p in tparams.iteritems() ] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] i = theano.shared(numpy_floatX(0.)) i_t = i + 1. fix1 = 1. - b1**(i_t) fix2 = 1. - b2**(i_t) lr_t = lr * (tensor.sqrt(fix2) / fix1) _s = tensor.scalar('s', dtype='float32') for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * zero) v = theano.shared(p.get_value() * zero) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) p_t = p - (lr_t * g_t) if tensor.eq(_s, 0.) and (p.name is 'gp_beta' or p.name is 'gp_alpha' or p.name is 'r'): p_t = p - (_s * lr_t * g_t) #elif tensor.eq(_s,1.) and (p.name is not 'gp_beta' and p.name is not 'gp_alpha' and p.name is not 'r'): # p_t = p - ((1-_s) * lr_t * g_t) if p.name == 'e_beta' or p.name == 'd_beta': p_t = p_t * (p_t > 0) elif p.name is 'gp_beta' or p.name is 'gp_alpha': m_t = m_t.astype('float32') v_t = v_t.astype('float32') p_t = p_t.astype('float32') updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr, _s], [], updates=updates) return f_grad_shared, f_update
def fully_layer(params, input, results, nCategories=101, nout=512, weights_path=None): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) ninput = tensor.prod(input.shape[1:]) denselayer1 = tensor.dot(input, params['fc1_w']) + params['fc1_b'] denselayer1 = relu(denselayer1) denselayer2 = tensor.dot(denselayer1, params['fc2_w']) + params['fc2_b'] denselayer2 = relu(denselayer2) results['fc1'] = denselayer1 results['fc2'] = denselayer2 return params, results
def build_model(tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # input sentences, size of n_steps * n_samples x = tensor.matrix('x', dtype='int64') # the corresponding masks padding zeros mask = tensor.matrix('mask', dtype=config.floatX) # size of n_z * n_samples z = tensor.matrix('z', dtype=config.floatX) z = dropout(z, trng, use_noise) n_steps = x.shape[0] # the sentence length in this mini-batch n_samples = x.shape[1] # the number of sentences in this mini-batch n_x = tparams['Wemb'].shape[1] # the dimension of the word embedding emb = tparams['Wemb'][x.flatten()].reshape([n_steps, n_samples, n_x]) emb = dropout(emb, trng, use_noise) # decoding the sentence vector z back into the original sentence h_decoder = decoder_layer(tparams, emb, z, mask=mask) h_decoder = dropout(h_decoder, trng, use_noise) shape = h_decoder.shape h_decoder = h_decoder.reshape((shape[0] * shape[1], shape[2])) Vhid = tensor.dot(tparams['Vhid'], tparams['Wemb'].T) pred_x = tensor.dot(h_decoder, Vhid) + tparams['bhid'] pred = tensor.nnet.softmax(pred_x) x_vec = x.reshape((shape[0] * shape[1], )) index = tensor.arange(shape[0] * shape[1]) pred_word = pred[index, x_vec] mask_word = mask.reshape((shape[0] * shape[1], )) index_list = theano.tensor.eq(mask_word, 1.).nonzero()[0] pred_word = pred_word[index_list] # the cross-entropy loss cost = -tensor.log(pred_word + 1e-6).sum() / n_samples f_pred_prob = theano.function([x, mask, z], pred_word, name='f_pred_prob') return use_noise, x, mask, z, f_pred_prob, cost
def decoder_layer(tparams, state_below, prefix='decoder_gru'): """ state_below: size of n_steps * n_samples * n_x """ nsteps = state_below.shape[0] n_h = tparams[_p(prefix, 'Ux')].shape[1] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] state_below_ = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] state_belowx = tensor.dot(state_below, tparams[_p( prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] def _step_slice(x_, xx_, h_, U, Ux): preact = tensor.dot(h_, U) preact += x_ r = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) u = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) preactx = tensor.dot(h_, Ux) preactx = preactx * r preactx = preactx + xx_ h = tensor.tanh(preactx) h = u * h_ + (1. - u) * h return h seqs = [state_below_, state_belowx] _step = _step_slice rval, updates = theano.scan( _step, sequences=seqs, outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, n_h)], non_sequences=[tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]], name=_p(prefix, '_layers'), n_steps=nsteps, strict=True) return rval
def pSGLD_test(tparams, cost, inps, lr, rho=0.99, epsilon=1e-6, eta=0.01, anne_rate=0.55, clip_norm=5): """ default: lr=0.001 """ trng = RandomStreams(123) grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g * clip_norm / norm for g in grads] gshared = [ theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in tparams.iteritems() ] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] i = theano.shared(numpy_floatX(0.)) i_t = i + 1. for p, g in zip(tparams.values(), gshared): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g**2 updates.append((acc, acc_new)) G = tensor.sqrt(acc_new + epsilon) eps = trng.normal(p.get_value().shape, avg=0.0, std=1.0, dtype=theano.config.floatX) updated_p = p - lr * g / G + tensor.sqrt( lr / G) * eta / (1 + i_t)**anne_rate * eps updates.append((p, updated_p)) updates.append((i, i_t)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def pred_error(f_pred, prepare_data, data, iterator, verbose=False): """ compute the prediction error. """ valid_err = 0 for _, valid_index in iterator: x, mask, y = prepare_data([data[0][t] for t in valid_index], np.array(data[1])[valid_index], maxlen=None) preds = f_pred(x, mask) targets = np.array(data[1])[valid_index] valid_err += (preds == targets).sum() valid_err = 1. - numpy_floatX(valid_err) / len(data[0]) return valid_err
def pred_error(f_pred, prepare_data, data, iterator, fname='', verbose=False): """ Just compute the error f_pred: Theano fct computing the prediction prepare_data: usual prepare_data for that dataset. """ valid_err = 0 preds_all = [] targets_all = [] if verbose: true_labels = [] f_label = open("paper2_labels_without_175_repeat_with_3200.txt", "r") for line_label in f_label: true_labels.append(int(line_label.strip())) f_label.close() f_out = open(fname + 'hdf_out_dscnn.txt', 'w') cat_ind = ['1', '2'] cnt = 0 for b, valid_index in iterator: x, mask, y = prepare_data([data[0][t] for t in valid_index], numpy.array(data[1])[valid_index]) preds = f_pred(x, mask) for pred_item in preds: preds_all.append(pred_item) targets = numpy.array(data[1])[valid_index] for target_item in targets: targets_all.append(target_item) if verbose: for i in range(len(preds)): p = preds[i] if p != targets[i]: f_out.write('*') else: f_out.write(' ') f_out.write(str(preds[i]) + ' ') f_out.write(str(targets[i]) + ' ') f_out.write(cat_ind[p] + ' ') f_out.write(str(true_labels[cnt]) + '\n') cnt += 1 equals = 0 for i in range(len(preds)): if preds[i] == targets[i]: equals += 1 valid_err += equals valid_err = 1. - numpy_floatX(valid_err) / len(data[0]) print 'len(preds_all):', len(preds_all) print 'len(targets_all):', len(targets_all) return valid_err * 100, preds_all, targets_all
def rmsprop(self, lr, tparams, grads, inp_list, cost, params): clip = params['grad_clip'] decay_rate = tensor.constant(params['decay_rate'], dtype=theano.config.floatX) smooth_eps = tensor.constant(params['smooth_eps'], dtype=theano.config.floatX) zipped_grads = [ theano.shared(np.zeros_like(p.get_value()), name='%s_grad' % k) for k, p in tparams.iteritems() ] running_grads2 = [ theano.shared(np.zeros_like(p.get_value()), name='%s_rgrad2' % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] if clip > 0.0: rg2up = [(rg2, tensor.clip( decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g, -clip, clip)**2), 0.0, np.inf)) for rg2, g in zip(running_grads2, grads)] else: rg2up = [(rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (g**2), 0.0, np.inf)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp_list, cost, updates=zgup + rg2up, name='rmsprop_f_grad_shared') updir = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems() ] updir_new = [ (ud, -lr * zg / (tensor.sqrt(rg2) + smooth_eps)) for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2) ] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') return f_grad_shared, f_update, zipped_grads, running_grads2, updir
def build_model(self, tparams): # sents -> word_indices * #batch_size sents = tensor.matrix('sents', dtype="int64") # mask -> n_word_indices * #batch_size mask = tensor.matrix('mask', dtype=config.floatX) # imgs -> #4098 * #batch_size imgs = tensor.matrix('imgs', dtype=config.floatX) # gt_sents -> word_indices * #batch_size gt_sents = tensor.matrix('gt_sents', dtype="int64") # Used for dropout. use_noise = theano.shared(numpy_floatX(1.)) with open("testTagData.pkl", "rb") as f: sents_tag, mask_tag, imgs_tag, gt_sents_tag = pickle.load(f) sents.tag.test_value = sents_tag mask.tag.test_value = mask_tag imgs.tag.test_value = imgs_tag gt_sents.tag.test_value = gt_sents_tag n_timesteps = sents.shape[0] n_samples = sents.shape[1] # Image encoding # Xe -> #batch_size * #image_encoding_size x_e = (tensor.dot(imgs.T, tparams['We']) + tparams['be']) # sentences (i.e. captions) encoding # Xs -> #no_of_words * #batch_size * #word_encoding_size x_s = tparams['Ws'][sents.flatten()].reshape([n_timesteps, n_samples, self.word_img_embed_hidden_dim]) # Xes has the image vector as the first timestep # Xes -> #no_timesteps (no_of_words + 1 (for image)) * #batch_size * #word_image_encoding_size x_es = tensor.zeros([n_timesteps + 1, n_samples, self.word_img_embed_hidden_dim], dtype=config.floatX) x_es = tensor.set_subtensor(x_es[1:], x_s) x_es = tensor.set_subtensor(x_es[0], x_e) mask_es = tensor.ones([mask.shape[0] + 1, mask.shape[1]], dtype=config.floatX) mask_es = tensor.set_subtensor(mask_es[1:], mask) # pred_softmax -> #batch_size * #no_of_words * #vocab_size pred_softmax = self._lstm_build_model(tparams, x_es, mask_es, use_noise) cost = negative_log_likelihood(pred_softmax, gt_sents) # pred_prob = lstm_output.max(axis=2) # pred = lstm_output.argmax(axis=2) return sents, mask, imgs, gt_sents, use_noise, cost
def build_model(tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # input sentence: n_steps * n_samples x = tensor.matrix('x', dtype='int32') # label: (n_samples,) y = tensor.vector('y', dtype='int32') layer0_input = tparams['Wemb'][tensor.cast(x.flatten(), dtype='int32')].reshape( (x.shape[0], 1, x.shape[1], tparams['Wemb'].shape[1])) layer0_input = dropout(layer0_input, trng, use_noise) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input, filter_shape=filter_shape, pool_size=pool_size, prefix=_p('cnn_encoder', i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input = tensor.concatenate(layer1_inputs, 1) layer1_input = dropout(layer1_input, trng, use_noise) # this is the label prediction you made pred = tensor.nnet.softmax( tensor.dot(layer1_input, tparams['Wy']) + tparams['by']) f_pred_prob = theano.function([x], pred, name='f_pred_prob') f_pred = theano.function([x], pred.argmax(axis=1), name='f_pred') # get the expression of how we calculate the cost function # i.e. corss-entropy loss index = tensor.arange(x.shape[0]) cost = -tensor.log(pred[index, y] + 1e-6).mean() return use_noise, x, y, f_pred_prob, f_pred, cost
def build_model(tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # n_samples * n_chars x = tensor.matrix('x', dtype='int32') y = tensor.matrix('y', dtype='int32') # (ncons*n_samples) * n_chars cy = tensor.matrix('cy', dtype='int32') # n_samples * n_h tmp_x = tensor.tanh(tensor.dot(x, tparams['W1']) + tparams['b1']) tmp_y = tensor.tanh(tensor.dot(y, tparams['W1']) + tparams['b1']) # (ncons*n_samples) * n_h tmp_cy = tensor.tanh(tensor.dot(cy, tparams['W1']) + tparams['b1']) # n_samples * n_h feats_x = tensor.tanh(tensor.dot(tmp_x, tparams['W2']) + tparams['b2']) feats_y = tensor.tanh(tensor.dot(tmp_y, tparams['W2']) + tparams['b2']) # (ncons*n_samples) * n_h feats_cy = tensor.tanh(tensor.dot(tmp_cy, tparams['W2']) + tparams['b2']) feats_x = dropout(feats_x, trng, use_noise) feats_y = dropout(feats_y, trng, use_noise) feats_cy = dropout(feats_cy, trng, use_noise) feats_x = l2norm(feats_x) feats_y = l2norm(feats_y) feats_cy = l2norm(feats_cy) # Tile by number of contrast terms # (ncon*n_samples) * n_h feats_x = tensor.tile(feats_x, (options['ncon'], 1)) feats_y = tensor.tile(feats_y, (options['ncon'], 1)) cost = tensor.log(1 + tensor.sum( tensor.exp(-options['gamma'] * ((feats_x * feats_y).sum(axis=1) - (feats_x * feats_cy).sum(axis=1))))) return use_noise, [x, y, cy], cost
def build_model(tparams, options): """first model - blind single answer qa without masking the response at ? :tparams: TODO :options: TODO :returns: TODO """ trng = RandomStreams(SEED) # Used for dropout use_noise = theano.shared(utils.numpy_floatX(0.)) x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) y = tensor.vector('y', dtype='int64') n_timesteps = x.shape[0] n_samples = x.shape[1] emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_proj']]) proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix=options['encoder'], mask=mask) if options['use_dropout']: proj = utils.dropout_layer(proj, use_noise, trng) pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') off = 1e-8 if pred.dtype == 'float16': off = 1e-6 cost = -tensor.log(pred[tensor.arange(n_samples), y] + off).mean() return use_noise, x, mask, y, f_pred_prob, f_pred, cost
def build_model(tparams,options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # input sentence: n_steps * n_samples x = tensor.matrix('x', dtype='int32') mask = tensor.matrix('mask', dtype=config.floatX) # label: (n_samples,) y = tensor.vector('y',dtype='int32') n_steps = x.shape[0] # the length of the longest sentence in this minibatch n_samples = x.shape[1] # how many samples we have in this minibatch n_x = tparams['Wemb'].shape[1] # the dimension of the word-embedding emb = tparams['Wemb'][x.flatten()].reshape([n_steps,n_samples,n_x]) emb = dropout(emb, trng, use_noise) # encoding of the sentence, size of n_samples * n_h h_encoder = encoder(tparams, emb, mask=mask, prefix='lstm_encoder') h_encoder_rev = encoder(tparams, emb[::-1], mask=mask[::-1], prefix='lstm_encoder_rev') # size of n_samples * (2*n_h) z = tensor.concatenate((h_encoder,h_encoder_rev),axis=1) z = dropout(z, trng, use_noise) # this is the label prediction you made # size of n_samples * n_y pred = tensor.nnet.softmax(tensor.dot(z, tparams['Wy'])+tparams['by']) f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') # get the expression of how we calculate the cost function # i.e. corss-entropy loss index = tensor.arange(n_samples) cost = -tensor.log(pred[index, y] + 1e-6).mean() return use_noise, x, mask, y, f_pred_prob, f_pred, cost
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8): """ default: lr=0.0002 This is the implementation of the Adam algorithm Reference: http://arxiv.org/pdf/1412.6980v8.pdf """ grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, 5): grads = [g * 5 / norm for g in grads] gshared = [ theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in tparams.iteritems() ] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] i = theano.shared(numpy_floatX(0.)) i_t = i + 1. fix1 = 1. - b1**(i_t) fix2 = 1. - b2**(i_t) lr_t = lr * (tensor.sqrt(fix2) / fix1) for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def build_model(tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # x: n_steps * n_samples x = tensor.matrix('x', dtype='int64') y = tensor.matrix('y', dtype='int64') n_steps = x.shape[0] n_samples = x.shape[1] n_x = tparams['Wemb'].shape[1] emb = tparams['Wemb'][x.flatten()].reshape([n_steps, n_samples, n_x]) emb = dropout(emb, trng, use_noise) h_decoder = decoder_layer(tparams, emb, prefix='decoder_h1') h_decoder = dropout(h_decoder, trng, use_noise) h_decoder = decoder_layer(tparams, h_decoder, prefix='decoder_h2') h_decoder = dropout(h_decoder, trng, use_noise) # n_steps * n_samples * n_h shape = h_decoder.shape h_decoder = h_decoder.reshape((shape[0] * shape[1], shape[2])) pred = tensor.dot(h_decoder, tparams['Vhid']) + tparams['bhid'] pred = tensor.nnet.softmax(pred) y_vec = y.reshape((shape[0] * shape[1], )) index = tensor.arange(shape[0] * shape[1]) y_pred = pred[index, y_vec] f_pred_prob = theano.function([x, y], y_pred, name='f_pred_prob') cost = -tensor.log(y_pred + 1e-6).sum() / n_steps / n_samples return use_noise, x, y, f_pred_prob, cost
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8, clip_norm=5): """ default: lr=0.0002 This is the implementation of the Adam algorithm Reference: http://arxiv.org/pdf/1412.6980v8.pdf """ grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g*clip_norm/norm for g in grads] gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] i = theano.shared(numpy_floatX(0.)) i_t = i + 1. fix1 = 1. - b1**(i_t) fix2 = 1. - b2**(i_t) lr_t = lr * (tensor.sqrt(fix2) / fix1) for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def build_model(tparams,options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # input sentence: n_steps * n_samples x = tensor.matrix('x', dtype='int32') # label: (n_samples,) y = tensor.vector('y',dtype='int32') layer0_input = tparams['Wemb'][tensor.cast(x.flatten(),dtype='int32')].reshape((x.shape[0],1,x.shape[1],tparams['Wemb'].shape[1])) layer0_input = dropout(layer0_input, trng, use_noise) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input,filter_shape=filter_shape, pool_size=pool_size,prefix=_p('cnn_encoder',i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input = tensor.concatenate(layer1_inputs,1) layer1_input = dropout(layer1_input, trng, use_noise) # this is the label prediction you made pred = tensor.nnet.softmax(tensor.dot(layer1_input, tparams['Wy']) + tparams['by']) f_pred_prob = theano.function([x], pred, name='f_pred_prob') f_pred = theano.function([x], pred.argmax(axis=1), name='f_pred') # get the expression of how we calculate the cost function # i.e. corss-entropy loss index = tensor.arange(x.shape[0]) cost = -tensor.log(pred[index, y] + 1e-6).mean() return use_noise, x, y, f_pred_prob, f_pred, cost
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8, clip_norm=5): grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g * clip_norm / norm for g in grads] gshared = [ theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in tparams.iteritems() ] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] i = theano.shared(numpy_floatX(0.)) i_t = i + 1. fix1 = 1. - b1**(i_t) fix2 = 1. - b2**(i_t) lr_t = lr * (tensor.sqrt(fix2) / fix1) for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def SGMGHMC(tparams, cost, inps, ntrain, lr, iterations, rho=0.9, epsilon=1e-6, clip_norm=1): """ Additional parameters """ mom_tparams = OrderedDict() xi_tparams = OrderedDict() for k, p0 in tparams.iteritems(): mom_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_mom' % k) xi_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_xi' % k) #a = theano.shared(numpy_floatX(1.)) m = theano.shared(numpy_floatX(1.)) c = theano.shared(numpy_floatX(5.)) sigma_p = theano.shared(numpy_floatX(1.)) sigma_xi = theano.shared(numpy_floatX(1.)) gamma_xi = theano.shared(numpy_floatX(0.001)) logger = logging.getLogger('eval_ptb_sgmgnht') logger.setLevel(logging.INFO) fh = logging.FileHandler('eval_ptb_sgmgnht.log') logger.info('a = 1, m {} c {} s_p{} s_xi{} g_xi{}'.format( m.get_value(), c.get_value(), sigma_p.get_value(), sigma_xi.get_value(), gamma_xi.get_value())) p = tensor.vector('p', dtype=theano.config.floatX) """ default: lr=0.001 """ trng = RandomStreams(123) grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g * clip_norm / norm for g in grads] gshared = [ theano.shared(p0.get_value() * 0., name='%s_grad' % k) for k, p0 in tparams.iteritems() ] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] for p, mom, xi, g in zip(tparams.values(), mom_tparams.values(), xi_tparams.values(), gshared): g_f = mom / m K_f = -g_f + 2 / c * (c * g_f + tensor.log(1 + tensor.exp(-c * g_f))) psi_f_1 = (1 - tensor.exp(-c * g_f)) / (1 + tensor.exp(-c * g_f)) f1_f_1 = 1 / m * psi_f_1 psi_grad_f_1 = 2 * c * tensor.exp( -c * g_f) / (1 + tensor.exp(-c * g_f))**2 f3_f_1 = 1 / m**2 * (psi_f_1**2 - psi_grad_f_1) psi_f = (tensor.exp(c * g_f) - 1) / (tensor.exp(c * g_f) + 1) f1_f = 1 / m * psi_f psi_grad_f = 2 * c * tensor.exp(c * g_f) / (tensor.exp(c * g_f) + 1)**2 f3_f = 1 / m**2 * (psi_f**2 - psi_grad_f) temp_f1 = tensor.switch(tensor.ge(g_f, 0), f1_f_1, f1_f) temp_f3 = tensor.switch(tensor.ge(g_f, 0), f3_f_1, f3_f) noise_p = trng.normal(p.get_value().shape, avg=0.0, std=1., dtype=theano.config.floatX) noise_xi = trng.normal(p.get_value().shape, avg=0.0, std=1., dtype=theano.config.floatX) # generata gamma(a,2): N(0,1)^2 = gamma(1/2,2) noise_temp = tensor.zeros(p.get_value().shape) for aa in xrange(2): this_noise = trng.normal(p.get_value().shape, avg=0.0, std=1., dtype=theano.config.floatX) noise_temp = tensor.inc_subtensor(noise_temp[:], this_noise**2) randmg = (noise_temp * m / 2) * tensor.sgn( trng.normal(p.get_value().shape, avg=0.0, std=1., dtype=theano.config.floatX)) updated_p = p + temp_f1 * lr updated_mom = (mom - temp_f1 * xi * lr - g * lr * ntrain + tensor.sqrt(2 * sigma_p * lr) * noise_p) * ( 1 - tensor.eq(tensor.mod(iterations, 50), 0) ) + randmg * tensor.eq(tensor.mod(iterations, 50), 0) #updated_mom = mom - temp_f1* xi *lr - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p temp_xi = trng.normal(p.get_value().shape, avg=sigma_p, std=tensor.sqrt(sigma_xi / 2), dtype=theano.config.floatX) updated_xi = (xi + temp_f3 * sigma_xi * lr - (xi - sigma_p) * gamma_xi * lr + tensor.sqrt(2 * sigma_xi * gamma_xi * lr) * noise_xi) * ( 1 - tensor.eq(tensor.mod(iterations, 100), 50) ) + temp_xi * tensor.eq(tensor.mod(iterations, 100), 50) updates.append((p, updated_p)) updates.append((mom, updated_mom)) updates.append((xi, updated_xi)) f_update = theano.function([lr, ntrain, iterations], [p, mom, xi], updates=updates) #f_params = theano.function([], [a, m, c, mom.shape]) return f_grad_shared, f_update