def __init__(self, args, params=None, attention=False, bidir=False, subset_grad=True, pyramid=False): self.rnn_dim = args.rnn_dim self.rlayers = args.rlayers self.attention = attention lr = T.scalar(dtype=floatX) pdrop = T.scalar(dtype=floatX) max_norm = T.scalar(dtype=floatX) # initialize input tensors src_sent = T.imatrix("src_sent") rev_src_sent = T.imatrix("rev_src_sent") src_mask = T.bmatrix("src_mask") tgt_sent = T.imatrix("tgt_sent") tgt_mask = T.bmatrix("tgt_mask") space_mask = T.bmatrix("space_mask") # build up model # https://groups.google.com/forum/#!topic/torch7/-NBrFw8Q6_s # NOTE can't use one-hot here because huge matrix multiply self.L_enc = theano.shared(uniform_init(args.src_vocab_size, args.rnn_dim, scale=0.1), "L_enc", borrow=True) self.L_dec = theano.shared(uniform_init(args.tgt_vocab_size, args.rnn_dim, scale=0.1), "L_dec", borrow=True) enc_input = src_sent if not args.reverse else rev_src_sent if bidir: print("Using bidirectional encoder") self.encoder = BiRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T, space_mask.T, self.L_enc, pdrop, args) elif pyramid: print("Using pyramid encoder") self.encoder = BiPyrRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T, self.L_enc, pdrop, args) else: self.encoder = RNNEncoder(enc_input.T, src_mask.T, space_mask.T, self.L_enc, pdrop, args) if attention: self.decoder = RNNDecoderAttention(self.encoder, tgt_sent.T, tgt_mask.T, self.L_dec, pdrop, args) hs = self.decoder.hs else: self.decoder = RNNDecoder(self.encoder.out, tgt_sent.T, tgt_mask.T, self.L_dec, pdrop, args) # cost, parameters, grads, updates self.cost = self.decoder.cost self.params = self.encoder.params + self.decoder.params + [self.L_enc, self.L_dec] if subset_grad: self.grad_params = self.encoder.params + self.decoder.params + [self.encoder.subset, self.decoder.subset] self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)( self.cost, self.grad_params, lr, max_norm=max_norm ) # instead of updating L_enc and L_dec only want to update the embeddings indexed, so use inc_subtensor/set_subtensor # http://deeplearning.net/software/theano/tutorial/faq_tutorial.html self.updates[-2] = (self.L_enc, T.set_subtensor(self.updates[-2][0], self.updates[-2][1])) self.updates[-1] = (self.L_dec, T.set_subtensor(self.updates[-1][0], self.updates[-1][1])) else: self.grad_params = self.params self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)( self.cost, self.grad_params, lr, max_norm=max_norm ) self.nparams = np.sum([np.prod(p.shape.eval()) for p in self.params]) # functions self.train = theano.function( inputs=[src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask, pdrop, lr, max_norm], outputs=[self.cost, self.grad_norm, self.param_norm], updates=self.updates, on_unused_input="warn", allow_input_downcast=True, ) self.test = theano.function( inputs=[src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask, theano.In(pdrop, value=0.0)], outputs=self.cost, updates=None, on_unused_input="warn", ) outputs = self.encoder.out if attention: outputs = self.encoder.out + [hs] self.encode = theano.function( inputs=[src_sent, rev_src_sent, src_mask, space_mask, theano.In(pdrop, value=0.0)], outputs=outputs, on_unused_input="warn", updates=None, ) # function for decoding step by step i_t = T.ivector() x_t = self.L_dec[i_t, :] h_ps = list() # previous for k in xrange(args.rlayers): h_ps.append(T.matrix()) h_ts = list() dmask = T.ones_like(h_ps[0]).astype(floatX) if attention and args.rlayers == 1: h_t, _ = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0], hs) else: h_t = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0]) h_ts.append(h_t) # NOTE no more dropout nodes here for k in xrange(1, args.rlayers): if attention and args.rlayers == k + 1: h_t, align = self.decoder.rlayers[k]._step(h_t, dmask, h_ps[k], hs) else: h_t = self.decoder.rlayers[k]._step(h_t, dmask, h_ps[k]) h_ts.append(h_t) E_t = T.dot(h_t, self.decoder.olayer.W) + self.decoder.olayer.b E_t = T.exp(E_t - T.max(E_t, axis=1, keepdims=True)) p_t = E_t / E_t.sum(axis=1, keepdims=True) inputs = [i_t] + h_ps outputs = [p_t] + h_ts if attention: inputs = inputs + [hs] outputs = outputs + [align] self.decode_step = theano.function(inputs=inputs, outputs=outputs, updates=None)
def __init__(self, args): self.args = args x = T.imatrix('x') y = T.imatrix('y') mask = T.ones_like(x).astype(floatX) # FIXME TODO resume from last state of previous sequence instead o # resetting the first hidden state to 0s self.unit = args.unit if args.unit == 'gru': init_states = [T.matrix(dtype=floatX) for k in xrange(args.rlayers)] elif args.unit == 'lstm': init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX)) for k in xrange(args.rlayers)] else: assert(False) lr = T.scalar(dtype=floatX) pdrop = T.scalar(dtype=floatX) rlayers = list() inp = theano.tensor.extra_ops.to_one_hot(x.flatten(), args.vocab_size).astype(floatX).reshape((x.shape[0], x.shape[1], args.vocab_size)) seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) # exclude last prediction inplayer = UnitInit[args.unit](inp.astype(floatX), mask, seqmask, args.vocab_size, init_states[0], args, suffix='0') rlayers.append(inplayer) for k in xrange(1, args.rlayers): seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out, mask, seqmask, args.rnn_dim, init_states[k], args, suffix='%d' % k) rlayers.append(rlayer) olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.vocab_size) self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False) super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost) shapes = [p.shape.eval() for p in self.params] sizes = [np.prod(s) for s in shapes] self.nparams = np.sum(sizes) self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm) # functions if args.unit == 'lstm': init_states = flatten(init_states) final_states = list() for r in rlayers: final_states.append(r.out[-1]) final_states.append(r.cell[-1]) else: final_states = [r.out[-1] for r in rlayers] self.train = theano.function( inputs=[x, y, pdrop, lr] + init_states, outputs=[self.cost, self.grad_norm, self.param_norm] + final_states, updates = self.updates, on_unused_input='warn' ) self.test = theano.function( # at test time should pass in pdrop=0 inputs=[x, y, pdrop] + init_states, outputs=[self.cost] + final_states, updates = None, on_unused_input='warn' ) # function for sampling i_t = T.ivector() x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0] h_ps = list() # previous for k in xrange(args.rlayers): if args.unit == 'gru': h_ps.append(T.vector()) dmask = T.ones_like(h_ps[0]).astype(floatX) else: h_ps.append((T.vector(), T.vector())) dmask = T.ones_like(h_ps[0][0]).astype(floatX) h_ts = list() if args.unit == 'lstm': h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0]) else: h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0]) h_ts.append(h_t) for k in xrange(1, args.rlayers): if args.unit == 'lstm': h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k]) else: h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k]) h_ts.append(h_t) if args.unit == 'lstm': h_t = h_t[0] E_t = T.dot(h_t, self.olayer.W) + self.olayer.b E_t = T.exp(E_t - T.max(E_t)) p_t = E_t / E_t.sum() if args.unit == 'lstm': h_ps = flatten(h_ps) h_ts = flatten(h_ts) self.decode_step = theano.function( inputs=[i_t] + h_ps, outputs=[p_t] + h_ts, updates=None, on_unused_input='warn' )
def __init__(self, args): self.args = args x = T.imatrix('x') y = T.imatrix('y') mask = T.ones_like(x).astype(floatX) # FIXME TODO resume from last state of previous sequence instead o # resetting the first hidden state to 0s self.unit = args.unit if args.unit == 'gru': init_states = [ T.matrix(dtype=floatX) for k in xrange(args.rlayers) ] elif args.unit == 'lstm': init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX)) for k in xrange(args.rlayers)] else: assert (False) lr = T.scalar(dtype=floatX) pdrop = T.scalar(dtype=floatX) rlayers = list() inp = theano.tensor.extra_ops.to_one_hot( x.flatten(), args.vocab_size).astype(floatX).reshape( (x.shape[0], x.shape[1], args.vocab_size)) seqmask = get_sequence_dropout_mask( (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) # exclude last prediction inplayer = UnitInit[args.unit](inp.astype(floatX), mask, seqmask, args.vocab_size, init_states[0], args, suffix='0') rlayers.append(inplayer) for k in xrange(1, args.rlayers): seqmask = get_sequence_dropout_mask( (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out, mask, seqmask, args.rnn_dim, init_states[k], args, suffix='%d' % k) rlayers.append(rlayer) olayer = SequenceLogisticRegression( Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.vocab_size) self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False) super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost) shapes = [p.shape.eval() for p in self.params] sizes = [np.prod(s) for s in shapes] self.nparams = np.sum(sizes) self.updates, self.grad_norm, self.param_norm = get_opt_fn( args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm) # functions if args.unit == 'lstm': init_states = flatten(init_states) final_states = list() for r in rlayers: final_states.append(r.out[-1]) final_states.append(r.cell[-1]) else: final_states = [r.out[-1] for r in rlayers] self.train = theano.function( inputs=[x, y, pdrop, lr] + init_states, outputs=[self.cost, self.grad_norm, self.param_norm] + final_states, updates=self.updates, on_unused_input='warn') self.test = theano.function( # at test time should pass in pdrop=0 inputs=[x, y, pdrop] + init_states, outputs=[self.cost] + final_states, updates=None, on_unused_input='warn') # function for sampling i_t = T.ivector() x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0] h_ps = list() # previous for k in xrange(args.rlayers): if args.unit == 'gru': h_ps.append(T.vector()) dmask = T.ones_like(h_ps[0]).astype(floatX) else: h_ps.append((T.vector(), T.vector())) dmask = T.ones_like(h_ps[0][0]).astype(floatX) h_ts = list() if args.unit == 'lstm': h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0]) else: h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0]) h_ts.append(h_t) for k in xrange(1, args.rlayers): if args.unit == 'lstm': h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k]) else: h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k]) h_ts.append(h_t) if args.unit == 'lstm': h_t = h_t[0] E_t = T.dot(h_t, self.olayer.W) + self.olayer.b E_t = T.exp(E_t - T.max(E_t)) p_t = E_t / E_t.sum() if args.unit == 'lstm': h_ps = flatten(h_ps) h_ts = flatten(h_ts) self.decode_step = theano.function(inputs=[i_t] + h_ps, outputs=[p_t] + h_ts, updates=None, on_unused_input='warn')
def __init__(self, args, params=None, attention=False, bidir=False, subset_grad=True, pyramid=False): self.rnn_dim = args.rnn_dim self.rlayers = args.rlayers self.attention = attention lr = T.scalar(dtype=floatX) pdrop = T.scalar(dtype=floatX) max_norm = T.scalar(dtype=floatX) # initialize input tensors src_sent = T.imatrix('src_sent') rev_src_sent = T.imatrix('rev_src_sent') src_mask = T.bmatrix('src_mask') tgt_sent = T.imatrix('tgt_sent') tgt_mask = T.bmatrix('tgt_mask') space_mask = T.bmatrix('space_mask') # build up model # https://groups.google.com/forum/#!topic/torch7/-NBrFw8Q6_s # NOTE can't use one-hot here because huge matrix multiply self.L_enc = theano.shared(uniform_init(args.src_vocab_size, args.rnn_dim, scale=0.1), 'L_enc', borrow=True) self.L_dec = theano.shared(uniform_init(args.tgt_vocab_size, args.rnn_dim, scale=0.1), 'L_dec', borrow=True) enc_input = src_sent if not args.reverse else rev_src_sent if bidir: print('Using bidirectional encoder') self.encoder = BiRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T, space_mask.T, self.L_enc, pdrop, args) elif pyramid: print('Using pyramid encoder') self.encoder = BiPyrRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T, self.L_enc, pdrop, args) else: self.encoder = RNNEncoder(enc_input.T, src_mask.T, space_mask.T, self.L_enc, pdrop, args) if attention: self.decoder = RNNDecoderAttention(self.encoder, tgt_sent.T, tgt_mask.T, self.L_dec, pdrop, args) hs = self.decoder.hs else: self.decoder = RNNDecoder(self.encoder.out, tgt_sent.T, tgt_mask.T, self.L_dec, pdrop, args) # cost, parameters, grads, updates self.cost = self.decoder.cost self.params = self.encoder.params + self.decoder.params + [ self.L_enc, self.L_dec ] if subset_grad: self.grad_params = self.encoder.params + self.decoder.params + [ self.encoder.subset, self.decoder.subset ] self.updates, self.grad_norm, self.param_norm = get_opt_fn( args.optimizer)(self.cost, self.grad_params, lr, max_norm=max_norm) # instead of updating L_enc and L_dec only want to update the embeddings indexed, so use inc_subtensor/set_subtensor # http://deeplearning.net/software/theano/tutorial/faq_tutorial.html self.updates[-2] = (self.L_enc, T.set_subtensor(self.updates[-2][0], self.updates[-2][1])) self.updates[-1] = (self.L_dec, T.set_subtensor(self.updates[-1][0], self.updates[-1][1])) else: self.grad_params = self.params self.updates, self.grad_norm, self.param_norm = get_opt_fn( args.optimizer)(self.cost, self.grad_params, lr, max_norm=max_norm) self.nparams = np.sum([np.prod(p.shape.eval()) for p in self.params]) # functions self.train = theano.function( inputs=[ src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask, pdrop, lr, max_norm ], outputs=[self.cost, self.grad_norm, self.param_norm], updates=self.updates, on_unused_input='warn', allow_input_downcast=True) self.test = theano.function(inputs=[ src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask, theano.In(pdrop, value=0.0) ], outputs=self.cost, updates=None, on_unused_input='warn') outputs = self.encoder.out if attention: outputs = self.encoder.out + [hs] self.encode = theano.function(inputs=[ src_sent, rev_src_sent, src_mask, space_mask, theano.In(pdrop, value=0.0) ], outputs=outputs, on_unused_input='warn', updates=None) # function for decoding step by step i_t = T.ivector() x_t = self.L_dec[i_t, :] h_ps = list() # previous for k in xrange(args.rlayers): h_ps.append(T.matrix()) h_ts = list() dmask = T.ones_like(h_ps[0]).astype(floatX) if attention and args.rlayers == 1: h_t, _ = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0], hs) else: h_t = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0]) h_ts.append(h_t) # NOTE no more dropout nodes here for k in xrange(1, args.rlayers): if attention and args.rlayers == k + 1: h_t, align = self.decoder.rlayers[k]._step( h_t, dmask, h_ps[k], hs) else: h_t = self.decoder.rlayers[k]._step(h_t, dmask, h_ps[k]) h_ts.append(h_t) E_t = T.dot(h_t, self.decoder.olayer.W) + self.decoder.olayer.b E_t = T.exp(E_t - T.max(E_t, axis=1, keepdims=True)) p_t = E_t / E_t.sum(axis=1, keepdims=True) inputs = [i_t] + h_ps outputs = [p_t] + h_ts if attention: inputs = inputs + [hs] outputs = outputs + [align] self.decode_step = theano.function(inputs=inputs, outputs=outputs, updates=None)