def __init__(self, rep, y, mask, L_dec, pdrop, args): self.h0s = rep outputs_info = self.h0s rlayers = list() self.subset = L_dec[y.flatten()] inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1])) seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], L_dec.shape[1]), pdrop) # exclude last prediction inplayer = GRULayer( inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix="dec0" ) rlayers.append(inplayer) for k in xrange(1, args.rlayers): seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], args.rnn_dim), pdrop) rlayer = GRULayer( Dropout(rlayers[-1].out, pdrop).out, mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[k], args, suffix="dec%d" % k, ) rlayers.append(rlayer) olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size) cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False) super(RNNDecoder, self).__init__(rlayers, olayer, cost)
def __init__(self, rep, y, mask, L_dec, pdrop, args): self.h0s = rep outputs_info = self.h0s rlayers = list() self.subset = L_dec[y.flatten()] inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1])) seqmask = get_sequence_dropout_mask( (y.shape[0], y.shape[1], L_dec.shape[1]), pdrop) # exclude last prediction inplayer = GRULayer(inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix='dec0') rlayers.append(inplayer) for k in xrange(1, args.rlayers): seqmask = get_sequence_dropout_mask( (y.shape[0], y.shape[1], args.rnn_dim), pdrop) rlayer = GRULayer(Dropout(rlayers[-1].out, pdrop).out, mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[k], args, suffix='dec%d' % k) rlayers.append(rlayer) olayer = SequenceLogisticRegression( Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size) cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False) super(RNNDecoder, self).__init__(rlayers, olayer, cost)
def __init__(self, encoder, y, mask, L_dec, pdrop, args): self.hs = encoder.hs # NOTE just use this so only last layer uses attention def layer_init(attention): if not attention: return GRULayer else: return lambda *largs, **kwargs: GRULayerAttention( self.hs, *largs, **kwargs) # initial states outputs_info = [ T.zeros_like(self.hs[0]) for k in xrange(len(encoder.routs)) ] rlayers = list() self.subset = L_dec[y.flatten()] inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1])) attention = args.rlayers == 1 # exclude last prediction seqmask = get_sequence_dropout_mask( (y.shape[0], y.shape[1], L_dec.shape[1]), pdrop) inplayer = layer_init(attention)(inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix='dec0') rlayers.append(inplayer) for k in xrange(1, args.rlayers): attention = (args.rlayers == k + 1) seqmask = get_sequence_dropout_mask( (y.shape[0], y.shape[1], args.rnn_dim), pdrop) rlayer = layer_init(attention)(Dropout(rlayers[-1].out, pdrop).out, mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[k], args, suffix='dec%d' % k) rlayers.append(rlayer) olayer = SequenceLogisticRegression( Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size) cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False) super(RNNDecoderAttention, self).__init__(rlayers, olayer, cost)
def __init__(self, encoder, y, mask, L_dec, pdrop, args): self.hs = encoder.hs # NOTE just use this so only last layer uses attention def layer_init(attention): if not attention: return GRULayer else: return lambda *largs, **kwargs: GRULayerAttention(self.hs, *largs, **kwargs) # initial states outputs_info = [T.zeros_like(self.hs[0]) for k in xrange(len(encoder.routs))] rlayers = list() self.subset = L_dec[y.flatten()] inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1])) attention = args.rlayers == 1 # exclude last prediction seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], L_dec.shape[1]), pdrop) inplayer = layer_init(attention)( inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix="dec0" ) rlayers.append(inplayer) for k in xrange(1, args.rlayers): attention = args.rlayers == k + 1 seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], args.rnn_dim), pdrop) rlayer = layer_init(attention)( Dropout(rlayers[-1].out, pdrop).out, mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[k], args, suffix="dec%d" % k, ) rlayers.append(rlayer) olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size) cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False) super(RNNDecoderAttention, self).__init__(rlayers, olayer, cost)
def __init__(self, args): self.args = args x = T.imatrix('x') y = T.imatrix('y') mask = T.ones_like(x).astype(floatX) # FIXME TODO resume from last state of previous sequence instead o # resetting the first hidden state to 0s self.unit = args.unit if args.unit == 'gru': init_states = [ T.matrix(dtype=floatX) for k in xrange(args.rlayers) ] elif args.unit == 'lstm': init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX)) for k in xrange(args.rlayers)] else: assert (False) lr = T.scalar(dtype=floatX) pdrop = T.scalar(dtype=floatX) rlayers = list() inp = theano.tensor.extra_ops.to_one_hot( x.flatten(), args.vocab_size).astype(floatX).reshape( (x.shape[0], x.shape[1], args.vocab_size)) seqmask = get_sequence_dropout_mask( (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) # exclude last prediction inplayer = UnitInit[args.unit](inp.astype(floatX), mask, seqmask, args.vocab_size, init_states[0], args, suffix='0') rlayers.append(inplayer) for k in xrange(1, args.rlayers): seqmask = get_sequence_dropout_mask( (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out, mask, seqmask, args.rnn_dim, init_states[k], args, suffix='%d' % k) rlayers.append(rlayer) olayer = SequenceLogisticRegression( Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.vocab_size) self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False) super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost) shapes = [p.shape.eval() for p in self.params] sizes = [np.prod(s) for s in shapes] self.nparams = np.sum(sizes) self.updates, self.grad_norm, self.param_norm = get_opt_fn( args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm) # functions if args.unit == 'lstm': init_states = flatten(init_states) final_states = list() for r in rlayers: final_states.append(r.out[-1]) final_states.append(r.cell[-1]) else: final_states = [r.out[-1] for r in rlayers] self.train = theano.function( inputs=[x, y, pdrop, lr] + init_states, outputs=[self.cost, self.grad_norm, self.param_norm] + final_states, updates=self.updates, on_unused_input='warn') self.test = theano.function( # at test time should pass in pdrop=0 inputs=[x, y, pdrop] + init_states, outputs=[self.cost] + final_states, updates=None, on_unused_input='warn') # function for sampling i_t = T.ivector() x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0] h_ps = list() # previous for k in xrange(args.rlayers): if args.unit == 'gru': h_ps.append(T.vector()) dmask = T.ones_like(h_ps[0]).astype(floatX) else: h_ps.append((T.vector(), T.vector())) dmask = T.ones_like(h_ps[0][0]).astype(floatX) h_ts = list() if args.unit == 'lstm': h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0]) else: h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0]) h_ts.append(h_t) for k in xrange(1, args.rlayers): if args.unit == 'lstm': h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k]) else: h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k]) h_ts.append(h_t) if args.unit == 'lstm': h_t = h_t[0] E_t = T.dot(h_t, self.olayer.W) + self.olayer.b E_t = T.exp(E_t - T.max(E_t)) p_t = E_t / E_t.sum() if args.unit == 'lstm': h_ps = flatten(h_ps) h_ts = flatten(h_ts) self.decode_step = theano.function(inputs=[i_t] + h_ps, outputs=[p_t] + h_ts, updates=None, on_unused_input='warn')
def __init__(self, args): self.args = args x = T.imatrix('x') y = T.imatrix('y') mask = T.ones_like(x).astype(floatX) # FIXME TODO resume from last state of previous sequence instead o # resetting the first hidden state to 0s self.unit = args.unit if args.unit == 'gru': init_states = [T.matrix(dtype=floatX) for k in xrange(args.rlayers)] elif args.unit == 'lstm': init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX)) for k in xrange(args.rlayers)] else: assert(False) lr = T.scalar(dtype=floatX) pdrop = T.scalar(dtype=floatX) rlayers = list() inp = theano.tensor.extra_ops.to_one_hot(x.flatten(), args.vocab_size).astype(floatX).reshape((x.shape[0], x.shape[1], args.vocab_size)) seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) # exclude last prediction inplayer = UnitInit[args.unit](inp.astype(floatX), mask, seqmask, args.vocab_size, init_states[0], args, suffix='0') rlayers.append(inplayer) for k in xrange(1, args.rlayers): seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out, mask, seqmask, args.rnn_dim, init_states[k], args, suffix='%d' % k) rlayers.append(rlayer) olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.vocab_size) self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False) super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost) shapes = [p.shape.eval() for p in self.params] sizes = [np.prod(s) for s in shapes] self.nparams = np.sum(sizes) self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm) # functions if args.unit == 'lstm': init_states = flatten(init_states) final_states = list() for r in rlayers: final_states.append(r.out[-1]) final_states.append(r.cell[-1]) else: final_states = [r.out[-1] for r in rlayers] self.train = theano.function( inputs=[x, y, pdrop, lr] + init_states, outputs=[self.cost, self.grad_norm, self.param_norm] + final_states, updates = self.updates, on_unused_input='warn' ) self.test = theano.function( # at test time should pass in pdrop=0 inputs=[x, y, pdrop] + init_states, outputs=[self.cost] + final_states, updates = None, on_unused_input='warn' ) # function for sampling i_t = T.ivector() x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0] h_ps = list() # previous for k in xrange(args.rlayers): if args.unit == 'gru': h_ps.append(T.vector()) dmask = T.ones_like(h_ps[0]).astype(floatX) else: h_ps.append((T.vector(), T.vector())) dmask = T.ones_like(h_ps[0][0]).astype(floatX) h_ts = list() if args.unit == 'lstm': h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0]) else: h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0]) h_ts.append(h_t) for k in xrange(1, args.rlayers): if args.unit == 'lstm': h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k]) else: h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k]) h_ts.append(h_t) if args.unit == 'lstm': h_t = h_t[0] E_t = T.dot(h_t, self.olayer.W) + self.olayer.b E_t = T.exp(E_t - T.max(E_t)) p_t = E_t / E_t.sum() if args.unit == 'lstm': h_ps = flatten(h_ps) h_ts = flatten(h_ts) self.decode_step = theano.function( inputs=[i_t] + h_ps, outputs=[p_t] + h_ts, updates=None, on_unused_input='warn' )