Exemple #1
0
 def __init__(self, rep, y, mask, L_dec, pdrop, args):
     self.h0s = rep
     outputs_info = self.h0s
     rlayers = list()
     self.subset = L_dec[y.flatten()]
     inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1]))
     seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], L_dec.shape[1]), pdrop)
     # exclude last prediction
     inplayer = GRULayer(
         inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix="dec0"
     )
     rlayers.append(inplayer)
     for k in xrange(1, args.rlayers):
         seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], args.rnn_dim), pdrop)
         rlayer = GRULayer(
             Dropout(rlayers[-1].out, pdrop).out,
             mask[:-1],
             seqmask[:-1],
             args.rnn_dim,
             outputs_info[k],
             args,
             suffix="dec%d" % k,
         )
         rlayers.append(rlayer)
     olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size)
     cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False)
     super(RNNDecoder, self).__init__(rlayers, olayer, cost)
Exemple #2
0
 def __init__(self, rep, y, mask, L_dec, pdrop, args):
     self.h0s = rep
     outputs_info = self.h0s
     rlayers = list()
     self.subset = L_dec[y.flatten()]
     inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1]))
     seqmask = get_sequence_dropout_mask(
         (y.shape[0], y.shape[1], L_dec.shape[1]), pdrop)
     # exclude last prediction
     inplayer = GRULayer(inp[:-1].astype(floatX),
                         mask[:-1],
                         seqmask[:-1],
                         args.rnn_dim,
                         outputs_info[0],
                         args,
                         suffix='dec0')
     rlayers.append(inplayer)
     for k in xrange(1, args.rlayers):
         seqmask = get_sequence_dropout_mask(
             (y.shape[0], y.shape[1], args.rnn_dim), pdrop)
         rlayer = GRULayer(Dropout(rlayers[-1].out, pdrop).out,
                           mask[:-1],
                           seqmask[:-1],
                           args.rnn_dim,
                           outputs_info[k],
                           args,
                           suffix='dec%d' % k)
         rlayers.append(rlayer)
     olayer = SequenceLogisticRegression(
         Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim,
         args.tgt_vocab_size)
     cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False)
     super(RNNDecoder, self).__init__(rlayers, olayer, cost)
Exemple #3
0
    def __init__(self, encoder, y, mask, L_dec, pdrop, args):
        self.hs = encoder.hs

        # NOTE just use this so only last layer uses attention
        def layer_init(attention):
            if not attention:
                return GRULayer
            else:
                return lambda *largs, **kwargs: GRULayerAttention(
                    self.hs, *largs, **kwargs)

        # initial states
        outputs_info = [
            T.zeros_like(self.hs[0]) for k in xrange(len(encoder.routs))
        ]
        rlayers = list()
        self.subset = L_dec[y.flatten()]
        inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1]))
        attention = args.rlayers == 1
        # exclude last prediction
        seqmask = get_sequence_dropout_mask(
            (y.shape[0], y.shape[1], L_dec.shape[1]), pdrop)
        inplayer = layer_init(attention)(inp[:-1].astype(floatX),
                                         mask[:-1],
                                         seqmask[:-1],
                                         args.rnn_dim,
                                         outputs_info[0],
                                         args,
                                         suffix='dec0')
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            attention = (args.rlayers == k + 1)
            seqmask = get_sequence_dropout_mask(
                (y.shape[0], y.shape[1], args.rnn_dim), pdrop)
            rlayer = layer_init(attention)(Dropout(rlayers[-1].out, pdrop).out,
                                           mask[:-1],
                                           seqmask[:-1],
                                           args.rnn_dim,
                                           outputs_info[k],
                                           args,
                                           suffix='dec%d' % k)
            rlayers.append(rlayer)
        olayer = SequenceLogisticRegression(
            Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim,
            args.tgt_vocab_size)
        cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False)
        super(RNNDecoderAttention, self).__init__(rlayers, olayer, cost)
Exemple #4
0
    def __init__(self, encoder, y, mask, L_dec, pdrop, args):
        self.hs = encoder.hs
        # NOTE just use this so only last layer uses attention
        def layer_init(attention):
            if not attention:
                return GRULayer
            else:
                return lambda *largs, **kwargs: GRULayerAttention(self.hs, *largs, **kwargs)

        # initial states
        outputs_info = [T.zeros_like(self.hs[0]) for k in xrange(len(encoder.routs))]
        rlayers = list()
        self.subset = L_dec[y.flatten()]
        inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1]))
        attention = args.rlayers == 1
        # exclude last prediction
        seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], L_dec.shape[1]), pdrop)
        inplayer = layer_init(attention)(
            inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix="dec0"
        )
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            attention = args.rlayers == k + 1
            seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], args.rnn_dim), pdrop)
            rlayer = layer_init(attention)(
                Dropout(rlayers[-1].out, pdrop).out,
                mask[:-1],
                seqmask[:-1],
                args.rnn_dim,
                outputs_info[k],
                args,
                suffix="dec%d" % k,
            )
            rlayers.append(rlayer)
        olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size)
        cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False)
        super(RNNDecoderAttention, self).__init__(rlayers, olayer, cost)
Exemple #5
0
    def __init__(self, args):
        self.args = args
        x = T.imatrix('x')
        y = T.imatrix('y')
        mask = T.ones_like(x).astype(floatX)
        # FIXME TODO resume from last state of previous sequence instead o
        # resetting the first hidden state to 0s
        self.unit = args.unit
        if args.unit == 'gru':
            init_states = [
                T.matrix(dtype=floatX) for k in xrange(args.rlayers)
            ]
        elif args.unit == 'lstm':
            init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX))
                           for k in xrange(args.rlayers)]
        else:
            assert (False)
        lr = T.scalar(dtype=floatX)
        pdrop = T.scalar(dtype=floatX)

        rlayers = list()
        inp = theano.tensor.extra_ops.to_one_hot(
            x.flatten(), args.vocab_size).astype(floatX).reshape(
                (x.shape[0], x.shape[1], args.vocab_size))
        seqmask = get_sequence_dropout_mask(
            (inp.shape[0], inp.shape[1], args.rnn_dim),
            pdrop,
            stocdrop=args.stocdrop)
        # exclude last prediction
        inplayer = UnitInit[args.unit](inp.astype(floatX),
                                       mask,
                                       seqmask,
                                       args.vocab_size,
                                       init_states[0],
                                       args,
                                       suffix='0')
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            seqmask = get_sequence_dropout_mask(
                (inp.shape[0], inp.shape[1], args.rnn_dim),
                pdrop,
                stocdrop=args.stocdrop)
            rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out,
                                         mask,
                                         seqmask,
                                         args.rnn_dim,
                                         init_states[k],
                                         args,
                                         suffix='%d' % k)
            rlayers.append(rlayer)
        olayer = SequenceLogisticRegression(
            Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.vocab_size)
        self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False)
        super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost)
        shapes = [p.shape.eval() for p in self.params]
        sizes = [np.prod(s) for s in shapes]
        self.nparams = np.sum(sizes)
        self.updates, self.grad_norm, self.param_norm = get_opt_fn(
            args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm)

        # functions

        if args.unit == 'lstm':
            init_states = flatten(init_states)
            final_states = list()
            for r in rlayers:
                final_states.append(r.out[-1])
                final_states.append(r.cell[-1])
        else:
            final_states = [r.out[-1] for r in rlayers]

        self.train = theano.function(
            inputs=[x, y, pdrop, lr] + init_states,
            outputs=[self.cost, self.grad_norm, self.param_norm] +
            final_states,
            updates=self.updates,
            on_unused_input='warn')

        self.test = theano.function(
            # at test time should pass in pdrop=0
            inputs=[x, y, pdrop] + init_states,
            outputs=[self.cost] + final_states,
            updates=None,
            on_unused_input='warn')

        # function for sampling

        i_t = T.ivector()
        x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0]
        h_ps = list()  # previous
        for k in xrange(args.rlayers):
            if args.unit == 'gru':
                h_ps.append(T.vector())
                dmask = T.ones_like(h_ps[0]).astype(floatX)
            else:
                h_ps.append((T.vector(), T.vector()))
                dmask = T.ones_like(h_ps[0][0]).astype(floatX)
        h_ts = list()
        if args.unit == 'lstm':
            h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0])
        else:
            h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0])
        h_ts.append(h_t)
        for k in xrange(1, args.rlayers):
            if args.unit == 'lstm':
                h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k])
            else:
                h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k])
            h_ts.append(h_t)
        if args.unit == 'lstm':
            h_t = h_t[0]
        E_t = T.dot(h_t, self.olayer.W) + self.olayer.b
        E_t = T.exp(E_t - T.max(E_t))
        p_t = E_t / E_t.sum()
        if args.unit == 'lstm':
            h_ps = flatten(h_ps)
            h_ts = flatten(h_ts)
        self.decode_step = theano.function(inputs=[i_t] + h_ps,
                                           outputs=[p_t] + h_ts,
                                           updates=None,
                                           on_unused_input='warn')
Exemple #6
0
    def __init__(self, args):
        self.args = args
        x = T.imatrix('x')
        y = T.imatrix('y')
        mask = T.ones_like(x).astype(floatX)
        # FIXME TODO resume from last state of previous sequence instead o
        # resetting the first hidden state to 0s
        self.unit = args.unit
        if args.unit == 'gru':
            init_states = [T.matrix(dtype=floatX) for k in xrange(args.rlayers)]
        elif args.unit == 'lstm':
            init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX)) for k in xrange(args.rlayers)]
        else:
            assert(False)
        lr = T.scalar(dtype=floatX)
        pdrop = T.scalar(dtype=floatX)

        rlayers = list()
        inp = theano.tensor.extra_ops.to_one_hot(x.flatten(), args.vocab_size).astype(floatX).reshape((x.shape[0], x.shape[1], args.vocab_size))
        seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop)
        # exclude last prediction
        inplayer = UnitInit[args.unit](inp.astype(floatX), mask, seqmask, args.vocab_size, init_states[0], args, suffix='0')
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop)
            rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out, mask, seqmask, args.rnn_dim, init_states[k], args, suffix='%d' % k)
            rlayers.append(rlayer)
        olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim,
                args.vocab_size)
        self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False)
        super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost)
        shapes = [p.shape.eval() for p in self.params]
        sizes = [np.prod(s) for s in shapes]
        self.nparams = np.sum(sizes)
        self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm)

        # functions

        if args.unit == 'lstm':
            init_states = flatten(init_states)
            final_states = list()
            for r in rlayers:
                final_states.append(r.out[-1])
                final_states.append(r.cell[-1])
        else:
            final_states = [r.out[-1] for r in rlayers]

        self.train = theano.function(
            inputs=[x, y, pdrop, lr] + init_states,
            outputs=[self.cost, self.grad_norm, self.param_norm] + final_states,
            updates = self.updates,
            on_unused_input='warn'
        )

        self.test = theano.function(
            # at test time should pass in pdrop=0
            inputs=[x, y, pdrop] + init_states,
            outputs=[self.cost] + final_states,
            updates = None,
            on_unused_input='warn'
        )

        # function for sampling

        i_t = T.ivector()
        x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0]
        h_ps = list()  # previous
        for k in xrange(args.rlayers):
            if args.unit == 'gru':
                h_ps.append(T.vector())
                dmask = T.ones_like(h_ps[0]).astype(floatX)
            else:
                h_ps.append((T.vector(), T.vector()))
                dmask = T.ones_like(h_ps[0][0]).astype(floatX)
        h_ts = list()
        if args.unit == 'lstm':
            h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0])
        else:
            h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0])
        h_ts.append(h_t)
        for k in xrange(1, args.rlayers):
            if args.unit == 'lstm':
                h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k])
            else:
                h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k])
            h_ts.append(h_t)
        if args.unit == 'lstm':
            h_t = h_t[0]
        E_t = T.dot(h_t, self.olayer.W) + self.olayer.b
        E_t = T.exp(E_t - T.max(E_t))
        p_t = E_t / E_t.sum()
        if args.unit == 'lstm':
            h_ps = flatten(h_ps)
            h_ts = flatten(h_ts)
        self.decode_step = theano.function(
            inputs=[i_t] + h_ps,
            outputs=[p_t] + h_ts,
            updates=None,
            on_unused_input='warn'
        )