Example #1
0
 def __init__(self, rep, y, mask, L_dec, pdrop, args):
     self.h0s = rep
     outputs_info = self.h0s
     rlayers = list()
     self.subset = L_dec[y.flatten()]
     inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1]))
     seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], L_dec.shape[1]), pdrop)
     # exclude last prediction
     inplayer = GRULayer(
         inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix="dec0"
     )
     rlayers.append(inplayer)
     for k in xrange(1, args.rlayers):
         seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], args.rnn_dim), pdrop)
         rlayer = GRULayer(
             Dropout(rlayers[-1].out, pdrop).out,
             mask[:-1],
             seqmask[:-1],
             args.rnn_dim,
             outputs_info[k],
             args,
             suffix="dec%d" % k,
         )
         rlayers.append(rlayer)
     olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size)
     cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False)
     super(RNNDecoder, self).__init__(rlayers, olayer, cost)
Example #2
0
 def __init__(self, rep, y, mask, L_dec, pdrop, args):
     self.h0s = rep
     outputs_info = self.h0s
     rlayers = list()
     self.subset = L_dec[y.flatten()]
     inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1]))
     seqmask = get_sequence_dropout_mask(
         (y.shape[0], y.shape[1], L_dec.shape[1]), pdrop)
     # exclude last prediction
     inplayer = GRULayer(inp[:-1].astype(floatX),
                         mask[:-1],
                         seqmask[:-1],
                         args.rnn_dim,
                         outputs_info[0],
                         args,
                         suffix='dec0')
     rlayers.append(inplayer)
     for k in xrange(1, args.rlayers):
         seqmask = get_sequence_dropout_mask(
             (y.shape[0], y.shape[1], args.rnn_dim), pdrop)
         rlayer = GRULayer(Dropout(rlayers[-1].out, pdrop).out,
                           mask[:-1],
                           seqmask[:-1],
                           args.rnn_dim,
                           outputs_info[k],
                           args,
                           suffix='dec%d' % k)
         rlayers.append(rlayer)
     olayer = SequenceLogisticRegression(
         Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim,
         args.tgt_vocab_size)
     cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False)
     super(RNNDecoder, self).__init__(rlayers, olayer, cost)
Example #3
0
 def __init__(self, x, xr, mask, space_mask, L_enc, pdrop, args):
     # NOTE shape[1] is batch size since shape[0] is seq length
     outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)]
     flayers = list()
     blayers = list()
     fsubset = L_enc[x.flatten()]
     bsubset = L_enc[xr.flatten()]
     finp = fsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
     binp = bsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
     fseqmask = get_sequence_dropout_mask(
         (x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
     bseqmask = get_sequence_dropout_mask(
         (x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
     finplayer = GRULayer(finp.astype(floatX),
                          mask,
                          fseqmask,
                          args.rnn_dim,
                          outputs_info,
                          args,
                          suffix='fenc0')
     binplayer = GRULayer(binp.astype(floatX),
                          mask,
                          bseqmask,
                          args.rnn_dim,
                          outputs_info,
                          args,
                          suffix='benc0',
                          backwards=True)
     flayers.append(finplayer)
     blayers.append(binplayer)
     self.routs = list()  # unlike RNNEncoder, contains hs, not just final h
     self.routs.append(finplayer.out + binplayer.out)
     for k in xrange(1, args.rlayers):
         inp = self.routs[-1]
         fseqmask = get_sequence_dropout_mask(
             (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop)
         bseqmask = get_sequence_dropout_mask(
             (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop)
         flayer = GRULayer(Dropout(inp, pdrop).out,
                           mask,
                           fseqmask,
                           args.rnn_dim,
                           outputs_info,
                           args,
                           suffix='fenc%d' % k)
         blayer = GRULayer(Dropout(inp, pdrop).out,
                           mask,
                           bseqmask,
                           args.rnn_dim,
                           outputs_info,
                           args,
                           suffix='benc%d' % k,
                           backwards=True)
         self.routs.append(flayer.out + blayer.out)
         flayers.append(flayer)
         blayers.append(blayer)
     self.hs = self.routs[-1]  # for attention
     olayer = LayerWrapper(self.routs)
     rlayers = flayers + blayers  # NOTE careful not to assume rlayers = # layers in all cases
     super(BiRNNEncoder, self).__init__(rlayers, olayer)
Example #4
0
    def __init__(self,
                 x,
                 mask,
                 space_mask,
                 L_enc,
                 pdrop,
                 args,
                 suffix_prefix='enc',
                 backwards=False):
        # NOTE shape[1] is batch size since shape[0] is seq length
        outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)]
        rlayers = list()
        self.subset = L_enc[x.flatten()]
        inp = self.subset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
        seqmask = get_sequence_dropout_mask(
            (x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
        inplayer = GRULayer(inp.astype(floatX),
                            mask,
                            seqmask,
                            args.rnn_dim,
                            outputs_info,
                            args,
                            suffix='%s0' % suffix_prefix,
                            backwards=backwards)
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            inp = rlayers[-1].out
            seqmask = get_sequence_dropout_mask(
                (x.shape[0], x.shape[1], args.rnn_dim), pdrop)
            rlayer = GRULayer(Dropout(inp, pdrop).out,
                              mask,
                              seqmask,
                              args.rnn_dim,
                              outputs_info,
                              args,
                              suffix='%s%d' % (suffix_prefix, k),
                              backwards=backwards)
            rlayers.append(rlayer)

        # should extract final outputs according to mask, note we
        # don't know seq length or current batch size at graph construction time
        # NOTE this would be used for initial hidden states in decoder in standard seq2seq but currently unused
        lens = T.sum(mask, axis=0)
        # will extract A[lens[k], k, :] for k in [0, batch size)
        self.routs = list()
        for rlayer in rlayers:
            rout = rlayer.out[lens - 1,
                              theano.tensor.arange(x.shape[1]), :].astype(
                                  floatX)
            self.routs.append(rout)
        self.hs = rlayers[-1].out  # for attention

        olayer = LayerWrapper(self.routs)
        super(RNNEncoder, self).__init__(rlayers, olayer)
Example #5
0
    def __init__(self, x, xr, mask, L_enc, pdrop, args):
        # NOTE shape[1] is batch size since shape[0] is seq length
        outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)]
        flayers = list()
        blayers = list()
        fsubset = L_enc[x.flatten()]
        bsubset = L_enc[xr.flatten()]
        finp = fsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
        binp = bsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
        fseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
        bseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
        finplayer = GRULayer(finp.astype(floatX), mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc0")
        binplayer = GRULayer(
            binp.astype(floatX), mask, bseqmask, args.rnn_dim, outputs_info, args, suffix="benc0", backwards=True
        )
        flayers.append(finplayer)
        blayers.append(binplayer)
        self.routs = list()  # unlike RNNEncoder, contains hs, not just final h
        self.routs.append(finplayer.out + binplayer.out)
        downs = []
        for k in xrange(1, args.rlayers):
            # concatenate consecutive steps in the sequence (which are downscaled to half from the previous layer)
            d = Downscale(self.routs[-1], args.rnn_dim, suffix="ds%d" % k)
            downs.append(d)
            inp = d.out
            twocols = mask.T.reshape([-1, 2])
            mask = T.or_(twocols[:, 0], twocols[:, 1]).reshape([mask.shape[1], -1]).T

            fseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop)
            bseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop)
            flayer = GRULayer(
                Dropout(inp, pdrop).out, mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc%d" % k
            )
            blayer = GRULayer(
                Dropout(inp, pdrop).out,
                mask,
                bseqmask,
                args.rnn_dim,
                outputs_info,
                args,
                suffix="benc%d" % k,
                backwards=True,
            )
            self.routs.append(flayer.out + blayer.out)
            flayers.append(flayer)
            blayers.append(blayer)
        self.hs = self.routs[-1]  # for attention
        olayer = LayerWrapper(self.routs)
        rlayers = flayers + blayers  # NOTE careful not to assume rlayers = # layers in all cases

        # undo the temporary hack
        super(BiPyrRNNEncoder, self).__init__(rlayers, olayer, downscales=downs)
Example #6
0
    def __init__(self, encoder, y, mask, L_dec, pdrop, args):
        self.hs = encoder.hs

        # NOTE just use this so only last layer uses attention
        def layer_init(attention):
            if not attention:
                return GRULayer
            else:
                return lambda *largs, **kwargs: GRULayerAttention(
                    self.hs, *largs, **kwargs)

        # initial states
        outputs_info = [
            T.zeros_like(self.hs[0]) for k in xrange(len(encoder.routs))
        ]
        rlayers = list()
        self.subset = L_dec[y.flatten()]
        inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1]))
        attention = args.rlayers == 1
        # exclude last prediction
        seqmask = get_sequence_dropout_mask(
            (y.shape[0], y.shape[1], L_dec.shape[1]), pdrop)
        inplayer = layer_init(attention)(inp[:-1].astype(floatX),
                                         mask[:-1],
                                         seqmask[:-1],
                                         args.rnn_dim,
                                         outputs_info[0],
                                         args,
                                         suffix='dec0')
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            attention = (args.rlayers == k + 1)
            seqmask = get_sequence_dropout_mask(
                (y.shape[0], y.shape[1], args.rnn_dim), pdrop)
            rlayer = layer_init(attention)(Dropout(rlayers[-1].out, pdrop).out,
                                           mask[:-1],
                                           seqmask[:-1],
                                           args.rnn_dim,
                                           outputs_info[k],
                                           args,
                                           suffix='dec%d' % k)
            rlayers.append(rlayer)
        olayer = SequenceLogisticRegression(
            Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim,
            args.tgt_vocab_size)
        cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False)
        super(RNNDecoderAttention, self).__init__(rlayers, olayer, cost)
Example #7
0
    def __init__(self, x, mask, space_mask, L_enc, pdrop, args, suffix_prefix="enc", backwards=False):
        # NOTE shape[1] is batch size since shape[0] is seq length
        outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)]
        rlayers = list()
        self.subset = L_enc[x.flatten()]
        inp = self.subset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
        seqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
        inplayer = GRULayer(
            inp.astype(floatX),
            mask,
            seqmask,
            args.rnn_dim,
            outputs_info,
            args,
            suffix="%s0" % suffix_prefix,
            backwards=backwards,
        )
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            inp = rlayers[-1].out
            seqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], args.rnn_dim), pdrop)
            rlayer = GRULayer(
                Dropout(inp, pdrop).out,
                mask,
                seqmask,
                args.rnn_dim,
                outputs_info,
                args,
                suffix="%s%d" % (suffix_prefix, k),
                backwards=backwards,
            )
            rlayers.append(rlayer)

        # should extract final outputs according to mask, note we
        # don't know seq length or current batch size at graph construction time
        # NOTE this would be used for initial hidden states in decoder in standard seq2seq but currently unused
        lens = T.sum(mask, axis=0)
        # will extract A[lens[k], k, :] for k in [0, batch size)
        self.routs = list()
        for rlayer in rlayers:
            rout = rlayer.out[lens - 1, theano.tensor.arange(x.shape[1]), :].astype(floatX)
            self.routs.append(rout)
        self.hs = rlayers[-1].out  # for attention

        olayer = LayerWrapper(self.routs)
        super(RNNEncoder, self).__init__(rlayers, olayer)
Example #8
0
 def __init__(self, x, xr, mask, space_mask, L_enc, pdrop, args):
     # NOTE shape[1] is batch size since shape[0] is seq length
     outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)]
     flayers = list()
     blayers = list()
     fsubset = L_enc[x.flatten()]
     bsubset = L_enc[xr.flatten()]
     finp = fsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
     binp = bsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
     fseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
     bseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
     finplayer = GRULayer(finp.astype(floatX), mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc0")
     binplayer = GRULayer(
         binp.astype(floatX), mask, bseqmask, args.rnn_dim, outputs_info, args, suffix="benc0", backwards=True
     )
     flayers.append(finplayer)
     blayers.append(binplayer)
     self.routs = list()  # unlike RNNEncoder, contains hs, not just final h
     self.routs.append(finplayer.out + binplayer.out)
     for k in xrange(1, args.rlayers):
         inp = self.routs[-1]
         fseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop)
         bseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop)
         flayer = GRULayer(
             Dropout(inp, pdrop).out, mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc%d" % k
         )
         blayer = GRULayer(
             Dropout(inp, pdrop).out,
             mask,
             bseqmask,
             args.rnn_dim,
             outputs_info,
             args,
             suffix="benc%d" % k,
             backwards=True,
         )
         self.routs.append(flayer.out + blayer.out)
         flayers.append(flayer)
         blayers.append(blayer)
     self.hs = self.routs[-1]  # for attention
     olayer = LayerWrapper(self.routs)
     rlayers = flayers + blayers  # NOTE careful not to assume rlayers = # layers in all cases
     super(BiRNNEncoder, self).__init__(rlayers, olayer)
Example #9
0
    def __init__(self, encoder, y, mask, L_dec, pdrop, args):
        self.hs = encoder.hs
        # NOTE just use this so only last layer uses attention
        def layer_init(attention):
            if not attention:
                return GRULayer
            else:
                return lambda *largs, **kwargs: GRULayerAttention(self.hs, *largs, **kwargs)

        # initial states
        outputs_info = [T.zeros_like(self.hs[0]) for k in xrange(len(encoder.routs))]
        rlayers = list()
        self.subset = L_dec[y.flatten()]
        inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1]))
        attention = args.rlayers == 1
        # exclude last prediction
        seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], L_dec.shape[1]), pdrop)
        inplayer = layer_init(attention)(
            inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix="dec0"
        )
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            attention = args.rlayers == k + 1
            seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], args.rnn_dim), pdrop)
            rlayer = layer_init(attention)(
                Dropout(rlayers[-1].out, pdrop).out,
                mask[:-1],
                seqmask[:-1],
                args.rnn_dim,
                outputs_info[k],
                args,
                suffix="dec%d" % k,
            )
            rlayers.append(rlayer)
        olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size)
        cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False)
        super(RNNDecoderAttention, self).__init__(rlayers, olayer, cost)
Example #10
0
    def __init__(self, args):
        self.args = args
        x = T.imatrix('x')
        y = T.imatrix('y')
        mask = T.ones_like(x).astype(floatX)
        # FIXME TODO resume from last state of previous sequence instead o
        # resetting the first hidden state to 0s
        self.unit = args.unit
        if args.unit == 'gru':
            init_states = [
                T.matrix(dtype=floatX) for k in xrange(args.rlayers)
            ]
        elif args.unit == 'lstm':
            init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX))
                           for k in xrange(args.rlayers)]
        else:
            assert (False)
        lr = T.scalar(dtype=floatX)
        pdrop = T.scalar(dtype=floatX)

        rlayers = list()
        inp = theano.tensor.extra_ops.to_one_hot(
            x.flatten(), args.vocab_size).astype(floatX).reshape(
                (x.shape[0], x.shape[1], args.vocab_size))
        seqmask = get_sequence_dropout_mask(
            (inp.shape[0], inp.shape[1], args.rnn_dim),
            pdrop,
            stocdrop=args.stocdrop)
        # exclude last prediction
        inplayer = UnitInit[args.unit](inp.astype(floatX),
                                       mask,
                                       seqmask,
                                       args.vocab_size,
                                       init_states[0],
                                       args,
                                       suffix='0')
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            seqmask = get_sequence_dropout_mask(
                (inp.shape[0], inp.shape[1], args.rnn_dim),
                pdrop,
                stocdrop=args.stocdrop)
            rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out,
                                         mask,
                                         seqmask,
                                         args.rnn_dim,
                                         init_states[k],
                                         args,
                                         suffix='%d' % k)
            rlayers.append(rlayer)
        olayer = SequenceLogisticRegression(
            Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.vocab_size)
        self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False)
        super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost)
        shapes = [p.shape.eval() for p in self.params]
        sizes = [np.prod(s) for s in shapes]
        self.nparams = np.sum(sizes)
        self.updates, self.grad_norm, self.param_norm = get_opt_fn(
            args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm)

        # functions

        if args.unit == 'lstm':
            init_states = flatten(init_states)
            final_states = list()
            for r in rlayers:
                final_states.append(r.out[-1])
                final_states.append(r.cell[-1])
        else:
            final_states = [r.out[-1] for r in rlayers]

        self.train = theano.function(
            inputs=[x, y, pdrop, lr] + init_states,
            outputs=[self.cost, self.grad_norm, self.param_norm] +
            final_states,
            updates=self.updates,
            on_unused_input='warn')

        self.test = theano.function(
            # at test time should pass in pdrop=0
            inputs=[x, y, pdrop] + init_states,
            outputs=[self.cost] + final_states,
            updates=None,
            on_unused_input='warn')

        # function for sampling

        i_t = T.ivector()
        x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0]
        h_ps = list()  # previous
        for k in xrange(args.rlayers):
            if args.unit == 'gru':
                h_ps.append(T.vector())
                dmask = T.ones_like(h_ps[0]).astype(floatX)
            else:
                h_ps.append((T.vector(), T.vector()))
                dmask = T.ones_like(h_ps[0][0]).astype(floatX)
        h_ts = list()
        if args.unit == 'lstm':
            h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0])
        else:
            h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0])
        h_ts.append(h_t)
        for k in xrange(1, args.rlayers):
            if args.unit == 'lstm':
                h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k])
            else:
                h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k])
            h_ts.append(h_t)
        if args.unit == 'lstm':
            h_t = h_t[0]
        E_t = T.dot(h_t, self.olayer.W) + self.olayer.b
        E_t = T.exp(E_t - T.max(E_t))
        p_t = E_t / E_t.sum()
        if args.unit == 'lstm':
            h_ps = flatten(h_ps)
            h_ts = flatten(h_ts)
        self.decode_step = theano.function(inputs=[i_t] + h_ps,
                                           outputs=[p_t] + h_ts,
                                           updates=None,
                                           on_unused_input='warn')
Example #11
0
    def __init__(self, args):
        self.args = args
        x = T.imatrix('x')
        y = T.imatrix('y')
        mask = T.ones_like(x).astype(floatX)
        # FIXME TODO resume from last state of previous sequence instead o
        # resetting the first hidden state to 0s
        self.unit = args.unit
        if args.unit == 'gru':
            init_states = [T.matrix(dtype=floatX) for k in xrange(args.rlayers)]
        elif args.unit == 'lstm':
            init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX)) for k in xrange(args.rlayers)]
        else:
            assert(False)
        lr = T.scalar(dtype=floatX)
        pdrop = T.scalar(dtype=floatX)

        rlayers = list()
        inp = theano.tensor.extra_ops.to_one_hot(x.flatten(), args.vocab_size).astype(floatX).reshape((x.shape[0], x.shape[1], args.vocab_size))
        seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop)
        # exclude last prediction
        inplayer = UnitInit[args.unit](inp.astype(floatX), mask, seqmask, args.vocab_size, init_states[0], args, suffix='0')
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop)
            rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out, mask, seqmask, args.rnn_dim, init_states[k], args, suffix='%d' % k)
            rlayers.append(rlayer)
        olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim,
                args.vocab_size)
        self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False)
        super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost)
        shapes = [p.shape.eval() for p in self.params]
        sizes = [np.prod(s) for s in shapes]
        self.nparams = np.sum(sizes)
        self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm)

        # functions

        if args.unit == 'lstm':
            init_states = flatten(init_states)
            final_states = list()
            for r in rlayers:
                final_states.append(r.out[-1])
                final_states.append(r.cell[-1])
        else:
            final_states = [r.out[-1] for r in rlayers]

        self.train = theano.function(
            inputs=[x, y, pdrop, lr] + init_states,
            outputs=[self.cost, self.grad_norm, self.param_norm] + final_states,
            updates = self.updates,
            on_unused_input='warn'
        )

        self.test = theano.function(
            # at test time should pass in pdrop=0
            inputs=[x, y, pdrop] + init_states,
            outputs=[self.cost] + final_states,
            updates = None,
            on_unused_input='warn'
        )

        # function for sampling

        i_t = T.ivector()
        x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0]
        h_ps = list()  # previous
        for k in xrange(args.rlayers):
            if args.unit == 'gru':
                h_ps.append(T.vector())
                dmask = T.ones_like(h_ps[0]).astype(floatX)
            else:
                h_ps.append((T.vector(), T.vector()))
                dmask = T.ones_like(h_ps[0][0]).astype(floatX)
        h_ts = list()
        if args.unit == 'lstm':
            h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0])
        else:
            h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0])
        h_ts.append(h_t)
        for k in xrange(1, args.rlayers):
            if args.unit == 'lstm':
                h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k])
            else:
                h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k])
            h_ts.append(h_t)
        if args.unit == 'lstm':
            h_t = h_t[0]
        E_t = T.dot(h_t, self.olayer.W) + self.olayer.b
        E_t = T.exp(E_t - T.max(E_t))
        p_t = E_t / E_t.sum()
        if args.unit == 'lstm':
            h_ps = flatten(h_ps)
            h_ts = flatten(h_ts)
        self.decode_step = theano.function(
            inputs=[i_t] + h_ps,
            outputs=[p_t] + h_ts,
            updates=None,
            on_unused_input='warn'
        )
Example #12
0
    def __init__(self, x, xr, mask, L_enc, pdrop, args):
        # NOTE shape[1] is batch size since shape[0] is seq length
        outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)]
        flayers = list()
        blayers = list()
        fsubset = L_enc[x.flatten()]
        bsubset = L_enc[xr.flatten()]
        finp = fsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
        binp = bsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
        fseqmask = get_sequence_dropout_mask(
            (x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
        bseqmask = get_sequence_dropout_mask(
            (x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
        finplayer = GRULayer(finp.astype(floatX),
                             mask,
                             fseqmask,
                             args.rnn_dim,
                             outputs_info,
                             args,
                             suffix='fenc0')
        binplayer = GRULayer(binp.astype(floatX),
                             mask,
                             bseqmask,
                             args.rnn_dim,
                             outputs_info,
                             args,
                             suffix='benc0',
                             backwards=True)
        flayers.append(finplayer)
        blayers.append(binplayer)
        self.routs = list()  # unlike RNNEncoder, contains hs, not just final h
        self.routs.append(finplayer.out + binplayer.out)
        downs = []
        for k in xrange(1, args.rlayers):
            # concatenate consecutive steps in the sequence (which are downscaled to half from the previous layer)
            d = Downscale(self.routs[-1], args.rnn_dim, suffix='ds%d' % k)
            downs.append(d)
            inp = d.out
            twocols = mask.T.reshape([-1, 2])
            mask = T.or_(twocols[:, 0],
                         twocols[:, 1]).reshape([mask.shape[1], -1]).T

            fseqmask = get_sequence_dropout_mask(
                (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop)
            bseqmask = get_sequence_dropout_mask(
                (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop)
            flayer = GRULayer(Dropout(inp, pdrop).out,
                              mask,
                              fseqmask,
                              args.rnn_dim,
                              outputs_info,
                              args,
                              suffix='fenc%d' % k)
            blayer = GRULayer(Dropout(inp, pdrop).out,
                              mask,
                              bseqmask,
                              args.rnn_dim,
                              outputs_info,
                              args,
                              suffix='benc%d' % k,
                              backwards=True)
            self.routs.append(flayer.out + blayer.out)
            flayers.append(flayer)
            blayers.append(blayer)
        self.hs = self.routs[-1]  # for attention
        olayer = LayerWrapper(self.routs)
        rlayers = flayers + blayers  # NOTE careful not to assume rlayers = # layers in all cases

        # undo the temporary hack
        super(BiPyrRNNEncoder, self).__init__(rlayers,
                                              olayer,
                                              downscales=downs)