Esempio n. 1
1
 def __init__(self,
              enc_nhids=1000,
              dec_nhids=1000,
              enc_embed=620,
              dec_embed=620,
              src_vocab_size=30000,
              trg_vocab_size=30000,
              **kwargs):
     self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table')
     self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table')
     self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs)
     self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids*2, **kwargs)
     self.logistic = LogisticRegression(kwargs.get('n_out', dec_nhids), trg_vocab_size, prefix='logistic', drop_rate=kwargs['dropout'])
     self.params = self.src_lookup_table.params + self.trg_lookup_table.params + self.encoder.params + self.decoder.params  \
         + self.logistic.params
     self.tparams = OrderedDict([(param.name, param) for param in self.params])
Esempio n. 2
0
    def __init__(self,
                 enc_nhids=1000,
                 dec_nhids=1000,
                 enc_embed=620,
                 dec_embed=620,
                 src_vocab_size=30000,
                 trg_vocab_size=30000,
                 **kwargs):
        self.lr_in = kwargs.get('n_out', dec_nhids)

        self.src_lookup_table = Lookup_table(enc_embed,
                                             src_vocab_size,
                                             prefix='src_lookup_table')
        self.trg_lookup_table = Lookup_table(dec_embed,
                                             trg_vocab_size,
                                             prefix='trg_lookup_table')
        self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs)
        # src_nhids*2 corresponds the last dimension of encoded state
        self.decoder = Decoder(dec_embed,
                               dec_nhids,
                               c_hids=enc_nhids * 2,
                               **kwargs)
        # the output size of decoder should be same with lr_in if no n_out
        # defined
        self.logistic = LogisticRegression(self.lr_in,
                                           trg_vocab_size,
                                           prefix='logistic',
                                           **kwargs)
        self.params = self.src_lookup_table.params + self.trg_lookup_table.params + \
            self.encoder.params + self.decoder.params + self.logistic.params
        self.tparams = OrderedDict([(param.name, param)
                                    for param in self.params])
        self.use_mv = kwargs.get('use_mv', 0)
Esempio n. 3
0
class Translate(object):

    def __init__(self,
                 enc_nhids=1000,
                 dec_nhids=1000,
                 enc_embed=620,
                 dec_embed=620,
                 src_vocab_size=30000,
                 trg_vocab_size=30000,
                 **kwargs):
        self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table')
        self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table')
        self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs)
        self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids*2, **kwargs)
        self.logistic = LogisticRegression(kwargs.get('n_out', dec_nhids), trg_vocab_size, prefix='logistic', drop_rate=kwargs['dropout'])
        self.params = self.src_lookup_table.params + self.trg_lookup_table.params + self.encoder.params + self.decoder.params  \
            + self.logistic.params
        self.tparams = OrderedDict([(param.name, param) for param in self.params])

    def apply(self, source, source_mask, target, target_mask, **kwargs):
        sbelow = self.src_lookup_table.apply(source)
        tbelow = self.trg_lookup_table.apply_zero_pad(target)

        s_rep = self.encoder.apply(sbelow, source_mask)
        hiddens = self.decoder.apply(tbelow, target_mask, s_rep, source_mask)

        cost_matrix = self.logistic.cost(hiddens, target, target_mask)
        self.cost = cost_matrix.sum()/target_mask.shape[1]

    def _next_prob_state(self, y, state, c, c_x):
        next_state, merge_out = self.decoder.next_state_merge(y, state, c, c_x)
        prob = self.logistic.apply(merge_out)
        return prob, next_state

    def build_sample(self):
        x = T.matrix('x', dtype='int64')
        sbelow = self.src_lookup_table.apply(x)
        ctx = self.encoder.apply(sbelow, mask=None)
        c_x = T.dot(ctx, self.decoder.Ws) + self.decoder.bs
        init_state = self.decoder.init_state(ctx)

        outs = [init_state, ctx]
        f_init = theano.function([x], outs, name='f_init')

        y = T.vector('y_sampler', dtype='int64')
        y_emb = self.trg_lookup_table.index(y)
        init_state = T.matrix('init_state', dtype='float32')
        next_probs, next_state = self._next_prob_state(y_emb, init_state, ctx, c_x)

        inps = [y, ctx, init_state]
        outs = [next_probs, next_state]
        f_next = theano.function(inps, outs, name='f_next')

        return f_init, f_next

    def savez(self, filename):
        params_value = OrderedDict([(kk, value.get_value()) for kk, value in self.tparams.iteritems()])
        numpy.savez(filename, **params_value)

    def load(self, filename):
        params_value = numpy.load(filename)
        assert len(params_value.files) == len(self.tparams)
        for key, value in self.tparams.iteritems():
            value.set_value(params_value[key])
Esempio n. 4
0
class Translate(object):
    def __init__(self,
                 enc_nhids=1000,
                 dec_nhids=1000,
                 enc_embed=620,
                 dec_embed=620,
                 src_vocab_size=30000,
                 trg_vocab_size=30000,
                 **kwargs):
        self.lr_in = kwargs.get('n_out', dec_nhids)

        self.src_lookup_table = Lookup_table(enc_embed,
                                             src_vocab_size,
                                             prefix='src_lookup_table')
        self.trg_lookup_table = Lookup_table(dec_embed,
                                             trg_vocab_size,
                                             prefix='trg_lookup_table')
        self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs)
        # src_nhids*2 corresponds the last dimension of encoded state
        self.decoder = Decoder(dec_embed,
                               dec_nhids,
                               c_hids=enc_nhids * 2,
                               **kwargs)
        # the output size of decoder should be same with lr_in if no n_out
        # defined
        self.logistic = LogisticRegression(self.lr_in,
                                           trg_vocab_size,
                                           prefix='logistic',
                                           **kwargs)
        self.params = self.src_lookup_table.params + self.trg_lookup_table.params + \
            self.encoder.params + self.decoder.params + self.logistic.params
        self.tparams = OrderedDict([(param.name, param)
                                    for param in self.params])
        self.use_mv = kwargs.get('use_mv', 0)

    def apply(self,
              source,
              source_mask,
              target,
              target_mask,
              v_part=None,
              v_true=None,
              **kwargs):
        # sbelow and tbelow are 3-D matrix, sbelow[i][j] (tbelow[i][j]) are embeddings of the i^{th} word in the j^{th} sentence in batch
        # here, source and source_mask: shape(src_sent_len * batch_size)
        # target and target_mask: shape(trg_sent_len * batch_size)
        # and their type are all theano.tensor.var.TensorVariable (numpy.ndarray)
        # (40,28,620) = (src_sent_len, batch_size, srcw_embsz)
        sbelow = self.src_lookup_table.apply(source)
        # the shape is different from source, (trg_sent_len-1, batch_size,
        # trgw_embsz)
        tbelow = self.trg_lookup_table.apply_zero_pad(target)

        # (src_sent_len, batch_size, src_nhids*2): bidirectional encode source sentence
        s_rep = self.encoder.apply(sbelow, source_mask)
        # remove the last word which is '</S>' of each sentence in a batch, the padding words are alse </S> 29999
        # tbelow[:-1] -> shape(trg_sent_len-1, batch_size, trgw_embsz)
        # target_mask[:-1] -> shape(trg_sent_len-1, batch_size)
        # hiddens, s, a, ss, als = self.decoder.apply(tbelow[:-1], target_mask[:-1], s_rep, source_mask)
        hiddens = self.decoder.apply(tbelow, target_mask, s_rep, source_mask)
        # hiddens from decoder: shape(trg_sent_len-1, batch_size, n_out)
        # (padding words all 0)
        self.mean_cost, self.mean_abs_log_norm = self.logistic.cost(
            hiddens, target, target_mask, v_part, v_true)

        # cost_matrix: shape((trg_sent_len-1), batch_size), here the trg_sent_len corresponds to this batch,
        # trg_sent_len may differ between different batches
        # cost_matrix.sum(): sum of all the elements in cost_matrix
        # target_mask[1]: the sentences number in a batch
        # so, cost_matrix.sum()/target_mask.shape[1] is actually the average cross
        # entropy per sentence in a batch

    '''
    y_emb_im1: (trgw_embsz,)
    t_stat_im1: (batch_size, trg_nhids)
    ctx: (src_sent_len, batch_size, src_nhids*2)
    c_x: (src_sent_len, batch_size, trg_nhids)
    '''

    def build_sample(self):
        x = T.matrix('x', dtype='int64')
        sbelow = self.src_lookup_table.apply(x)
        mask = T.alloc(numpy.float32(1.), sbelow.shape[0], sbelow.shape[1])
        # (src_sent_len, batch_size, src_nhids*2) batch_size == 1 for decoding
        ctx = self.encoder.apply(sbelow, mask)
        # self.decoder.Ws: (src_nhids*2, trg_nhids)
        # self.decocer.bs: (trg_nhids, )
        # (src_sent_len, batch_size, trg_nhids) (-1 ~ 1)
        # as long as ctx is inputed as parameter, no need c_x, it will be
        # calculated... do not worry
        c_x = T.dot(ctx, self.decoder.Ws) + self.decoder.bs
        # init_state: (batch_size, trg_nhids)
        init_state = self.decoder.init_state(
            ctx)  # no mask here, because no batch
        f_init = theano.function([x], [init_state, ctx, c_x], name='f_init')

        #--------------------------------------------------------------
        y_im1 = T.vector('y_sampler', dtype='int64')
        y_emb_im1 = self.trg_lookup_table.index(y_im1)
        f_emb = theano.function([y_im1], y_emb_im1, name='f_emb')

        #t_yemb_im1 = T.tensor3('t_yemb_im1', dtype='float32')
        t_yemb_im1 = T.matrix('t_yemb_im1', dtype='float32')
        t_stat_im1 = T.matrix('t_stat_im1', dtype='float32')

        #--------------------------------------------------------------
        # get next state h1: h_i = rnn(y_{i-1}, s_{i-1})
        # y_emb_im1: embedding of one target word, shape(1, trgw_embsz)
        hi = self.decoder._step_forward(x_t=t_yemb_im1,
                                        x_m=None,
                                        h_tm1=t_stat_im1)
        f_nh = theano.function([t_yemb_im1, t_stat_im1], hi, name='f_nh')

        #--------------------------------------------------------------
        t_hi = T.matrix('t_hi', dtype='float32')
        t_ctx = T.tensor3('t_ctx', dtype='float32')
        t_c_x = T.tensor3('t_c_x', dtype='float32')
        # next attention: a_i = a(h_i, c_i), c_i is actually do not change ...
        pi, ai = self.decoder.attention_layer.apply(source_ctx=t_ctx,
                                                    source_mask=None,
                                                    source_x=t_c_x,
                                                    cur_hidden=t_hi)
        f_na = theano.function([t_ctx, t_c_x, t_hi], [pi, ai], name='f_na')

        #--------------------------------------------------------------
        # get next final state, s_i = f(h_i<=(y_{i-1} and s_{i-1}), y_{i-1},
        # c_i)
        t_ai = T.matrix('t_ai', dtype='float32')
        ns = self.decoder.state_with_attend(h1=t_hi, attended=t_ai)
        f_ns = theano.function([t_hi, t_ai], ns, name='f_ns')

        #--------------------------------------------------------------
        # merge_out = g(y_{i-1}, s_i, a_i)
        t_si = T.matrix('t_si', dtype='float32')
        merge_out = self.decoder.merge_out(y_emb_im1=t_yemb_im1,
                                           s_i=t_si,
                                           a_i=t_ai)
        f_mo = theano.function([t_yemb_im1, t_ai, t_si],
                               merge_out,
                               name='f_mo')

        #--------------------------------------------------------------
        # get model score of the whole vocab: nonlinear(merge_out)
        t_mo = T.matrix('t_mo', dtype='float32')
        if self.use_mv:
            ptv = T.vector('ptv', dtype='int64')
            ptv_ins = [t_mo, ptv]
            ptv_ous = self.logistic.apply_score(t_mo, ptv, drop=True)
        else:
            ptv_ins = [t_mo]
            ptv_ous = self.logistic.apply_score(t_mo, drop=True)
        f_pws = theano.function(ptv_ins, ptv_ous, name='f_pws')

        #--------------------------------------------------------------
        # no need to use the whole vocabulary, vocabulary manipulation
        # if use T.ivector(), this slice will be very slow on cpu, i do not
        # know why
        y = T.wscalar('y')
        # get part model score slice: nonlinear(merge_out)[part]
        f_one = theano.function([t_mo, y],
                                self.logistic.apply_score_one(t_mo, y),
                                name='f_one')

        #--------------------------------------------------------------
        # distribution over target vocab: softmax(energy)
        t_pws = T.matrix('t_pws', dtype='float32')
        #self.logistic.apply_softmax(t_pws)
        #self.logistic.softmax(t_pws)
        f_ce = theano.function([t_pws], T.nnet.softmax(t_pws), name='f_ce')
        # next_w(y_emb_im1):    (k-dead_k,)  the last word id of each translate candidate in beam
        # ctx:  (src_sent_len, live_k, src_nhids*2)
        # t_stat_im1:           shape(k-dead_k, trg_nhids)
        # probs:  shape(k-dead_k, trg_vocab_size)

        # f_next .................
        next_probs, next_state = self.next_prob_state(y_emb_im1, t_stat_im1,
                                                      ctx, c_x)

        inps = [y_im1, ctx, t_stat_im1]
        outs = [next_probs, next_state]
        f_next = theano.function(inps, outs, name='f_next')

        return [
            f_init, f_nh, f_na, f_ns, f_mo, f_pws, f_one, f_ce, f_next, f_emb
        ]

    def next_prob_state(self, y_emb_im1, s_im1, ctx, c_x):
        next_state, merge_out = self.decoder.next_state_mout(
            y_emb_im1, s_im1, ctx, c_x)
        prob = self.logistic.apply(merge_out)
        return prob, next_state

    def savez(self, filename):
        params_value = OrderedDict([(kk, value.get_value())
                                    for kk, value in self.tparams.iteritems()])
        numpy.savez(filename, **params_value)

    def load(self, filename):  # change all weights by file
        params_value = numpy.load(filename)
        assert len(params_value.files) == len(self.tparams)
        for key, value in self.tparams.iteritems():
            # type(value) theano.tensor.sharedvar.TensorSharedVariable
            # params_value[key] is numpy.ndarray
            # we set the shared variable as the numpy array
            value.set_value(params_value[key])
        '''
        type(params_value['logistic_W0']: numpy.ndarray (512, 30000)
array([[-0.00096034, -0.0392303 , -0.07458289, ..., -0.00285031,
         0.03942127, -0.03161906],
       [-0.03706803, -0.06445373, -0.00836279, ..., -0.01915432,
        -0.00247126,  0.17407075],
       [-0.00102945,  0.03983303, -0.00801838, ..., -0.02834764,
         0.02834882, -0.07769781],
       ...,
       [ 0.01267207,  0.07802714, -0.02748013, ...,  0.0485581 ,
        -0.00657458,  0.07204553],
       [ 0.01089897,  0.06406539, -0.04804269, ..., -0.03247456,
         0.04343275, -0.14596273],
       [ 0.01474529,  0.02925147,  0.01569422, ...,  0.01673588,
        -0.02202134,  0.19972666]], dtype=float32)
        '''

    def load2numpy(self, filename):  # change all weights by file
        params_value = numpy.load(filename)
        assert len(params_value.files) == len(self.tparams)
        return params_value