def __init__(self, enc_nhids=1000, dec_nhids=1000, enc_embed=620, dec_embed=620, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table') self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table') self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs) self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids*2, **kwargs) self.logistic = LogisticRegression(kwargs.get('n_out', dec_nhids), trg_vocab_size, prefix='logistic', drop_rate=kwargs['dropout']) self.params = self.src_lookup_table.params + self.trg_lookup_table.params + self.encoder.params + self.decoder.params \ + self.logistic.params self.tparams = OrderedDict([(param.name, param) for param in self.params])
def __init__(self, enc_nhids=1000, dec_nhids=1000, enc_embed=620, dec_embed=620, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): self.lr_in = kwargs.get('n_out', dec_nhids) self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table') self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table') self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs) # src_nhids*2 corresponds the last dimension of encoded state self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids * 2, **kwargs) # the output size of decoder should be same with lr_in if no n_out # defined self.logistic = LogisticRegression(self.lr_in, trg_vocab_size, prefix='logistic', **kwargs) self.params = self.src_lookup_table.params + self.trg_lookup_table.params + \ self.encoder.params + self.decoder.params + self.logistic.params self.tparams = OrderedDict([(param.name, param) for param in self.params]) self.use_mv = kwargs.get('use_mv', 0)
class Translate(object): def __init__(self, enc_nhids=1000, dec_nhids=1000, enc_embed=620, dec_embed=620, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table') self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table') self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs) self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids*2, **kwargs) self.logistic = LogisticRegression(kwargs.get('n_out', dec_nhids), trg_vocab_size, prefix='logistic', drop_rate=kwargs['dropout']) self.params = self.src_lookup_table.params + self.trg_lookup_table.params + self.encoder.params + self.decoder.params \ + self.logistic.params self.tparams = OrderedDict([(param.name, param) for param in self.params]) def apply(self, source, source_mask, target, target_mask, **kwargs): sbelow = self.src_lookup_table.apply(source) tbelow = self.trg_lookup_table.apply_zero_pad(target) s_rep = self.encoder.apply(sbelow, source_mask) hiddens = self.decoder.apply(tbelow, target_mask, s_rep, source_mask) cost_matrix = self.logistic.cost(hiddens, target, target_mask) self.cost = cost_matrix.sum()/target_mask.shape[1] def _next_prob_state(self, y, state, c, c_x): next_state, merge_out = self.decoder.next_state_merge(y, state, c, c_x) prob = self.logistic.apply(merge_out) return prob, next_state def build_sample(self): x = T.matrix('x', dtype='int64') sbelow = self.src_lookup_table.apply(x) ctx = self.encoder.apply(sbelow, mask=None) c_x = T.dot(ctx, self.decoder.Ws) + self.decoder.bs init_state = self.decoder.init_state(ctx) outs = [init_state, ctx] f_init = theano.function([x], outs, name='f_init') y = T.vector('y_sampler', dtype='int64') y_emb = self.trg_lookup_table.index(y) init_state = T.matrix('init_state', dtype='float32') next_probs, next_state = self._next_prob_state(y_emb, init_state, ctx, c_x) inps = [y, ctx, init_state] outs = [next_probs, next_state] f_next = theano.function(inps, outs, name='f_next') return f_init, f_next def savez(self, filename): params_value = OrderedDict([(kk, value.get_value()) for kk, value in self.tparams.iteritems()]) numpy.savez(filename, **params_value) def load(self, filename): params_value = numpy.load(filename) assert len(params_value.files) == len(self.tparams) for key, value in self.tparams.iteritems(): value.set_value(params_value[key])
class Translate(object): def __init__(self, enc_nhids=1000, dec_nhids=1000, enc_embed=620, dec_embed=620, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): self.lr_in = kwargs.get('n_out', dec_nhids) self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table') self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table') self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs) # src_nhids*2 corresponds the last dimension of encoded state self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids * 2, **kwargs) # the output size of decoder should be same with lr_in if no n_out # defined self.logistic = LogisticRegression(self.lr_in, trg_vocab_size, prefix='logistic', **kwargs) self.params = self.src_lookup_table.params + self.trg_lookup_table.params + \ self.encoder.params + self.decoder.params + self.logistic.params self.tparams = OrderedDict([(param.name, param) for param in self.params]) self.use_mv = kwargs.get('use_mv', 0) def apply(self, source, source_mask, target, target_mask, v_part=None, v_true=None, **kwargs): # sbelow and tbelow are 3-D matrix, sbelow[i][j] (tbelow[i][j]) are embeddings of the i^{th} word in the j^{th} sentence in batch # here, source and source_mask: shape(src_sent_len * batch_size) # target and target_mask: shape(trg_sent_len * batch_size) # and their type are all theano.tensor.var.TensorVariable (numpy.ndarray) # (40,28,620) = (src_sent_len, batch_size, srcw_embsz) sbelow = self.src_lookup_table.apply(source) # the shape is different from source, (trg_sent_len-1, batch_size, # trgw_embsz) tbelow = self.trg_lookup_table.apply_zero_pad(target) # (src_sent_len, batch_size, src_nhids*2): bidirectional encode source sentence s_rep = self.encoder.apply(sbelow, source_mask) # remove the last word which is '</S>' of each sentence in a batch, the padding words are alse </S> 29999 # tbelow[:-1] -> shape(trg_sent_len-1, batch_size, trgw_embsz) # target_mask[:-1] -> shape(trg_sent_len-1, batch_size) # hiddens, s, a, ss, als = self.decoder.apply(tbelow[:-1], target_mask[:-1], s_rep, source_mask) hiddens = self.decoder.apply(tbelow, target_mask, s_rep, source_mask) # hiddens from decoder: shape(trg_sent_len-1, batch_size, n_out) # (padding words all 0) self.mean_cost, self.mean_abs_log_norm = self.logistic.cost( hiddens, target, target_mask, v_part, v_true) # cost_matrix: shape((trg_sent_len-1), batch_size), here the trg_sent_len corresponds to this batch, # trg_sent_len may differ between different batches # cost_matrix.sum(): sum of all the elements in cost_matrix # target_mask[1]: the sentences number in a batch # so, cost_matrix.sum()/target_mask.shape[1] is actually the average cross # entropy per sentence in a batch ''' y_emb_im1: (trgw_embsz,) t_stat_im1: (batch_size, trg_nhids) ctx: (src_sent_len, batch_size, src_nhids*2) c_x: (src_sent_len, batch_size, trg_nhids) ''' def build_sample(self): x = T.matrix('x', dtype='int64') sbelow = self.src_lookup_table.apply(x) mask = T.alloc(numpy.float32(1.), sbelow.shape[0], sbelow.shape[1]) # (src_sent_len, batch_size, src_nhids*2) batch_size == 1 for decoding ctx = self.encoder.apply(sbelow, mask) # self.decoder.Ws: (src_nhids*2, trg_nhids) # self.decocer.bs: (trg_nhids, ) # (src_sent_len, batch_size, trg_nhids) (-1 ~ 1) # as long as ctx is inputed as parameter, no need c_x, it will be # calculated... do not worry c_x = T.dot(ctx, self.decoder.Ws) + self.decoder.bs # init_state: (batch_size, trg_nhids) init_state = self.decoder.init_state( ctx) # no mask here, because no batch f_init = theano.function([x], [init_state, ctx, c_x], name='f_init') #-------------------------------------------------------------- y_im1 = T.vector('y_sampler', dtype='int64') y_emb_im1 = self.trg_lookup_table.index(y_im1) f_emb = theano.function([y_im1], y_emb_im1, name='f_emb') #t_yemb_im1 = T.tensor3('t_yemb_im1', dtype='float32') t_yemb_im1 = T.matrix('t_yemb_im1', dtype='float32') t_stat_im1 = T.matrix('t_stat_im1', dtype='float32') #-------------------------------------------------------------- # get next state h1: h_i = rnn(y_{i-1}, s_{i-1}) # y_emb_im1: embedding of one target word, shape(1, trgw_embsz) hi = self.decoder._step_forward(x_t=t_yemb_im1, x_m=None, h_tm1=t_stat_im1) f_nh = theano.function([t_yemb_im1, t_stat_im1], hi, name='f_nh') #-------------------------------------------------------------- t_hi = T.matrix('t_hi', dtype='float32') t_ctx = T.tensor3('t_ctx', dtype='float32') t_c_x = T.tensor3('t_c_x', dtype='float32') # next attention: a_i = a(h_i, c_i), c_i is actually do not change ... pi, ai = self.decoder.attention_layer.apply(source_ctx=t_ctx, source_mask=None, source_x=t_c_x, cur_hidden=t_hi) f_na = theano.function([t_ctx, t_c_x, t_hi], [pi, ai], name='f_na') #-------------------------------------------------------------- # get next final state, s_i = f(h_i<=(y_{i-1} and s_{i-1}), y_{i-1}, # c_i) t_ai = T.matrix('t_ai', dtype='float32') ns = self.decoder.state_with_attend(h1=t_hi, attended=t_ai) f_ns = theano.function([t_hi, t_ai], ns, name='f_ns') #-------------------------------------------------------------- # merge_out = g(y_{i-1}, s_i, a_i) t_si = T.matrix('t_si', dtype='float32') merge_out = self.decoder.merge_out(y_emb_im1=t_yemb_im1, s_i=t_si, a_i=t_ai) f_mo = theano.function([t_yemb_im1, t_ai, t_si], merge_out, name='f_mo') #-------------------------------------------------------------- # get model score of the whole vocab: nonlinear(merge_out) t_mo = T.matrix('t_mo', dtype='float32') if self.use_mv: ptv = T.vector('ptv', dtype='int64') ptv_ins = [t_mo, ptv] ptv_ous = self.logistic.apply_score(t_mo, ptv, drop=True) else: ptv_ins = [t_mo] ptv_ous = self.logistic.apply_score(t_mo, drop=True) f_pws = theano.function(ptv_ins, ptv_ous, name='f_pws') #-------------------------------------------------------------- # no need to use the whole vocabulary, vocabulary manipulation # if use T.ivector(), this slice will be very slow on cpu, i do not # know why y = T.wscalar('y') # get part model score slice: nonlinear(merge_out)[part] f_one = theano.function([t_mo, y], self.logistic.apply_score_one(t_mo, y), name='f_one') #-------------------------------------------------------------- # distribution over target vocab: softmax(energy) t_pws = T.matrix('t_pws', dtype='float32') #self.logistic.apply_softmax(t_pws) #self.logistic.softmax(t_pws) f_ce = theano.function([t_pws], T.nnet.softmax(t_pws), name='f_ce') # next_w(y_emb_im1): (k-dead_k,) the last word id of each translate candidate in beam # ctx: (src_sent_len, live_k, src_nhids*2) # t_stat_im1: shape(k-dead_k, trg_nhids) # probs: shape(k-dead_k, trg_vocab_size) # f_next ................. next_probs, next_state = self.next_prob_state(y_emb_im1, t_stat_im1, ctx, c_x) inps = [y_im1, ctx, t_stat_im1] outs = [next_probs, next_state] f_next = theano.function(inps, outs, name='f_next') return [ f_init, f_nh, f_na, f_ns, f_mo, f_pws, f_one, f_ce, f_next, f_emb ] def next_prob_state(self, y_emb_im1, s_im1, ctx, c_x): next_state, merge_out = self.decoder.next_state_mout( y_emb_im1, s_im1, ctx, c_x) prob = self.logistic.apply(merge_out) return prob, next_state def savez(self, filename): params_value = OrderedDict([(kk, value.get_value()) for kk, value in self.tparams.iteritems()]) numpy.savez(filename, **params_value) def load(self, filename): # change all weights by file params_value = numpy.load(filename) assert len(params_value.files) == len(self.tparams) for key, value in self.tparams.iteritems(): # type(value) theano.tensor.sharedvar.TensorSharedVariable # params_value[key] is numpy.ndarray # we set the shared variable as the numpy array value.set_value(params_value[key]) ''' type(params_value['logistic_W0']: numpy.ndarray (512, 30000) array([[-0.00096034, -0.0392303 , -0.07458289, ..., -0.00285031, 0.03942127, -0.03161906], [-0.03706803, -0.06445373, -0.00836279, ..., -0.01915432, -0.00247126, 0.17407075], [-0.00102945, 0.03983303, -0.00801838, ..., -0.02834764, 0.02834882, -0.07769781], ..., [ 0.01267207, 0.07802714, -0.02748013, ..., 0.0485581 , -0.00657458, 0.07204553], [ 0.01089897, 0.06406539, -0.04804269, ..., -0.03247456, 0.04343275, -0.14596273], [ 0.01474529, 0.02925147, 0.01569422, ..., 0.01673588, -0.02202134, 0.19972666]], dtype=float32) ''' def load2numpy(self, filename): # change all weights by file params_value = numpy.load(filename) assert len(params_value.files) == len(self.tparams) return params_value