def __init__(self, config, name='', fls=None): self.config = config self.name = name self.creater = LayerFactory() self.fls = fls #print(self.fls) self.trng = RandomStreams(numpy.random.randint(int(10e6)))
def setup_LG(n, classsize=20, degree=5, p=1): """ construction of our basic network """ categories = ["Kids", "Normal", "Risk"] percentage = [0.15, 0.48, 0.37] # stastica LG = LayerGraph(n, categories, percentage) LF = LayerFactory(LG) # create layers: household_layer = LF.layer_dividing_Graph("Households", 2, None, categories, fully_connected=True) school_layer = LF.layer_dividing_Graph("Schools", classsize, degree, ["Kids"]) working_layer = LF.layer_dividing_Graph("Workplaces", 6, 3, ["Normal"]) risk_layer = LF.create_layer("R_Workplaces", 6, int(percentage[1] * n / 3), [0, 0.7, 0.3], 3) social_layer = LF.layer_dividing_Graph("Social", 10, 3, categories) party_layer = LF.create_layer("parties", 20, int(n * percentage[0] / 6), [0.6, 0.4, 0], 6) basic_connect = LF.layer_dividing_Graph("basic", n, 1, categories) # Add layers: LG.add_layer(household_layer, p) LG.add_layer(school_layer, p) LG.add_layer(working_layer, p) LG.add_layer(risk_layer, p) LG.add_layer(social_layer, p) LG.add_layer(party_layer, p) LG.add_layer(basic_connect, p) return LG
def build(self): ''' Building the computational graph. ''' # building forward NMT logging.info("Building forward NMT") self.fwd_nmt = RNNsearch(self.config, '') self.fwd_nmt.build() # building backward NMT logging.info("Building backward NMT") config = copy.deepcopy(self.config) config['index_unk_src'], config['index_unk_trg'] = config[ 'index_unk_trg'], config['index_unk_src'] config['index_eos_src'], config['index_eos_trg'] = config[ 'index_eos_trg'], config['index_eos_src'] config['num_vocab_src'], config['num_vocab_trg'] = config[ 'num_vocab_trg'], config['num_vocab_src'] self.bwd_nmt = RNNsearch(config, 'inv_') self.bwd_nmt.build() # merging parameters and objectives self.creater = LayerFactory() self.creater.params = self.fwd_nmt.creater.params + self.bwd_nmt.creater.params self.creater.layers = self.fwd_nmt.creater.layers + self.bwd_nmt.creater.layers cost0 = self.fwd_nmt.cost_per_sample cost1 = self.bwd_nmt.cost_per_sample valid = tensor.vector('valid', dtype='float32') self.inputs = self.fwd_nmt.inputs + self.bwd_nmt.inputs + [ valid, ] self.get_addition_grads(cost0, cost1, valid)
class RNNsearch(model): ''' The attention-based NMT model ''' def __init__(self, config, name='', fls=None): self.config = config self.name = name self.creater = LayerFactory() self.fls = fls #print(self.fls) self.trng = RandomStreams(numpy.random.randint(int(10e6))) def sampling_step(self, state, prev, context): ''' Build the computational graph which samples the next word. :type state: theano variables :param state: the previous hidden state :type prev: theano variables :param prev: the last generated word :type context: theano variables :param context: the context vectors. ''' emb = self.emb_trg.forward(prev) energy, c = self.decoderGRU.decode_probs(context, state, emb) probs = tensor.nnet.softmax(energy) sample = self.trng.multinomial(pvals=probs, dtype='int64').argmax(axis=-1) newemb = self.emb_trg.forward(sample) newstate = self.decoderGRU.decode_next(c, state, newemb) return newstate, sample, probs def decode_sample(self, state_init, c, length, n_samples): ''' Build the decoder graph for sampling. :type state_init: theano variables :param state_init: the initial state of decoder :type c: theano variables :param c: the context vectors :type length: int :param length: the limitation of sample length :type n_samples: int :param n_samples: the number of samples ''' state = tensor.repeat(state_init, n_samples, axis=0) # copy state n times sample = tensor.zeros((n_samples, ), dtype='int64') c = tensor.repeat(c, n_samples, axis=1) result, updates = theano.scan(self.sampling_step, outputs_info=[state, sample, None], non_sequences=[c], n_steps=length) samples = result[1] probs = result[2] y_idx = tensor.arange(samples.flatten( ).shape[0]) * self.config['num_vocab_trg'] + samples.flatten() #probs = probs.flatten()[y_idx] #probs = probs.reshape(samples.shape) return samples, probs, updates def build(self, verbose=False): ''' Build the computational graph. :type verbose: bool :param verbose: only set to True on visualization ''' config = self.config # create layers logging.info('Initializing layers') self.emb_src = self.creater.createLookupTable( self.name + 'emb_src', config['num_vocab_src'], config['dim_emb_src'], offset=True) #(input,output)-->[30000,620] self.emb_trg = self.creater.createLookupTable( self.name + 'emb_trg', config['num_vocab_trg'], config['dim_emb_trg'], offset=True) #(input,output)-->[30000,620] self.encoderGRU = self.creater.createGRU(self.name + 'GRU_enc', config['dim_emb_src'], config['dim_rec_enc'], verbose=verbose) self.encoderGRU_back = self.creater.createGRU(self.name + 'GRU_enc_back', config['dim_emb_src'], config['dim_rec_enc'], verbose=verbose) self.decoderGRU = self.creater.createGRU_attention( self.name + 'GRU_dec', config['dim_emb_trg'], 2 * config['dim_rec_enc'], config['dim_rec_dec'], config['num_vocab_trg'], verbose=verbose) self.initer = self.creater.createFeedForwardLayer( self.name + 'initer', config['dim_rec_enc'], config['dim_rec_dec'], offset=True) if self.fls: #print("loaded feature") fl_weight = [] for fl in self.fls: fl_weight.append(fl.feature_weight) #logging.info("sen weight") #print(fl.feature_weight) fl_weight = numpy.concatenate(fl_weight) self.feature_weight = theano.shared(fl_weight.astype('float32'), name="feature_weight") self.creater.params += [self.feature_weight] self.feature_weight_dim = self.feature_weight.dimshuffle( 'x', 0) # equal to a.T (m,n)-->(n,m) # create input variables self.x = tensor.matrix('x', dtype='int64') # size: (length, batchsize) self.xmask = tensor.matrix( 'x_mask', dtype='float32') # size: (length, batchsize) self.y = tensor.matrix('y', dtype='int64') # size: (length, batchsize) self.ymask = tensor.matrix( 'y_mask', dtype='float32') # size: (length, batchsize) if 'MRT' in config and config['MRT'] is True: self.MRTLoss = tensor.vector('MRTLoss') self.inputs = [ self.x, self.xmask, self.y, self.ymask, self.MRTLoss ] else: self.MRTLoss = None self.inputs = [self.x, self.xmask, self.y, self.ymask] if config['PR']: self.ans = tensor.scalar('ans', dtype='int64') self.features = tensor.matrix('features', dtype='float32') self.inputs += [self.features, self.ans] # create computational graph for training logging.info('Building computational graph') # ----encoder----- emb = self.emb_src.forward( self.x.flatten()) # size: (length, batch_size, dim_emb) back_emb = self.emb_src.forward(self.x[::-1].flatten()) self.encode_forward = self.encoderGRU.forward( emb, self.x.shape[0], batch_size=self.x.shape[1], mask=self.xmask) # size: (length, batch_size, dim) self.encode_backward = self.encoderGRU_back.forward( back_emb, self.x.shape[0], batch_size=self.x.shape[1], mask=self.xmask[::-1]) # size: (length, batch_size, dim) context_forward = self.encode_forward[0] # only hiddens context_backward = self.encode_backward[0][::-1] self.context = tensor.concatenate( (context_forward, context_backward), axis=2) # size: (length, batch_size, 2*dim) # ----decoder---- self.init_c = context_backward[0] self.state_init = self.initer.forward(context_backward[0]) emb = self.emb_trg.forward( self.y.flatten()) # size: (length, batch_size, dim_emb) self.decode = self.decoderGRU.forward( emb, self.y.shape[0], self.context, state_init=self.state_init, batch_size=self.y.shape[1], mask=self.ymask, cmask=self.xmask) # size: (length, batch_size, dim) energy = self.decode[1] self.attention = self.decode[2] self.softmax = tensor.nnet.softmax(energy) # compute costs and grads y_idx = tensor.arange(self.y.flatten( ).shape[0]) * self.config['num_vocab_trg'] + self.y.flatten() cost = self.softmax.flatten()[y_idx] cost = -tensor.log(cost) self.cost = cost.reshape( (self.y.shape[0], self.y.shape[1])) * self.ymask self.cost_per_sample = self.cost.sum(axis=0) if 'MRT' in config and config['MRT'] is True: self.cost_per_sample = self.cost.sum(axis=0) tmp = self.cost_per_sample tmp *= config['MRT_alpha'] tmp -= tmp.min() tmp = tensor.exp(-tmp) tmp /= tmp.sum() tmp *= self.MRTLoss tmp = -tmp.sum() self.cost = tmp elif config['PR'] and self.fls: # calculate p self.cost_per_sample = self.cost.sum(axis=0) self.cost_per_sample *= config['alpha_PR'] cost_min = self.cost_per_sample - self.cost_per_sample.min() probs = tensor.exp(-cost_min) log_probs = -cost_min - tensor.log(probs.sum()) probs /= probs.sum() self.probs = log_probs # calculate q energy_q = self.features * self.feature_weight_dim energy_q = energy_q.sum(axis=1) self.energy_q = energy_q energy_q_min = energy_q - energy_q.max() probs_q = tensor.exp(energy_q_min) log_probs_q = energy_q_min - tensor.log(probs_q.sum()) probs_q /= probs_q.sum() self.probs_q = log_probs_q # calculate KL divergence cost_KL = tensor.exp(log_probs_q) * (log_probs_q - log_probs) self.cost_KLs = cost_KL self.cost_KL = cost_KL.sum() self.cost_NMT = self.cost_per_sample[self.ans] self.cost = config['lambda_PR'] * self.cost_KL + config[ 'lambda_MLE'] * self.cost_NMT else: self.cost = self.cost.sum() # build sampling graph self.x_sample = tensor.matrix('x_sample', dtype='int64') self.n_samples = tensor.scalar('n_samples', dtype='int64') self.length_sample = tensor.scalar('length', dtype='int64') emb_sample = self.emb_src.forward( self.x_sample.flatten()) # (length, batch_size, dim_emb) back_emb_sample = self.emb_src.forward(self.x_sample[::-1].flatten()) encode_forward_sample = self.encoderGRU.forward( emb_sample, self.x_sample.shape[0], batch_size=self.x_sample.shape[1]) # (length, batch_size, dim) encode_backward_sample = self.encoderGRU_back.forward( back_emb_sample, self.x_sample.shape[0], batch_size=self.x_sample.shape[1]) # (length, batch_size, dim) context_sample = tensor.concatenate( (encode_forward_sample[0], encode_backward_sample[0][::-1]), axis=2) # (length, batch_size, 2*dim) state_init_sample = self.initer.forward( encode_backward_sample[0][::-1][0]) self.state_init_sample = state_init_sample self.context_sample = context_sample self.samples, self.probs_sample, self.updates_sample = self.decode_sample( state_init_sample, context_sample, self.length_sample, self.n_samples) # parameter for decoding self.y_decode = tensor.vector('y_decode', dtype='int64') self.context_decode = tensor.tensor3('context_decode', dtype='float32') self.c_decode = tensor.matrix('c_decode', dtype='float32') self.state_decode = tensor.matrix('state_decode', dtype='float32') self.emb_decode = tensor.matrix('emb_decode', dtype='float32') def encode(self, x): ''' Encode source sentence to context vector. ''' if not hasattr(self, "encoder"): self.encoder = theano.function(inputs=[self.x, self.xmask], outputs=[self.context]) x = numpy.reshape(x, (x.shape[0], 1)) xmask = numpy.ones(x.shape, dtype='float32') return self.encoder(x, xmask) def get_trg_embedding(self, y): ''' Get the embedding of target sentence. ''' if not hasattr(self, "get_trg_embeddinger"): self.get_trg_embeddinger = theano.function( inputs=[self.y_decode], outputs=[self.emb_trg.forward(self.y_decode)]) return self.get_trg_embeddinger(y) def get_init(self, c): ''' Get the initial decoder hidden state with context vector. ''' if not hasattr(self, "get_initer"): self.get_initer = theano.function( inputs=[self.context], outputs=[self.initer.forward(context_backward[0])]) return self.get_initer(c) def get_context_and_init(self, x): ''' Encode source sentence to context vectors and get the initial decoder hidden state. ''' if not hasattr(self, "get_context_and_initer"): self.get_context_and_initer = theano.function( inputs=[self.x, self.xmask], outputs=[self.context, self.state_init]) x = numpy.reshape(x, (x.shape[0], 1)) xmask = numpy.ones(x.shape, dtype='float32') return self.get_context_and_initer(x, xmask) def get_probs(self, c, state, emb): ''' Get the probability of the next target word. ''' if not hasattr(self, "get_probser"): self.get_probser = theano.function(inputs = [self.context_decode, \ self.state_decode, \ self.emb_decode], \ outputs = self.decoderGRU.decode_probs(self.context_decode, \ self.state_decode, \ self.emb_decode)) return self.get_probser(c, state, emb) def get_next(self, c, state, emb): ''' Get the next hidden state. ''' if not hasattr(self, "get_nexter"): self.get_nexter = theano.function(inputs = [self.c_decode, \ self.state_decode, \ self.emb_decode], outputs = self.decoderGRU.decode_next(self.c_decode, \ self.state_decode, \ self.emb_decode)) return self.get_nexter(c, state, emb) def get_cost(self, x, xmask, y, ymask): ''' Get the negative log-likelihood of parallel sentences. ''' if not hasattr(self, "get_coster"): self.get_coster = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=[self.cost]) return self.get_coster(x, xmask, y, ymask) def get_sample(self, x, length, n_samples): ''' Get sampling results. ''' if not hasattr(self, "get_sampler"): self.get_sampler = theano.function( inputs=[self.x_sample, self.length_sample, self.n_samples], outputs=[self.samples, self.probs_sample], updates=self.updates_sample) return self.get_sampler(x, length, n_samples) def get_attention(self, x, xmask, y, ymask): ''' Get the attention weight of parallel sentences. ''' if not hasattr(self, "get_attentioner"): self.get_attentioner = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=[self.attention]) return self.get_attentioner(x, xmask, y, ymask) def get_layer(self, x, xmask, y, ymask): ''' Get the hidden states essential for visualization ''' if not hasattr(self, "get_layerer"): self.get_layerer = theano.function(inputs = [self.x, self.xmask, self.y, self.ymask], outputs = self.encode_forward + \ self.encode_backward + \ tuple(self.decode[0]) + tuple(self.decode[1:])) layers = self.get_layerer(x, xmask, y, ymask) enc_names = [ 'h', 'gate', 'reset', 'state', 'reseted', 'state_in', 'gate_in', 'reset_in' ] dec_names = [ 'h', 'c', 'att', 'gate_cin', 'gate_preactive', 'gate', 'reset_cin', 'reset_preactive', 'reset', 'state_cin', 'reseted', 'state_preactive', 'state' ] dec_names += [ 'outenergy', 'state_in', 'gate_in', 'reset_in', 'state_in_prev', 'readout', 'maxout', 'outenergy_1', 'outenergy_2' ] value_name = ['enc_for_' + name for name in enc_names] value_name += ['enc_back_' + name for name in enc_names] value_name += ['dec_' + name for name in dec_names] result = {} for i in range(len(layers)): if value_name[i] != '': result[value_name[i]] = layers[i] return result
class RNNtsg(model): ''' The attention-based NMT model for TSG ''' def __init__(self, config, name=''): self.config = config self.name = name self.creater = LayerFactory() self.trng = RandomStreams(numpy.random.randint(int(10e6))) def translate(self, x, T, beam_size=10, return_array=False): ''' Decode with beam search. :type x: numpy array :param x: the indexed source sentence :type beam_size: int :param beam_size: beam size :returns: a numpy array, the indexed translation result ''' # initialize variables result = [[]] loss = [0.] result_eos = [] loss_eos = [] beam = beam_size nonterms = [ ['S'] ] # same length as result, nonterms for each hypothesis # (n_hyps, nonterm for each hyp) par_state_time = [[0]] # (n_hyps, len(nonterm) for each hyp) # get encoder states c, state = self.get_context_and_init(x) emb_y = numpy.zeros((1, self.config['dim_emb_trg']), dtype='float32') state_hist = [[ numpy.zeros((1, self.config['dim_rec_enc']), dtype='float32') ]] # (n_hyps, l) for l in range(x.shape[0] * 3): cur_nonterm_idx = [ ] # length lists, each list is the rule indices for expanding LHS #print result for i in range(len(nonterms)): if len(nonterms[i]) > 0: potent_rules = T.rule_idx_with_root( nonterms[i][-1] ) # list of potential rules with the given lhs as root #print potent_rules + i * self.config['dim_emb_trg'] cur_nonterm_idx += [ r + i * self.config['num_vocab_trg'] for r in potent_rules ] nonterms[i].pop() # only take the first k results if we have k < beam_size potential nonterms if len(cur_nonterm_idx) < beam_size: beam = len(cur_nonterm_idx) else: beam = beam_size # get word probability energy, ctx = self.get_probs(numpy.repeat(c, len(result), axis=1), state, emb_y) # multiply energy by cur_nonterm_idx mask energy_mask = numpy.zeros((energy.shape[0] * energy.shape[1]), dtype='float32') energy_mask[cur_nonterm_idx] = 1. energy_mask = energy_mask.reshape( (energy.shape[0], energy.shape[1])) energy = energy * energy_mask probs = tools.softmax(energy) losses = -numpy.log(probs) # prevent translation to be too short. if l < x.shape[0] / 2: losses[:, self.config['index_eos_trg']] = numpy.inf # prevent rules that do not have required lhs #losses[:, not_cur_nonterm_idx] = numpy.inf for i in range(len(loss)): losses[i] += loss[i] # get the n-best partial translations best_index_flatten = numpy.argpartition(losses.flatten(), beam)[:beam] best_index = [(index / self.config['num_vocab_trg'], index % self.config['num_vocab_trg']) for index in best_index_flatten] # save the partial translations in the beam new_ctx = numpy.zeros((beam, 2 * self.config['dim_rec_enc']), dtype='float32') new_y = [] new_state = numpy.zeros((beam, self.config['dim_rec_dec']), dtype='float32') new_result = [] new_loss = [] new_nonterms = [] new_par_state_time = [] new_state_hist = [] new_par_state = numpy.zeros((beam, self.config['dim_rec_dec']), dtype='float32') #print best_index #print len(result), len(state_hist), len(par_state_time) for i in range(beam): index = best_index[i] new_result.append(result[index[0]] + [index[1]]) new_loss.append(losses[index[0], index[1]]) new_ctx[i] = ctx[index[0]] new_y.append(index[1]) new_state[i] = state[index[0]] par_state_t = par_state_time[index[0]][-1] new_par_state[i] = state_hist[index[0]][par_state_t] r = T.get_rule_from_idx(index[1]) if r: add_nonterms = r.get_expand_tags()[::-1] else: add_nonterms = [] new_nonterms.append(nonterms[index[0]] + add_nonterms) # set the parent of expanded tags to be current # do not include last par_state_time[] for current hyp new_par_state_time.append(par_state_time[index[0]][:-1] + [l + 1] * len(add_nonterms)) new_state_hist.append(state_hist[index[0]] + [state[index[0]]]) # get the next decoder hidden state new_emby = self.get_trg_embedding( numpy.asarray(new_y, dtype='int64'))[0] new_state = self.get_next(new_ctx, new_state, new_par_state, new_emby) # remove finished translation from the beam state = [] emb_y = [] result = [] loss = [] nonterms = [] state_hist = [] par_state_time = [] for i in range(beam): if len(new_nonterms[i]) == 0: # par_state_time and nonterms should have same length for each hyp # par_state_time records parent state timestep for each nonterms that needs to be expanded assert len(new_par_state_time[i]) == 0 result_eos.append(new_result[i]) #print new_result[i] loss_eos.append(new_loss[i]) beam -= 1 else: result.append(new_result[i]) loss.append(new_loss[i]) state.append(new_state[i]) emb_y.append(new_emby[i]) nonterms.append(new_nonterms[i]) state_hist.append(new_state_hist[i]) par_state_time.append(new_par_state_time[i]) #print len(result), len(state_hist), len(par_state_time) if beam <= 0: break state = numpy.asarray(state, dtype='float32') emb_y = numpy.asarray(emb_y, dtype='float32') # only used in semi-supervised training if return_array: if len(result_eos) > 0: return result_eos else: return [result[-1][:1]] if len(result_eos) > 0: # return the best translation return result_eos[numpy.argmin(loss_eos)] elif beam_size > 100: # double the beam size on failure logging.warning('cannot find translation in beam size %d' % beam_size) return [] else: logging.info('cannot find translation in beam size %d, try %d' % (beam_size, beam_size * 2)) return self.translate(x, beam_size=beam_size * 2) def sampling_step(self, state, prev, context, par_state): ''' Build the computational graph which samples the next word. :type state: theano variables :param state: the previous hidden state :type prev: theano variables :param prev: the last generated word :type context: theano variables :param context: the context vectors. ''' emb = self.emb_trg.forward(prev) energy, c = self.decoderGRU.decode_probs(context, state, emb) probs = tensor.nnet.softmax(energy) sample = self.trng.multinomial(pvals=probs, dtype='int64').argmax(axis=-1) newemb = self.emb_trg.forward(sample) newstate = self.decoderGRU.decode_next(c, state, newemb, par_state) return newstate, sample, probs def decode_sample(self, state_init, c, length, n_samples): ''' Build the decoder graph for sampling. :type state_init: theano variables :param state_init: the initial state of decoder :type c: theano variables :param c: the context vectors :type length: int :param length: the limitation of sample length :type n_samples: int :param n_samples: the number of samples ''' state = tensor.repeat(state_init, n_samples, axis=0) sample = tensor.zeros((n_samples, ), dtype='int64') c = tensor.repeat(c, n_samples, axis=1) result, updates = theano.scan(self.sampling_step, outputs_info=[state, sample, None], non_sequences=[c], n_steps=length) samples = result[1] probs = result[2] y_idx = tensor.arange(samples.flatten( ).shape[0]) * self.config['num_vocab_trg'] + samples.flatten() probs = probs.flatten()[y_idx] probs.reshape(samples.shape) return samples, probs, updates def build(self, verbose=False): ''' Build the computational graph. :type verbose: bool :param verbose: only set to True on visualization ''' config = self.config #create layers logging.info('initializing layers...') self.emb_src = self.creater.createLookupTable(self.name + 'emb_src', config['num_vocab_src'], config['dim_emb_src'], offset=True) self.emb_trg = self.creater.createLookupTable(self.name + 'emb_trg', config['num_vocab_trg'], config['dim_emb_trg'], offset=True) self.encoderGRU = self.creater.createGRU(self.name + 'GRU_enc', config['dim_emb_src'], config['dim_rec_enc'], verbose=verbose) self.encoderGRU_back = self.creater.createGRU(self.name + 'GRU_enc_back', config['dim_emb_src'], config['dim_rec_enc'], verbose=verbose) self.decoderGRU = self.creater.createGRU_tsg(self.name + 'GRU_dec', config['dim_emb_trg'], 2 * config['dim_rec_enc'], config['dim_rec_dec'], config['num_vocab_trg'], verbose=verbose) self.initer = self.creater.createFeedForwardLayer( self.name + 'initer', config['dim_rec_enc'], config['dim_rec_dec'], offset=True) # create input variables self.x = tensor.matrix('x', dtype='int64') # size: (length, batchsize) self.xmask = tensor.matrix( 'x_mask', dtype='float32') # size: (length, batchsize) self.y_idx = tensor.matrix('y_idx', dtype='int64') # size: (length, batchsize) self.ymask = tensor.matrix( 'y_mask', dtype='float32') # size: (length, batchsize) #self.y_parent_idx = tensor.matrix('y_parent_idx', dtype='int64') # size: (length, batchsize) self.y_parent_t = tensor.matrix( 'y_parent_t', dtype='int64') # size: (length, batchsize) if 'MRT' in config and config['MRT'] is True: self.MRTLoss = tensor.vector('MRTLoss') self.inputs = [ self.x, self.xmask, self.y_idx, self.y_parent_t, self.ymask, self.MRTLoss ] else: self.MRTLoss = None self.inputs = [ self.x, self.xmask, self.y_idx, self.y_parent_t, self.ymask ] # create computational graph for training logging.info('building computational graph...') # ----encoder----- emb = self.emb_src.forward( self.x.flatten()) # size: (length, batch_size, dim_emb) back_emb = self.emb_src.forward(self.x[::-1].flatten()) self.encode_forward = self.encoderGRU.forward( emb, self.x.shape[0], batch_size=self.x.shape[1], mask=self.xmask) # size: (length, batch_size, dim) self.encode_backward = self.encoderGRU_back.forward( back_emb, self.x.shape[0], batch_size=self.x.shape[1], mask=self.xmask[::-1]) # size: (length, batch_size, dim) context_forward = self.encode_forward[0] context_backward = self.encode_backward[0][::-1] self.context = tensor.concatenate( (context_forward, context_backward), axis=2) # size: (length, batch_size, 2*dim) # ----decoder---- self.init_c = context_backward[0] self.state_init = self.initer.forward(context_backward[0]) emb = self.emb_trg.forward( self.y_idx.flatten()) # size: (length, batch_size, dim_emb) self.decode = self.decoderGRU.forward( emb, self.y_idx.shape[0], self.context, self.state_init, self.y_parent_t, batch_size=self.y_idx.shape[1], mask=self.ymask, cmask=self.xmask) # size: (length, batch_size, dim) energy = self.decode[1] self.attention = self.decode[2] self.softmax = tensor.nnet.softmax(energy) # compute costs and grads y_idx = tensor.arange(self.y_idx.flatten( ).shape[0]) * self.config['num_vocab_trg'] + self.y_idx.flatten() cost = self.softmax.flatten()[y_idx] cost = -tensor.log(cost) self.cost = cost.reshape( (self.y_idx.shape[0], self.y_idx.shape[1])) * self.ymask self.cost_per_sample = self.cost.sum(axis=0) if 'MRT' in config and config['MRT'] is True: self.cost_per_sample = self.cost.sum(axis=0) tmp = self.cost_per_sample tmp *= config['MRT_alpha'] tmp -= tmp.min() tmp = tensor.exp(-tmp) tmp /= tmp.sum() tmp *= self.MRTLoss tmp = -tmp.sum() self.cost = tmp else: self.cost = self.cost.sum() # build sampling graph self.x_sample = tensor.matrix('x_sample', dtype='int64') self.n_samples = tensor.scalar('n_samples', dtype='int64') self.length_sample = tensor.scalar('length', dtype='int64') emb_sample = self.emb_src.forward( self.x_sample.flatten()) # (length, batch_size, dim_emb) back_emb_sample = self.emb_src.forward(self.x_sample[::-1].flatten()) encode_forward_sample = self.encoderGRU.forward( emb_sample, self.x_sample.shape[0], batch_size=self.x_sample.shape[1]) # (length, batch_size, dim) encode_backward_sample = self.encoderGRU_back.forward( back_emb_sample, self.x_sample.shape[0], batch_size=self.x_sample.shape[1]) # (length, batch_size, dim) context_sample = tensor.concatenate( (encode_forward_sample[0], encode_backward_sample[0][::-1]), axis=2) # (length, batch_size, 2*dim) state_init_sample = self.initer.forward( encode_backward_sample[0][::-1][0]) self.state_init_sample = state_init_sample self.context_sample = context_sample #self.samples, self.probs_sample, self.updates_sample = self.decode_sample(state_init_sample, context_sample, # self.length_sample, self.n_samples) # parameter for decoding self.y_decode = tensor.vector('y_decode', dtype='int64') self.context_decode = tensor.tensor3('context_decode', dtype='float32') self.c_decode = tensor.matrix('c_decode', dtype='float32') self.state_decode = tensor.matrix('state_decode', dtype='float32') self.par_state_decode = tensor.matrix('par_state_decode', dtype='float32') self.emb_decode = tensor.matrix('emb_decode', dtype='float32') def encode(self, x): ''' Encode source sentence to context vector. ''' if not hasattr(self, "encoder"): self.encoder = theano.function(inputs=[self.x, self.xmask], outputs=[self.context]) x = numpy.reshape(x, (x.shape[0], 1)) xmask = numpy.ones(x.shape, dtype='float32') return self.encoder(x, xmask) def get_trg_embedding(self, y): ''' Get the embedding of target sentence. ''' if not hasattr(self, "get_trg_embeddinger"): self.get_trg_embeddinger = theano.function( inputs=[self.y_decode], outputs=[self.emb_trg.forward(self.y_decode)]) return self.get_trg_embeddinger(y) def get_init(self, c): ''' Get the initial decoder hidden state with context vector. ''' if not hasattr(self, "get_initer"): self.get_initer = theano.function( inputs=[self.context], outputs=[self.initer.forward(context_backward[0])]) return self.get_initer(c) def get_context_and_init(self, x): ''' Encode source sentence to context vectors and get the initial decoder hidden state. ''' if not hasattr(self, "get_context_and_initer"): self.get_context_and_initer = theano.function( inputs=[self.x, self.xmask], outputs=[self.context, self.state_init]) x = numpy.reshape(x, (x.shape[0], 1)) xmask = numpy.ones(x.shape, dtype='float32') return self.get_context_and_initer(x, xmask) def get_probs(self, c, state, emb): ''' Get the probability of the next target word. ''' if not hasattr(self, "get_probser"): self.get_probser = theano.function( inputs=[ self.context_decode, self.state_decode, self.emb_decode ], outputs=self.decoderGRU.decode_probs(self.context_decode, self.state_decode, self.emb_decode)) return self.get_probser(c, state, emb) def get_next(self, c, state, par_state, emb): ''' Get the next hidden state. ''' if not hasattr(self, "get_nexter"): self.get_nexter = theano.function( inputs=[ self.c_decode, self.state_decode, self.par_state_decode, self.emb_decode ], outputs=self.decoderGRU.decode_next(self.c_decode, self.state_decode, self.par_state_decode, self.emb_decode)) return self.get_nexter(c, state, par_state, emb) def get_cost(self, x, xmask, y, ymask): ''' Get the negative log-likelihood of parallel sentences. ''' if not hasattr(self, "get_coster"): self.get_coster = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=[self.cost]) return self.get_coster(x, xmask, y, ymask) def get_sample(self, x, length, n_samples): ''' Get sampling results. ''' if not hasattr(self, "get_sampler"): self.get_sampler = theano.function( inputs=[self.x_sample, self.length_sample, self.n_samples], outputs=[self.samples, self.probs_sample], updates=self.updates_sample) return self.get_sampler(x, length, n_samples) def get_attention(self, x, xmask, y, ymask): ''' Get the attention weight of parallel sentences. ''' if not hasattr(self, "get_attentioner"): self.get_attentioner = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=[self.attention]) return self.get_attentioner(x, xmask, y, ymask) def get_layer(self, x, xmask, y, ymask): ''' Get the hidden states essential for visualization ''' if not hasattr(self, "get_layerer"): self.get_layerer = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=self.encode_forward + self.encode_backward + tuple(self.decode[0]) + tuple(self.decode[1:])) layers = self.get_layerer(x, xmask, y, ymask) enc_names = [ 'h', 'gate', 'reset', 'state', 'reseted', 'state_in', 'gate_in', 'reset_in' ] dec_names = [ 'h', 'c', 'att', 'gate_cin', 'gate_preactive', 'gate', 'reset_cin', 'reset_preactive', 'reset', 'state_cin', 'reseted', 'state_preactive', 'state' ] dec_names += [ 'outenergy', 'state_in', 'gate_in', 'reset_in', 'state_in_prev', 'readout', 'maxout', 'outenergy_1', 'outenergy_2' ] value_name = ['enc_for_' + name for name in enc_names] value_name += ['enc_back_' + name for name in enc_names] value_name += ['dec_' + name for name in dec_names] result = {} for i in range(len(layers)): print layers[i].shape if value_name[i] != '': result[value_name[i]] = layers[i] return result