def build_model1(self): # LookupTable to Embedding src_embedding_layer = EmbeddingLayer(input_dim=self.n_src_vocab, output_dim=self.src_embed_dim, name='src_embedding') tgt_embedding_layer = EmbeddingLayer(input_dim=self.n_tgt_vocab, output_dim=self.tgt_embed_dim, name='src_embedding') # LSTMs src_lstm_forward = LSTM(input_dim=self.src_embed_dim, output_dim=self.src_lstm_op_dim) src_lstm_backward = LSTM(input_dim=self.src_embed_dim, output_dim=self.src_lstm_op_dim) tgt_lstm = LSTM(input_dim=self.tgt_embed_dim, output_dim=self.tgt_lstm_op_dim) sys.stderr.write(str(tgt_lstm.params) + "\n") # TODO # From target LSTM to target word indexes # Input: target LSTM output dim + Attention from BiLSTM proj_layer = FullyConnectedLayer(input_dim=tgt_lstm_op_dim + 2 * src_lstm_op_dim, output_dim=self.n_tgt_vocab, activation='softmax') params = src_embedding_layer.params + tgt_embedding_layer.params + src_lstm_forward.params + src_lstm_backward.params + tgt_lstm.params[:-1] + proj_layer.params # declare input variables src_ip = T.ivector() tgt_ip = T.ivector() tgt_op = T.ivector() # lookup table -> embedding src_embed_ip = src_embedding_layer.fprop(src_ip) tgt_embed_ip = tgt_embedding_layer.fprop(tgt_ip) # embedding -> source BiLSTM src_lstm_forward.fprop(src_embed_ip) src_lstm_backward.fprop(src_embed_ip[::-1, :]) # Concatenate foward/backward. (Flip backward again to get corresponding h for the same word) encoderh = T.concatenate((src_lstm_forward.h, src_lstm_backward.h[::-1, :]), axis=1) # End of source BiLSTM -> target LSTM tgt_lstm.h_0 = encoderh[-1] tgt_lstm.fprop(tgt_embed_ip) # Attention # Read http://arxiv.org/abs/1508.04025 attention = tgt_lstm.h.dot(encoderh.transpose()) attention = attention.dot(encoderh) # Order preference? decoderh = T.concatenate((attention, tgt_lstm.h), axis=1) # LSTM output -> target word proj_op = proj_layer.fprop(decoder) # Cost + regularization cost = T.nnet.categorical_crossentropy(proj_op, tgt_op).mean() cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2) return dict({'cost': cost, 'src_ip': src_ip, 'tgt_ip': tgt_ip, 'tgt_op': tgt_op, 'params': params, 'proj_op': proj_op})
params += rnn.params[:-1] else: params += rnn.params params += tgt_lstm_h_to_vocab.params logging.info('Model parameters ...') logging.info('Src Embedding dim : %d ' % (src_embedding_layer.output_dim)) logging.info('Tgt Embedding dim : %d ' % (tgt_embedding_layer.output_dim)) logging.info('Encoder dim : %d ' % (src_lstm_2.output_dim)) logging.info('Batch size : %d ' % (batch_size)) logging.info('Decoder LSTM dim : %d ' % (tgt_lstm_2.output_dim)) logging.info('Depth : %s ' % ('3')) # Get embedding matrices src_emb_inp = src_embedding_layer.fprop(src_inp[:, ::-1]) tgt_emb_inp = tgt_embedding_layer.fprop(src_inp[:, :-1]) # Get encoder representation src_lstm_0.fprop(src_emb_inp) src_lstm_1.fprop(src_lstm_0.h) src_lstm_2.fprop(src_lstm_1.h) encoder_final_state = src_lstm_2.h.dimshuffle(1, 0, 2)[T.arange(src_inp.shape[0]), src_lens - 1, :] # Connect encoder and decoder tgt_lstm_0.h_0 = encoder_final_state # Decode sentence from input
params += decoder[0].params[:-1] if args.attention == 'mlp': params += attention_layer_1.params + attention_layer_2.params logging.info('Model parameters ...') logging.info('Src Embedding dim : %d ' % (src_emb_dim)) logging.info('Tgt Embedding dim : %d ' % (tgt_emb_dim)) logging.info('Encoder BiLSTM dim : %d ' % (encoder_forward[-1].output_dim)) logging.info('Batch size : %s ' % (batch_size)) logging.info('Decoder LSTM dim : %d ' % (decoder[-1].output_dim)) logging.info('Attention mechanism : %s ' % (args.attention)) logging.info('Depth : %s ' % (args.num_layers)) logging.info('Peek Encoder : %s ' % (str(peek_encoder))) # Get embedding matrices src_emb_inp = src_embedding_layer.fprop(src_inp) tgt_emb_inp = tgt_embedding_layer.fprop(tgt_inp) encoder_representation = None # Get BiLSTM representations encoder_forward[0].fprop(src_emb_inp) encoder_backward.fprop(src_emb_inp[:, ::-1]) # h is seqlen x batch x hdim encoder_representation = T.concatenate( (encoder_forward[0].h, encoder_backward.h[::-1, :, :]), axis=2).dimshuffle(1, 0, 2) for rnn in encoder_forward[1:]: rnn.fprop(encoder_representation) encoder_representation = rnn.h.dimshuffle(1, 0, 2) encoder_final_state = encoder_representation[T.arange(src_inp.shape[0]),