def build_rnn(x_sym, hid_init_sym, hid2_init_sym, seq_length, vocab_size, rnn_size): l_input = L.InputLayer(input_var=x_sym, shape=(None, seq_length)) l_input_hid = L.InputLayer(input_var=hid_init_sym, shape=(None, rnn_size)) l_input_hid2 = L.InputLayer(input_var=hid2_init_sym, shape=(None, rnn_size)) l_input = L.EmbeddingLayer(l_input, input_size=vocab_size, output_size=rnn_size) l_rnn = L.LSTMLayer(l_input, num_units=rnn_size, hid_init=l_input_hid) #, cell_init=l_init_cell) h = L.DropoutLayer(l_rnn, p=dropout_prob) l_rnn2 = L.LSTMLayer(h, num_units=rnn_size, hid_init=l_input_hid2) #, cell_init=l_init_cell2) h = L.DropoutLayer(l_rnn2, p=dropout_prob) # Before the decoder layer, we need to reshape the sequence into the batch dimension, # so that timesteps are decoded independently. l_shp = L.ReshapeLayer(h, (-1, rnn_size)) pred = NCELayer(l_shp, num_units=vocab_size, Z=Z) pred = L.ReshapeLayer(pred, (-1, seq_length, vocab_size)) return l_rnn, l_rnn2, pred
def get_emb_layer_from_idx(self, idx_layer, avg): suf = '_avg' if avg else '' # if not avg: # idx_layer = MaskLayer(idx_layer, self.config[self.name]['mask_rate']) self.emb = L.EmbeddingLayer( idx_layer, len(self.map), self.config[self.name]['emb_dim'], W=Normal(self.args.init_std) if not avg else Constant(), name='e%s%s' % (self.name, suf)) # W=HeNormal('relu') if not avg else Constant(), name = 'e%s%s'%(self.name, suf)) self.emb.params[self.emb.W].remove('regularizable') if self.config[self.name]['freeze']: self.emb.params[self.emb.W].remove('trainable') # load embedding from external file if available if self.name == 'word' and self.args.train and self.args.embw: if not avg or self.config['word']['freeze']: try: self.load_emb(self.args.embw) except: print 'Not able to read pre-trained embeddings, use random initialization instead' add_layer = self.additional_layer(idx_layer, self.emb, avg) # add noise to embeddings as in Plank tagger add_layer = L.GaussianNoiseLayer(add_layer, 0.1) return add_layer
def ptb_lstm(input_var, vocabulary_size, hidden_size, seq_len, num_layers, dropout, batch_size): l_input = L.InputLayer(shape=(batch_size, seq_len), input_var=input_var) l_embed = L.EmbeddingLayer(l_input, vocabulary_size, hidden_size, W=init.Uniform(1.0)) l_lstms = [] for i in range(num_layers): l_lstm = L.LSTMLayer(l_embed if i == 0 else l_lstms[-1], hidden_size, ingate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal()), forgetgate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), b=init.Constant(1.0)), cell=L.Gate( W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal())) l_lstms.append(l_lstm) l_drop = L.DropoutLayer(l_lstms[-1], dropout) l_out = L.DenseLayer(l_drop, num_units=vocabulary_size, num_leading_axes=2) l_out = L.ReshapeLayer( l_out, (l_out.output_shape[0] * l_out.output_shape[1], l_out.output_shape[2])) l_out = L.NonlinearityLayer(l_out, nonlinearity=lasagne.nonlinearities.softmax) return l_out
def build_network(self, vocab_size, input_var, mask_var, W_init): l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var) l_mask = L.InputLayer(shape=(None, None), input_var=mask_var) l_embed = L.EmbeddingLayer(l_in, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) l_fwd_1 = L.LSTMLayer(l_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_1 = L.LSTMLayer(l_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_all_1 = L.concat([l_fwd_1, l_bkd_1], axis=2) l_fwd_2 = L.LSTMLayer(l_all_1, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_2 = L.LSTMLayer(l_all_1, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1) l_bkd_1_slice = L.SliceLayer(l_bkd_1, 0, 1) y_1 = L.ElemwiseSumLayer([l_fwd_1_slice, l_bkd_1_slice]) l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1) l_bkd_2_slice = L.SliceLayer(l_bkd_2, 0, 1) y_2 = L.ElemwiseSumLayer([l_fwd_2_slice, l_bkd_2_slice]) y = L.concat([y_1, y_2], axis=1) g = L.DenseLayer(y, num_units=EMBED_DIM, nonlinearity=lasagne.nonlinearities.tanh) l_out = L.DenseLayer(g, num_units=vocab_size, W=l_embed.W.T, nonlinearity=lasagne.nonlinearities.softmax) return l_out
def __init__( self, n_words, dim_emb, num_units, n_classes, w_emb=None, dropout=0.2, use_final=False, lr=0.001, pretrain=None, ): self.n_words = n_words self.dim_emb = dim_emb self.num_units = num_units self.n_classes = n_classes self.lr = lr if w_emb is None: w_emb = init.Normal() self.l_x = layers.InputLayer((None, None)) self.l_m = layers.InputLayer((None, None)) self.l_emb = layers.EmbeddingLayer(self.l_x, n_words, dim_emb, W=w_emb) self.l_ebd = self.l_emb if dropout: self.l_emb = layers.dropout(self.l_emb, dropout) if use_final: self.l_enc = layers.LSTMLayer(self.l_emb, num_units, mask_input=self.l_m, only_return_final=True, grad_clipping=10.0, gradient_steps=400) self.l_rnn = self.l_enc else: self.l_enc = layers.LSTMLayer(self.l_emb, num_units, mask_input=self.l_m, only_return_final=False, grad_clipping=10.0, gradient_steps=400) self.l_rnn = self.l_enc self.l_enc = MeanLayer(self.l_enc, self.l_m) if dropout: self.l_enc = layers.dropout(self.l_enc, dropout) self.l_y = layers.DenseLayer(self.l_enc, n_classes, nonlinearity=nonlinearities.softmax) if pretrain: self.load_pretrain(pretrain)
def model(self, query_input, batch_size, query_vocab_size, context_vocab_size, emb_dim_size): l_input = L.InputLayer(shape=(batch_size, ), input_var=query_input) l_embed = L.EmbeddingLayer(l_input, input_size=query_vocab_size, output_size=emb_dim_size) l_out = L.DenseLayer(l_embed, num_units=context_vocab_size, nonlinearity=lasagne.nonlinearities.softmax) return l_embed, l_out
def __init__(self, input, input_size, embedding_size): """ Allocate an Embedding Layer. """ self.input = input self.output = layers.EmbeddingLayer(self.input, input_size, embedding_size, W=initialize_parameters()[0])
def build_cnn(): data_size = (None, 10, 100) # Batch size x Img Channels x Height x Width input_var = T.tensor3(name="input", dtype='int64') values = np.array(np.random.randint(0, 1, (5, 10, 100))) input_var.tag.test_value = values input_layer = L.InputLayer(data_size, input_var=input_var) W = create_char_embedding_matrix() embed_layer = L.EmbeddingLayer(input_layer, input_size=102, output_size=101, W=W) reshape = L.reshape(embed_layer, (-1, 100, 101)) dim_shuffle = L.dimshuffle(reshape, (0, 2, 1)) #conv_layer_1 = L.Conv2DLayer(embed_layer, 4, (1), 1, 0) #pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=1) print L.get_output(dim_shuffle).tag.test_value.shape conv_layer_1 = L.Conv1DLayer(dim_shuffle, 50, 2, 1) print L.get_output(conv_layer_1).tag.test_value.shape print "TEST" pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=99) print L.get_output(pool_layer_1).tag.test_value.shape reshape_conv_1 = L.reshape(pool_layer_1, (-1, 50)) conv_layer_2 = L.Conv1DLayer(dim_shuffle, 50, 3, 1) pool_layer_2 = L.MaxPool1DLayer(conv_layer_2, pool_size=98) reshape_conv_2 = L.reshape(pool_layer_2, (-1, 50)) merge_layer = L.ConcatLayer([reshape_conv_1, reshape_conv_2], 1) print L.get_output(merge_layer).tag.test_value.shape reshape_output = L.reshape(merge_layer, (-1, 10, 100)) print L.get_output(reshape_output).tag.test_value.shape x = T.tensor3(name="testname", dtype='int32') #x = T.imatrix() #output = L.get_output(conv_layer_1,x) #f = theano.function([x],output) word = unicode("Tat") word_index = np.array([]) #print word_index #x_test = np.array([word_index]).astype('int32') #print f(x_test) return reshape_output
def build_network(self): l_char1_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[0]) l_char2_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[1]) l_mask1_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[2]) l_mask2_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[3]) l_char_in = L.ConcatLayer([l_char1_in, l_char2_in], axis=1) # B x (ND+NQ) x L l_char_mask = L.ConcatLayer([l_mask1_in, l_mask2_in], axis=1) shp = (self.inps[0].shape[0], self.inps[0].shape[1] + self.inps[1].shape[1], self.inps[1].shape[2]) l_index_reshaped = L.ReshapeLayer(l_char_in, (shp[0] * shp[1], shp[2])) # BN x L l_mask_reshaped = L.ReshapeLayer(l_char_mask, (shp[0] * shp[1], shp[2])) # BN x L l_lookup = L.EmbeddingLayer(l_index_reshaped, self.num_chars, self.char_dim) # BN x L x D l_fgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=10, gradient_steps=-1, precompute_input=True, only_return_final=True, mask_input=l_mask_reshaped) l_bgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=10, gradient_steps=-1, precompute_input=True, backwards=True, only_return_final=True, mask_input=l_mask_reshaped) # BN x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim / 2, nonlinearity=None) # BN x DE l_bckembed = L.DenseLayer(l_bgru, self.embed_dim / 2, nonlinearity=None) # BN x DE l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_char_embed = L.ReshapeLayer(l_embed, (shp[0], shp[1], self.embed_dim / 2)) l_embed1 = L.SliceLayer(l_char_embed, slice(0, self.inps[0].shape[1]), axis=1) l_embed2 = L.SliceLayer(l_char_embed, slice(-self.inps[1].shape[1], None), axis=1) return l_embed1, l_embed2
def model(self, query_input, batch_size, query_vocab_size, context_vocab_size, emb_dim_size): l_input = L.InputLayer(shape=(batch_size, ), input_var=query_input) l_embed_continuous = L.EmbeddingLayer(l_input, input_size=query_vocab_size, output_size=emb_dim_size) l_values_discrete = L.EmbeddingLayer(l_input, input_size=query_vocab_size, output_size=emb_dim_size) l_probabilities_discrete = L.NonlinearityLayer( l_values_discrete, nonlinearity=lasagne.nonlinearities.softmax) l_embed_discrete = StochasticLayer(l_probabilities_discrete, estimator='MF') l_merge = L.ElemwiseSumLayer([l_embed_continuous, l_embed_discrete]) l_out = L.DenseLayer(l_merge, num_units=emb_dim_size, nonlinearity=lasagne.nonlinearities.softmax) l_merge_2 = L.ElemwiseMergeLayer([l_out, l_embed_discrete], merge_function=T.mul) l_final_out = L.DenseLayer(l_merge_2, num_units=context_vocab_size) return l_values_discrete, l_final_out
def build_cnn(input): #data_size = (None,103,130) # Batch size x Img Channels x Height x Width #input_var = T.tensor3(name = "input",dtype='int64') input_var = input #values = np.array(np.random.randint(0,102,(1,9,50))) #input_var.tag.test_value = values #number sentences x words x characters input_layer = L.InputLayer((None,9,50), input_var=input) W = create_char_embedding_matrix() embed_layer = L.EmbeddingLayer(input_layer, input_size=103,output_size=101, W=W) #print "EMBED", L.get_output(embed_layer).tag.test_value.shape reshape_embed = L.reshape(embed_layer,(-1,50,101)) #print "reshap embed", L.get_output(reshape_embed).tag.test_value.shape conv_layer_1 = L.Conv1DLayer(reshape_embed, 55, 2) conv_layer_2 = L.Conv1DLayer(reshape_embed, 55, 3) #print "TEST" #print "Convolution Layer 1", L.get_output(conv_layer_1).tag.test_value.shape #print "Convolution Layer 2", L.get_output(conv_layer_2).tag.test_value.shape #flatten_conv_1 = L.flatten(conv_layer_1,3) #flatten_conv_2 = L.flatten(conv_layer_2,3) #reshape_max_1 = L.reshape(flatten_conv_1,(-1,49)) #reshape_max_2 = L.reshape(flatten_conv_2, (-1,48)) #print "OUTPUT Flatten1", L.get_output(flatten_conv_1).tag.test_value.shape #print "OUTPUT Flatten2", L.get_output(flatten_conv_2).tag.test_value.shape #print "OUTPUT reshape_max_1", L.get_output(reshape_max_1).tag.test_value.shape #print "OUTPUT reshape_max_2", L.get_output(reshape_max_2).tag.test_value.shape pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=54) pool_layer_2 = L.MaxPool1DLayer(conv_layer_2, pool_size=53) #print "OUTPUT POOL1", L.get_output(pool_layer_1).tag.test_value.shape #print "OUTPUT POOL2",L.get_output(pool_layer_2).tag.test_value.shape merge_layer = L.ConcatLayer([pool_layer_1, pool_layer_2], 1) flatten_merge = L.flatten(merge_layer, 2) reshape_merge = L.reshape(flatten_merge, (1,9,110)) print L.get_output(reshape_embed).shape #print L.get_output(reshape_merge).tag.test_value.shape return reshape_merge, char_index_lookup
def get_conv_input(self, sidx, tidx, avg=False): suf = '_avg' if avg else '' feat_embs = [ self.manager.feats[name].get_emb_layer(sidx, tidx, avg=avg) for name in self.args.source_feats ] # TODO: change the meaning if self.args.lex == 'mix': concat_emb = L.ElemwiseSumLayer(feat_embs) # (100, 15, 256) else: concat_emb = L.concat(feat_embs, axis=2) # (100, 15, 256+100) pos = np.array([0] * (self.args.window_size / 2) + [1] + [0] * (self.args.window_size / 2)).astype( theano.config.floatX) post = theano.shared(pos[np.newaxis, :, np.newaxis], borrow=True) # (1, 15, 1) posl = L.InputLayer( (None, self.args.window_size, 1), input_var=T.extra_ops.repeat(post, sidx.shape[0], axis=0)) # (100, 15, 1) conv_in = L.concat([concat_emb, posl], axis=2) # (100, 15, 256+1) if self.args.pos_emb: posint = L.flatten( L.ExpressionLayer(posl, lambda x: T.cast(x, 'int64'))) # (100, 15) pos_emb = L.EmbeddingLayer( posint, self.args.window_size, 8, name='epos' + suf, W=Normal(0.01) if not avg else Constant()) # (100, 15, 8) pos_emb.params[pos_emb.W].remove('regularizable') conv_in = L.concat([concat_emb, posl, pos_emb], axis=2) # (100, 15, 256+1+8) # # squeeze # if self.args.squeeze: # conv_in = L.DenseLayer(conv_in, num_units=self.args.squeeze, name='squeeze'+suf, num_leading_axes=2, # W=HeNormal('relu')) # (100, 15, 256) conv_in = L.dimshuffle(conv_in, (0, 2, 1)) # (100, 256+1, 15) return conv_in
def integrate_captions(input_var=T.imatrix()): ''' :param batch_size: number of images :param nb_caption: number of caption used per image ''' ############################### # Build Network Configuration # ############################### print('... Integrating captions to the model') # Input of the network : shape = (nb_caption, seq_length) network = layers.InputLayer(shape=(None, None), input_var=input_var) # Embedding layer : shape = (nb_caption, seq_length, 400) vocab_length = get_vocab_length() network = layers.EmbeddingLayer(network, vocab_length, output_size=400) # LSTM layer : shape = (nb_caption, 500) gate_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.)) cell_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh) network = layers.LSTMLayer(network, num_units=500, ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, grad_clipping=100., only_return_final=True) # Dense Layer : shape = (nb_caption, 500) network = layers.DenseLayer(network, num_units=500) # Reshape layer : shape = (nb_caption, 500, 1, 1) network = layers.ReshapeLayer(network, (-1, 500, 1, 1)) return network
def makeRNN(xInputRNN, hiddenInitRNN, hidden2InitRNN, sequenceLen, vocabularySize, neuralNetworkSz): input_Layer = L.InputLayer(input_var = xInputRNN, shape = (None, sequenceLen)) hidden_Layer = L.InputLayer(input_var = hiddenInitRNN, shape = (None, neuralNetworkSz)) hidden_Layer2 = L.InputLayer(input_var = hidden2InitRNN, shape = (None, neuralNetworkSz)) input_Layer = L.EmbeddingLayer(input_Layer, input_size = vocabularySize, output_size = neuralNetworkSz) RNN_Layer = L.LSTMLayer(input_Layer, num_units = neuralNetworkSz, hid_init = hidden_Layer) h = L.DropoutLayer(RNN_Layer, p = dropOutProbability) RNN_Layer2 = L.LSTMLayer(h, num_units = neuralNetworkSz, hid_init = hidden_Layer2) h = L.DropoutLayer(RNN_Layer2, p = dropOutProbability) layerShape = L.ReshapeLayer(h, (-1, neuralNetworkSz)) predictions = NCE(layerShape, num_units = vocabularySize, Z = Z) predictions = L.ReshapeLayer(predictions, (-1, sequenceLen, vocabularySize)) return RNN_Layer, RNN_Layer2, predictions
def rnn_encoder(x_sym, x_mask): name = "Encoder" n_layers = 1 n_units = 128 emb_size = 128 rnn = DropoutLSTMLayer l_in = L.InputLayer((None, None), input_var=x_sym) l_mask = L.InputLayer((None, None), input_var=x_mask) l_emb = DropoutEmbeddingLayer(l_in, dict_size, emb_size, name=name + '.Embedding', dropout=0.25) l_onehot = L.EmbeddingLayer(l_in, dict_size, dict_size, W=np.eye(dict_size, dtype='float32'), name=name + '.OneHot') l_onehot.params[l_onehot.W].remove('trainable') l_enc_forwards = rnn(l_emb, num_units=n_units, mask_input=l_mask, name=name + '.0.Forward') l_enc_backwards = rnn(l_emb, num_units=n_units, mask_input=l_mask, backwards=True, name=name + '.0.Backward') l_enc = L.ConcatLayer([l_enc_forwards, l_enc_backwards], axis=2) for i in range(n_layers - 1): l_enc = rnn(l_enc, num_units=n_units, mask_input=l_mask, name="%s.%d.Forward" % (name, i + 1), dropout=0.25) return l_onehot, l_enc
def __init__(self, incomings, vocab_size, emb_size, W, WT=None, **kwargs): super(EncodingFullLayer, self).__init__(incomings, **kwargs) # if len(self.input_shapes[0]) == 3: # batch_size, w_count, w_length = self.input_shapes[0] shape = tuple(self.input_shapes[0]) # else: # shape = tuple(self.input_shapes[0]) self.WT = None # self.reset_zero() self.l_in = LL.InputLayer(shape=shape) self.l_in_pe = LL.InputLayer(shape=shape + (emb_size, )) self.l_emb = LL.EmbeddingLayer(self.l_in, input_size=vocab_size, output_size=emb_size, W=W) self.W = self.l_emb.W self.l_emb = LL.ElemwiseMergeLayer((self.l_emb, self.l_in_pe), merge_function=T.mul) self.l_emb_res = LL.ExpressionLayer(self.l_emb, lambda X: X.sum(2), output_shape='auto') # self.l_emb_res = SumLayer(self.l_emb, axis=2) if np.any(WT): self.l_emb_res = TemporalEncodicgLayer(self.l_emb_res, T=WT) self.WT = self.l_emb_res.T params = LL.helper.get_all_params(self.l_emb_res, trainable=True) values = LL.helper.get_all_param_values(self.l_emb_res, trainable=True) for p, v in zip(params, values): self.add_param(p, v.shape, name=p.name) zero_vec_tensor = T.vector() self.zero_vec = np.zeros(emb_size, dtype=theano.config.floatX) self.set_zero = theano.function( [zero_vec_tensor], updates=[(x, T.set_subtensor(x[-1, :], zero_vec_tensor)) for x in [self.W]])
def build_network(W, number_unique_tags, longest_word, longest_sentence, input_var=None): print("Building network ...") input_layer = L.InputLayer((None, longest_sentence, longest_word), input_var=input_var) embed_layer = L.EmbeddingLayer(input_layer, input_size=103, output_size=101, W=W) reshape_embed = L.reshape(embed_layer, (-1, longest_word, 101)) conv_layer_1 = L.Conv1DLayer(reshape_embed, longest_word, 2) conv_layer_2 = L.Conv1DLayer(reshape_embed, longest_word, 3) pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=longest_word - 1) pool_layer_2 = L.MaxPool1DLayer(conv_layer_2, pool_size=longest_word - 2) merge_layer = L.ConcatLayer([pool_layer_1, pool_layer_2], 1) flatten_merge = L.flatten(merge_layer, 2) reshape_merge = L.reshape(flatten_merge, (-1, longest_sentence, int(longest_word * 2))) l_re = lasagne.layers.RecurrentLayer( reshape_merge, N_HIDDEN, nonlinearity=lasagne.nonlinearities.sigmoid, mask_input=None) l_out = lasagne.layers.DenseLayer( l_re, number_unique_tags, nonlinearity=lasagne.nonlinearities.softmax) print "DONE BUILDING NETWORK" return l_out
class decoder_step: #inputs encoder = L.InputLayer((None, None, CODE_SIZE), name='encoded sequence') encoder_mask = L.InputLayer((None, None), name='encoded sequence') inp = L.InputLayer((None, ), name='current character') l_target_emb = L.EmbeddingLayer(inp, dst_voc.len, 128) #recurrent part l_rnn1 = AutoLSTMCell(l_target_emb, 128, name="lstm1") query = L.DenseLayer(l_rnn1.out, 128, nonlinearity=None) attn = AttentionLayer(encoder, query, 128, mask_input=encoder_mask)['attn'] l_rnn = L.concat([attn, l_rnn1.out, l_target_emb]) l_rnn2 = AutoLSTMCell(l_rnn, 128, name="lstm1") next_token_probas = L.DenseLayer(l_rnn2.out, dst_voc.len, nonlinearity=T.nnet.softmax) #pick next token from predicted probas next_token = ProbabilisticResolver(next_token_probas) tau = T.scalar("sample temperature", "float32") next_token_temperatured = TemperatureResolver(next_token_probas, tau) next_token_greedy = GreedyResolver(next_token_probas) auto_updates = { **l_rnn1.get_automatic_updates(), **l_rnn2.get_automatic_updates() }
def get_char2word(self, ic, avg=False): suf = '_avg' if avg else '' ec = L.EmbeddingLayer( ic, self.args.vc, self.args.nc, name='ec' + suf, W=HeNormal() if not avg else Constant()) # (100, 24, 32, 16) ec.params[ec.W].remove('regularizable') if self.args.char_model == 'CNN': lds = L.dimshuffle(ec, (0, 3, 1, 2)) # (100, 16, 24, 32) ls = [] for n in self.args.ngrams: lconv = L.Conv2DLayer( lds, self.args.nf, (1, n), untie_biases=True, W=HeNormal('relu') if not avg else Constant(), name='conv_%d' % n + suf) # (100, 64/4, 24, 32-n+1) lpool = L.MaxPool2DLayer( lconv, (1, self.args.max_len - n + 1)) # (100, 64, 24, 1) lpool = L.flatten(lpool, outdim=3) # (100, 16, 24) lpool = L.dimshuffle(lpool, (0, 2, 1)) # (100, 24, 16) ls.append(lpool) xc = L.concat(ls, axis=2) # (100, 24, 64) return xc elif self.args.char_model == 'LSTM': ml = L.ExpressionLayer( ic, lambda x: T.neq(x, 0)) # mask layer (100, 24, 32) ml = L.reshape(ml, (-1, self.args.max_len)) # (2400, 32) gate_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal()) cell_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=None, nonlinearity=tanh) lstm_in = L.reshape( ec, (-1, self.args.max_len, self.args.nc)) # (2400, 32, 16) lstm_f = L.LSTMLayer( lstm_in, self.args.nw / 2, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, name='forward' + suf) # (2400, 64) lstm_b = L.LSTMLayer( lstm_in, self.args.nw / 2, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, backwards=True, name='backward' + suf) # (2400, 64) remove_reg(lstm_f) remove_reg(lstm_b) if avg: set_zero(lstm_f) set_zero(lstm_b) xc = L.concat([lstm_f, lstm_b], axis=1) # (2400, 128) xc = L.reshape(xc, (-1, self.args.sw, self.args.nw)) # (100, 24, 256) return xc
def build_network(self, vocab_size, doc_var, query_var, docmask_var, qmask_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=EMBED_DIM, W=l_docembed.W) l_fwd_doc = L.GRULayer(l_docembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_docembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) l_fwd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) d = L.get_output(l_doc) # B x N x D q = L.get_output(l_q) # B x D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax( T.set_subtensor( T.alloc(-20., p.shape[0], p.shape[1])[docmask_var.nonzero()], p[docmask_var.nonzero()])) index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final = T.inc_subtensor( T.alloc(0., p.shape[0], vocab_size)[index, T.flatten(doc_var, outdim=2)], pm) #qv = T.flatten(query_var,outdim=2) #index2 = T.reshape(T.repeat(T.arange(qv.shape[0]),qv.shape[1]),qv.shape) #xx = index2[qmask_var.nonzero()] #yy = qv[qmask_var.nonzero()] #pV = T.set_subtensor(final[xx,yy], T.zeros_like(qv[xx,yy])) return final, l_doc, l_q
def rnn_decoder(l_input_one_hot, l_encoder_hid, encoder_mask, out_sym, out_mask, out_go_sym, name="Decoder"): n_layers = 1 n_units = 256 n_attention_units = 256 emb_size = 256 rnn = DropoutLSTMLayer l_go_out = L.InputLayer((None, None), input_var=out_go_sym) l_out_mask = L.InputLayer((None, None), input_var=out_mask) l_in_mask = L.InputLayer((None, None), input_var=encoder_mask) l_emb = L.EmbeddingLayer(l_go_out, dict_size, emb_size, name=name + '.Embedding') last_hid_encoded = L.SliceLayer(rnn(l_encoder_hid, num_units=n_units, mask_input=l_in_mask, name=name + '.Summarizer', dropout=0.25), indices=-1, axis=1) encoder_last_hid_repeat = RepeatLayer(last_hid_encoded, n=T.shape(out_go_sym)[1], axis=1) l_dec = L.ConcatLayer([l_emb, encoder_last_hid_repeat], axis=2) for i in range(n_layers): l_dec = rnn(l_dec, num_units=n_units, mask_input=l_out_mask, name="%s.%d.Forward" % (name, i), learn_init=True, dropout=0.25) l_attention = BahdanauKeyValueAttentionLayer( [l_encoder_hid, l_input_one_hot, l_in_mask, l_dec], n_attention_units, name=name + '.Attention') # (bs, seq_out, dict) l_out = L.ReshapeLayer(l_attention, (-1, [2])) out_random = L.get_output( l_out, deterministic=False) # (batch * seq_out) x dict out_deterministic = L.get_output( l_out, deterministic=True) # (batch * seq_out) x dict params = L.get_all_params([l_out], trainable=True) rcrossentropy = T.nnet.categorical_crossentropy( out_random + 1e-8, out_sym.flatten()) # (batch * seq) x 1 crossentropy = T.reshape(rcrossentropy, (bs, -1)) # batch x seq loss = T.sum(out_mask * crossentropy) / T.sum(out_mask) # scalar argmax = T.argmax(T.reshape(out_deterministic, (bs, -1, dict_size)), axis=-1) # batch x seq x 1 return {'loss': loss, 'argmax': argmax, 'params': params}
def clone(src_net, dst_net, mask_input): """ Clones a lasagne neural network, keeping weights tied. For all layers of src_net in turn, starting at the first: 1. creates a copy of the layer, 2. reuses the original objects for weights and 3. appends the new layer to dst_net. InputLayers are ignored. Recurrent layers (LSTMLayer) are passed mask_input. """ logger.info("Net to be cloned:") for l in layers.get_all_layers(src_net): logger.info(" - {} ({}):".format(l.name, l)) logger.info("Starting to clone..") for l in layers.get_all_layers(src_net): logger.info("src_net[...]: {} ({}):".format(l.name, l)) if type(l) == layers.InputLayer: logger.info(' - skipping') continue if type(l) == layers.DenseLayer: dst_net = layers.DenseLayer( dst_net, num_units=l.num_units, W=l.W, b=l.b, nonlinearity=l.nonlinearity, name=l.name+'2', ) elif type(l) == layers.EmbeddingLayer: dst_net = layers.EmbeddingLayer( dst_net, l.input_size, l.output_size, W=l.W, name=l.name+'2', ) elif type(l) == layers.LSTMLayer: dst_net = layers.LSTMLayer( dst_net, l.num_units, ingate=layers.Gate( W_in=l.W_in_to_ingate, W_hid=l.W_hid_to_ingate, W_cell=l.W_cell_to_ingate, b=l.b_ingate, nonlinearity=l.nonlinearity_ingate ), forgetgate=layers.Gate( W_in=l.W_in_to_forgetgate, W_hid=l.W_hid_to_forgetgate, W_cell=l.W_cell_to_forgetgate, b=l.b_forgetgate, nonlinearity=l.nonlinearity_forgetgate ), cell=layers.Gate( W_in=l.W_in_to_cell, W_hid=l.W_hid_to_cell, W_cell=None, b=l.b_cell, nonlinearity=l.nonlinearity_cell ), outgate=layers.Gate( W_in=l.W_in_to_outgate, W_hid=l.W_hid_to_outgate, W_cell=l.W_cell_to_outgate, b=l.b_outgate, nonlinearity=l.nonlinearity_outgate ), nonlinearity=l.nonlinearity, cell_init=l.cell_init, hid_init=l.hid_init, backwards=l.backwards, learn_init=l.learn_init, peepholes=l.peepholes, gradient_steps=l.gradient_steps, grad_clipping=l.grad_clipping, unroll_scan=l.unroll_scan, precompute_input=l.precompute_input, # mask_input=l.mask_input, # AttributeError: 'LSTMLayer' object has no attribute 'mask_input' name=l.name+'2', mask_input=mask_input, ) elif type(l) == layers.SliceLayer: dst_net = layers.SliceLayer( dst_net, indices=l.slice, axis=l.axis, name=l.name+'2', ) else: raise ValueError("Unhandled layer: {}".format(l)) new_layer = layers.get_all_layers(dst_net)[-1] logger.info('dst_net[...]: {} ({})'.format(new_layer, new_layer.name)) logger.info("Result of cloning:") for l in layers.get_all_layers(dst_net): logger.info(" - {} ({}):".format(l.name, l)) return dst_net
def build_network(self, K, vocab_size, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0]) l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1]) l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2]) l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3]) l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6]) l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7]) l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[8]) l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[9]) l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11]) doc_shp = self.inps[1].shape qry_shp = self.inps[3].shape l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) l_qembed = L.ReshapeLayer( l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim)) # B x N x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') # char embeddings if self.use_chars: l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 2 * self.char_dim) # T x L x D l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_bgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=True) # T x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim / 2, nonlinearity=None) # T x DE/2 l_bckembed = L.DenseLayer(l_bgru, self.embed_dim / 2, nonlinearity=None) # T x DE/2 l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2 l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2 l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2) l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2) l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=False) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=False) l_q = L.ConcatLayer([l_fwd_q, l_bkd_q]) # B x Q x 2D q = L.get_output(l_q) # B x Q x 2D q = q[T.arange(q.shape[0]), self.inps[12], :] # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) # B x Q x DE dd = L.get_output(l_doc_1) # B x N x DE M = T.batched_dot(dd, qd.dimshuffle((0, 2, 1))) # B x N x Q alphas = T.nnet.softmax( T.reshape(M, (M.shape[0] * M.shape[1], M.shape[2]))) alphas_r = T.reshape(alphas, (M.shape[0],M.shape[1],M.shape[2]))* \ self.inps[7][:,np.newaxis,:] # B x N x Q alphas_r = alphas_r / alphas_r.sum(axis=2)[:, :, np.newaxis] # B x N x Q q_rep = T.batched_dot(alphas_r, qd) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * self.inps[10] pm = pm / pm.sum(axis=1)[:, np.newaxis] final = T.batched_dot(pm, self.inps[4]) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * self.inps[10] pm = pm / pm.sum(axis=1)[:, np.newaxis] final_v = T.batched_dot(pm, self.inps[4]) return final, final_v, l_doc, l_qs, l_docembed.W
def __init__(self): self.batch_size = 32 self.embedding_size = 50 self.nb_max_sentences = 10 self.length_max_sentences = 30 self.vocab_size = 10000 self.nb_hidden = 32 self.nb_hops = 5 # Dimension of the input context is (batch_size, number of sentences, max size of sentences) self.context = T.itensor3('context') self.mask_context = T.imatrix('context_mask') # Dimension of the question input is (batch_size, max size of sentences) self.question = T.itensor3('question') self.mask_question = T.imatrix('question_mask') """ Building the Input context module """ mask_context = layers.InputLayer( (self.batch_size * self.nb_max_sentences, self.length_max_sentences), input_var=self.mask_context) # (batch_size, nb_sentences, length_max_sentences) input_module = layers.InputLayer( (self.batch_size, self.nb_max_sentences, self.length_max_sentences), input_var=self.context) # (batch_size, nb_sentences * length_max_sentences) input_module = layers.ReshapeLayer(input_module, (self.batch_size, -1)) # (batch_size, nb_sentences * length_max_sequences, embedding_size) input_module = layers.EmbeddingLayer(input_module, self.vocab_size, self.embedding_size) # (batch_size, nb_sentences, length_max_sequences, embedding_size) input_module = layers.ReshapeLayer( input_module, (self.batch_size, self.nb_max_sentences, self.length_max_sentences, self.embedding_size)) # (batch_size * nb_sentences, length_sentences, embedding_size) input_module = layers.ReshapeLayer( input_module, (self.batch_size * self.nb_max_sentences, self.length_max_sentences, self.embedding_size)) # (batch_size * nb_sentences, nb_hidden) input_module = layers.GRULayer(input_module, self.nb_hidden, mask_input=mask_context, only_return_final=True) context = layers.get_output(input_module) # input_module = layers.ReshapeLayer(input_module, (self.batch_size, self.nb_max_sentences, self.nb_hidden)) """ Building the Input context module """ # (bach_size, length_sentences) mask_question = layers.InputLayer( (self.batch_size, self.length_max_sentences), input_var=self.mask_question) # (batch_size, length_sentences) question_module = layers.InputLayer( (self.batch_size, self.length_max_sentences)) # (batch_size, length_sentences, embedding_size) question_module = layers.EmbeddingLayer(question_module, self.vocab_size, self.embedding_size) # (batch_size, nb_hidden) question_module = layers.GRULayer(question_module, self.nb_hidden, mask_input=mask_question, only_return_final=True) question = layers.get_output(question_module) """ Building the Memory module """ memory = question self._M = utils.get_shared('glorot_uniform', self.nb_hidden, self.nb_hidden) for step in xrange(self.nb_hops): z_score_vector = T.concatenate([ context, question, memory, context * question, context * memory, T.abs_(context - question), T.abs_(context - memory), T.dot(T.dot(context, self._M), question), T.dot(T.dot(context, self._M), memory) ]) self._M1 = utils.get_shared('glorot_uniform', self.nb_hidden * 9, self.nb_hidden) self._B1 = utils.get_shared('constant_zero', self.nb_hidden, None) z1 = T.tanh(T.dot(self._M1, z_score_vector) + self._B1) self._M2 = utils.get_shared('glorot_uniform', self.nb_hidden, 1) self._B2 = utils.get_shared('constant_zero', self.nb_hidden, None) z2 = T.nnet.sigmoid(T.dot(self._M2, z1) + self._B2)
def buildModel(self): print(' -- Building...') x_init = sparse.csr_matrix('x', dtype='float32') y_init = T.imatrix('y') g_init = T.imatrix('g') ind_init = T.ivector('ind') sub_path_init = T.imatrix('subPathsBatch') mask_init = T.fmatrix('subMask') # step train x_input = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=x_init) g_input = lgl.InputLayer(shape=(None, 2), input_var=g_init) ind_input = lgl.InputLayer(shape=(None, ), input_var=ind_init) pair_second = lgl.SliceLayer(g_input, indices=1, axis=1) pair_first = lgl.SliceLayer(g_input, indices=0, axis=1) pair_first_emd = lgl.EmbeddingLayer(pair_first, input_size=self.num_ver, output_size=self.embedding_size) emd_to_numver = layers.DenseLayer( pair_first_emd, self.num_ver, nonlinearity=lg.nonlinearities.softmax) index_emd = lgl.EmbeddingLayer(ind_input, input_size=self.num_ver, output_size=self.embedding_size, W=pair_first_emd.W) x_to_ydim = layers.SparseLayer(x_input, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) index_emd = layers.DenseLayer(index_emd, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) concat_two = lgl.ConcatLayer([x_to_ydim, index_emd], axis=1) concat_two = layers.DenseLayer(concat_two, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) concat_two_output = lgl.get_output(concat_two) step_loss = lgo.categorical_crossentropy(concat_two_output, y_init).mean() hid_loss = lgl.get_output(x_to_ydim) step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean() emd_loss = lgl.get_output(index_emd) step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean() step_params = [ index_emd.W, index_emd.b, x_to_ydim.W, x_to_ydim.b, concat_two.W, concat_two.b ] step_updates = lg.updates.sgd(step_loss, step_params, learning_rate=self.step_learning_rate) self.step_train = theano.function([x_init, y_init, ind_init], step_loss, updates=step_updates, on_unused_input='ignore') self.test_fn = theano.function([x_init, ind_init], concat_two_output, on_unused_input='ignore') # supervised train fc_output = lgl.get_output(emd_to_numver) pair_second_output = lgl.get_output(pair_second) sup_loss = lgo.categorical_crossentropy(fc_output, pair_second_output).sum() sup_params = lgl.get_all_params(emd_to_numver, trainable=True) sup_updates = lg.updates.sgd(sup_loss, sup_params, learning_rate=self.sup_learning_rate) self.sup_train = theano.function([g_init], sup_loss, updates=sup_updates, on_unused_input='ignore') cross_entropy = lgo.categorical_crossentropy(fc_output, pair_second_output) cross_entropy = T.reshape(cross_entropy, (1, self.unsup_batch_size), ndim=None) mask_input = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=mask_init) subPath_in = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=sub_path_init) sub_path_emd = lgl.EmbeddingLayer(subPath_in, input_size=self.num_ver, output_size=self.embedding_size, W=pair_first_emd.W) lstm_layer = lgl.LSTMLayer(sub_path_emd, self.lstm_hidden_units, grad_clipping=3, mask_input=mask_input) # handle path weight max1 = T.mean(lgl.get_output(lstm_layer), axis=1) max2 = T.mean(max1, axis=1) max2_init = T.fcol('max2') max2_init = T.reshape(max2, ((self.subpath_num, 1))) max2_input = lgl.InputLayer(shape=(self.subpath_num, 1), input_var=max2_init) max2_input = lgl.BatchNormLayer(max2_input) path_weight = lgl.get_output(max2_input) path_weight = lg.nonlinearities.sigmoid(path_weight) path_weight = 1 + 0.3 * path_weight # unsupervised train reweight_loss = T.dot(cross_entropy, path_weight)[0][0] lstm_params_all = lgl.get_all_params(lstm_layer, trainable=True) lstm_params = list(set(lstm_params_all).difference(set(sup_params))) lstm_updates = lg.updates.sgd(reweight_loss, lstm_params, learning_rate=0.01) self.lstm_fn = theano.function([sub_path_init, g_init, mask_init], reweight_loss, updates=lstm_updates, on_unused_input='ignore') alpha_updates = lg.updates.sgd(reweight_loss, sup_params, learning_rate=0.001) self.alpha_fn = theano.function([sub_path_init, g_init, mask_init], reweight_loss, updates=alpha_updates, on_unused_input='ignore') print(' -- Done!')
def build_network(self, vocab_size, input_var, mask_var, docidx_var, docidx_mask, skip_connect=True): l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var) l_mask = L.InputLayer(shape=(None, None), input_var=mask_var) l_embed = L.EmbeddingLayer(l_in, input_size=vocab_size, output_size=EMBED_DIM, W=self.params['W_emb']) l_embed_noise = L.dropout(l_embed, p=DROPOUT_RATE) # NOTE: Moved initialization of forget gate biases to init_params #forget_gate_1 = L.Gate(b=lasagne.init.Constant(3)) #forget_gate_2 = L.Gate(b=lasagne.init.Constant(3)) # NOTE: LSTM layer provided by Lasagne is slightly different from that used in DeepMind's paper. # In the paper the cell-to-* weights are not diagonal. # the 1st lstm layer in_gate = L.Gate(W_in=self.params['W_lstm1_xi'], W_hid=self.params['W_lstm1_hi'], W_cell=self.params['W_lstm1_ci'], b=self.params['b_lstm1_i'], nonlinearity=lasagne.nonlinearities.sigmoid) forget_gate = L.Gate(W_in=self.params['W_lstm1_xf'], W_hid=self.params['W_lstm1_hf'], W_cell=self.params['W_lstm1_cf'], b=self.params['b_lstm1_f'], nonlinearity=lasagne.nonlinearities.sigmoid) out_gate = L.Gate(W_in=self.params['W_lstm1_xo'], W_hid=self.params['W_lstm1_ho'], W_cell=self.params['W_lstm1_co'], b=self.params['b_lstm1_o'], nonlinearity=lasagne.nonlinearities.sigmoid) cell_gate = L.Gate(W_in=self.params['W_lstm1_xc'], W_hid=self.params['W_lstm1_hc'], W_cell=None, b=self.params['b_lstm1_c'], nonlinearity=lasagne.nonlinearities.tanh) l_fwd_1 = L.LSTMLayer(l_embed_noise, NUM_HIDDEN, ingate=in_gate, forgetgate=forget_gate, cell=cell_gate, outgate=out_gate, peepholes=True, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) # the 2nd lstm layer if skip_connect: # construct skip connection from the lookup table to the 2nd layer batch_size, seq_len, _ = input_var.shape # concatenate the last dimension of l_fwd_1 and embed l_fwd_1_shp = L.ReshapeLayer(l_fwd_1, (-1, NUM_HIDDEN)) l_embed_shp = L.ReshapeLayer(l_embed, (-1, EMBED_DIM)) to_next_layer = L.ReshapeLayer( L.concat([l_fwd_1_shp, l_embed_shp], axis=1), (batch_size, seq_len, NUM_HIDDEN + EMBED_DIM)) else: to_next_layer = l_fwd_1 to_next_layer_noise = L.dropout(to_next_layer, p=DROPOUT_RATE) in_gate = L.Gate(W_in=self.params['W_lstm2_xi'], W_hid=self.params['W_lstm2_hi'], W_cell=self.params['W_lstm2_ci'], b=self.params['b_lstm2_i'], nonlinearity=lasagne.nonlinearities.sigmoid) forget_gate = L.Gate(W_in=self.params['W_lstm2_xf'], W_hid=self.params['W_lstm2_hf'], W_cell=self.params['W_lstm2_cf'], b=self.params['b_lstm2_f'], nonlinearity=lasagne.nonlinearities.sigmoid) out_gate = L.Gate(W_in=self.params['W_lstm2_xo'], W_hid=self.params['W_lstm2_ho'], W_cell=self.params['W_lstm2_co'], b=self.params['b_lstm2_o'], nonlinearity=lasagne.nonlinearities.sigmoid) cell_gate = L.Gate(W_in=self.params['W_lstm2_xc'], W_hid=self.params['W_lstm2_hc'], W_cell=None, b=self.params['b_lstm2_c'], nonlinearity=lasagne.nonlinearities.tanh) l_fwd_2 = L.LSTMLayer(to_next_layer_noise, NUM_HIDDEN, ingate=in_gate, forgetgate=forget_gate, cell=cell_gate, outgate=out_gate, peepholes=True, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) # slice final states of both lstm layers l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1) l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1) # g will be used to score the words based on their embeddings g = L.DenseLayer(L.concat([l_fwd_1_slice, l_fwd_2_slice], axis=1), num_units=EMBED_DIM, W=self.params['W_dense'], b=self.params['b_dense'], nonlinearity=lasagne.nonlinearities.tanh) ## get outputs #g_out = L.get_output(g) # B x D #g_out_val = L.get_output(g, deterministic=True) # B x D ## compute softmax probs #probs,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm), # outputs_info=None, # sequences=[g_out,docidx_var,docidx_mask], # non_sequences=self.params['W_emb']) #predicted_probs = probs.reshape(docidx_var.shape) # B x N #probs_val,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm), # outputs_info=None, # sequences=[g_out_val,docidx_var,docidx_mask], # non_sequences=self.params['W_emb']) #predicted_probs_val = probs_val.reshape(docidx_var.shape) # B x N #return predicted_probs, predicted_probs_val # W is shared with the lookup table l_out = L.DenseLayer(g, num_units=vocab_size, W=self.params['W_emb'].T, nonlinearity=lasagne.nonlinearities.softmax, b=None) return l_out
def build_model(vocab_size, doc_var, qry_var, doc_mask_var, qry_mask_var, W_init=lasagne.init.Normal()): l_doc_in = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qry_in = L.InputLayer(shape=(None, None, 1), input_var=qry_var) l_doc_embed = L.EmbeddingLayer(l_doc_in, vocab_size, EMBED_DIM, W=W_init) l_qry_embed = L.EmbeddingLayer(l_qry_in, vocab_size, EMBED_DIM, W=l_doc_embed.W) l_doc_mask = L.InputLayer(shape=(None, None), input_var=doc_mask_var) l_qry_mask = L.InputLayer(shape=(None, None), input_var=qry_mask_var) l_doc_fwd = L.LSTMLayer(l_doc_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_doc_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_doc_bkd = L.LSTMLayer(l_doc_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_doc_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_qry_fwd = L.LSTMLayer(l_qry_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qry_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_qry_bkd = L.LSTMLayer(l_qry_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qry_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_fwd_slice = L.SliceLayer(l_doc_fwd, -1, 1) l_doc_bkd_slice = L.SliceLayer(l_doc_bkd, 0, 1) l_qry_fwd_slice = L.SliceLayer(l_qry_fwd, -1, 1) l_qry_bkd_slice = L.SliceLayer(l_qry_bkd, 0, 1) r = L.DenseLayer(L.ElemwiseSumLayer([l_doc_fwd_slice, l_doc_bkd_slice]), num_units=NUM_HIDDEN, nonlinearity=lasagne.nonlinearities.tanh) u = L.DenseLayer(L.ElemwiseSumLayer([l_qry_fwd_slice, l_qry_bkd_slice]), num_units=NUM_HIDDEN, nonlinearity=lasagne.nonlinearities.tanh) g = L.DenseLayer(L.concat([r, u], axis=1), num_units=EMBED_DIM, W=lasagne.init.GlorotNormal(), nonlinearity=lasagne.nonlinearities.tanh) l_out = L.DenseLayer(g, num_units=vocab_size, W=l_doc_embed.W.T, nonlinearity=lasagne.nonlinearities.softmax, b=None) return l_out
def build_model(hyparams, vocab, nclasses=2, batchsize=None, invar=None, maskvar=None, maxlen=MAXLEN): embedding_dim = hyparams.embedding_dim nhidden = hyparams.nhidden bidirectional = hyparams.bidirectional pool = hyparams.pool grad_clip = hyparams.grad_clip init = hyparams.init net = OrderedDict() V = len(vocab) W = lasagne.init.Normal() gate_params = layer.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.) ) cell_params = layer.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.tanh ) # define model net['input'] = layer.InputLayer((batchsize, maxlen), input_var=invar) net['mask'] = layer.InputLayer((batchsize, maxlen), input_var=maskvar) net['emb'] = layer.EmbeddingLayer(net['input'], input_size=V, output_size=embedding_dim, W=W) net['fwd1'] = layer.LSTMLayer( net['emb'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True ) if bidirectional: net['bwd1'] = layer.LSTMLayer( net['emb'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, backwards=True ) def tmean(a, b): agg = theano.tensor.add(a, b) agg /= 2. return agg net['pool'] = layer.ElemwiseMergeLayer([net['fwd1'], net['bwd1']], tmean) else: net['pool'] = layer.ConcatLayer([net['fwd1']]) net['dropout1'] = layer.DropoutLayer(net['pool'], p=0.5) net['fwd2'] = layer.LSTMLayer( net['dropout1'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, only_return_final=True ) net['dropout2'] = layer.DropoutLayer(net['fwd2'], p=0.6) net['softmax'] = layer.DenseLayer( net['dropout2'], num_units=nclasses, nonlinearity=lasagne.nonlinearities.softmax ) ASSUME = {net['input']: (200, 140), net['mask']: (200, 140)} logstr = '========== MODEL ========== \n' logstr += 'vocab size: %d\n' % V logstr += 'embedding dim: %d\n' % embedding_dim logstr += 'nhidden: %d\n' % nhidden logstr += 'pooling: %s\n' % pool for lname, lyr in net.items(): logstr += '%s %s\n' % (lname, str(get_output_shape(lyr, ASSUME))) logstr += '=========================== \n' print logstr return net
def get_actor(self, avg=False): suf = '_avg' if avg else '' iw = L.InputLayer(shape=(None, self.args.sw)) # (100, 24) ew = L.EmbeddingLayer( iw, self.args.vw, self.args.nw, name='ew' + suf, W=HeNormal() if not avg else Constant()) # (100, 24, 256) ew.params[ew.W].remove('regularizable') if 'w' in self.args.freeze: ew.params[ew.W].remove('trainable') # for access from outside if not avg: self.Ew = ew.W # char embedding with CNN/LSTM ic = L.InputLayer(shape=(None, self.args.sw, self.args.max_len)) # (100, 24, 32) ec = self.get_char2word(ic, avg) # (100, 24, 256) it = L.InputLayer(shape=(None, self.args.st)) et = L.EmbeddingLayer(it, self.args.vt, self.args.nt, name='et' + suf, W=HeNormal() if not avg else Constant()) et.params[et.W].remove('regularizable') il = L.InputLayer(shape=(None, self.args.sl)) el = L.EmbeddingLayer(il, self.args.vl, self.args.nl, name='el' + suf, W=HeNormal() if not avg else Constant()) el.params[el.W].remove('regularizable') to_concat = [] if self.args.type == 'word': to_concat.append(ew) elif self.args.type == 'char': to_concat.append(ec) elif self.args.type == 'both': to_concat += [ew, ec] elif self.args.type == 'mix': to_concat.append(L.ElemwiseSumLayer([ew, ec])) if not self.args.untagged: to_concat.append(et) if not self.args.unlabeled: to_concat.append(el) x = L.concat(to_concat, axis=2) # (100, 24, 64+16+16) # additional: # get the more compact representation of each token by its word, tag and label, # before putting into the hidden layer if self.args.squeeze: x = L.DenseLayer( x, num_units=self.args.squeeze, name='h0' + suf, num_leading_axes=2, W=HeNormal('relu') if not avg else Constant()) # (100, 24, 64) h1 = L.DenseLayer( x, num_units=self.args.nh1, name='h1' + suf, W=HeNormal('relu') if not avg else Constant()) # (100, 512) h1 = L.dropout(h1, self.args.p1) h2 = L.DenseLayer( h1, num_units=self.args.nh2, name='h2' + suf, W=HeNormal('relu') if not avg else Constant()) # (100, 256) h2 = L.dropout(h2, self.args.p2) h3 = L.DenseLayer(h2, num_units=self.args.nh3, name='h3' + suf, W=HeNormal() if not avg else Constant(), nonlinearity=softmax) # (100, 125) num of actions return iw, ic, it, il, h3
def build_network(self, K, vocab_size, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0]) l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1]) l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2]) l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3]) l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6]) l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7]) l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[8]) l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[9]) l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11]) l_match_feat = L.InputLayer(shape=(None, None, None), input_var=self.inps[13]) l_match_feat = L.EmbeddingLayer(l_match_feat, 2, 1) l_match_feat = L.ReshapeLayer(l_match_feat, (-1, [1], [2])) l_use_char = L.InputLayer(shape=(None, None, self.feat_cnt), input_var=self.inps[14]) l_use_char_q = L.InputLayer(shape=(None, None, self.feat_cnt), input_var=self.inps[15]) doc_shp = self.inps[1].shape qry_shp = self.inps[3].shape l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') l_qembed.params[l_qembed.W].remove('trainable') l_qembed = L.ReshapeLayer( l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim)) # B x N x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 # char embeddings if self.use_chars: # ====== concatenation ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 2*self.char_dim) # T x L x D # l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, # mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, # only_return_final=True) # l_bgru = L.GRULayer(l_lookup, 2*self.char_dim, grad_clipping=GRAD_CLIP, # mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, # backwards=True, only_return_final=True) # T x 2D # l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim/2, nonlinearity=None) # T x DE/2 # l_bckembed = L.DenseLayer(l_bgru, self.embed_dim/2, nonlinearity=None) # T x DE/2 # l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) # l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2 # l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2 # l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2) # l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2) # ====== bidir feat concat ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True) # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru]) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = L.ConcatLayer([l_use_char, l_docchar_embed, l_doce], axis = 2) # l_qembed = L.ConcatLayer([l_use_char_q, l_qchar_embed, l_qembed], axis = 2) # ====== char concat ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = L.ConcatLayer([l_docchar_embed, l_doce], axis = 2) # l_qembed = L.ConcatLayer([l_qchar_embed, l_qembed], axis = 2) # ====== feat concat ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = L.ConcatLayer([l_use_char, l_docchar_embed, l_doce], axis = 2) # l_qembed = L.ConcatLayer([l_use_char_q, l_qchar_embed, l_qembed], axis = 2) # ====== gating ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce]) # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed]) # ====== tie gating ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce]) # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed], W = l_doce.W, b = l_doce.b) # ====== scalar gating ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = ScalarDymLayer([l_use_char, l_docchar_embed, l_doce]) # l_qembed = ScalarDymLayer([l_use_char_q, l_qchar_embed, l_qembed]) # ====== dibirectional gating ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True) # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru]) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce]) # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed]) # ====== gate + concat ====== l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce]) l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed]) l_doce = L.ConcatLayer([l_use_char, l_doce], axis=2) l_qembed = L.ConcatLayer([l_use_char_q, l_qembed], axis=2) # ====== bidirectional gate + concat ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True) # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru]) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce]) # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed]) # l_doce = L.ConcatLayer([l_use_char, l_doce], axis = 2) # l_qembed = L.ConcatLayer([l_use_char_q, l_qembed], axis = 2) attentions = [] if self.save_attn: l_m = PairwiseInteractionLayer([l_doce, l_qembed]) attentions.append(L.get_output(l_m, deterministic=True)) for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE l_doce = MatrixAttentionLayer( [l_doc_1, l_q_c_1, l_qmask, l_match_feat]) # l_doce = MatrixAttentionLayer([l_doc_1, l_q_c_1, l_qmask]) # === begin GA === # l_m = PairwiseInteractionLayer([l_doc_1, l_q_c_1]) # l_doc_2_in = GatedAttentionLayer([l_doc_1, l_q_c_1, l_m], mask_input=self.inps[7]) # l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE # === end GA === # if self.save_attn: # attentions.append(L.get_output(l_m, deterministic=True)) if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=False) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=False) l_q = L.ConcatLayer([l_fwd_q, l_bkd_q], axis=2) # B x Q x 2D if self.save_attn: l_m = PairwiseInteractionLayer([l_doc, l_q]) attentions.append(L.get_output(l_m, deterministic=True)) l_prob = AttentionSumLayer([l_doc, l_q], self.inps[4], self.inps[12], mask_input=self.inps[10]) final = L.get_output(l_prob) final_v = L.get_output(l_prob, deterministic=True) return final, final_v, l_prob, l_docembed.W, attentions