def build_network(self): l_char1_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[0]) l_char2_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[1]) l_mask1_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[2]) l_mask2_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[3]) l_char_in = L.ConcatLayer([l_char1_in, l_char2_in], axis=1) # B x (ND+NQ) x L l_char_mask = L.ConcatLayer([l_mask1_in, l_mask2_in], axis=1) shp = (self.inps[0].shape[0], self.inps[0].shape[1] + self.inps[1].shape[1], self.inps[1].shape[2]) l_index_reshaped = L.ReshapeLayer(l_char_in, (shp[0] * shp[1], shp[2])) # BN x L l_mask_reshaped = L.ReshapeLayer(l_char_mask, (shp[0] * shp[1], shp[2])) # BN x L l_lookup = L.EmbeddingLayer(l_index_reshaped, self.num_chars, self.char_dim) # BN x L x D l_fgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=10, gradient_steps=-1, precompute_input=True, only_return_final=True, mask_input=l_mask_reshaped) l_bgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=10, gradient_steps=-1, precompute_input=True, backwards=True, only_return_final=True, mask_input=l_mask_reshaped) # BN x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim / 2, nonlinearity=None) # BN x DE l_bckembed = L.DenseLayer(l_bgru, self.embed_dim / 2, nonlinearity=None) # BN x DE l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_char_embed = L.ReshapeLayer(l_embed, (shp[0], shp[1], self.embed_dim / 2)) l_embed1 = L.SliceLayer(l_char_embed, slice(0, self.inps[0].shape[1]), axis=1) l_embed2 = L.SliceLayer(l_char_embed, slice(-self.inps[1].shape[1], None), axis=1) return l_embed1, l_embed2
def __init__(self, insize, vocoder, mlpg_wins=[], hiddensize=256, nonlinearity=lasagne.nonlinearities.very_leaky_rectify, nblayers=3, bn_axes=None, dropout_p=-1.0, grad_clipping=50): if bn_axes is None: bn_axes = [] # Recurrent nets don't like batch norm [ref. needed] model.Model.__init__(self, insize, vocoder, hiddensize) if len(bn_axes) > 0: warnings.warn( 'ModelBGRU: You are using bn_axes={}, but batch normalisation is supposed to make Recurrent Neural Networks (RNNS) unstable [ref. needed]' .format(bn_axes)) l_hid = ll.InputLayer(shape=(None, None, insize), input_var=self._input_values, name='input_conditional') for layi in xrange(nblayers): layerstr = 'l' + str(1 + layi) + '_BGRU{}'.format(hiddensize) fwd = ll.GRULayer(l_hid, num_units=hiddensize, backwards=False, name=layerstr + '.fwd', grad_clipping=grad_clipping) bck = ll.GRULayer(l_hid, num_units=hiddensize, backwards=True, name=layerstr + '.bck', grad_clipping=grad_clipping) l_hid = ll.ConcatLayer((fwd, bck), axis=2) # Add batch normalisation if len(bn_axes) > 0: l_hid = ll.batch_norm( l_hid, axes=bn_axes) # Not be good for recurrent nets! # Add dropout (after batchnorm) if dropout_p > 0.0: l_hid = ll.dropout(l_hid, p=dropout_p) l_out = layer_final(l_hid, vocoder, mlpg_wins) self.init_finish( l_out ) # Has to be called at the end of the __init__ to print out the architecture, get the trainable params, etc.
def _init_model(self, in_size, out_size, n_hid=10, learning_rate_sl=0.005, \ learning_rate_rl=0.005, batch_size=32, ment=0.1): # 2-layer MLP self.in_size = in_size # x and y coordinate self.out_size = out_size # up, down, right, left self.batch_size = batch_size self.learning_rate = learning_rate_rl self.n_hid = n_hid input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.imatrix('tm'), \ T.itensor3('am'), T.fvector('r') in_var = T.reshape(input_var, (input_var.shape[0]*input_var.shape[1],self.in_size)) l_mask_in = L.InputLayer(shape=(None,None), input_var=turn_mask) pol_in = T.fmatrix('pol-h') l_in = L.InputLayer(shape=(None,None,self.in_size), input_var=input_var) l_pol_rnn = L.GRULayer(l_in, n_hid, hid_init=pol_in, mask_input=l_mask_in) # B x H x D pol_out = L.get_output(l_pol_rnn)[:,-1,:] l_den_in = L.ReshapeLayer(l_pol_rnn, (turn_mask.shape[0]*turn_mask.shape[1], n_hid)) # BH x D l_out = L.DenseLayer(l_den_in, self.out_size, nonlinearity=lasagne.nonlinearities.softmax) self.network = l_out self.params = L.get_all_params(self.network) # rl probs = L.get_output(self.network) # BH x A out_probs = T.reshape(probs, (input_var.shape[0],input_var.shape[1],self.out_size)) # B x H x A log_probs = T.log(out_probs) act_probs = (log_probs*act_mask).sum(axis=2) # B x H ep_probs = (act_probs*turn_mask).sum(axis=1) # B H_probs = -T.sum(T.sum(out_probs*log_probs,axis=2),axis=1) # B self.loss = 0.-T.mean(ep_probs*reward_var + ment*H_probs) updates = lasagne.updates.rmsprop(self.loss, self.params, learning_rate=learning_rate_rl, \ epsilon=1e-4) self.inps = [input_var, turn_mask, act_mask, reward_var, pol_in] self.train_fn = theano.function(self.inps, self.loss, updates=updates) self.obj_fn = theano.function(self.inps, self.loss) self.act_fn = theano.function([input_var, turn_mask, pol_in], [out_probs, pol_out]) # sl sl_loss = 0.-T.mean(ep_probs) sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, learning_rate=learning_rate_sl, \ epsilon=1e-4) self.sl_train_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss, \ updates=sl_updates) self.sl_obj_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss)
def build_network(self, K, vocab_size, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0]) l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1]) l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2]) l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3]) l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6]) l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7]) l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[8]) l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[9]) l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11]) l_match_feat = L.InputLayer(shape=(None, None, None), input_var=self.inps[13]) l_match_feat = L.EmbeddingLayer(l_match_feat, 2, 1) l_match_feat = L.ReshapeLayer(l_match_feat, (-1, [1], [2])) l_use_char = L.InputLayer(shape=(None, None, self.feat_cnt), input_var=self.inps[14]) l_use_char_q = L.InputLayer(shape=(None, None, self.feat_cnt), input_var=self.inps[15]) doc_shp = self.inps[1].shape qry_shp = self.inps[3].shape l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') l_qembed.params[l_qembed.W].remove('trainable') l_qembed = L.ReshapeLayer( l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim)) # B x N x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 # char embeddings if self.use_chars: # ====== concatenation ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 2*self.char_dim) # T x L x D # l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, # mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, # only_return_final=True) # l_bgru = L.GRULayer(l_lookup, 2*self.char_dim, grad_clipping=GRAD_CLIP, # mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, # backwards=True, only_return_final=True) # T x 2D # l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim/2, nonlinearity=None) # T x DE/2 # l_bckembed = L.DenseLayer(l_bgru, self.embed_dim/2, nonlinearity=None) # T x DE/2 # l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) # l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2 # l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2 # l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2) # l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2) # ====== bidir feat concat ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True) # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru]) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = L.ConcatLayer([l_use_char, l_docchar_embed, l_doce], axis = 2) # l_qembed = L.ConcatLayer([l_use_char_q, l_qchar_embed, l_qembed], axis = 2) # ====== char concat ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = L.ConcatLayer([l_docchar_embed, l_doce], axis = 2) # l_qembed = L.ConcatLayer([l_qchar_embed, l_qembed], axis = 2) # ====== feat concat ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = L.ConcatLayer([l_use_char, l_docchar_embed, l_doce], axis = 2) # l_qembed = L.ConcatLayer([l_use_char_q, l_qchar_embed, l_qembed], axis = 2) # ====== gating ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce]) # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed]) # ====== tie gating ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce]) # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed], W = l_doce.W, b = l_doce.b) # ====== scalar gating ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = ScalarDymLayer([l_use_char, l_docchar_embed, l_doce]) # l_qembed = ScalarDymLayer([l_use_char_q, l_qchar_embed, l_qembed]) # ====== dibirectional gating ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True) # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru]) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce]) # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed]) # ====== gate + concat ====== l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce]) l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed]) l_doce = L.ConcatLayer([l_use_char, l_doce], axis=2) l_qembed = L.ConcatLayer([l_use_char_q, l_qembed], axis=2) # ====== bidirectional gate + concat ====== # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32) # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True) # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True) # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru]) # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru]) # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru]) # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce]) # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed]) # l_doce = L.ConcatLayer([l_use_char, l_doce], axis = 2) # l_qembed = L.ConcatLayer([l_use_char_q, l_qembed], axis = 2) attentions = [] if self.save_attn: l_m = PairwiseInteractionLayer([l_doce, l_qembed]) attentions.append(L.get_output(l_m, deterministic=True)) for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE l_doce = MatrixAttentionLayer( [l_doc_1, l_q_c_1, l_qmask, l_match_feat]) # l_doce = MatrixAttentionLayer([l_doc_1, l_q_c_1, l_qmask]) # === begin GA === # l_m = PairwiseInteractionLayer([l_doc_1, l_q_c_1]) # l_doc_2_in = GatedAttentionLayer([l_doc_1, l_q_c_1, l_m], mask_input=self.inps[7]) # l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE # === end GA === # if self.save_attn: # attentions.append(L.get_output(l_m, deterministic=True)) if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=False) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=False) l_q = L.ConcatLayer([l_fwd_q, l_bkd_q], axis=2) # B x Q x 2D if self.save_attn: l_m = PairwiseInteractionLayer([l_doc, l_q]) attentions.append(L.get_output(l_m, deterministic=True)) l_prob = AttentionSumLayer([l_doc, l_q], self.inps[4], self.inps[12], mask_input=self.inps[10]) final = L.get_output(l_prob) final_v = L.get_output(l_prob, deterministic=True) return final, final_v, l_prob, l_docembed.W, attentions
def __init__(self): self.batch_size = 32 self.embedding_size = 50 self.nb_max_sentences = 10 self.length_max_sentences = 30 self.vocab_size = 10000 self.nb_hidden = 32 self.nb_hops = 5 # Dimension of the input context is (batch_size, number of sentences, max size of sentences) self.context = T.itensor3('context') self.mask_context = T.imatrix('context_mask') # Dimension of the question input is (batch_size, max size of sentences) self.question = T.itensor3('question') self.mask_question = T.imatrix('question_mask') """ Building the Input context module """ mask_context = layers.InputLayer( (self.batch_size * self.nb_max_sentences, self.length_max_sentences), input_var=self.mask_context) # (batch_size, nb_sentences, length_max_sentences) input_module = layers.InputLayer( (self.batch_size, self.nb_max_sentences, self.length_max_sentences), input_var=self.context) # (batch_size, nb_sentences * length_max_sentences) input_module = layers.ReshapeLayer(input_module, (self.batch_size, -1)) # (batch_size, nb_sentences * length_max_sequences, embedding_size) input_module = layers.EmbeddingLayer(input_module, self.vocab_size, self.embedding_size) # (batch_size, nb_sentences, length_max_sequences, embedding_size) input_module = layers.ReshapeLayer( input_module, (self.batch_size, self.nb_max_sentences, self.length_max_sentences, self.embedding_size)) # (batch_size * nb_sentences, length_sentences, embedding_size) input_module = layers.ReshapeLayer( input_module, (self.batch_size * self.nb_max_sentences, self.length_max_sentences, self.embedding_size)) # (batch_size * nb_sentences, nb_hidden) input_module = layers.GRULayer(input_module, self.nb_hidden, mask_input=mask_context, only_return_final=True) context = layers.get_output(input_module) # input_module = layers.ReshapeLayer(input_module, (self.batch_size, self.nb_max_sentences, self.nb_hidden)) """ Building the Input context module """ # (bach_size, length_sentences) mask_question = layers.InputLayer( (self.batch_size, self.length_max_sentences), input_var=self.mask_question) # (batch_size, length_sentences) question_module = layers.InputLayer( (self.batch_size, self.length_max_sentences)) # (batch_size, length_sentences, embedding_size) question_module = layers.EmbeddingLayer(question_module, self.vocab_size, self.embedding_size) # (batch_size, nb_hidden) question_module = layers.GRULayer(question_module, self.nb_hidden, mask_input=mask_question, only_return_final=True) question = layers.get_output(question_module) """ Building the Memory module """ memory = question self._M = utils.get_shared('glorot_uniform', self.nb_hidden, self.nb_hidden) for step in xrange(self.nb_hops): z_score_vector = T.concatenate([ context, question, memory, context * question, context * memory, T.abs_(context - question), T.abs_(context - memory), T.dot(T.dot(context, self._M), question), T.dot(T.dot(context, self._M), memory) ]) self._M1 = utils.get_shared('glorot_uniform', self.nb_hidden * 9, self.nb_hidden) self._B1 = utils.get_shared('constant_zero', self.nb_hidden, None) z1 = T.tanh(T.dot(self._M1, z_score_vector) + self._B1) self._M2 = utils.get_shared('glorot_uniform', self.nb_hidden, 1) self._B2 = utils.get_shared('constant_zero', self.nb_hidden, None) z2 = T.nnet.sigmoid(T.dot(self._M2, z1) + self._B2)
target_var = T.ivector('targets') index = T.iscalar("index") batch_size = 500 n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_x.get_value(borrow=True).shape[0] / batch_size l_in = layers.InputLayer(shape=(None, seq_len, feature_num), input_var=input_var) #l_rec = layers.RecurrentLayer(incoming=l_in, # num_units=100, # W_hid_to_hid=init_diagnal(100), # b=init_constant(size=(100,)), # nonlinearity=lasagne.nonlinearities.rectify, # grad_clipping=1) l_rec = layers.GRULayer(incoming=l_in, num_units=hidden_unit) l_out = layers.DenseLayer(incoming=l_rec, num_units=10, nonlinearity=lasagne.nonlinearities.softmax) prediction = lasagne.layers.get_output(l_out) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(l_out, trainable=True) sum = 0 for p in params: shape = p.shape.eval() print shape if len(shape) > 1: sum += shape[0] * shape[1]
def make_model(): image = ll.InputLayer((BS, CH, IH, IW), name='step1.image') h_read_init = ll.InputLayer( (HS, ), lasagne.utils.create_param(li.Uniform(), (HS, ), name='step1.tensor.h_read_init'), name='step1.h_read_init') h_read_init.add_param(h_read_init.input_var, (HS, )) h_write_init = ll.InputLayer( (HS, ), lasagne.utils.create_param(li.Uniform(), (HS, ), name='step1.tensor.h_write_init'), name='step1.h_write_init') h_write_init.add_param(h_write_init.input_var, (HS, )) h_read = ll.ExpressionLayer(h_read_init, lambda t: T.tile(T.reshape(t, (1, HS)), (BS, 1)), (BS, HS), name='step1.h_read') h_write = ll.ExpressionLayer(h_write_init, lambda t: T.tile(T.reshape(t, (1, HS)), (BS, 1)), (BS, HS), name='step1.h_write') canvas = ll.InputLayer( (BS, CH, IH, IW), lasagne.utils.create_param(li.Constant(0.0), (BS, CH, IH, IW), name='step1.tensor.canvas'), name='step1.canvas') image_prev = ll.NonlinearityLayer(canvas, ln.sigmoid, name='step1.image_prev') image_error = ll.ElemwiseSumLayer([image, image_prev], coeffs=[1, -1], name='step1.image_error') image_stack = ll.ConcatLayer([image, image_error], name='step1.image_stack') read_params = ll.DenseLayer(h_write, 6, nonlinearity=None, name='step1.read_params') read_window = advanced_layers.AttentionLayer([read_params, image_stack], (WH, WW), name='step1.read_window') read_flat = ll.FlattenLayer(read_window, name='step1.read_flat') read_code = ll.ConcatLayer([read_flat, h_write], name='step1.read_code') read_code_sequence = ll.ReshapeLayer(read_code, (BS, 1, read_code.output_shape[-1]), name='step1.read_code_sequence') read_rnn = ll.GRULayer( read_code_sequence, HS, only_return_final=True, hid_init=h_read, name='step1.read_rnn', ) sample_mean = ll.DenseLayer(read_rnn, ENC_NDIM, nonlinearity=None, name='step1.sample_mean') sample_logvar2 = ll.DenseLayer(read_rnn, ENC_NDIM, nonlinearity=None, name='step1.sample_logvar2') sample = advanced_layers.SamplingLayer([sample_mean, sample_logvar2], ENC_VAR, name='step1.sample') write_code = ll.DenseLayer(sample, HS, name='step1.write_code') write_code_sequence = ll.ReshapeLayer(write_code, (BS, 1, write_code.output_shape[-1]), name='step1.write_code_sequence') write_rnn = ll.GRULayer( write_code_sequence, HS, only_return_final=True, hid_init=h_write, name='step1.write_rnn', ) write_window_flat = ll.DenseLayer(write_rnn, CH * WH * WW, name='step1.write_window_flat') write_window = ll.ReshapeLayer(write_window_flat, (BS, CH, WH, WW), name='step1.write_window') write_params = ll.DenseLayer(h_write, 6, nonlinearity=None, name='step1.write_params') write_image = advanced_layers.AttentionLayer([write_params, write_window], (IH, IW), name='step1.write_image') canvas_next = ll.ElemwiseSumLayer([canvas, write_image], name='step1.canvas_next') def rename(name): if name is None: return None step, real_name = name.split('.', 1) step = int(step[4:]) return 'step%d.%s' % (step + 1, real_name) for step in xrange(1, TIME_ROUNDS): sample_random_variable_next = sample.random_stream.normal( sample.input_shapes[0], std=sample.variation_coeff, ) sample_random_variable_next.name = 'step%d.sample.random_variable' % \ (step + 1) canvas, canvas_next = (canvas_next, utils.modified_copy( canvas_next, modify={ h_read: read_rnn, h_write: write_rnn, canvas: canvas_next, sample.random_stream: sample.random_stream, sample.random_variable: sample_random_variable_next, }, rename=rename, )) h_read = read_rnn h_write = write_rnn read_rnn = utils.layer_by_name(canvas_next, 'step%d.read_rnn' % (step + 1)) write_rnn = utils.layer_by_name(canvas_next, 'step%d.write_rnn' % (step + 1)) sample = utils.layer_by_name(canvas_next, 'step%d.sample' % (step + 1)) output = ll.NonlinearityLayer(canvas_next, ln.sigmoid, name='output') return output
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): print ("==> not used params in DMN class:", kwargs.keys()) self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.dropout = dropout self.l2 = l2 self.mode = mode self.batch_norm = batch_norm self.num_units = rnn_num_units self.input_var = T.tensor4('input_var') self.answer_var = T.ivector('answer_var') print ("==> building network") example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) print (layers.get_output(network).eval({self.input_var:example}).shape) # CONV-RELU-POOL 1 network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 2 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 3 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 4 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print (layers.get_output(network).eval({self.input_var:example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print (layers.get_output(network).eval({self.input_var:example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) self.params = layers.get_all_params(network, trainable=True) output = layers.get_output(network) num_channels = 32 filter_W = 54 filter_H = 8 # NOTE: these constants are shapes of last pool layer, it can be symbolic # explicit values are better for optimizations channels = [] for channel_index in range(num_channels): channels.append(output[:, channel_index, :, :].transpose((0, 2, 1))) rnn_network_outputs = [] W_in_to_updategate = None W_hid_to_updategate = None b_updategate = None W_in_to_resetgate = None W_hid_to_resetgate = None b_resetgate = None W_in_to_hidden_update = None W_hid_to_hidden_update = None b_hidden_update = None W_in_to_updategate1 = None W_hid_to_updategate1 = None b_updategate1 = None W_in_to_resetgate1 = None W_hid_to_resetgate1 = None b_resetgate1 = None W_in_to_hidden_update1 = None W_hid_to_hidden_update1 = None b_hidden_update1 = None for channel_index in range(num_channels): rnn_input_var = channels[channel_index] # InputLayer network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var) if (channel_index == 0): # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False) W_in_to_updategate = network.W_in_to_updategate W_hid_to_updategate = network.W_hid_to_updategate b_updategate = network.b_updategate W_in_to_resetgate = network.W_in_to_resetgate W_hid_to_resetgate = network.W_hid_to_resetgate b_resetgate = network.b_resetgate W_in_to_hidden_update = network.W_in_to_hidden_update W_hid_to_hidden_update = network.W_hid_to_hidden_update b_hidden_update = network.b_hidden_update # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) W_in_to_updategate1 = network.W_in_to_updategate W_hid_to_updategate1 = network.W_hid_to_updategate b_updategate1 = network.b_updategate W_in_to_resetgate1 = network.W_in_to_resetgate W_hid_to_resetgate1 = network.W_hid_to_resetgate b_resetgate1 = network.b_resetgate W_in_to_hidden_update1 = network.W_in_to_hidden_update W_hid_to_hidden_update1 = network.W_hid_to_hidden_update b_hidden_update1 = network.b_hidden_update # add params self.params += layers.get_all_params(network, trainable=True) else: # GRULayer, but shared network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False, resetgate=layers.Gate(W_in=W_in_to_resetgate, W_hid=W_hid_to_resetgate, b=b_resetgate), updategate=layers.Gate(W_in=W_in_to_updategate, W_hid=W_hid_to_updategate, b=b_updategate), hidden_update=layers.Gate(W_in=W_in_to_hidden_update, W_hid=W_hid_to_hidden_update, b=b_hidden_update)) # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # GRULayer, but shared network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True, resetgate=layers.Gate(W_in=W_in_to_resetgate1, W_hid=W_hid_to_resetgate1, b=b_resetgate1), updategate=layers.Gate(W_in=W_in_to_updategate1, W_hid=W_hid_to_updategate1, b=b_updategate1), hidden_update=layers.Gate(W_in=W_in_to_hidden_update1, W_hid=W_hid_to_hidden_update1, b=b_hidden_update1)) rnn_network_outputs.append(layers.get_output(network)) all_output_var = T.concatenate(rnn_network_outputs, axis=1) print (all_output_var.eval({self.input_var:example}).shape) # InputLayer network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var) # Dropout Layer if (self.dropout > 0): network = layers.dropout(network, self.dropout) # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print (layers.get_output(network).eval({self.input_var:example}).shape) self.params += layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) #print "==> param shapes", [x.eval().shape for x in self.params] self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) if self.mode == 'train': print ("==> compiling train_fn") self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print ("==> compiling test_fn") self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
def build_network(self, K, vocab_size, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0]) l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1]) l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2]) l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3]) l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6]) l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7]) l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[8]) l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[9]) l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11]) doc_shp = self.inps[1].shape qry_shp = self.inps[3].shape l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) l_qembed = L.ReshapeLayer( l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim)) # B x N x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') # char embeddings if self.use_chars: l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 2 * self.char_dim) # T x L x D l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_bgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=True) # T x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim / 2, nonlinearity=None) # T x DE/2 l_bckembed = L.DenseLayer(l_bgru, self.embed_dim / 2, nonlinearity=None) # T x DE/2 l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2 l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2 l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2) l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2) l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=False) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=False) l_q = L.ConcatLayer([l_fwd_q, l_bkd_q]) # B x Q x 2D q = L.get_output(l_q) # B x Q x 2D q = q[T.arange(q.shape[0]), self.inps[12], :] # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) # B x Q x DE dd = L.get_output(l_doc_1) # B x N x DE M = T.batched_dot(dd, qd.dimshuffle((0, 2, 1))) # B x N x Q alphas = T.nnet.softmax( T.reshape(M, (M.shape[0] * M.shape[1], M.shape[2]))) alphas_r = T.reshape(alphas, (M.shape[0],M.shape[1],M.shape[2]))* \ self.inps[7][:,np.newaxis,:] # B x N x Q alphas_r = alphas_r / alphas_r.sum(axis=2)[:, :, np.newaxis] # B x N x Q q_rep = T.batched_dot(alphas_r, qd) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * self.inps[10] pm = pm / pm.sum(axis=1)[:, np.newaxis] final = T.batched_dot(pm, self.inps[4]) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * self.inps[10] pm = pm / pm.sum(axis=1)[:, np.newaxis] final_v = T.batched_dot(pm, self.inps[4]) return final, final_v, l_doc, l_qs, l_docembed.W
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, l2, mode, rnn_num_units, batch_norm, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.l2 = l2 self.mode = mode self.num_units = rnn_num_units self.batch_norm = batch_norm self.input_var = T.tensor3('input_var') self.answer_var = T.ivector('answer_var') # scale inputs to be in [-1, 1] input_var_norm = 2 * self.input_var - 1 print "==> building network" example = np.random.uniform(size=(self.batch_size, 858, 256), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size, )) ######### # InputLayer network = layers.InputLayer(shape=(None, 858, 256), input_var=input_var_norm) print layers.get_output(network).eval({self.input_var: example}).shape # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units) print layers.get_output(network).eval({self.input_var: example}).shape # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) print layers.get_output(network).eval({ self.input_var: example }).shape # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) print layers.get_output(network).eval({self.input_var: example}).shape # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print layers.get_output(network).eval({self.input_var: example}).shape self.params = layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) self.loss_ce = lasagne.objectives.categorical_crossentropy( self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params( network, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
# NOTE: these constants are shapes of last pool layer, it can be symbolic # explicit values are better for optimizations num_channels = 32 filter_W = 852 filter_H = 8 # InputLayer network = layers.InputLayer(shape=(None, filter_W, num_channels * filter_H), input_var=output) print layers.get_output(network).eval({input_var: example}).shape # GRULayer network = layers.GRULayer(incoming=network, num_units=num_units, only_return_final=True) print layers.get_output(network).eval({input_var: example}).shape if (batch_norm): network = layers.BatchNormLayer(incoming=network) if (dropout > 0): network = layers.dropout(network, dropout) # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=3, nonlinearity=softmax) print layers.get_output(network).eval({input_var: example}).shape params += layers.get_all_params(network, trainable=True) prediction = layers.get_output(network)
index = T.iscalar("index") batch_size = 500 n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_x.get_value(borrow=True).shape[0] / batch_size l_in = layers.InputLayer(shape=(None, seq_len, feature_num), input_var=input_var) #l_rec = layers.RecurrentLayer(incoming=l_in, # num_units=100, # W_hid_to_hid=init_diagnal(100), # b=init_constant(size=(100,)), # nonlinearity=lasagne.nonlinearities.rectify, # grad_clipping=1) l_rec = layers.GRULayer(incoming=l_in, num_units=hidden_unit, only_return_final=True) l_out = layers.DenseLayer(incoming=l_rec, num_units=2, nonlinearity=lasagne.nonlinearities.softmax) prediction = lasagne.layers.get_output(l_out) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(l_out, trainable=True) sum = 0 for p in params: shape = p.shape.eval() print shape if len(shape) > 1:
def build_network(self, K, vocab_size, doc_var, query_var, docmask_var, qmask_var, candmask_var, feat_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_featin = L.InputLayer(shape=(None, None), input_var=feat_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_var.shape[0], doc_var.shape[1], EMBED_DIM)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=EMBED_DIM, W=l_docembed.W) l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if not EMB_TRAIN: l_docembed.params[l_docembed.W].remove('trainable') l_fwd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) # B x 2D q = L.get_output(l_q) # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) l_fwd_q_1 = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice_1 = L.SliceLayer(l_fwd_q_1, -1, 1) l_bkd_q_slice_1 = L.SliceLayer(l_bkd_q_1, 0, 1) l_q_c_1 = L.ConcatLayer([l_fwd_q_slice_1, l_bkd_q_slice_1]) # B x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) q_rep = T.reshape(T.tile(qd, (1, doc_var.shape[1])), (doc_var.shape[0], doc_var.shape[1], 2 * NUM_HIDDEN)) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=DROPOUT_RATE) # B x N x DE l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,T.flatten(doc_var,outdim=2)],\ pm) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final_v = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,\ T.flatten(doc_var,outdim=2)],pm) return final, final_v, l_doc, l_qs
# Recurrent layers expect input of shape # (batch size, max sequence length, number of features) l_in = layers.InputLayer(shape=(N_BATCH, MAX_LENGTH, 2)) # The network also needs a way to provide a mask for each sequence. We'll # use a separate input layer for that. Since the mask only determines # which indices are part of the sequence for each batch entry, they are # supplied as matrices of dimensionality (N_BATCH, MAX_LENGTH) l_mask = layers.InputLayer(shape=(N_BATCH, MAX_LENGTH)) # We're using a bidirectional network, which means we will combine two # RecurrentLayers, one with the backwards=True keyword argument. # Setting a value for grad_clipping will clip the gradients in the layer # Setting only_return_final=True makes the layers only return their output # for the final time step, which is all we need for this task l_forward = layers.GRULayer(l_in, N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP, only_return_final=True) l_backward = layers.GRULayer(l_in, N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP, only_return_final=True, backwards=True) # Now, we'll concatenate the outputs to combine them. l_concat = layers.ConcatLayer([l_forward, l_backward]) # Our output layer is a simple dense connection, with 1 output unit l_out = layers.DenseLayer(l_concat, num_units=1, nonlinearity=lasagne.nonlinearities.tanh)
def build_network(self, K, vocab_size, doc_var, query_var, cand_var, docmask_var, qmask_var, candmask_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_var.shape[0], doc_var.shape[1], self.embed_dim)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) # B x 2D q = L.get_output(l_q) # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice_1 = L.SliceLayer(l_fwd_q_1, -1, 1) l_bkd_q_slice_1 = L.SliceLayer(l_bkd_q_1, 0, 1) l_q_c_1 = L.ConcatLayer([l_fwd_q_slice_1, l_bkd_q_slice_1]) # B x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) q_rep = T.reshape(T.tile(qd, (1, doc_var.shape[1])), (doc_var.shape[0], doc_var.shape[1], 2 * self.nhidden)) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=self.dropout) l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] final = T.batched_dot(pm, cand_var) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] final_v = T.batched_dot(pm, cand_var) return final, final_v, l_doc, l_qs, l_docembed.W
def _init_model(self, in_size, out_size, slot_sizes, db, \ n_hid=10, learning_rate_sl=0.005, learning_rate_rl=0.005, batch_size=32, ment=0.1, \ inputtype='full', sl='e2e', rl='e2e'): self.in_size = in_size self.out_size = out_size self.slot_sizes = slot_sizes self.batch_size = batch_size self.learning_rate = learning_rate_rl self.n_hid = n_hid self.r_hid = self.n_hid self.sl = sl self.rl = rl table = db.table counts = db.counts m_unk = [db.inv_counts[s][-1] for s in dialog_config.inform_slots] prior = [db.priors[s] for s in dialog_config.inform_slots] unknown = [db.unks[s] for s in dialog_config.inform_slots] ids = [db.ids[s] for s in dialog_config.inform_slots] input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.bmatrix('tm'), \ T.btensor3('am'), T.fvector('r') T_var, N_var = T.as_tensor_variable(table), T.as_tensor_variable( counts) db_index_var = T.imatrix('db') db_index_switch = T.bvector('s') l_mask_in = L.InputLayer(shape=(None, None), input_var=turn_mask) flat_mask = T.reshape(turn_mask, (turn_mask.shape[0] * turn_mask.shape[1], 1)) def _smooth(p): p_n = p + EPS return p_n / (p_n.sum(axis=1)[:, np.newaxis]) def _add_unk(p, m, N): # p: B x V, m- num missing, N- total, p0: 1 x V t_unk = T.as_tensor_variable(float(m) / N) ps = p * (1. - t_unk) return T.concatenate([ps, T.tile(t_unk, (ps.shape[0], 1))], axis=1) def kl_divergence(p, q): p_n = _smooth(p) return -T.sum(q * T.log(p_n), axis=1) # belief tracking l_in = L.InputLayer(shape=(None, None, self.in_size), input_var=input_var) p_vars = [] pu_vars = [] phi_vars = [] p_targets = [] phi_targets = [] hid_in_vars = [] hid_out_vars = [] bt_loss = T.as_tensor_variable(0.) kl_loss = [] x_loss = [] self.trackers = [] for i, s in enumerate(dialog_config.inform_slots): hid_in = T.fmatrix('h') l_rnn = L.GRULayer(l_in, self.r_hid, hid_init=hid_in, \ mask_input=l_mask_in, grad_clipping=10.) # B x H x D l_b_in = L.ReshapeLayer(l_rnn, (input_var.shape[0] * input_var.shape[1], self.r_hid)) # BH x D hid_out = L.get_output(l_rnn)[:, -1, :] p_targ = T.ftensor3('p_target_' + s) p_t = T.reshape( p_targ, (p_targ.shape[0] * p_targ.shape[1], self.slot_sizes[i])) phi_targ = T.fmatrix('phi_target' + s) phi_t = T.reshape(phi_targ, (phi_targ.shape[0] * phi_targ.shape[1], 1)) l_b = L.DenseLayer(l_b_in, self.slot_sizes[i], nonlinearity=lasagne.nonlinearities.softmax) l_phi = L.DenseLayer(l_b_in, 1, nonlinearity=lasagne.nonlinearities.sigmoid) phi = T.clip(L.get_output(l_phi), 0.01, 0.99) p = L.get_output(l_b) p_u = _add_unk(p, m_unk[i], db.N) kl_loss.append( T.sum(flat_mask.flatten() * kl_divergence(p, p_t)) / T.sum(flat_mask)) x_loss.append( T.sum(flat_mask * lasagne.objectives.binary_crossentropy(phi, phi_t)) / T.sum(flat_mask)) bt_loss += kl_loss[-1] + x_loss[-1] p_vars.append(p) pu_vars.append(p_u) phi_vars.append(phi) p_targets.append(p_targ) phi_targets.append(phi_targ) hid_in_vars.append(hid_in) hid_out_vars.append(hid_out) self.trackers.append(l_b) self.trackers.append(l_phi) self.bt_params = L.get_all_params(self.trackers) def check_db(pv, phi, Tb, N): O = T.alloc(0., pv[0].shape[0], Tb.shape[0]) # BH x T.shape[0] for i, p in enumerate(pv): p_dc = T.tile(phi[i], (1, Tb.shape[0])) O += T.log(p_dc*(1./db.table.shape[0]) + \ (1.-p_dc)*(p[:,Tb[:,i]]/N[np.newaxis,:,i])) Op = T.exp(O) #+EPS # BH x T.shape[0] Os = T.sum(Op, axis=1)[:, np.newaxis] # BH x 1 return Op / Os def entropy(p): p = _smooth(p) return -T.sum(p * T.log(p), axis=-1) def weighted_entropy(p, q, p0, unks, idd): w = T.dot(idd, q.transpose()) # Pi x BH u = p0[np.newaxis, :] * (q[:, unks].sum(axis=1)[:, np.newaxis] ) # BH x Pi p_tilde = w.transpose() + u return entropy(p_tilde) p_db = check_db(pu_vars, phi_vars, T_var, N_var) # BH x T.shape[0] if inputtype == 'entropy': H_vars = [weighted_entropy(pv,p_db,prior[i],unknown[i],ids[i]) \ for i,pv in enumerate(p_vars)] H_db = entropy(p_db) phv = [ph[:, 0] for ph in phi_vars] t_in = T.stacklists(H_vars + phv + [H_db]).transpose() # BH x 2M+1 t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \ t_in.shape[1])) # B x H x 2M+1 l_in_pol = L.InputLayer( shape=(None,None,2*len(dialog_config.inform_slots)+1), \ input_var=t_in_resh) else: in_reshaped = T.reshape(input_var, (input_var.shape[0]*input_var.shape[1], \ input_var.shape[2])) prev_act = in_reshaped[:, -len(dialog_config.inform_slots):] t_in = T.concatenate(pu_vars + phi_vars + [p_db, prev_act], axis=1) # BH x D-sum+A t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \ t_in.shape[1])) # B x H x D-sum l_in_pol = L.InputLayer(shape=(None,None,sum(self.slot_sizes)+ \ 3*len(dialog_config.inform_slots)+ \ table.shape[0]), input_var=t_in_resh) pol_in = T.fmatrix('pol-h') l_pol_rnn = L.GRULayer(l_in_pol, n_hid, hid_init=pol_in, mask_input=l_mask_in, grad_clipping=10.) # B x H x D pol_out = L.get_output(l_pol_rnn)[:, -1, :] l_den_in = L.ReshapeLayer( l_pol_rnn, (turn_mask.shape[0] * turn_mask.shape[1], n_hid)) # BH x D l_out = L.DenseLayer(l_den_in, self.out_size, \ nonlinearity=lasagne.nonlinearities.softmax) # BH x A self.network = l_out self.pol_params = L.get_all_params(self.network) self.params = self.bt_params + self.pol_params # db loss p_db_reshaped = T.reshape( p_db, (turn_mask.shape[0], turn_mask.shape[1], table.shape[0])) p_db_final = p_db_reshaped[:, -1, :] # B x T.shape[0] p_db_final = _smooth(p_db_final) ix = T.tile(T.arange(p_db_final.shape[0]), (db_index_var.shape[1], 1)).transpose() sample_probs = p_db_final[ix, db_index_var] # B x K if dialog_config.SUCCESS_MAX_RANK == 1: log_db_probs = T.log(sample_probs).sum(axis=1) else: cum_probs,_ = theano.scan(fn=lambda x, prev: x+prev, \ outputs_info=T.zeros_like(sample_probs[:,0]), \ sequences=sample_probs[:,:-1].transpose()) cum_probs = T.clip(cum_probs.transpose(), 0., 1. - 1e-5) # B x K-1 log_db_probs = T.log(sample_probs).sum( axis=1) - T.log(1. - cum_probs).sum(axis=1) # B log_db_probs = log_db_probs * db_index_switch # rl probs = L.get_output(self.network) # BH x A probs = _smooth(probs) out_probs = T.reshape(probs, (turn_mask.shape[0], turn_mask.shape[1], self.out_size)) # B x H x A log_probs = T.log(out_probs) act_probs = (log_probs * act_mask).sum(axis=2) # B x H ep_probs = (act_probs * turn_mask).sum(axis=1) # B H_probs = -T.sum(T.sum(out_probs * log_probs, axis=2), axis=1) # B self.act_loss = -T.mean(ep_probs * reward_var) self.db_loss = -T.mean(log_db_probs * reward_var) self.reg_loss = -T.mean(ment * H_probs) self.loss = self.act_loss + self.db_loss + self.reg_loss self.inps = [input_var, turn_mask, act_mask, reward_var, db_index_var, db_index_switch, \ pol_in] + hid_in_vars self.obj_fn = theano.function(self.inps, self.loss, on_unused_input='warn') self.act_fn = theano.function([input_var,turn_mask,pol_in]+hid_in_vars, \ [out_probs,p_db,pol_out]+pu_vars+phi_vars+hid_out_vars, on_unused_input='warn') self.debug_fn = theano.function(self.inps, [probs, p_db, self.loss], on_unused_input='warn') self._rl_train_fn(self.learning_rate) ## sl sl_loss = 0. + bt_loss - T.mean(ep_probs) if self.sl == 'e2e': sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) elif self.sl == 'bel': sl_updates = lasagne.updates.rmsprop(sl_loss, self.bt_params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) else: sl_updates = lasagne.updates.rmsprop(sl_loss, self.pol_params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) sl_inps = [input_var, turn_mask, act_mask, pol_in ] + p_targets + phi_targets + hid_in_vars self.sl_train_fn = theano.function(sl_inps, [sl_loss]+kl_loss+x_loss, updates=sl_updates, \ on_unused_input='warn') self.sl_obj_fn = theano.function(sl_inps, sl_loss, on_unused_input='warn')
def __init__(self, n_inputs, n_outputs, n_components=1, n_filters=[], n_hiddens=[10, 10], n_rnn=None, impute_missing=True, seed=None, svi=True): """Initialize a mixture density network with custom layers Parameters ---------- n_inputs : int or tuple of ints or list of ints Dimensionality of input n_outputs : int Dimensionality of output n_components : int Number of components of the mixture density n_filters : list of ints Number of filters per convolutional layer n_hiddens : list of ints Number of hidden units per fully connected layer n_rnn : None or int Number of RNN units impute_missing : bool If set to True, learns replacement value for NaNs, otherwise those inputs are set to zero seed : int or None If provided, random number generator will be seeded svi : bool Whether to use SVI version or not """ self.impute_missing = impute_missing self.n_components = n_components self.n_filters = n_filters self.n_hiddens = n_hiddens self.n_outputs = n_outputs self.svi = svi self.iws = tt.vector('iws', dtype=dtype) if n_rnn is None: self.n_rnn = 0 else: self.n_rnn = n_rnn if self.n_rnn > 0 and len(self.n_filters) > 0: raise NotImplementedError self.seed = seed if seed is not None: self.rng = np.random.RandomState(seed=seed) else: self.rng = np.random.RandomState() lasagne.random.set_rng(self.rng) # cast n_inputs to tuple if type(n_inputs) is int: self.n_inputs = (n_inputs, ) elif type(n_inputs) is list: self.n_inputs = tuple(n_inputs) elif type(n_inputs) is tuple: self.n_inputs = n_inputs else: raise ValueError('n_inputs type not supported') # compose layers self.layer = collections.OrderedDict() # stats : input placeholder, (batch, *self.n_inputs) if len(self.n_inputs) + 1 == 2: self.stats = tt.matrix('stats', dtype=dtype) elif len(self.n_inputs) + 1 == 3: self.stats = tt.tensor3('stats', dtype=dtype) elif len(self.n_inputs) + 1 == 4: self.stats = tt.tensor4('stats', dtype=dtype) else: raise NotImplementedError # input layer self.layer['input'] = ll.InputLayer((None, *self.n_inputs), input_var=self.stats) # learn replacement values if self.impute_missing: self.layer['missing'] = dl.ImputeMissingLayer( last(self.layer), n_inputs=self.n_inputs) else: self.layer['missing'] = dl.ReplaceMissingLayer( last(self.layer), n_inputs=self.n_inputs) # recurrent neural net # expects shape (batch, sequence_length, num_inputs) if self.n_rnn > 0: if len(self.n_inputs) == 1: rs = (-1, *self.n_inputs, 1) self.layer['rnn_reshape'] = ll.ReshapeLayer( last(self.layer), rs) self.layer['rnn'] = ll.GRULayer(last(self.layer), n_rnn, only_return_final=True) # convolutional layers # expects shape (batch, num_input_channels, input_rows, input_columns) if len(self.n_filters) > 0: # reshape if len(self.n_inputs) == 1: raise NotImplementedError elif len(self.n_inputs) == 2: rs = (-1, 1, *self.n_inputs) else: rs = None if rs is not None: self.layer['conv_reshape'] = ll.ReshapeLayer( last(self.layer), rs) # add layers for l in range(len(n_filters)): self.layer['conv_' + str(l + 1)] = ll.Conv2DLayer( name='c' + str(l + 1), incoming=last(self.layer), num_filters=n_filters[l], filter_size=3, stride=(2, 2), pad=0, untie_biases=False, W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.), nonlinearity=lnl.rectify, flip_filters=True, convolution=tt.nnet.conv2d) # flatten self.layer['flatten'] = ll.FlattenLayer(incoming=last(self.layer), outdim=2) # hidden layers for l in range(len(n_hiddens)): self.layer['hidden_' + str(l + 1)] = dl.FullyConnectedLayer( last(self.layer), n_units=n_hiddens[l], svi=svi, name='h' + str(l + 1)) last_hidden = last(self.layer) # mixture layers self.layer['mixture_weights'] = dl.MixtureWeightsLayer( last_hidden, n_units=n_components, actfun=lnl.softmax, svi=svi, name='weights') self.layer['mixture_means'] = dl.MixtureMeansLayer( last_hidden, n_components=n_components, n_dim=n_outputs, svi=svi, name='means') self.layer['mixture_precisions'] = dl.MixturePrecisionsLayer( last_hidden, n_components=n_components, n_dim=n_outputs, svi=svi, name='precisions') last_mog = [ self.layer['mixture_weights'], self.layer['mixture_means'], self.layer['mixture_precisions'] ] # output placeholder self.params = tt.matrix('params', dtype=dtype) # (batch, self.n_outputs) # mixture parameters # a : weights, matrix with shape (batch, n_components) # ms : means, list of len n_components with (batch, n_dim, n_dim) # Us : precision factors, n_components list with (batch, n_dim, n_dim) # ldetUs : log determinants of precisions, n_comp list with (batch, ) self.a, self.ms, precision_out = ll.get_output(last_mog, deterministic=False) self.Us = precision_out['Us'] self.ldetUs = precision_out['ldetUs'] self.comps = { **{ 'a': self.a }, **{'m' + str(i): self.ms[i] for i in range(self.n_components)}, **{'U' + str(i): self.Us[i] for i in range(self.n_components)} } # log probability of y given the mixture distribution # lprobs_comps : log probs per component, list of len n_components with (batch, ) # probs : log probs of mixture, (batch, ) self.lprobs_comps = [ -0.5 * tt.sum(tt.sum( (self.params - m).dimshuffle([0, 'x', 1]) * U, axis=2)**2, axis=1) + ldetU for m, U, ldetU in zip(self.ms, self.Us, self.ldetUs) ] self.lprobs = (MyLogSumExp(tt.stack(self.lprobs_comps, axis=1) + tt.log(self.a), axis=1) \ - (0.5 * self.n_outputs * np.log(2 * np.pi))).squeeze() # the quantities from above again, but with deterministic=True # --- in the svi case, this will disable injection of randomness; # the mean of weights is used instead self.da, self.dms, dprecision_out = ll.get_output(last_mog, deterministic=True) self.dUs = dprecision_out['Us'] self.dldetUs = dprecision_out['ldetUs'] self.dcomps = { **{ 'a': self.da }, **{'m' + str(i): self.dms[i] for i in range(self.n_components)}, **{'U' + str(i): self.dUs[i] for i in range(self.n_components)} } self.dlprobs_comps = [ -0.5 * tt.sum(tt.sum( (self.params - m).dimshuffle([0, 'x', 1]) * U, axis=2)**2, axis=1) + ldetU for m, U, ldetU in zip(self.dms, self.dUs, self.dldetUs) ] self.dlprobs = (MyLogSumExp(tt.stack(self.dlprobs_comps, axis=1) + tt.log(self.da), axis=1) \ - (0.5 * self.n_outputs * np.log(2 * np.pi))).squeeze() # parameters of network self.aps = ll.get_all_params(last_mog) # all parameters self.mps = ll.get_all_params(last_mog, mp=True) # means self.sps = ll.get_all_params(last_mog, sp=True) # log stds # weight and bias parameter sets as seperate lists self.mps_wp = ll.get_all_params(last_mog, mp=True, wp=True) self.sps_wp = ll.get_all_params(last_mog, sp=True, wp=True) self.mps_bp = ll.get_all_params(last_mog, mp=True, bp=True) self.sps_bp = ll.get_all_params(last_mog, sp=True, bp=True) # theano functions self.compile_funs() self.iws = tt.vector('iws', dtype=dtype)
def __init__(self, n_inputs=None, n_outputs=None, input_shape=None, n_bypass=0, density='mog', n_hiddens=(10, 10), impute_missing=True, seed=None, n_filters=(), filter_sizes=3, pool_sizes=2, n_rnn=0, **density_opts): """Initialize a mixture density network with custom layers Parameters ---------- n_inputs : int Total input dimensionality (data/summary stats) n_outputs : int Dimensionality of output (simulator parameters) input_shape : tuple Size to which data are reshaped before CNN or RNN n_bypass : int Number of elements at end of input which bypass CNN or RNN density : string Type of density condition on the network, can be 'mog' or 'maf' n_components : int Number of components of the mixture density n_filters : list of ints Number of filters per convolutional layer n_hiddens : list of ints Number of hidden units per fully connected layer n_rnn : None or int Number of RNN units impute_missing : bool If set to True, learns replacement value for NaNs, otherwise those inputs are set to zero seed : int or None If provided, random number generator will be seeded density_opts : dict Options for the density estimator """ if n_rnn > 0 and len(n_filters) > 0: raise NotImplementedError assert isint(n_inputs) and isint(n_outputs)\ and n_inputs > 0 and n_outputs > 0 self.density = density.lower() self.impute_missing = impute_missing self.n_hiddens = list(n_hiddens) self.n_outputs, self.n_inputs = n_outputs, n_inputs self.n_bypass = n_bypass self.n_rnn = n_rnn self.n_filters, self.filter_sizes, self.pool_sizes, n_cnn = \ list(n_filters), filter_sizes, pool_sizes, len(n_filters) if type(self.filter_sizes) is int: self.filter_sizes = [self.filter_sizes for _ in range(n_cnn)] else: assert len(self.filter_sizes) >= n_cnn if type(self.pool_sizes) is int: self.pool_sizes = [self.pool_sizes for _ in range(n_cnn)] else: assert len(self.pool_sizes) >= n_cnn self.iws = tt.vector('iws', dtype=dtype) self.seed = seed if seed is not None: self.rng = np.random.RandomState(seed=seed) else: self.rng = np.random.RandomState() lasagne.random.set_rng(self.rng) self.input_shape = (n_inputs,) if input_shape is None else input_shape assert np.prod(self.input_shape) + self.n_bypass == self.n_inputs assert 1 <= len(self.input_shape) <= 3 # params: output placeholder (batch, self.n_outputs) self.params = tensorN(2, name='params', dtype=dtype) # stats : input placeholder, (batch, self.n_inputs) self.stats = tensorN(2, name='stats', dtype=dtype) # compose layers self.layer = collections.OrderedDict() # input layer, None indicates batch size not fixed at compile time self.layer['input'] = ll.InputLayer( (None, self.n_inputs), input_var=self.stats) # learn replacement values if self.impute_missing: self.layer['missing'] = \ dl.ImputeMissingLayer(last(self.layer), n_inputs=(self.n_inputs,)) else: self.layer['missing'] = \ dl.ReplaceMissingLayer(last(self.layer), n_inputs=(self.n_inputs,)) if self.n_bypass > 0 and (self.n_rnn > 0 or n_cnn > 0): last_layer = last(self.layer) bypass_slice = slice(self.n_inputs - self.n_bypass, self.n_inputs) direct_slice = slice(0, self.n_inputs - self.n_bypass) self.layer['bypass'] = ll.SliceLayer(last_layer, bypass_slice) self.layer['direct'] = ll.SliceLayer(last_layer, direct_slice) # reshape inputs prior to RNN or CNN step if self.n_rnn > 0 or n_cnn > 0: if len(n_filters) > 0 and len(self.input_shape) == 2: # 1 channel rs = (-1, 1, *self.input_shape) else: if self.n_rnn > 0: assert len(self.input_shape) == 2 # time, dim else: assert len(self.input_shape) == 3 # channel, row, col rs = (-1, *self.input_shape) # last layer is 'missing' or 'direct' self.layer['reshape'] = ll.ReshapeLayer(last(self.layer), rs) # recurrent neural net, input: (batch, sequence_length, num_inputs) if self.n_rnn > 0: self.layer['rnn'] = ll.GRULayer(last(self.layer), n_rnn, only_return_final=True) # convolutional net, input: (batch, channels, rows, columns) if n_cnn > 0: for l in range(n_cnn): # add layers if self.pool_sizes[l] == 1: padding = (self.filter_sizes[l] - 1) // 2 else: padding = 0 self.layer['conv_' + str(l + 1)] = ll.Conv2DLayer( name='c' + str(l + 1), incoming=last(self.layer), num_filters=self.n_filters[l], filter_size=self.filter_sizes[l], stride=(1, 1), pad=padding, untie_biases=False, W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.), nonlinearity=lnl.rectify, flip_filters=True, convolution=tt.nnet.conv2d) if self.pool_sizes[l] > 1: self.layer['pool_' + str(l + 1)] = ll.MaxPool2DLayer( name='p' + str(l + 1), incoming=last(self.layer), pool_size=self.pool_sizes[l], stride=None, ignore_border=True) # flatten self.layer['flatten'] = ll.FlattenLayer( incoming=last(self.layer), outdim=2) # incorporate bypass inputs if self.n_bypass > 0 and (self.n_rnn > 0 or n_cnn > 0): self.layer['bypass_merge'] = lasagne.layers.ConcatLayer( [self.layer['bypass'], last(self.layer)], axis=1) if self.density == 'mog': self.init_mdn(**density_opts) elif self.density == 'maf': self.init_maf(**density_opts) else: raise NotImplementedError self.compile_funs() # theano functions
def build_network(self, vocab_size, doc_var, query_var, docmask_var, qmask_var, candmask_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_var.shape[0], doc_var.shape[1], EMBED_DIM)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=EMBED_DIM, W=l_docembed.W) l_fwd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) # B x 2D q = L.get_output(l_q) # B x 2D l_fwd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) l_doc_1 = L.dropout(l_doc_1, p=DROPOUT_RATE) l_fwd_q_c = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_c = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice_c = L.SliceLayer(l_fwd_q_c, -1, 1) l_bkd_q_slice_c = L.SliceLayer(l_bkd_q_c, 0, 1) l_q_c = L.ConcatLayer([l_fwd_q_slice_c, l_bkd_q_slice_c]) # B x DE qd = L.get_output(l_q_c) q_rep = T.reshape( T.tile(qd, (1, doc_var.shape[1])), (doc_var.shape[0], doc_var.shape[1], 2 * NUM_HIDDEN)) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN), input_var=q_rep) l_doc_gru_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_fwd_doc = L.GRULayer(l_doc_gru_in, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doc_gru_in, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax( T.set_subtensor( T.alloc(-20., p.shape[0], p.shape[1])[candmask_var.nonzero()], p[candmask_var.nonzero()])) index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final = T.inc_subtensor( T.alloc(0., p.shape[0], vocab_size)[index, T.flatten(doc_var, outdim=2)], pm) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax( T.set_subtensor( T.alloc(-20., p.shape[0], p.shape[1])[candmask_var.nonzero()], p[candmask_var.nonzero()])) index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final_v = T.inc_subtensor( T.alloc(0., p.shape[0], vocab_size)[index, T.flatten(doc_var, outdim=2)], pm) return final, final_v, l_doc, [l_q, l_q_c]
def build_network(): batch_norm = False num_units = 500 # rnn hidden units number l2 = 0.0 # l2 regularization dropout = 0.5 input_var = T.tensor4('input_var') answer_var = T.ivector('answer_var') print('==> building network') example = np.random.uniform(size=(batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) answer = np.random.randint(low=0, high=176, size=(batch_size, )) network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=input_var) print(layers.get_output(network).eval({input_var: example}).shape) # conv-relu-pool 1 network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({input_var: example}).shape) if batch_norm: network = layers.BatchNormLayer(incoming=network) # conv-relu-pool 2 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({input_var: example}).shape) if batch_norm: network = layers.BatchNormLayer(incoming=network) # conv-relu-pool 3 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({input_var: example}).shape) if batch_norm: network = layers.BatchNormLayer(incoming=network) # conv-relu-pool 4 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({input_var: example}).shape) if batch_norm: network = layers.BatchNormLayer(incoming=network) params = layers.get_all_params(network, trainable=True) output = layers.get_output(network) output = output.transpose((0, 3, 1, 2)) output = output.flatten(ndim=3) # This params is important num_channels = 32 filter_w = 54 filter_h = 8 network = layers.InputLayer(shape=(None, filter_w, num_channels * filter_h), input_var=output) print(layers.get_output(network).eval({input_var: example}).shape) network = layers.GRULayer(incoming=network, num_units=num_units, only_return_final=True) print(layers.get_output(network).eval({input_var: example}).shape) if batch_norm: network = layers.BatchNormLayer(incoming=network) if dropout > 0: network = layers.dropout(network, dropout) # last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print(layers.get_output(network).eval({input_var: example}).shape) params += layers.get_all_params(network, trainable=True) prediction = layers.get_output(network) print('==> param shapes', [x.eval().shape for x in params]) loss_ce = lasagne.objectives.categorical_crossentropy( prediction, answer_var).mean() if l2 > 0: loss_l2 = l2 * lasagne.regularization.apply_penalty( params, lasagne.regularization.l2) else: loss_l2 = 0 loss = loss_ce + loss_l2 # updates = lasagne.updates.adadelta(loss, params) updates = lasagne.updates.momentum(loss, params, learning_rate=0.003) # good one # updates = lasagne.updates.momentum(loss, params, learning_rate=0.0003) # good one print('==> compiling train_fn') train_fn = theano.function(inputs=[input_var, answer_var], outputs=[prediction, loss], updates=updates) test_fn = theano.function(inputs=[input_var, answer_var], outputs=[prediction, loss]) return train_fn, test_fn
def build_network(self, K, vocab_size, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0]) l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1]) l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2]) l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3]) l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6]) l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7]) l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[8]) l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[9]) l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11]) doc_shp = self.inps[1].shape qry_shp = self.inps[3].shape l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) l_qembed = L.ReshapeLayer( l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim)) # B x N x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') # char embeddings if self.use_chars: l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 2 * self.char_dim) # T x L x D l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_bgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=True) # T x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim / 2, nonlinearity=None) # T x DE/2 l_bckembed = L.DenseLayer(l_bgru, self.embed_dim / 2, nonlinearity=None) # T x DE/2 l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2 l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2 l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2) l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2) l_qembed = L.DropoutLayer(l_qembed, p=self.dropout) l_doce = L.DropoutLayer(l_doce, p=self.dropout) l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=True) l_q = L.ConcatLayer([l_fwd_q, l_bkd_q]) # B x 2D if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x 2D l_o = BilinearAttentionLayer([l_doc_1, l_q], 2 * self.nhidden, mask_input=self.inps[6]) # B x 2D #odim = self.embed_dim #if self.use_chars: odim += self.embed_dim/2 #if self.use_feat: odim += 2 #l_od = L.DenseLayer(l_o, odim) l_od = l_o oo = L.get_output(l_od) # B x OD d = L.get_output(l_doc_1) # B x N x OD p = T.batched_dot(d, oo) # B x N pm = T.nnet.softmax(p) * self.inps[10] pm = pm / pm.sum(axis=1)[:, np.newaxis] final = T.batched_dot(pm, self.inps[4]) ov = L.get_output(l_od, deterministic=True) dv = L.get_output(l_doc_1, deterministic=True) # B x N x OD p = T.batched_dot(dv, ov) # B x N pm = T.nnet.softmax(p) * self.inps[10] pm = pm / pm.sum(axis=1)[:, np.newaxis] final_v = T.batched_dot(pm, self.inps[4]) return final, final_v, l_od, [], l_docembed.W
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.dropout = dropout self.l2 = l2 self.mode = mode self.batch_norm = batch_norm self.num_units = rnn_num_units self.input_var = T.tensor4('input_var') self.answer_var = T.ivector('answer_var') print "==> building network" example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size, )) ######### network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) print layers.get_output(network).eval({self.input_var: example}).shape # CONV-RELU-POOL 1 network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var: example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2, 1), pad=2) print layers.get_output(network).eval({self.input_var: example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 2 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var: example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2, 1), pad=2) print layers.get_output(network).eval({self.input_var: example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 3 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var: example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2, 1), pad=2) print layers.get_output(network).eval({self.input_var: example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 4 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var: example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(2, 1), pad=2) print layers.get_output(network).eval({self.input_var: example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) self.params = layers.get_all_params(network, trainable=True) output = layers.get_output(network) output = output.transpose((0, 3, 1, 2)) output = output.flatten(ndim=3) # NOTE: these constants are shapes of last pool layer, it can be symbolic # explicit values are better for optimizations num_channels = 32 filter_W = 852 filter_H = 8 # InputLayer network = layers.InputLayer(shape=(None, filter_W, num_channels * filter_H), input_var=output) print layers.get_output(network).eval({self.input_var: example}).shape # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) print layers.get_output(network).eval({self.input_var: example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) if (self.dropout > 0): network = layers.dropout(network, self.dropout) # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print layers.get_output(network).eval({self.input_var: example}).shape self.params += layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) #print "==> param shapes", [x.eval().shape for x in self.params] self.loss_ce = lasagne.objectives.categorical_crossentropy( self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty( self.params, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) # good one updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
def build_network(self, K, vocab_size, W_init): l_docin = L.InputLayer(shape=(None,None,1), input_var=self.inps[0]) l_doctokin = L.InputLayer(shape=(None,None), input_var=self.inps[1]) l_qin = L.InputLayer(shape=(None,None,1), input_var=self.inps[2]) l_qtokin = L.InputLayer(shape=(None,None), input_var=self.inps[3]) l_docmask = L.InputLayer(shape=(None,None), input_var=self.inps[6]) l_qmask = L.InputLayer(shape=(None,None), input_var=self.inps[7]) l_tokin = L.InputLayer(shape=(None,MAX_WORD_LEN), input_var=self.inps[8]) l_tokmask = L.InputLayer(shape=(None,MAX_WORD_LEN), input_var=self.inps[9]) l_featin = L.InputLayer(shape=(None,None), input_var=self.inps[11]) doc_shp = self.inps[1].shape qry_shp = self.inps[3].shape l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer(l_docembed, (doc_shp[0],doc_shp[1],self.embed_dim)) # B x N x DE l_qemb = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) l_qembed = L.ReshapeLayer(l_qemb, (qry_shp[0],qry_shp[1],self.embed_dim)) # B x N x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if self.train_emb==0: l_docembed.params[l_docembed.W].remove('trainable') l_qemb.params[l_qemb.W].remove('trainable') # char embeddings if self.use_chars: l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, self.char_dim) # T x L x D l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_bgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=True) # T x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim/2, nonlinearity=None) # T x DE/2 l_bckembed = L.DenseLayer(l_bgru, self.embed_dim/2, nonlinearity=None) # T x DE/2 l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2 l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2 l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2) l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2) attentions = [] if self.save_attn: l_m = PairwiseInteractionLayer([l_doce,l_qembed]) attentions.append(L.get_output(l_m, deterministic=True)) for i in range(K-1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE l_m = PairwiseInteractionLayer([l_doc_1, l_q_c_1]) l_doc_2_in = GatedAttentionLayer([l_doc_1, l_q_c_1, l_m], gating_fn=self.gating_fn, mask_input=self.inps[7]) l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE if self.save_attn: attentions.append(L.get_output(l_m, deterministic=True)) if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 # final layer l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=False) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=False) l_q = L.ConcatLayer([l_fwd_q, l_bkd_q], axis=2) # B x Q x 2D if self.save_attn: l_m = PairwiseInteractionLayer([l_doc, l_q]) attentions.append(L.get_output(l_m, deterministic=True)) l_prob = AttentionSumLayer([l_doc,l_q], self.inps[4], self.inps[12], mask_input=self.inps[10]) final = L.get_output(l_prob) final_v = L.get_output(l_prob, deterministic=True) return final, final_v, l_prob, l_docembed.W, attentions
def build_network(self, vocab_size, doc_var, query_var, docmask_var, qmask_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=EMBED_DIM, W=l_docembed.W) l_fwd_doc = L.GRULayer(l_docembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_docembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) l_fwd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) d = L.get_output(l_doc) # B x N x D q = L.get_output(l_q) # B x D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax( T.set_subtensor( T.alloc(-20., p.shape[0], p.shape[1])[docmask_var.nonzero()], p[docmask_var.nonzero()])) index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final = T.inc_subtensor( T.alloc(0., p.shape[0], vocab_size)[index, T.flatten(doc_var, outdim=2)], pm) #qv = T.flatten(query_var,outdim=2) #index2 = T.reshape(T.repeat(T.arange(qv.shape[0]),qv.shape[1]),qv.shape) #xx = index2[qmask_var.nonzero()] #yy = qv[qmask_var.nonzero()] #pV = T.set_subtensor(final[xx,yy], T.zeros_like(qv[xx,yy])) return final, l_doc, l_q
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, l2, mode, rnn_num_units, **kwargs): self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.l2 = l2 self.mode = mode self.num_units = rnn_num_units self.input_var = T.tensor3('input_var') self.answer_var = T.ivector('answer_var') ### Arquitectura de la red gru example = np.random.uniform(size=(self.batch_size, 768, 256), low=0.0, high=1.0).astype(np.float32) answer = np.random.randint(low=0, high=176, size=(self.batch_size, )) # Capa de entrada network = layers.InputLayer(shape=(None, 768, 256), input_var=self.input_var) # Capa GRU: network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) # Ultima Capa de la red network = layers.DenseLayer(incoming=network, num_units=122, nonlinearity=softmax) self.params = layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) self.loss_ce = lasagne.objectives.categorical_crossentropy( self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params( network, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0005) if self.mode == 'train': self.train_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) self.test_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])