def _get_normalised_relevance_layer(self, layer, feeder): def add_epsilon(Zs): tmp = (T.cast(Zs >= 0, theano.config.floatX) * 2.0 - 1.0) return Zs + self.epsilon * tmp if isinstance(layer, L.DenseLayer): forward_layer = L.DenseLayer(layer.input_layer, layer.num_units, W=layer.W, b=layer.b, nonlinearity=None) elif isinstance(layer, L.Conv2DLayer): forward_layer = L.Conv2DLayer(layer.input_layer, num_filters=layer.num_filters, W=layer.W, b=layer.b, stride=layer.stride, filter_size=layer.filter_size, flip_filters=layer.flip_filters, untie_biases=layer.untie_biases, pad=layer.pad, nonlinearity=None) else: raise NotImplementedError() forward_layer = L.ExpressionLayer(forward_layer, lambda x: 1.0 / add_epsilon(x)) feeder = L.ElemwiseMergeLayer([forward_layer, feeder], merge_function=T.mul) return feeder
def layer_GatedConv2DLayer( incoming, num_filters, filter_size, stride=(1, 1), pad=0, nonlinearity=lasagne.nonlinearities.very_leaky_rectify, name=''): la = ll.Conv2DLayer(incoming, num_filters=num_filters, filter_size=filter_size, stride=stride, pad=pad, nonlinearity=nonlinearity, name=name + '.activation') lg = ll.Conv2DLayer(incoming, num_filters=num_filters, filter_size=filter_size, stride=stride, pad=pad, nonlinearity=theano.tensor.nnet.nnet.sigmoid, name=name + '.gate') lout = ll.ElemwiseMergeLayer([la, lg], T.mul, cropping=None, name=name + '.mul_merge') return lout
def _invert_DenseLayer(self, layer, feeder): # Warning they are swapped here feeder = self._put_rectifiers(feeder, layer) feeder = self._get_normalised_relevance_layer(layer, feeder) output_units = np.prod(L.get_output_shape(layer.input_layer)[1:]) output_layer = L.DenseLayer(feeder, num_units=output_units) W = output_layer.W tmp_shape = np.asarray((-1, ) + L.get_output_shape(output_layer)[1:]) x_layer = L.ReshapeLayer(layer.input_layer, tmp_shape.tolist()) output_layer = L.ElemwiseMergeLayer(incomings=[x_layer, output_layer], merge_function=T.mul) output_layer.W = W return output_layer
def _invert_Conv2DLayer(self, layer, feeder): # Warning they are swapped here feeder = self._put_rectifiers(feeder, layer) feeder = self._get_normalised_relevance_layer(layer, feeder) f_s = layer.filter_size if layer.pad == 'same': pad = 'same' elif layer.pad == 'valid' or layer.pad == (0, 0): pad = 'full' else: raise RuntimeError("Define your padding as full or same.") # By definition the # Flip filters must be on to be a proper deconvolution. num_filters = L.get_output_shape(layer.input_layer)[1] if layer.stride == (4, 4): # Todo: similar code gradient based explainers. Merge. feeder = L.Upscale2DLayer(feeder, layer.stride, mode='dilate') output_layer = L.Conv2DLayer(feeder, num_filters=num_filters, filter_size=f_s, stride=1, pad=pad, nonlinearity=None, b=None, flip_filters=True) conv_layer = output_layer tmp = L.SliceLayer(output_layer, slice(0, -3), axis=3) output_layer = L.SliceLayer(tmp, slice(0, -3), axis=2) output_layer.W = conv_layer.W else: output_layer = L.Conv2DLayer(feeder, num_filters=num_filters, filter_size=f_s, stride=1, pad=pad, nonlinearity=None, b=None, flip_filters=True) W = output_layer.W # Do the multiplication. x_layer = L.ReshapeLayer(layer.input_layer, (-1, ) + L.get_output_shape(output_layer)[1:]) output_layer = L.ElemwiseMergeLayer(incomings=[x_layer, output_layer], merge_function=T.mul) output_layer.W = W return output_layer
def model(self, query_input, batch_size, query_vocab_size, context_vocab_size, emb_dim_size): l_input = L.InputLayer(shape=(batch_size, ), input_var=query_input) l_embed_continuous = L.EmbeddingLayer(l_input, input_size=query_vocab_size, output_size=emb_dim_size) l_values_discrete = L.EmbeddingLayer(l_input, input_size=query_vocab_size, output_size=emb_dim_size) l_probabilities_discrete = L.NonlinearityLayer( l_values_discrete, nonlinearity=lasagne.nonlinearities.softmax) l_embed_discrete = StochasticLayer(l_probabilities_discrete, estimator='MF') l_merge = L.ElemwiseSumLayer([l_embed_continuous, l_embed_discrete]) l_out = L.DenseLayer(l_merge, num_units=emb_dim_size, nonlinearity=lasagne.nonlinearities.softmax) l_merge_2 = L.ElemwiseMergeLayer([l_out, l_embed_discrete], merge_function=T.mul) l_final_out = L.DenseLayer(l_merge_2, num_units=context_vocab_size) return l_values_discrete, l_final_out
def __init__(self, incomings, vocab_size, emb_size, W, WT=None, **kwargs): super(EncodingFullLayer, self).__init__(incomings, **kwargs) # if len(self.input_shapes[0]) == 3: # batch_size, w_count, w_length = self.input_shapes[0] shape = tuple(self.input_shapes[0]) # else: # shape = tuple(self.input_shapes[0]) self.WT = None # self.reset_zero() self.l_in = LL.InputLayer(shape=shape) self.l_in_pe = LL.InputLayer(shape=shape + (emb_size, )) self.l_emb = LL.EmbeddingLayer(self.l_in, input_size=vocab_size, output_size=emb_size, W=W) self.W = self.l_emb.W self.l_emb = LL.ElemwiseMergeLayer((self.l_emb, self.l_in_pe), merge_function=T.mul) self.l_emb_res = LL.ExpressionLayer(self.l_emb, lambda X: X.sum(2), output_shape='auto') # self.l_emb_res = SumLayer(self.l_emb, axis=2) if np.any(WT): self.l_emb_res = TemporalEncodicgLayer(self.l_emb_res, T=WT) self.WT = self.l_emb_res.T params = LL.helper.get_all_params(self.l_emb_res, trainable=True) values = LL.helper.get_all_param_values(self.l_emb_res, trainable=True) for p, v in zip(params, values): self.add_param(p, v.shape, name=p.name) zero_vec_tensor = T.vector() self.zero_vec = np.zeros(emb_size, dtype=theano.config.floatX) self.set_zero = theano.function( [zero_vec_tensor], updates=[(x, T.set_subtensor(x[-1, :], zero_vec_tensor)) for x in [self.W]])
def build_model(hyparams, vocab, nclasses=2, batchsize=None, invar=None, maskvar=None, maxlen=MAXLEN): embedding_dim = hyparams.embedding_dim nhidden = hyparams.nhidden bidirectional = hyparams.bidirectional pool = hyparams.pool grad_clip = hyparams.grad_clip init = hyparams.init net = OrderedDict() V = len(vocab) W = lasagne.init.Normal() gate_params = layer.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.) ) cell_params = layer.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.tanh ) # define model net['input'] = layer.InputLayer((batchsize, maxlen), input_var=invar) net['mask'] = layer.InputLayer((batchsize, maxlen), input_var=maskvar) net['emb'] = layer.EmbeddingLayer(net['input'], input_size=V, output_size=embedding_dim, W=W) net['fwd1'] = layer.LSTMLayer( net['emb'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True ) if bidirectional: net['bwd1'] = layer.LSTMLayer( net['emb'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, backwards=True ) def tmean(a, b): agg = theano.tensor.add(a, b) agg /= 2. return agg net['pool'] = layer.ElemwiseMergeLayer([net['fwd1'], net['bwd1']], tmean) else: net['pool'] = layer.ConcatLayer([net['fwd1']]) net['dropout1'] = layer.DropoutLayer(net['pool'], p=0.5) net['fwd2'] = layer.LSTMLayer( net['dropout1'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, only_return_final=True ) net['dropout2'] = layer.DropoutLayer(net['fwd2'], p=0.6) net['softmax'] = layer.DenseLayer( net['dropout2'], num_units=nclasses, nonlinearity=lasagne.nonlinearities.softmax ) ASSUME = {net['input']: (200, 140), net['mask']: (200, 140)} logstr = '========== MODEL ========== \n' logstr += 'vocab size: %d\n' % V logstr += 'embedding dim: %d\n' % embedding_dim logstr += 'nhidden: %d\n' % nhidden logstr += 'pooling: %s\n' % pool for lname, lyr in net.items(): logstr += '%s %s\n' % (lname, str(get_output_shape(lyr, ASSUME))) logstr += '=========================== \n' print logstr return net
def __init__(self, incomings, vocab_size, emb_size, A=lasagne.init.Normal(std=0.1), C=lasagne.init.Normal(std=0.1), AT=lasagne.init.Normal(std=0.1), CT=lasagne.init.Normal(std=0.1), nonlin=lasagne.nonlinearities.softmax, RN=0., **kwargs): super(MemoryLayer, self).__init__(incomings, **kwargs) self.vocab_size, self.emb_size = vocab_size, emb_size self.nonlin = nonlin self.RN = RN # self.A, self.C, self.AT, self.CT = A, C, AT, CT batch_size, c_count, c_length = self.input_shapes[0] _, q_count, _ = self.input_shapes[2] self.l_c_in = LL.InputLayer(shape=(batch_size, c_count, c_length)) self.l_c_in_pe = LL.InputLayer(shape=(batch_size, c_count, c_length, self.emb_size)) self.l_u_in = LL.InputLayer(shape=(batch_size, q_count, self.emb_size)) self.l_c_A_enc = EncodingFullLayer((self.l_c_in, self.l_c_in_pe), self.vocab_size, self.emb_size, A, AT) self.l_c_C_enc = EncodingFullLayer((self.l_c_in, self.l_c_in_pe), self.vocab_size, self.emb_size, C, CT) self.A, self.C = self.l_c_A_enc.W, self.l_c_C_enc.W self.AT, self.CT = self.l_c_A_enc.WT, self.l_c_C_enc.WT if len(incomings ) == 4: # if there is also the probabilities over sentences self.l_in_ac_prob = LL.InputLayer(shape=(batch_size, c_count, emb_size)) self.l_c_A_enc_ = LL.ElemwiseMergeLayer( (self.l_c_A_enc, self.l_in_ac_prob), merge_function=T.mul) self.l_c_C_enc_ = LL.ElemwiseMergeLayer( (self.l_c_C_enc, self.l_in_ac_prob), merge_function=T.mul) self.l_u_in_tr = LL.DimshuffleLayer(self.l_u_in, pattern=(0, 2, 1)) if len(incomings) == 4: self.l_p = BatchedDotLayer((self.l_c_A_enc_, self.l_u_in_tr)) else: self.l_p = BatchedDotLayer((self.l_c_A_enc, self.l_u_in_tr)) if self.l_p.output_shape[2] == 1: self.l_p = LL.FlattenLayer(self.l_p, outdim=2) # self.l_p = LL.DimshuffleLayer(self.l_p, (0, 1)) if self.nonlin == 'MaxOut': raise NotImplementedError self.l_p = LL.NonlinearityLayer(self.l_p, nonlinearity=nonlin) self.l_p = LL.DimshuffleLayer(self.l_p, (0, 1, 'x')) # self.l_p = LL.ReshapeLayer(self.l_p, self.l_p.output_shape + (1,)) self.l_p = LL.ExpressionLayer(self.l_p, lambda X: X.repeat(emb_size, 2), output_shape='auto') ## self.l_p = RepeatDimLayer(self.l_p, emb_size, axis=2) if len(incomings) == 4: self.l_pc = LL.ElemwiseMergeLayer((self.l_p, self.l_c_C_enc_), merge_function=T.mul) else: self.l_pc = LL.ElemwiseMergeLayer((self.l_p, self.l_c_C_enc), merge_function=T.mul) self.l_o = LL.ExpressionLayer(self.l_pc, lambda X: X.sum(1), output_shape='auto') # self.l_o = SumLayer(self.l_pc, axis=1) self.l_o = LL.DimshuffleLayer(self.l_o, pattern=(0, 'x', 1)) self.l_o_u = LL.ElemwiseMergeLayer((self.l_o, self.l_u_in), merge_function=T.add) params = LL.helper.get_all_params(self.l_o_u, trainable=True) values = LL.helper.get_all_param_values(self.l_o_u, trainable=True) for p, v in zip(params, values): self.add_param(p, v.shape, name=p.name)
from lasagne import layers, nonlinearities import theano.tensor as T __all__ = [ 'take', 'minimum', 'maximum', 'concat', 'noise', 'nothing', 'dropout', 'dense', 'select', 'batch_norm', 'elementwise', 'elementwise_sum', 'elementwise_mean', 'flatten', 'feature_pool', 'nonlinearity' ] get_common_nonlinearity = lambda f=None: nonlinearities.LeakyRectify( 0.1) if f is None else f minimum = lambda: lambda incomings: layers.ElemwiseMergeLayer( incomings, merge_function=T.minimum) maximum = lambda: lambda incomings: layers.ElemwiseMergeLayer( incomings, merge_function=T.maximum) concat = lambda axis=1: lambda incomings: layers.ConcatLayer(incomings, axis=axis) noise = lambda sigma=0.1: lambda incoming: \ layers.GaussianNoiseLayer(incoming, sigma=sigma) if sigma is not None and sigma > 0 else incoming nothing = lambda incoming: incoming dense = lambda num_units, f=None: lambda incoming: \ layers.DenseLayer(incoming, num_units=num_units, nonlinearity=(nonlinearities.LeakyRectify(0.05) if f is None else f)) dropout = lambda p=0.1, rescale=True: lambda incoming: \ layers.DropoutLayer(incoming, p=p, rescale=rescale) if p is not None else incoming
def build_network(self, vocab_size, doc_var, query_var, docmask_var, qmask_var, candmask_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_var.shape[0], doc_var.shape[1], EMBED_DIM)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=EMBED_DIM, W=l_docembed.W) l_fwd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) # B x 2D q = L.get_output(l_q) # B x 2D l_fwd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) l_doc_1 = L.dropout(l_doc_1, p=DROPOUT_RATE) l_fwd_q_c = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_c = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice_c = L.SliceLayer(l_fwd_q_c, -1, 1) l_bkd_q_slice_c = L.SliceLayer(l_bkd_q_c, 0, 1) l_q_c = L.ConcatLayer([l_fwd_q_slice_c, l_bkd_q_slice_c]) # B x DE qd = L.get_output(l_q_c) q_rep = T.reshape( T.tile(qd, (1, doc_var.shape[1])), (doc_var.shape[0], doc_var.shape[1], 2 * NUM_HIDDEN)) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN), input_var=q_rep) l_doc_gru_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_fwd_doc = L.GRULayer(l_doc_gru_in, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doc_gru_in, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax( T.set_subtensor( T.alloc(-20., p.shape[0], p.shape[1])[candmask_var.nonzero()], p[candmask_var.nonzero()])) index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final = T.inc_subtensor( T.alloc(0., p.shape[0], vocab_size)[index, T.flatten(doc_var, outdim=2)], pm) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax( T.set_subtensor( T.alloc(-20., p.shape[0], p.shape[1])[candmask_var.nonzero()], p[candmask_var.nonzero()])) index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final_v = T.inc_subtensor( T.alloc(0., p.shape[0], vocab_size)[index, T.flatten(doc_var, outdim=2)], pm) return final, final_v, l_doc, [l_q, l_q_c]
def build_network(self, K, vocab_size, W_init, C_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0]) l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1]) l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2]) l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3]) l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6]) l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7]) l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[8]) l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[9]) l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11]) doc_shp = self.inps[1].shape qry_shp = self.inps[3].shape l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim)) # B x N x DE l_qemb = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) l_qembed = L.ReshapeLayer( l_qemb, (qry_shp[0], qry_shp[1], self.embed_dim)) # B x N x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') l_qemb.params[l_qemb.W].remove('trainable') # char embeddings if self.use_subs: if C_init != None: l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, self.sub_dim, W=C_init) # T x L x D else: l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, self.sub_dim) # T x L x D l_fgru = L.GRULayer(l_lookup, self.sub_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_bgru = L.GRULayer(l_lookup, self.sub_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=True) # T x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim, nonlinearity=None) # T x DE/2 l_bckembed = L.DenseLayer(l_bgru, self.embed_dim, nonlinearity=None) # T x DE/2 l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2 l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2 # l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2) # l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2) #l_doce = L.ElemwiseSumLayer([l_doce, l_docchar_embed], coeffs=1) #l_qembed = L.ElemwiseSumLayer([l_qembed, l_qchar_embed], coeffs=1) l_doce = L.ElemwiseMergeLayer([l_doce, l_docchar_embed], merge_function=T.mul) l_qembed = L.ElemwiseMergeLayer([l_qembed, l_qchar_embed], merge_function=T.mul) attentions = [] if self.save_attn: l_m = PairwiseInteractionLayer([l_doce, l_qembed]) attentions.append(L.get_output(l_m, deterministic=True)) for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE l_m = PairwiseInteractionLayer([l_doc_1, l_q_c_1]) l_doc_2_in = GatedAttentionLayer([l_doc_1, l_q_c_1, l_m], gating_fn=self.gating_fn, mask_input=self.inps[7]) l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE if self.save_attn: attentions.append(L.get_output(l_m, deterministic=True)) if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 # final layer l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=False) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=False) l_q = L.ConcatLayer([l_fwd_q, l_bkd_q], axis=2) # B x Q x 2D if self.save_attn: l_m = PairwiseInteractionLayer([l_doc, l_q]) attentions.append(L.get_output(l_m, deterministic=True)) l_prob = AttentionSumLayer([l_doc, l_q], self.inps[4], self.inps[12], mask_input=self.inps[10]) final = L.get_output(l_prob) final_v = L.get_output(l_prob, deterministic=True) return final, final_v, l_prob, l_docembed.W, attentions
def build_network(self, K, vocab_size, doc_var, query_var, docmask_var, qmask_var, candmask_var, feat_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_featin = L.InputLayer(shape=(None, None), input_var=feat_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_var.shape[0], doc_var.shape[1], EMBED_DIM)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=EMBED_DIM, W=l_docembed.W) l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if not EMB_TRAIN: l_docembed.params[l_docembed.W].remove('trainable') l_fwd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) # B x 2D q = L.get_output(l_q) # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) l_fwd_q_1 = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice_1 = L.SliceLayer(l_fwd_q_1, -1, 1) l_bkd_q_slice_1 = L.SliceLayer(l_bkd_q_1, 0, 1) l_q_c_1 = L.ConcatLayer([l_fwd_q_slice_1, l_bkd_q_slice_1]) # B x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) q_rep = T.reshape(T.tile(qd, (1, doc_var.shape[1])), (doc_var.shape[0], doc_var.shape[1], 2 * NUM_HIDDEN)) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=DROPOUT_RATE) # B x N x DE l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,T.flatten(doc_var,outdim=2)],\ pm) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final_v = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,\ T.flatten(doc_var,outdim=2)],pm) return final, final_v, l_doc, l_qs
def additional_layer(self, idx_layer, emb_layer, avg=False): suf = '_avg' if avg else '' if self.name == 'char': if self.args.char_model == 'cnn': lds = L.dimshuffle(emb_layer, (0, 3, 1, 2)) # (100, 16, 26, 32) ls = [] for n in self.args.ngrams: lconv = L.Conv2DLayer( lds, self.args.conv_dim, (1, n), untie_biases=False, # W=HeNormal('relu') if not avg else Constant(), W=GlorotNormal('relu') if not avg else Constant(), name='conv_%d' % n + suf) # (100, 64/4, 26, 32-n+1) lpool = L.MaxPool2DLayer(lconv, (1, self.args.max_word_len - n + 1)) # (100, 64, 26, 1) lpool = L.flatten(lpool, outdim=3) # (100, 16, 26) lpool = L.dimshuffle(lpool, (0, 2, 1)) # (100, 26, 16) ls.append(lpool) xc = L.concat(ls, axis=2, name='echar_concat') # (100, 26, 64) # additional # xc = L.DenseLayer(xc, self.args.embw_dim, nonlinearity=None, name='echar_affine', num_leading_axes=2, # W=HeNormal() if not avg else Constant()) # (100, 26, 100) return xc elif self.args.char_model == 'lstm': ml = L.ExpressionLayer( idx_layer, lambda x: T.neq(x, 0)) # mask layer (100, 24, 32) ml = L.reshape(ml, (-1, self.args.max_word_len)) # (1500, 32) gate_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal()) cell_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=None, nonlinearity=tanh) lstm_in = L.reshape( emb_layer, (-1, self.args.max_word_len, self.config['char']['emb_dim'])) # (1500, 32, 16) lstm_f = L.LSTMLayer( lstm_in, 32, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, name='forward' + suf) # (1500, 32) lstm_b = L.LSTMLayer( lstm_in, 32, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, backwards=True, name='backward' + suf) # (1500, 32) remove_reg(lstm_f) remove_reg(lstm_b) if avg: set_zero(lstm_f) set_zero(lstm_b) xc = L.concat([lstm_f, lstm_b], axis=1) # (1500, 64) if self.args.lstm_tagger: xc = L.reshape( xc, (-1, self.args.max_sent_len, 64)) # (100, 161, 64) elif self.args.trans_tagger: xc = L.reshape( xc, (-1, self.args.window_size, 64)) # (100, 15, 64) else: xc = L.reshape(xc, (-1, 26, 64)) # (100, 26, 64) return xc elif self.name == 'morph': # idx (100, 26/161, 16) emb (100, 26/161, 16, 32) if self.args.morph_model == 'max': xm = L.MaxPool2DLayer( emb_layer, (self.args.max_morph_len, 1)) # (100, 26/161, 1, 32) # xm = L.reshape(xm, (-1, 26, self.config['morph']['emb_dim'])) # (100, 26/161, 32) xm = L.flatten(xm, outdim=3) # (100, 26/161, 32) # xm = L.ExpressionLayer(emb_layer, lambda x: T.max(x, 2)) elif self.args.morph_model == 'avg': mask = L.ExpressionLayer( idx_layer, lambda x: T.neq(x, 0)) # (100, 26, 16) mask = L.dimshuffle(mask, (0, 1, 2, 'x')) # (100, 26, 16, 1) mask = L.ExpressionLayer(mask, lambda x: T.extra_ops.repeat( x, self.config['morph']['emb_dim'], 3)) # (100, 26, 16, 1) xm = L.ElemwiseMergeLayer([ emb_layer, mask ], lambda x, m: T.sum(x * m, 2) / T.sum(m, 2)) # (100, 26, 32) # xm = L.reshape(xm, (-1, self.args.feat_shape, self.config['morph']['emb_dim'])) # (100, 26, 32) return xm else: return emb_layer
def build_network(self, K, vocab_size, doc_var, query_var, cand_var, docmask_var, qmask_var, candmask_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_var.shape[0], doc_var.shape[1], self.embed_dim)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) # B x 2D q = L.get_output(l_q) # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice_1 = L.SliceLayer(l_fwd_q_1, -1, 1) l_bkd_q_slice_1 = L.SliceLayer(l_bkd_q_1, 0, 1) l_q_c_1 = L.ConcatLayer([l_fwd_q_slice_1, l_bkd_q_slice_1]) # B x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) q_rep = T.reshape(T.tile(qd, (1, doc_var.shape[1])), (doc_var.shape[0], doc_var.shape[1], 2 * self.nhidden)) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=self.dropout) l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] final = T.batched_dot(pm, cand_var) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] final_v = T.batched_dot(pm, cand_var) return final, final_v, l_doc, l_qs, l_docembed.W
def _build_net(self, emb_char_filter_size=5, emb_dropout=True, **kwargs): batch_size = self.mask_context_var.shape[0] context_len = self.mask_context_var.shape[1] question_len = self.question_var.shape[1] context_word_len = self.context_char_var.shape[2] question_word_len = self.question_char_var.shape[2] self.batch_size = batch_size self.context_len = context_len ''' Inputs and word embeddings''' l_context_char = LL.InputLayer(shape=(None, None, None), input_var=self.context_char_var) l_question_char = LL.InputLayer(shape=(None, None, None), input_var=self.question_char_var) l_c_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_context_var) l_q_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_question_var) l_c_char_mask = LL.InputLayer(shape=(None, None, None), input_var=self.mask_context_char_var) l_q_char_mask = LL.InputLayer(shape=(None, None, None), input_var=self.mask_question_char_var) l_c_emb = LL.InputLayer(shape=(None, None, self.emb_size), input_var=self.context_var) l_q_emb = LL.InputLayer(shape=(None, None, self.emb_size), input_var=self.question_var) if self.train_unk: l_c_unk_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_context_unk_var) l_q_unk_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_question_unk_var) l_c_emb = TrainUnkLayer(l_c_emb, l_c_unk_mask, output_size=self.emb_size, W=self.word_embeddings[0]) l_q_emb = TrainUnkLayer(l_q_emb, l_q_unk_mask, output_size=self.emb_size, W=l_c_emb.W) if self.negative: l_c_emb = TrainNAWLayer(l_c_emb, l_c_mask, output_size=self.emb_size) ''' Char-embeddings ''' # (batch_size x context_len x context_word_len x emb_char_size) l_c_char_emb = LL.EmbeddingLayer(l_context_char, input_size=self.alphabet_size, output_size=self.emb_char_size) l_q_char_emb = LL.EmbeddingLayer(l_question_char, input_size=self.alphabet_size, output_size=self.emb_char_size, W=l_c_char_emb.W) # here I do multiplication of character embeddings with masks, # because I want to pad them with constant zeros l_c_char_mask = ForgetSizeLayer( LL.dimshuffle(l_c_char_mask, (0, 1, 2, 'x'))) l_q_char_mask = ForgetSizeLayer( LL.dimshuffle(l_q_char_mask, (0, 1, 2, 'x'))) l_c_char_emb = LL.ElemwiseMergeLayer([l_c_char_emb, l_c_char_mask], T.mul) l_q_char_emb = LL.ElemwiseMergeLayer([l_q_char_emb, l_q_char_mask], T.mul) # convolutions l_c_char_emb = LL.dimshuffle( LL.reshape(l_c_char_emb, (batch_size * context_len, context_word_len, self.emb_char_size)), (0, 2, 1)) l_c_char_conv = LL.Conv1DLayer(l_c_char_emb, num_filters=self.num_emb_char_filters, filter_size=emb_char_filter_size, nonlinearity=L.nonlinearities.tanh, pad=self.conv) # (batch_size * context_len x num_filters x context_word_len + filter_size - 1) l_c_char_emb = LL.ExpressionLayer(l_c_char_conv, lambda X: X.max(2), output_shape='auto') l_c_char_emb = LL.reshape( l_c_char_emb, (batch_size, context_len, self.num_emb_char_filters)) l_q_char_emb = LL.dimshuffle( LL.reshape(l_q_char_emb, (batch_size * question_len, question_word_len, self.emb_char_size)), (0, 2, 1)) l_q_char_conv = LL.Conv1DLayer(l_q_char_emb, num_filters=self.num_emb_char_filters, filter_size=emb_char_filter_size, nonlinearity=L.nonlinearities.tanh, W=l_c_char_conv.W, b=l_c_char_conv.b, pad=self.conv) # (batch_size * question_len x num_filters x question_word_len + filter_size - 1) l_q_char_emb = LL.ExpressionLayer(l_q_char_conv, lambda X: X.max(2), output_shape='auto') l_q_char_emb = LL.reshape( l_q_char_emb, (batch_size, question_len, self.num_emb_char_filters)) ''' Concatenating both embeddings ''' l_c_emb = LL.concat([l_c_emb, l_c_char_emb], axis=2) l_q_emb = LL.concat([l_q_emb, l_q_char_emb], axis=2) # originally I had dropout here ''' Highway layer allowing for interaction between embeddings ''' l_c_P = LL.reshape(l_c_emb, (batch_size * context_len, self.emb_size + self.num_emb_char_filters)) l_c_P = LL.DenseLayer(l_c_P, num_units=self.rec_size, b=None, nonlinearity=None) l_c_high = HighwayLayer(l_c_P) l_c_emb = LL.reshape(l_c_high, (batch_size, context_len, self.rec_size)) l_q_P = LL.reshape(l_q_emb, (batch_size * question_len, self.emb_size + self.num_emb_char_filters)) l_q_P = LL.DenseLayer(l_q_P, num_units=self.rec_size, W=l_c_P.W, b=None, nonlinearity=None) l_q_high = HighwayLayer(l_q_P, W1=l_c_high.W1, b1=l_c_high.b1, W2=l_c_high.W2, b2=l_c_high.b2) l_q_emb = LL.reshape(l_q_high, (batch_size, question_len, self.rec_size)) ''' Calculating wiq features from https://arxiv.org/abs/1703.04816 ''' l_weighted_feat = WeightedFeatureLayer( [l_c_emb, l_q_emb, l_c_mask, l_q_mask]) # batch_size x context_len l_weighted_feat = LL.dimshuffle(l_weighted_feat, (0, 1, 'x')) # batch_size x context_len l_bin_feat = LL.InputLayer(shape=(None, None), input_var=self.bin_feat_var) l_bin_feat = LL.dimshuffle(l_bin_feat, (0, 1, 'x')) ''' Dropout at the embeddings ''' if emb_dropout: print('Using dropout after wiq calculation.') l_c_emb = LL.dropout(l_c_emb) l_q_emb = LL.dropout(l_q_emb) ''' Here we concatenate wiq features to embeddings''' # both features are concatenated to the embeddings # for the question we fix the features to 1 l_c_emb = LL.concat([l_c_emb, l_bin_feat, l_weighted_feat], axis=2) l_q_emb = LL.pad(l_q_emb, width=[(0, 2)], val=L.utils.floatX(1), batch_ndim=2) ''' Context and question encoding using the same BiLSTM for both ''' # output shape is (batch_size x context_len x rec_size) l_c_enc_forw = LL.LSTMLayer(l_c_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_c_mask) l_c_enc_back = LL.LSTMLayer(l_c_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_c_mask, backwards=True) # output shape is (batch_size x question_len x rec_size) l_q_enc_forw = LL.LSTMLayer( l_q_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_q_mask, ingate=LL.Gate(W_in=l_c_enc_forw.W_in_to_ingate, W_hid=l_c_enc_forw.W_hid_to_ingate, W_cell=l_c_enc_forw.W_cell_to_ingate, b=l_c_enc_forw.b_ingate), forgetgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_forgetgate, W_hid=l_c_enc_forw.W_hid_to_forgetgate, W_cell=l_c_enc_forw.W_cell_to_forgetgate, b=l_c_enc_forw.b_forgetgate), outgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_outgate, W_hid=l_c_enc_forw.W_hid_to_outgate, W_cell=l_c_enc_forw.W_cell_to_outgate, b=l_c_enc_forw.b_outgate), cell=LL.Gate(W_in=l_c_enc_forw.W_in_to_cell, W_hid=l_c_enc_forw.W_hid_to_cell, W_cell=None, b=l_c_enc_forw.b_cell, nonlinearity=L.nonlinearities.tanh)) l_q_enc_back = LL.LSTMLayer( l_q_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_q_mask, backwards=True, ingate=LL.Gate(W_in=l_c_enc_back.W_in_to_ingate, W_hid=l_c_enc_back.W_hid_to_ingate, W_cell=l_c_enc_back.W_cell_to_ingate, b=l_c_enc_back.b_ingate), forgetgate=LL.Gate(W_in=l_c_enc_back.W_in_to_forgetgate, W_hid=l_c_enc_back.W_hid_to_forgetgate, W_cell=l_c_enc_back.W_cell_to_forgetgate, b=l_c_enc_back.b_forgetgate), outgate=LL.Gate(W_in=l_c_enc_back.W_in_to_outgate, W_hid=l_c_enc_back.W_hid_to_outgate, W_cell=l_c_enc_back.W_cell_to_outgate, b=l_c_enc_back.b_outgate), cell=LL.Gate(W_in=l_c_enc_back.W_in_to_cell, W_hid=l_c_enc_back.W_hid_to_cell, W_cell=None, b=l_c_enc_back.b_cell, nonlinearity=L.nonlinearities.tanh)) # batch_size x context_len x 2*rec_size l_c_enc = LL.concat([l_c_enc_forw, l_c_enc_back], axis=2) # batch_size x question_len x 2*rec_size l_q_enc = LL.concat([l_q_enc_forw, l_q_enc_back], axis=2) def proj_init(): return np.vstack([ np.eye(self.rec_size, dtype=theano.config.floatX), np.eye(self.rec_size, dtype=theano.config.floatX) ]) # this is H from the paper, shape: (batch_size * context_len x # rec_size) l_c_proj = LL.reshape(l_c_enc, (batch_size * context_len, 2 * self.rec_size)) l_c_proj = LL.DenseLayer(l_c_proj, num_units=self.rec_size, W=proj_init(), b=None, nonlinearity=L.nonlinearities.tanh) # this is Z from the paper, shape: (batch_size * question_len x # rec_size) l_q_proj = LL.reshape(l_q_enc, (batch_size * question_len, 2 * self.rec_size)) l_q_proj = LL.DenseLayer(l_q_proj, num_units=self.rec_size, W=proj_init(), b=None, nonlinearity=L.nonlinearities.tanh) ''' Additional, weighted question encoding (alphas from paper) ''' l_alpha = LL.DenseLayer( l_q_proj, # batch_size * question_len x 1 num_units=1, b=None, nonlinearity=None) # batch_size x question_len l_alpha = MaskedSoftmaxLayer( LL.reshape(l_alpha, (batch_size, question_len)), l_q_mask) # batch_size x rec_size l_z_hat = BatchedDotLayer([ LL.reshape(l_q_proj, (batch_size, question_len, self.rec_size)), l_alpha ]) return l_c_proj, l_z_hat
def build_network(self, K, vocab_size, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0]) l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1]) l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2]) l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3]) l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6]) l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7]) l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[8]) l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN), input_var=self.inps[9]) l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11]) doc_shp = self.inps[1].shape qry_shp = self.inps[3].shape l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) l_qembed = L.ReshapeLayer( l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim)) # B x N x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') # char embeddings if self.use_chars: l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 2 * self.char_dim) # T x L x D l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=True) l_bgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=GRAD_CLIP, mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=True) # T x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim / 2, nonlinearity=None) # T x DE/2 l_bckembed = L.DenseLayer(l_bgru, self.embed_dim / 2, nonlinearity=None) # T x DE/2 l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2 l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2 l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2) l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2) l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=False) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, only_return_final=False) l_q = L.ConcatLayer([l_fwd_q, l_bkd_q]) # B x Q x 2D q = L.get_output(l_q) # B x Q x 2D q = q[T.arange(q.shape[0]), self.inps[12], :] # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) # B x Q x DE dd = L.get_output(l_doc_1) # B x N x DE M = T.batched_dot(dd, qd.dimshuffle((0, 2, 1))) # B x N x Q alphas = T.nnet.softmax( T.reshape(M, (M.shape[0] * M.shape[1], M.shape[2]))) alphas_r = T.reshape(alphas, (M.shape[0],M.shape[1],M.shape[2]))* \ self.inps[7][:,np.newaxis,:] # B x N x Q alphas_r = alphas_r / alphas_r.sum(axis=2)[:, :, np.newaxis] # B x N x Q q_rep = T.batched_dot(alphas_r, qd) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * self.inps[10] pm = pm / pm.sum(axis=1)[:, np.newaxis] final = T.batched_dot(pm, self.inps[4]) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * self.inps[10] pm = pm / pm.sum(axis=1)[:, np.newaxis] final_v = T.batched_dot(pm, self.inps[4]) return final, final_v, l_doc, l_qs, l_docembed.W
def __init__( self, image_shape, filter_shape, num_class, conv_type, kernel_size, kernel_pool_size, dropout_rate, ): """ """ self.filter_shape = filter_shape self.n_visible = numpy.prod(image_shape) self.n_layers = len(filter_shape) self.rng = RandomStreams(123) self.x = T.matrix() self.y = T.ivector() self.conv_layers = [] NoiseLayer = layers.DropoutLayer dropout_rate = float(dropout_rate) self.l_input = layers.InputLayer((None, self.n_visible), self.x) this_layer = layers.ReshapeLayer(self.l_input, ([0], ) + image_shape) for l in range(self.n_layers): activation = lasagne.nonlinearities.rectify if len(filter_shape[l]) == 3: if conv_type == 'double' and filter_shape[l][1] > kernel_size: this_layer = DoubleConvLayer( this_layer, filter_shape[l][0], filter_shape[l][1:], pad='same', nonlinearity=activation, kernel_size=kernel_size, kernel_pool_size=kernel_pool_size) this_layer = layers.batch_norm(this_layer) elif conv_type == 'maxout': this_layer = layers.Conv2DLayer(this_layer, filter_shape[l][0], filter_shape[l][1:], b=None, pad='same', nonlinearity=None) this_layer = layers.FeaturePoolLayer( this_layer, pool_size=kernel_pool_size**2) this_layer = layers.BatchNormLayer(this_layer) this_layer = layers.NonlinearityLayer( this_layer, activation) elif conv_type == 'cyclic': this_layers = [] this_layers.append( layers.Conv2DLayer(this_layer, filter_shape[l][0], filter_shape[l][1:], b=None, pad='same', nonlinearity=None)) for _ in range(3): W = this_layers[-1].W.dimshuffle(0, 1, 3, 2)[:, :, :, ::-1] this_layers.append( layers.Conv2DLayer(this_layer, filter_shape[l][0], filter_shape[l][1:], W=W, b=None, pad='same', nonlinearity=None)) this_layer = layers.ElemwiseMergeLayer( this_layers, T.maximum) this_layer = layers.BatchNormLayer(this_layer) this_layer = layers.NonlinearityLayer( this_layer, activation) elif conv_type == 'standard' \ or (conv_type == 'double' and filter_shape[l][1] <= kernel_size): this_layer = layers.Conv2DLayer(this_layer, filter_shape[l][0], filter_shape[l][1:], pad='same', nonlinearity=activation) this_layer = layers.batch_norm(this_layer) else: raise NotImplementedError self.conv_layers.append(this_layer) elif len(filter_shape[l]) == 2: this_layer = layers.MaxPool2DLayer(this_layer, filter_shape[l]) this_layer = NoiseLayer(this_layer, dropout_rate) elif len(filter_shape[l]) == 1: raise NotImplementedError self.top_conv_layer = this_layer this_layer = layers.GlobalPoolLayer(this_layer, T.mean) self.clf_layer = layers.DenseLayer(this_layer, num_class, W=lasagne.init.Constant(0.), nonlinearity=T.nnet.softmax) self.params = layers.get_all_params(self.clf_layer, trainable=True) self.params_all = layers.get_all_params(self.clf_layer)
def build_model(vmap, nclasses=2, embedding_dim=50, nhidden=256, batchsize=None, invar=None, maskvar=None, bidirectional=True, pool=True, grad_clip=100, maxlen=MAXLEN): V = len(vmap) W = lasagne.init.Normal() # Input Layer # TODO: should be (batchsize, maxlen, vocab_size) l_in = layer.InputLayer((batchsize, maxlen, V), input_var=invar) l_mask = layer.InputLayer((batchsize, maxlen), input_var=maskvar) ASSUME = {l_in: (200, 140, 94), l_mask: (200, 140)} print 'Input Layer' print 'output:', get_output_shape(l_in, ASSUME) print 'output(mask):', get_output_shape(l_mask, ASSUME) print # Embedding Layer l_emb = layer.EmbeddingLayer(l_in, input_size=V, output_size=embedding_dim, W=W) print 'Embedding Layer' print 'output:', get_output_shape(l_emb, ASSUME) gate_params = layer.recurrent.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.)) cell_params = layer.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.tanh) l_fwd = layer.LSTMLayer(l_emb, num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True) print 'Forward LSTM' print 'output:', get_output_shape(l_fwd, ASSUME) l_concat = None if bidirectional: l_bwd = layer.LSTMLayer(l_emb, num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, backwards=True) print 'Backward LSTM' print 'output:', get_output_shape(l_bwd, ASSUME) def tmean(a, b): agg = theano.tensor.add(a, b) agg /= 2. return agg if pool: l_concat = layer.ElemwiseMergeLayer([l_fwd, l_bwd], tmean) else: l_concat = layer.ConcatLayer([l_fwd, l_bwd]) else: l_concat = layer.ConcatLayer([l_fwd]) print 'Concat' print 'output:', get_output_shape(l_concat, ASSUME) l_concat = layer.DropoutLayer(l_concat, p=0.5) l_lstm2 = layer.LSTMLayer(l_concat, num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, only_return_final=True) print 'LSTM #2' print 'output:', get_output_shape(l_lstm2, ASSUME) l_lstm2 = layer.DropoutLayer(l_lstm2, p=0.6) network = layer.DenseLayer(l_lstm2, num_units=nclasses, nonlinearity=lasagne.nonlinearities.softmax) print 'Dense Layer' print 'output:', get_output_shape(network, ASSUME) return network