def __init__(self, output_size, meta_size, depth=2): encoder_sizes = [64, 64, 64] input_var = TT.matrix() meta_var = TT.matrix() target_var = TT.matrix() mask_var = TT.matrix() input_layer = layers.InputLayer((None, output_size), input_var=input_var) meta_layer = layers.InputLayer((None, meta_size), input_var=meta_var) concat_input_layer = layers.ConcatLayer([input_layer, meta_layer]) dense = concat_input_layer for idx in xrange(depth): dense = layers.DenseLayer(dense, encoder_sizes[idx]) dense = layers.batch_norm(dense) mu_and_logvar = layers.DenseLayer(dense, 2 * output_size, nonlinearity=nonlinearities.linear) mu = layers.SliceLayer(mu_and_logvar, slice(0, output_size), axis=1) log_var = layers.SliceLayer(mu_and_logvar, slice(output_size, None), axis=1) loss = neg_log_likelihood2( target_var, layers.get_output(mu), layers.get_output(log_var), mask_var ).mean() test_loss = neg_log_likelihood2( target_var, layers.get_output(mu, deterministic=True), layers.get_output(log_var, deterministic=True), mask_var ).mean() params = layers.get_all_params(mu_and_logvar, trainable=True) param_updates = updates.adadelta(loss, params) self._train_fn = theano.function( [input_var, meta_var, target_var], updates=param_updates, outputs=loss ) self._loss_fn = theano.function( [input_var, meta_var, target_var], outputs=test_loss ) self._predict_fn = theano.function( [input_var, meta_var], outputs=[ layers.get_output(mu, deterministic=True), layers.get_output(log_var, deterministic=True) ] )
def build_network(self, vocab_size, input_var, mask_var, W_init): l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var) l_mask = L.InputLayer(shape=(None, None), input_var=mask_var) l_embed = L.EmbeddingLayer(l_in, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) l_fwd_1 = L.LSTMLayer(l_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_1 = L.LSTMLayer(l_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_all_1 = L.concat([l_fwd_1, l_bkd_1], axis=2) l_fwd_2 = L.LSTMLayer(l_all_1, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_2 = L.LSTMLayer(l_all_1, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1) l_bkd_1_slice = L.SliceLayer(l_bkd_1, 0, 1) y_1 = L.ElemwiseSumLayer([l_fwd_1_slice, l_bkd_1_slice]) l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1) l_bkd_2_slice = L.SliceLayer(l_bkd_2, 0, 1) y_2 = L.ElemwiseSumLayer([l_fwd_2_slice, l_bkd_2_slice]) y = L.concat([y_1, y_2], axis=1) g = L.DenseLayer(y, num_units=EMBED_DIM, nonlinearity=lasagne.nonlinearities.tanh) l_out = L.DenseLayer(g, num_units=vocab_size, W=l_embed.W.T, nonlinearity=lasagne.nonlinearities.softmax) return l_out
def _invert_PadLayer(self, layer, feeder): assert isinstance(layer, L.PadLayer) assert layer.batch_ndim == 2 assert len(L.get_output_shape(layer)) == 4. tmp = L.SliceLayer(feeder, slice(layer.width[0][0], -layer.width[0][1]), axis=2) return L.SliceLayer(tmp, slice(layer.width[1][0], -layer.width[1][1]), axis=3)
def build_network(self): l_char1_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[0]) l_char2_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[1]) l_mask1_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[2]) l_mask2_in = L.InputLayer(shape=(None, None, self.max_word_len), input_var=self.inps[3]) l_char_in = L.ConcatLayer([l_char1_in, l_char2_in], axis=1) # B x (ND+NQ) x L l_char_mask = L.ConcatLayer([l_mask1_in, l_mask2_in], axis=1) shp = (self.inps[0].shape[0], self.inps[0].shape[1] + self.inps[1].shape[1], self.inps[1].shape[2]) l_index_reshaped = L.ReshapeLayer(l_char_in, (shp[0] * shp[1], shp[2])) # BN x L l_mask_reshaped = L.ReshapeLayer(l_char_mask, (shp[0] * shp[1], shp[2])) # BN x L l_lookup = L.EmbeddingLayer(l_index_reshaped, self.num_chars, self.char_dim) # BN x L x D l_fgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=10, gradient_steps=-1, precompute_input=True, only_return_final=True, mask_input=l_mask_reshaped) l_bgru = L.GRULayer(l_lookup, 2 * self.char_dim, grad_clipping=10, gradient_steps=-1, precompute_input=True, backwards=True, only_return_final=True, mask_input=l_mask_reshaped) # BN x 2D l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim / 2, nonlinearity=None) # BN x DE l_bckembed = L.DenseLayer(l_bgru, self.embed_dim / 2, nonlinearity=None) # BN x DE l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1) l_char_embed = L.ReshapeLayer(l_embed, (shp[0], shp[1], self.embed_dim / 2)) l_embed1 = L.SliceLayer(l_char_embed, slice(0, self.inps[0].shape[1]), axis=1) l_embed2 = L.SliceLayer(l_char_embed, slice(-self.inps[1].shape[1], None), axis=1) return l_embed1, l_embed2
def _invert_Conv2DLayer(self, layer, feeder): # Warning they are swapped here feeder = self._put_rectifiers(feeder, layer) feeder = self._get_normalised_relevance_layer(layer, feeder) f_s = layer.filter_size if layer.pad == 'same': pad = 'same' elif layer.pad == 'valid' or layer.pad == (0, 0): pad = 'full' else: raise RuntimeError("Define your padding as full or same.") # By definition the # Flip filters must be on to be a proper deconvolution. num_filters = L.get_output_shape(layer.input_layer)[1] if layer.stride == (4, 4): # Todo: similar code gradient based explainers. Merge. feeder = L.Upscale2DLayer(feeder, layer.stride, mode='dilate') output_layer = L.Conv2DLayer(feeder, num_filters=num_filters, filter_size=f_s, stride=1, pad=pad, nonlinearity=None, b=None, flip_filters=True) conv_layer = output_layer tmp = L.SliceLayer(output_layer, slice(0, -3), axis=3) output_layer = L.SliceLayer(tmp, slice(0, -3), axis=2) output_layer.W = conv_layer.W else: output_layer = L.Conv2DLayer(feeder, num_filters=num_filters, filter_size=f_s, stride=1, pad=pad, nonlinearity=None, b=None, flip_filters=True) W = output_layer.W # Do the multiplication. x_layer = L.ReshapeLayer(layer.input_layer, (-1, ) + L.get_output_shape(output_layer)[1:]) output_layer = L.ElemwiseMergeLayer(incomings=[x_layer, output_layer], merge_function=T.mul) output_layer.W = W return output_layer
def recurrent(input_var=None, num_units=512, batch_size=64, seq_length=1, grad_clip=100): recurrent = [] theano_rng = RandomStreams(rng.randint(2**15)) # we want noise to match tanh range of activation ([-1,1]) noise = theano_rng.uniform(size=(batch_size, seq_length, num_units), low=-1.0, high=1.0) input_var = noise if input_var is None else input_var recurrent.append( ll.InputLayer(shape=(batch_size, seq_length, num_units), input_var=input_var)) recurrent.append( ll.LSTMLayer(recurrent[-1], num_units, grad_clipping=grad_clip)) #tanh is default recurrent.append(ll.SliceLayer(recurrent[-1], -1, 1)) recurrent.append(ll.ReshapeLayer(recurrent[-1], ([0], 1, [1]))) for layer in recurrent: print layer.output_shape print "" return recurrent
def concat_tn(_top, _seed, start=0, num_slices=1): if _top==None: return L.SliceLayer(_seed, indices=slice(start, start+num_slices), axis=1), start+num_slices elif num_slices>0: _seed1, n = create_slices_from(_seed, _top.output_shape, start=start, num_slices=num_slices) return L.ConcatLayer([_top, _seed1], axis=1), start+n else: return _top, start
def create_slices_from(_source, ish, start=0, num_slices=1): ns = num_slices if len(ish)==2 else num_slices * ish[2] * ish[3] osh = (num_slices,) if len(ish)==2 else (num_slices, ish[2], ish[3]) _slice = L.SliceLayer(_source, indices=slice(start, start+ns), axis=1) return L.ReshapeLayer(_slice, ([0],)+osh), ns
def build_network(self, K, vocab_size, doc_var, query_var, docmask_var, qmask_var, candmask_var, feat_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_featin = L.InputLayer(shape=(None, None), input_var=feat_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_var.shape[0], doc_var.shape[1], EMBED_DIM)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=EMBED_DIM, W=l_docembed.W) l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if not EMB_TRAIN: l_docembed.params[l_docembed.W].remove('trainable') l_fwd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) # B x 2D q = L.get_output(l_q) # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) l_fwd_q_1 = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice_1 = L.SliceLayer(l_fwd_q_1, -1, 1) l_bkd_q_slice_1 = L.SliceLayer(l_bkd_q_1, 0, 1) l_q_c_1 = L.ConcatLayer([l_fwd_q_slice_1, l_bkd_q_slice_1]) # B x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) q_rep = T.reshape(T.tile(qd, (1, doc_var.shape[1])), (doc_var.shape[0], doc_var.shape[1], 2 * NUM_HIDDEN)) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=DROPOUT_RATE) # B x N x DE l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,T.flatten(doc_var,outdim=2)],\ pm) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final_v = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,\ T.flatten(doc_var,outdim=2)],pm) return final, final_v, l_doc, l_qs
def __init__(self, input_shape, output_dim, hidden_sizes, conv_filters, conv_filter_sizes, conv_strides, conv_pads, encoding_levels=None, num_encoding_levels=5, xd_dim=32, hidden_W_init=LI.GlorotUniform(), hidden_b_init=LI.Constant(0.), output_W_init=LI.GlorotUniform(), output_b_init=LI.Constant(0.), hidden_nonlinearity=LN.rectify, output_nonlinearity=None, name=None, input_var=None): if name is None: prefix = "" else: prefix = name + "_" if len(input_shape) == 3: l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var) l_hid = L.reshape(l_in, ([0], ) + input_shape) elif len(input_shape) == 2: l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var) input_shape = (1, ) + input_shape l_hid = L.reshape(l_in, ([0], ) + input_shape) else: l_in = L.InputLayer(shape=(None, ) + input_shape, input_var=input_var) l_hid = l_in assert input_shape[0] % 2 == 0 l_hid0 = L.SliceLayer(l_hid, slice(None, input_shape[0] // 2), axis=1) l_hid1 = L.SliceLayer(l_hid, slice(input_shape[0] // 2, None), axis=1) l_hids = [l_hid0, l_hid1] if encoding_levels is None: encoding_levels = [num_encoding_levels] else: assert max(encoding_levels) == num_encoding_levels xlevels_c_dim = OrderedDict( zip(range(num_encoding_levels + 1), [3, 64, 128, 256, 512, 512])) import h5py params_file = h5py.File("models/theano/vgg16_levelsall_nodyn_model.h5", 'r') params_kwargs_list = [] # encoding for ihid, l_hid in enumerate(l_hids): l_xlevels = OrderedDict() l_xdlevels = OrderedDict( ) # downsampled version of l_xlevels at the resolution for servoing for level in range(num_encoding_levels + 1): if level == 0: l_xlevel = l_hid elif level < 3: l_xlevelm1 = l_xlevels[level - 1] if level == 1: # change from BGR to RGB and subtract mean pixel values # (X - mean_pixel_bgr[None, :, None, None])[:, ::-1, :, :] # X[:, ::-1, :, :] - mean_pixel_rgb[None, :, None, None] if ihid == 0: mean_pixel_bgr = np.array( [103.939, 116.779, 123.68], dtype=np.float32) mean_pixel_rgb = mean_pixel_bgr[::-1] W = np.eye(3)[::-1, :].reshape( (3, 3, 1, 1)).astype(np.float32) b = -mean_pixel_rgb params_kwargs = dict(W=W, b=b) for k, v in params_kwargs.items(): bcast = tuple(s == 1 for s in v.shape) params_kwargs[k] = theano.shared( v, broadcastable=bcast) params_kwargs_list.append(params_kwargs) else: params_kwargs = params_kwargs_list.pop(0) l_xlevelm1 = L.Conv2DLayer(l_xlevelm1, num_filters=3, filter_size=1, nonlinearity=nl.identity, **params_kwargs) l_xlevelm1.W.name = 'x0.W' l_xlevelm1.params[l_xlevelm1.W].remove('trainable') l_xlevelm1.b.name = 'x0.b' l_xlevelm1.params[l_xlevelm1.b].remove('trainable') if ihid == 0: conv1_W = params_file['conv%d_1.W' % level][()] conv1_b = params_file['conv%d_1.b' % level][()] conv2_W = params_file['conv%d_2.W' % level][()] conv2_b = params_file['conv%d_2.b' % level][()] params_kwargs = dict(conv1_W=conv1_W, conv1_b=conv1_b, conv2_W=conv2_W, conv2_b=conv2_b) for k, v in params_kwargs.items(): bcast = tuple(s == 1 for s in v.shape) params_kwargs[k] = theano.shared( v, broadcastable=bcast) params_kwargs_list.append(params_kwargs) else: params_kwargs = params_kwargs_list.pop(0) l_xlevel = LT.VggEncodingLayer(l_xlevelm1, xlevels_c_dim[level], level=str(level), **params_kwargs) else: if ihid == 0: conv1_W = params_file['conv%d_1.W' % level][()] conv1_b = params_file['conv%d_1.b' % level][()] conv2_W = params_file['conv%d_2.W' % level][()] conv2_b = params_file['conv%d_2.b' % level][()] conv3_W = params_file['conv%d_3.W' % level][()] conv3_b = params_file['conv%d_3.b' % level][()] params_kwargs = dict(conv1_W=conv1_W, conv1_b=conv1_b, conv2_W=conv2_W, conv2_b=conv2_b, conv3_W=conv3_W, conv3_b=conv3_b) for k, v in params_kwargs.items(): bcast = tuple(s == 1 for s in v.shape) params_kwargs[k] = theano.shared( v, broadcastable=bcast) params_kwargs_list.append(params_kwargs) else: params_kwargs = params_kwargs_list.pop(0) l_xlevel = LT.VggEncoding3Layer( l_xlevels[level - 1], xlevels_c_dim[level], dilation=(2**(level - 3), ) * 2, level=str(level), **params_kwargs) # TODO: LT.set_layer_param_tags(l_xlevel, trainable=False) # downsample to servoing resolution xlevel_shape = L.get_output_shape(l_xlevel) xlevel_dim = xlevel_shape[-1] assert xlevel_shape[-2] == xlevel_dim scale_factor = xlevel_dim // xd_dim if scale_factor > 1: l_xdlevel = LT.Downscale2DLayer(l_xlevel, scale_factor=scale_factor, name='x%dd' % level) elif scale_factor == 1: l_xdlevel = l_xlevel else: raise NotImplementedError if 0 < level < 3: l_xlevel = L.MaxPool2DLayer(l_xlevel, pool_size=2, stride=2, pad=0, name='pool%d' % level) l_xlevels[level] = l_xlevel l_xdlevels[level] = l_xdlevel l_ylevels = OrderedDict( ) # standarized version of l_xdlevels used as the feature for servoing for level in encoding_levels: if ihid == 0: offset = params_file['y%d.offset' % level][()] scale = params_file['y%d.scale' % level][()] params_kwargs = dict(offset=offset, scale=scale) for k, v in params_kwargs.items(): bcast = tuple(s == 1 for s in v.shape) params_kwargs[k] = theano.shared(v, broadcastable=bcast) params_kwargs_list.append(params_kwargs) else: params_kwargs = params_kwargs_list.pop(0) l_ylevels[level] = LT.StandarizeLayer(l_xdlevels[level], name='y%d' % level, **params_kwargs) l_hids[ihid] = L.ConcatLayer( [l_ylevels[level] for level in encoding_levels], axis=1) assert not params_kwargs_list l_hid = L.ConcatLayer(l_hids, axis=1) for idx, conv_filter, filter_size, stride, pad in zip( range(len(conv_filters)), conv_filters, conv_filter_sizes, conv_strides, conv_pads, ): l_hid = L.Conv2DLayer( l_hid, num_filters=conv_filter, filter_size=filter_size, stride=(stride, stride), pad=pad, nonlinearity=hidden_nonlinearity, name="%sconv_hidden_%d" % (prefix, idx), ) conv_out = l_hid for idx, hidden_size in enumerate(hidden_sizes): l_hid = L.DenseLayer( l_hid, num_units=hidden_size, nonlinearity=hidden_nonlinearity, name="%shidden_%d" % (prefix, idx), W=hidden_W_init, b=hidden_b_init, ) l_out = L.DenseLayer( l_hid, num_units=output_dim, nonlinearity=output_nonlinearity, name="%soutput" % (prefix, ), W=output_W_init, b=output_b_init, ) self._l_in = l_in self._l_out = l_out self._input_var = l_in.input_var self._conv_out = conv_out
def rnn_decoder(l_input_one_hot, l_encoder_hid, encoder_mask, out_sym, out_mask, out_go_sym, name="Decoder"): n_layers = 1 n_units = 256 n_attention_units = 256 emb_size = 256 rnn = DropoutLSTMLayer l_go_out = L.InputLayer((None, None), input_var=out_go_sym) l_out_mask = L.InputLayer((None, None), input_var=out_mask) l_in_mask = L.InputLayer((None, None), input_var=encoder_mask) l_emb = L.EmbeddingLayer(l_go_out, dict_size, emb_size, name=name + '.Embedding') last_hid_encoded = L.SliceLayer(rnn(l_encoder_hid, num_units=n_units, mask_input=l_in_mask, name=name + '.Summarizer', dropout=0.25), indices=-1, axis=1) encoder_last_hid_repeat = RepeatLayer(last_hid_encoded, n=T.shape(out_go_sym)[1], axis=1) l_dec = L.ConcatLayer([l_emb, encoder_last_hid_repeat], axis=2) for i in range(n_layers): l_dec = rnn(l_dec, num_units=n_units, mask_input=l_out_mask, name="%s.%d.Forward" % (name, i), learn_init=True, dropout=0.25) l_attention = BahdanauKeyValueAttentionLayer( [l_encoder_hid, l_input_one_hot, l_in_mask, l_dec], n_attention_units, name=name + '.Attention') # (bs, seq_out, dict) l_out = L.ReshapeLayer(l_attention, (-1, [2])) out_random = L.get_output( l_out, deterministic=False) # (batch * seq_out) x dict out_deterministic = L.get_output( l_out, deterministic=True) # (batch * seq_out) x dict params = L.get_all_params([l_out], trainable=True) rcrossentropy = T.nnet.categorical_crossentropy( out_random + 1e-8, out_sym.flatten()) # (batch * seq) x 1 crossentropy = T.reshape(rcrossentropy, (bs, -1)) # batch x seq loss = T.sum(out_mask * crossentropy) / T.sum(out_mask) # scalar argmax = T.argmax(T.reshape(out_deterministic, (bs, -1, dict_size)), axis=-1) # batch x seq x 1 return {'loss': loss, 'argmax': argmax, 'params': params}
def __init__(self, input_shape, output_dim, hidden_sizes, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_W_init=LI.GlorotUniform(), hidden_b_init=LI.Constant(0.), output_W_init=LI.GlorotUniform(), output_b_init=LI.Constant(0.), hidden_nonlinearity=LN.rectify, output_nonlinearity=LN.softmax, name=None, input_var=None): if name is None: prefix = "" else: prefix = name + "_" if len(input_shape) == 3: l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var) l_hid = L.reshape(l_in, ([0], ) + input_shape) elif len(input_shape) == 2: l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var) input_shape = (1, ) + input_shape l_hid = L.reshape(l_in, ([0], ) + input_shape) else: l_in = L.InputLayer(shape=(None, ) + input_shape, input_var=input_var) l_hid = l_in assert input_shape[0] % 2 == 0 l_hid0 = L.SliceLayer(l_hid, slice(None, input_shape[0] // 2), axis=1) l_hid1 = L.SliceLayer(l_hid, slice(input_shape[0] // 2, None), axis=1) l_hids = [l_hid0, l_hid1] for idx, conv_filter, filter_size, stride, pad in zip( range(len(conv_filters)), conv_filters, conv_filter_sizes, conv_strides, conv_pads, ): for ihid in range(len(l_hids)): if ihid > 0: conv_kwargs = dict(W=l_hids[0].W, b=l_hids[0].b) else: conv_kwargs = dict() l_hids[ihid] = L.Conv2DLayer(l_hids[ihid], num_filters=conv_filter, filter_size=filter_size, stride=(stride, stride), pad=pad, nonlinearity=hidden_nonlinearity, name="%sconv_hidden_%d_%d" % (prefix, idx, ihid), convolution=wrapped_conv, **conv_kwargs) l_hid = L.ElemwiseSumLayer(l_hids, coeffs=[-1, 1]) l_hid = L.ExpressionLayer(l_hid, lambda X: X * X) for idx, hidden_size in enumerate(hidden_sizes): l_hid = L.DenseLayer( l_hid, num_units=hidden_size, nonlinearity=hidden_nonlinearity, name="%shidden_%d" % (prefix, idx), W=hidden_W_init, b=hidden_b_init, ) l_out = L.DenseLayer( l_hid, num_units=output_dim, nonlinearity=output_nonlinearity, name="%soutput" % (prefix, ), W=output_W_init, b=output_b_init, ) self._l_in = l_in self._l_out = l_out self._input_var = l_in.input_var
def build_network(self, vocab_size, doc_var, query_var, docmask_var, qmask_var, candmask_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_var.shape[0], doc_var.shape[1], EMBED_DIM)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=EMBED_DIM, W=l_docembed.W) l_fwd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) # B x 2D q = L.get_output(l_q) # B x 2D l_fwd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) l_doc_1 = L.dropout(l_doc_1, p=DROPOUT_RATE) l_fwd_q_c = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_c = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice_c = L.SliceLayer(l_fwd_q_c, -1, 1) l_bkd_q_slice_c = L.SliceLayer(l_bkd_q_c, 0, 1) l_q_c = L.ConcatLayer([l_fwd_q_slice_c, l_bkd_q_slice_c]) # B x DE qd = L.get_output(l_q_c) q_rep = T.reshape( T.tile(qd, (1, doc_var.shape[1])), (doc_var.shape[0], doc_var.shape[1], 2 * NUM_HIDDEN)) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN), input_var=q_rep) l_doc_gru_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_fwd_doc = L.GRULayer(l_doc_gru_in, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doc_gru_in, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax( T.set_subtensor( T.alloc(-20., p.shape[0], p.shape[1])[candmask_var.nonzero()], p[candmask_var.nonzero()])) index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final = T.inc_subtensor( T.alloc(0., p.shape[0], vocab_size)[index, T.flatten(doc_var, outdim=2)], pm) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax( T.set_subtensor( T.alloc(-20., p.shape[0], p.shape[1])[candmask_var.nonzero()], p[candmask_var.nonzero()])) index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final_v = T.inc_subtensor( T.alloc(0., p.shape[0], vocab_size)[index, T.flatten(doc_var, outdim=2)], pm) return final, final_v, l_doc, [l_q, l_q_c]
def clone(src_net, dst_net, mask_input): """ Clones a lasagne neural network, keeping weights tied. For all layers of src_net in turn, starting at the first: 1. creates a copy of the layer, 2. reuses the original objects for weights and 3. appends the new layer to dst_net. InputLayers are ignored. Recurrent layers (LSTMLayer) are passed mask_input. """ logger.info("Net to be cloned:") for l in layers.get_all_layers(src_net): logger.info(" - {} ({}):".format(l.name, l)) logger.info("Starting to clone..") for l in layers.get_all_layers(src_net): logger.info("src_net[...]: {} ({}):".format(l.name, l)) if type(l) == layers.InputLayer: logger.info(' - skipping') continue if type(l) == layers.DenseLayer: dst_net = layers.DenseLayer( dst_net, num_units=l.num_units, W=l.W, b=l.b, nonlinearity=l.nonlinearity, name=l.name+'2', ) elif type(l) == layers.EmbeddingLayer: dst_net = layers.EmbeddingLayer( dst_net, l.input_size, l.output_size, W=l.W, name=l.name+'2', ) elif type(l) == layers.LSTMLayer: dst_net = layers.LSTMLayer( dst_net, l.num_units, ingate=layers.Gate( W_in=l.W_in_to_ingate, W_hid=l.W_hid_to_ingate, W_cell=l.W_cell_to_ingate, b=l.b_ingate, nonlinearity=l.nonlinearity_ingate ), forgetgate=layers.Gate( W_in=l.W_in_to_forgetgate, W_hid=l.W_hid_to_forgetgate, W_cell=l.W_cell_to_forgetgate, b=l.b_forgetgate, nonlinearity=l.nonlinearity_forgetgate ), cell=layers.Gate( W_in=l.W_in_to_cell, W_hid=l.W_hid_to_cell, W_cell=None, b=l.b_cell, nonlinearity=l.nonlinearity_cell ), outgate=layers.Gate( W_in=l.W_in_to_outgate, W_hid=l.W_hid_to_outgate, W_cell=l.W_cell_to_outgate, b=l.b_outgate, nonlinearity=l.nonlinearity_outgate ), nonlinearity=l.nonlinearity, cell_init=l.cell_init, hid_init=l.hid_init, backwards=l.backwards, learn_init=l.learn_init, peepholes=l.peepholes, gradient_steps=l.gradient_steps, grad_clipping=l.grad_clipping, unroll_scan=l.unroll_scan, precompute_input=l.precompute_input, # mask_input=l.mask_input, # AttributeError: 'LSTMLayer' object has no attribute 'mask_input' name=l.name+'2', mask_input=mask_input, ) elif type(l) == layers.SliceLayer: dst_net = layers.SliceLayer( dst_net, indices=l.slice, axis=l.axis, name=l.name+'2', ) else: raise ValueError("Unhandled layer: {}".format(l)) new_layer = layers.get_all_layers(dst_net)[-1] logger.info('dst_net[...]: {} ({})'.format(new_layer, new_layer.name)) logger.info("Result of cloning:") for l in layers.get_all_layers(dst_net): logger.info(" - {} ({}):".format(l.name, l)) return dst_net
def _invert_Conv2DLayer(self, layer, feeder): def _check_padding_same(): for s, p in zip(layer.filter_size, layer.pad): if s % 2 != 1: return False elif s // 2 != p: return False return True # Warning they are swapped here. feeder = self._put_rectifiers(feeder, layer) f_s = layer.filter_size if layer.pad == 'same' or _check_padding_same(): pad = 'same' elif layer.pad == 'valid' or layer.pad == (0, 0): pad = 'full' else: raise RuntimeError("Define your padding as full or same.") # By definition the # Flip filters must be on to be a proper deconvolution. num_filters = L.get_output_shape(layer.input_layer)[1] if layer.stride == (4, 4): # Todo: clean this! print("Applying alexnet hack.") feeder = L.Upscale2DLayer(feeder, layer.stride, mode='dilate') output_layer = L.Conv2DLayer(feeder, num_filters=num_filters, filter_size=f_s, stride=1, pad=pad, nonlinearity=None, b=None, flip_filters=True) print("Applying alexnet hack part 2.") conv_layer = output_layer output_layer = L.SliceLayer(L.SliceLayer(output_layer, slice(0, -3), axis=3), slice(0, -3), axis=2) output_layer.W = conv_layer.W elif layer.stride == (2, 2): # Todo: clean this! Seems to be the same code as for AlexNet above. print("Applying GoogLeNet hack.") feeder = L.Upscale2DLayer(feeder, layer.stride, mode='dilate') output_layer = L.Conv2DLayer(feeder, num_filters=num_filters, filter_size=f_s, stride=1, pad=pad, nonlinearity=None, b=None, flip_filters=True) else: # Todo: clean this. Repetitions all over. output_layer = L.Conv2DLayer(feeder, num_filters=num_filters, filter_size=f_s, stride=1, pad=pad, nonlinearity=None, b=None, flip_filters=True) return output_layer
def build_network(self, vocab_size, input_var, mask_var, docidx_var, docidx_mask, skip_connect=True): l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var) l_mask = L.InputLayer(shape=(None, None), input_var=mask_var) l_embed = L.EmbeddingLayer(l_in, input_size=vocab_size, output_size=EMBED_DIM, W=self.params['W_emb']) l_embed_noise = L.dropout(l_embed, p=DROPOUT_RATE) # NOTE: Moved initialization of forget gate biases to init_params #forget_gate_1 = L.Gate(b=lasagne.init.Constant(3)) #forget_gate_2 = L.Gate(b=lasagne.init.Constant(3)) # NOTE: LSTM layer provided by Lasagne is slightly different from that used in DeepMind's paper. # In the paper the cell-to-* weights are not diagonal. # the 1st lstm layer in_gate = L.Gate(W_in=self.params['W_lstm1_xi'], W_hid=self.params['W_lstm1_hi'], W_cell=self.params['W_lstm1_ci'], b=self.params['b_lstm1_i'], nonlinearity=lasagne.nonlinearities.sigmoid) forget_gate = L.Gate(W_in=self.params['W_lstm1_xf'], W_hid=self.params['W_lstm1_hf'], W_cell=self.params['W_lstm1_cf'], b=self.params['b_lstm1_f'], nonlinearity=lasagne.nonlinearities.sigmoid) out_gate = L.Gate(W_in=self.params['W_lstm1_xo'], W_hid=self.params['W_lstm1_ho'], W_cell=self.params['W_lstm1_co'], b=self.params['b_lstm1_o'], nonlinearity=lasagne.nonlinearities.sigmoid) cell_gate = L.Gate(W_in=self.params['W_lstm1_xc'], W_hid=self.params['W_lstm1_hc'], W_cell=None, b=self.params['b_lstm1_c'], nonlinearity=lasagne.nonlinearities.tanh) l_fwd_1 = L.LSTMLayer(l_embed_noise, NUM_HIDDEN, ingate=in_gate, forgetgate=forget_gate, cell=cell_gate, outgate=out_gate, peepholes=True, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) # the 2nd lstm layer if skip_connect: # construct skip connection from the lookup table to the 2nd layer batch_size, seq_len, _ = input_var.shape # concatenate the last dimension of l_fwd_1 and embed l_fwd_1_shp = L.ReshapeLayer(l_fwd_1, (-1, NUM_HIDDEN)) l_embed_shp = L.ReshapeLayer(l_embed, (-1, EMBED_DIM)) to_next_layer = L.ReshapeLayer( L.concat([l_fwd_1_shp, l_embed_shp], axis=1), (batch_size, seq_len, NUM_HIDDEN + EMBED_DIM)) else: to_next_layer = l_fwd_1 to_next_layer_noise = L.dropout(to_next_layer, p=DROPOUT_RATE) in_gate = L.Gate(W_in=self.params['W_lstm2_xi'], W_hid=self.params['W_lstm2_hi'], W_cell=self.params['W_lstm2_ci'], b=self.params['b_lstm2_i'], nonlinearity=lasagne.nonlinearities.sigmoid) forget_gate = L.Gate(W_in=self.params['W_lstm2_xf'], W_hid=self.params['W_lstm2_hf'], W_cell=self.params['W_lstm2_cf'], b=self.params['b_lstm2_f'], nonlinearity=lasagne.nonlinearities.sigmoid) out_gate = L.Gate(W_in=self.params['W_lstm2_xo'], W_hid=self.params['W_lstm2_ho'], W_cell=self.params['W_lstm2_co'], b=self.params['b_lstm2_o'], nonlinearity=lasagne.nonlinearities.sigmoid) cell_gate = L.Gate(W_in=self.params['W_lstm2_xc'], W_hid=self.params['W_lstm2_hc'], W_cell=None, b=self.params['b_lstm2_c'], nonlinearity=lasagne.nonlinearities.tanh) l_fwd_2 = L.LSTMLayer(to_next_layer_noise, NUM_HIDDEN, ingate=in_gate, forgetgate=forget_gate, cell=cell_gate, outgate=out_gate, peepholes=True, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) # slice final states of both lstm layers l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1) l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1) # g will be used to score the words based on their embeddings g = L.DenseLayer(L.concat([l_fwd_1_slice, l_fwd_2_slice], axis=1), num_units=EMBED_DIM, W=self.params['W_dense'], b=self.params['b_dense'], nonlinearity=lasagne.nonlinearities.tanh) ## get outputs #g_out = L.get_output(g) # B x D #g_out_val = L.get_output(g, deterministic=True) # B x D ## compute softmax probs #probs,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm), # outputs_info=None, # sequences=[g_out,docidx_var,docidx_mask], # non_sequences=self.params['W_emb']) #predicted_probs = probs.reshape(docidx_var.shape) # B x N #probs_val,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm), # outputs_info=None, # sequences=[g_out_val,docidx_var,docidx_mask], # non_sequences=self.params['W_emb']) #predicted_probs_val = probs_val.reshape(docidx_var.shape) # B x N #return predicted_probs, predicted_probs_val # W is shared with the lookup table l_out = L.DenseLayer(g, num_units=vocab_size, W=self.params['W_emb'].T, nonlinearity=lasagne.nonlinearities.softmax, b=None) return l_out
def buildModel(self): print(' -- Building...') x_init = sparse.csr_matrix('x', dtype='float32') y_init = T.imatrix('y') g_init = T.imatrix('g') ind_init = T.ivector('ind') sub_path_init = T.imatrix('subPathsBatch') mask_init = T.fmatrix('subMask') # step train x_input = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=x_init) g_input = lgl.InputLayer(shape=(None, 2), input_var=g_init) ind_input = lgl.InputLayer(shape=(None, ), input_var=ind_init) pair_second = lgl.SliceLayer(g_input, indices=1, axis=1) pair_first = lgl.SliceLayer(g_input, indices=0, axis=1) pair_first_emd = lgl.EmbeddingLayer(pair_first, input_size=self.num_ver, output_size=self.embedding_size) emd_to_numver = layers.DenseLayer( pair_first_emd, self.num_ver, nonlinearity=lg.nonlinearities.softmax) index_emd = lgl.EmbeddingLayer(ind_input, input_size=self.num_ver, output_size=self.embedding_size, W=pair_first_emd.W) x_to_ydim = layers.SparseLayer(x_input, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) index_emd = layers.DenseLayer(index_emd, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) concat_two = lgl.ConcatLayer([x_to_ydim, index_emd], axis=1) concat_two = layers.DenseLayer(concat_two, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) concat_two_output = lgl.get_output(concat_two) step_loss = lgo.categorical_crossentropy(concat_two_output, y_init).mean() hid_loss = lgl.get_output(x_to_ydim) step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean() emd_loss = lgl.get_output(index_emd) step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean() step_params = [ index_emd.W, index_emd.b, x_to_ydim.W, x_to_ydim.b, concat_two.W, concat_two.b ] step_updates = lg.updates.sgd(step_loss, step_params, learning_rate=self.step_learning_rate) self.step_train = theano.function([x_init, y_init, ind_init], step_loss, updates=step_updates, on_unused_input='ignore') self.test_fn = theano.function([x_init, ind_init], concat_two_output, on_unused_input='ignore') # supervised train fc_output = lgl.get_output(emd_to_numver) pair_second_output = lgl.get_output(pair_second) sup_loss = lgo.categorical_crossentropy(fc_output, pair_second_output).sum() sup_params = lgl.get_all_params(emd_to_numver, trainable=True) sup_updates = lg.updates.sgd(sup_loss, sup_params, learning_rate=self.sup_learning_rate) self.sup_train = theano.function([g_init], sup_loss, updates=sup_updates, on_unused_input='ignore') cross_entropy = lgo.categorical_crossentropy(fc_output, pair_second_output) cross_entropy = T.reshape(cross_entropy, (1, self.unsup_batch_size), ndim=None) mask_input = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=mask_init) subPath_in = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=sub_path_init) sub_path_emd = lgl.EmbeddingLayer(subPath_in, input_size=self.num_ver, output_size=self.embedding_size, W=pair_first_emd.W) lstm_layer = lgl.LSTMLayer(sub_path_emd, self.lstm_hidden_units, grad_clipping=3, mask_input=mask_input) # handle path weight max1 = T.mean(lgl.get_output(lstm_layer), axis=1) max2 = T.mean(max1, axis=1) max2_init = T.fcol('max2') max2_init = T.reshape(max2, ((self.subpath_num, 1))) max2_input = lgl.InputLayer(shape=(self.subpath_num, 1), input_var=max2_init) max2_input = lgl.BatchNormLayer(max2_input) path_weight = lgl.get_output(max2_input) path_weight = lg.nonlinearities.sigmoid(path_weight) path_weight = 1 + 0.3 * path_weight # unsupervised train reweight_loss = T.dot(cross_entropy, path_weight)[0][0] lstm_params_all = lgl.get_all_params(lstm_layer, trainable=True) lstm_params = list(set(lstm_params_all).difference(set(sup_params))) lstm_updates = lg.updates.sgd(reweight_loss, lstm_params, learning_rate=0.01) self.lstm_fn = theano.function([sub_path_init, g_init, mask_init], reweight_loss, updates=lstm_updates, on_unused_input='ignore') alpha_updates = lg.updates.sgd(reweight_loss, sup_params, learning_rate=0.001) self.alpha_fn = theano.function([sub_path_init, g_init, mask_init], reweight_loss, updates=alpha_updates, on_unused_input='ignore') print(' -- Done!')
def build_network(self, K, vocab_size, doc_var, query_var, cand_var, docmask_var, qmask_var, candmask_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=self.embed_dim, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_var.shape[0], doc_var.shape[1], self.embed_dim)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=self.embed_dim, W=l_docembed.W) if self.train_emb == 0: l_docembed.params[l_docembed.W].remove('trainable') l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) # B x 2D q = L.get_output(l_q) # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice_1 = L.SliceLayer(l_fwd_q_1, -1, 1) l_bkd_q_slice_1 = L.SliceLayer(l_bkd_q_1, 0, 1) l_q_c_1 = L.ConcatLayer([l_fwd_q_slice_1, l_bkd_q_slice_1]) # B x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) q_rep = T.reshape(T.tile(qd, (1, doc_var.shape[1])), (doc_var.shape[0], doc_var.shape[1], 2 * self.nhidden)) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=self.dropout) l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] final = T.batched_dot(pm, cand_var) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] final_v = T.batched_dot(pm, cand_var) return final, final_v, l_doc, l_qs, l_docembed.W
def __init__(self, input_size, img_width, img_height, channel_size=1, action_size=None, feature_dim=10, hidden_sizes=(32,32), conv_args={}, l2_reg=0, kl_weight=1, learning_rate=1e-4, hidden_act=nonlinearities.tanh, use_actions=False, set_norm_constant=None): self.input_size = input_size self.set_norm_constant = set_norm_constant self.sym_x1 = T.matrix() self.sym_x2 = T.matrix() self.sym_labels = T.matrix() self.lin1 = lasagne.layers.InputLayer((None, input_size)) self.lin2 = lasagne.layers.InputLayer((None, input_size)) if use_actions: lin1 = L.SliceLayer(self.lin1, slice(0, -action_size)) lin2 = L.SliceLayer(self.lin2, slice(0, -action_size)) lact1 = L.SliceLayer(self.lin1, slice(-action_size, None)) lact2 = L.SliceLayer(self.lin2, slice(-action_size, None)) else: lin1 = self.lin1 lin2 = self.lin2 lin1 = L.ReshapeLayer(lin1, (-1, channel_size, img_width, img_height)) lin2 = L.ReshapeLayer(lin2, (-1, channel_size, img_width, img_height)) self.base1 = ConvNet(lin1, **conv_args) self.base2 = ConvNet(lin2, **conv_args) l1_enc_h2 = self.base1.output_layer() l2_enc_h2 = self.base2.output_layer() if use_actions: l1_enc_h2 = L.ConcatLayer([l1_enc_h2, lact1]) l2_enc_h2 = L.ConcatLayer([l2_enc_h2, lact2]) self.mean_net1 = MLP(l1_enc_h2, feature_dim, hidden_sizes, hidden_act) self.mean_net2 = MLP(l2_enc_h2, feature_dim, hidden_sizes, hidden_act) self.logvar_net1 = MLP(l1_enc_h2, feature_dim, hidden_sizes, hidden_act) self.logvar_net2 = MLP(l1_enc_h2, feature_dim, hidden_sizes, hidden_act) l1_mu = self.mean_net1.output_layer() l1_log_var = self.logvar_net1.output_layer() l2_mu = self.mean_net2.output_layer() l2_log_var = self.logvar_net2.output_layer() # Sample latent variables l1_z = SimpleSampleLayer(mean=l1_mu, log_var=l1_log_var) l2_z = SimpleSampleLayer(mean=l2_mu, log_var=l2_log_var) combined_z = L.ConcatLayer([l1_z, l2_z]) # Classify from latent self.class_net = MLP(combined_z, 1, hidden_sizes, output_act=nonlinearities.sigmoid) l_output = self.class_net.output_layer() combined_mu = L.ConcatLayer([l1_mu, l2_mu]) combined_logvar = L.ConcatLayer([l1_log_var, l2_log_var]) z_train, z_mu_train, z_log_var_train, output_train = L.get_output( [combined_z, combined_mu, combined_logvar, l_output], inputs={self.lin1: self.sym_x1, self.lin2: self.sym_x2}, deterministic=False ) l1_z_t, l2_z_t = L.get_output( [l1_z, l2_z], inputs={self.lin1: self.sym_x1, self.lin2: self.sym_x2}, deterministic=False ) output_test = L.get_output(l_output, inputs={self.lin1: self.sym_x1, self.lin2: self.sym_x2}, deterministic=True) self.LL_train, self.class_loss, self.kl_loss = latent_gaussian_x_bernoulli(z_train, z_mu_train, z_log_var_train, output_train, self.sym_labels, True, kl_weight) self.LL_train *= -1 if l2_reg != 0: self.LL_train += l2_reg * lasagne.regularization.regularize_network_params(l_output, lasagne.regularization.l2) self.l_output = l_output self.output_test = output_test params = self.params() grads = T.grad(self.LL_train, params) updates = lasagne.updates.adam(grads, params, learning_rate=learning_rate) with compile_timer('train_fn'): self.train_model = theano.function([self.sym_x1, self.sym_x2, self.sym_labels], [self.LL_train, self.class_loss, self.kl_loss], updates=updates) with compile_timer('test_fn'): self.test_model = theano.function([self.sym_x1, self.sym_x2], self.output_test)
def _invert_layer_recursion(self, layer, prev_layer): """ Note for concatenation layers this will be called multiple times. :param layer: Start the inversion recusrion in this layer. :return: the inverted layer, part of the entire graph. """ # If we have a concatenation layer, # we must find out at which point it is concatenated and slice. # We should not store it in the map for this layer. # Because that would corrupt the result. # Did we already invert this? if self.inverse_map[layer] is not None: return self.inverse_map[layer] feeder = [ self._invert_layer_recursion(l, layer) for l in self.output_map[layer] ] # Concatenation layers must be handled here. # This is not elegant, but it is important for the recursion that # the correct slice is computed every single time # Find the inverse of the layers this one feeds. # If this is none, it is the top layer and # we have to inject the explanation starting point if len(feeder) == 1: feeder = feeder[0] elif len(feeder) == 0: # It feeds nothing, so must be # output layer with restricted assumptions def nonlinearity(x): return 0 * x + self.relevance_values feeder = L.NonlinearityLayer(layer, nonlinearity=nonlinearity) else: # Multiple feeders. if type(self.output_map[layer][0]) is SliceLayer: print("Assuming all slices and non-overlapping") # TODO CHECK ASSUMPTIONS ARE APPLICABLE cat_axis = self.output_map[layer][0].axis print([l.slice for l in self.output_map[layer]]) feeder = L.ConcatLayer(feeder, axis=cat_axis) else: feeders = feeder feeder = feeders[0] for f in feeders[1:]: feeder = L.ElemwiseSumLayer([feeder, f]) # Concatenation layer or other layer. if isinstance(layer, L.ConcatLayer): axis = layer.axis start_slice = 0 for l in layer.input_layers: if l == prev_layer: break start_slice += L.get_output_shape(l)[axis] end_slice = start_slice + L.get_output_shape(prev_layer)[axis] return L.SliceLayer(feeder, slice(start_slice, end_slice), axis=axis) else: self.inverse_map[layer] = self._invert_layer(layer, feeder) return self.inverse_map[layer]
def build_model(vocab_size, doc_var, qry_var, doc_mask_var, qry_mask_var, W_init=lasagne.init.Normal()): l_doc_in = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qry_in = L.InputLayer(shape=(None, None, 1), input_var=qry_var) l_doc_embed = L.EmbeddingLayer(l_doc_in, vocab_size, EMBED_DIM, W=W_init) l_qry_embed = L.EmbeddingLayer(l_qry_in, vocab_size, EMBED_DIM, W=l_doc_embed.W) l_doc_mask = L.InputLayer(shape=(None, None), input_var=doc_mask_var) l_qry_mask = L.InputLayer(shape=(None, None), input_var=qry_mask_var) l_doc_fwd = L.LSTMLayer(l_doc_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_doc_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_doc_bkd = L.LSTMLayer(l_doc_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_doc_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_qry_fwd = L.LSTMLayer(l_qry_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qry_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_qry_bkd = L.LSTMLayer(l_qry_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qry_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_fwd_slice = L.SliceLayer(l_doc_fwd, -1, 1) l_doc_bkd_slice = L.SliceLayer(l_doc_bkd, 0, 1) l_qry_fwd_slice = L.SliceLayer(l_qry_fwd, -1, 1) l_qry_bkd_slice = L.SliceLayer(l_qry_bkd, 0, 1) r = L.DenseLayer(L.ElemwiseSumLayer([l_doc_fwd_slice, l_doc_bkd_slice]), num_units=NUM_HIDDEN, nonlinearity=lasagne.nonlinearities.tanh) u = L.DenseLayer(L.ElemwiseSumLayer([l_qry_fwd_slice, l_qry_bkd_slice]), num_units=NUM_HIDDEN, nonlinearity=lasagne.nonlinearities.tanh) g = L.DenseLayer(L.concat([r, u], axis=1), num_units=EMBED_DIM, W=lasagne.init.GlorotNormal(), nonlinearity=lasagne.nonlinearities.tanh) l_out = L.DenseLayer(g, num_units=vocab_size, W=l_doc_embed.W.T, nonlinearity=lasagne.nonlinearities.softmax, b=None) return l_out
def build_critic(self, critic_input_var, condition_var, vocoder, ctxsize, nonlinearity=lasagne.nonlinearities.very_leaky_rectify, postlayers_nb=6, use_LSweighting=True, LSWGANtransfreqcutoff=4000, LSWGANtranscoef=1.0 / 8.0, use_WGAN_incnoisefeature=False): useLRN = False # TODO layer_critic = ll.InputLayer(shape=(None, None, vocoder.featuressize()), input_var=critic_input_var, name='input') winlen = int(0.5 * self._windur / 0.005) * 2 + 1 layerstoconcats = [] # Amplitude spectrum layer = ll.SliceLayer(layer_critic, indices=slice( vocoder.f0size(), vocoder.f0size() + vocoder.specsize()), axis=2, name='spec_slice') # Assumed feature order if use_LSweighting: # Using weighted WGAN+LS print( 'WGAN Weighted LS - critic - SPEC (trans cutoff {}Hz)'.format( LSWGANtransfreqcutoff)) # wganls_spec_weights_ = nonlin_sigmoidparm(np.arange(vocoder.specsize(), dtype=theano.config.floatX), int(LSWGANtransfreqcutoff*vocoder.specsize()), LSWGANtranscoef) wganls_spec_weights_ = nonlin_sigmoidparm( np.arange(vocoder.specsize(), dtype=theano.config.floatX), sp.freq2fwspecidx(LSWGANtransfreqcutoff, vocoder.fs, vocoder.specsize()), LSWGANtranscoef) wganls_weights = theano.shared( value=np.asarray(wganls_spec_weights_), name='wganls_spec_weights_') layer = CstMulLayer(layer, cstW=wganls_weights, name='cstdot_wganls_weights') layer = ll.dimshuffle(layer, [0, 'x', 1, 2], name='spec_dimshuffle') for layi in xrange(self._nbcnnlayers): layerstr = 'spec_l' + str(1 + layi) + '_GC{}x{}x{}'.format( self._nbfilters, winlen, self._spec_freqlen) # strides>1 make the first two Conv layers pyramidal. Increase patches' effects here and there, bad. layer = layer_GatedConv2DLayer(layer, self._nbfilters, [winlen, self._spec_freqlen], pad='same', nonlinearity=nonlinearity, name=layerstr) if useLRN: layer = ll.LocalResponseNormalization2DLayer(layer) layer = ll.dimshuffle(layer, [0, 2, 3, 1], name='spec_dimshuffle') layer_spec = ll.flatten(layer, outdim=3, name='spec_flatten') layerstoconcats.append(layer_spec) if use_WGAN_incnoisefeature and vocoder.noisesize( ) > 0: # Add noise in critic layer = ll.SliceLayer(layer_critic, indices=slice( vocoder.f0size() + vocoder.specsize(), vocoder.f0size() + vocoder.specsize() + vocoder.noisesize()), axis=2, name='nm_slice') if use_LSweighting: # Using weighted WGAN+LS print('WGAN Weighted LS - critic - NM (trans cutoff {}Hz)'. format(LSWGANtransfreqcutoff)) # wganls_spec_weights_ = nonlin_sigmoidparm(np.arange(vocoder.noisesize(), dtype=theano.config.floatX), int(LSWGANtransfreqcutoff*vocoder.noisesize()), LSWGANtranscoef) wganls_spec_weights_ = nonlin_sigmoidparm( np.arange(vocoder.noisesize(), dtype=theano.config.floatX), sp.freq2fwspecidx(LSWGANtransfreqcutoff, vocoder.fs, vocoder.noisesize()), LSWGANtranscoef) wganls_weights = theano.shared( value=np.asarray(wganls_spec_weights_), name='wganls_spec_weights_') layer = CstMulLayer(layer, cstW=wganls_weights, name='cstdot_wganls_weights') layer = ll.dimshuffle(layer, [0, 'x', 1, 2], name='nm_dimshuffle') for layi in xrange(np.max( (1, int(np.ceil(self._nbcnnlayers / 2))))): layerstr = 'nm_l' + str(1 + layi) + '_GC{}x{}x{}'.format( self._nbfilters, winlen, self._noise_freqlen) layer = layer_GatedConv2DLayer(layer, self._nbfilters, [winlen, self._noise_freqlen], pad='same', nonlinearity=nonlinearity, name=layerstr) if useLRN: layer = ll.LocalResponseNormalization2DLayer(layer) layer = ll.dimshuffle(layer, [0, 2, 3, 1], name='nm_dimshuffle') layer_bndnm = ll.flatten(layer, outdim=3, name='nm_flatten') layerstoconcats.append(layer_bndnm) # Add the contexts layer_ctx_input = ll.InputLayer(shape=(None, None, ctxsize), input_var=condition_var, name='ctx_input') layer_ctx = layer_context(layer_ctx_input, ctx_nblayers=self._ctx_nblayers, ctx_nbfilters=self._ctx_nbfilters, ctx_winlen=self._ctx_winlen, hiddensize=self._hiddensize, nonlinearity=nonlinearity, bn_axes=None, bn_cnn_axes=None, critic=True, useLRN=useLRN) layerstoconcats.append(layer_ctx) # Concatenate the features analysis with the contexts... layer = ll.ConcatLayer(layerstoconcats, axis=2, name='ctx_features.concat') # ... and finalize with a common FC network for layi in xrange(postlayers_nb): layerstr = 'post.l' + str(1 + layi) + '_FC' + str(self._hiddensize) layer = ll.DenseLayer(layer, self._hiddensize, nonlinearity=nonlinearity, num_leading_axes=2, name=layerstr) # output layer (linear) layer = ll.DenseLayer(layer, 1, nonlinearity=None, num_leading_axes=2, name='projection') # No nonlin for this output return [layer, layer_critic, layer_ctx_input]
def __init__(self, n_inputs=None, n_outputs=None, input_shape=None, n_bypass=0, density='mog', n_hiddens=(10, 10), impute_missing=True, seed=None, n_filters=(), filter_sizes=3, pool_sizes=2, n_rnn=0, **density_opts): """Initialize a mixture density network with custom layers Parameters ---------- n_inputs : int Total input dimensionality (data/summary stats) n_outputs : int Dimensionality of output (simulator parameters) input_shape : tuple Size to which data are reshaped before CNN or RNN n_bypass : int Number of elements at end of input which bypass CNN or RNN density : string Type of density condition on the network, can be 'mog' or 'maf' n_components : int Number of components of the mixture density n_filters : list of ints Number of filters per convolutional layer n_hiddens : list of ints Number of hidden units per fully connected layer n_rnn : None or int Number of RNN units impute_missing : bool If set to True, learns replacement value for NaNs, otherwise those inputs are set to zero seed : int or None If provided, random number generator will be seeded density_opts : dict Options for the density estimator """ if n_rnn > 0 and len(n_filters) > 0: raise NotImplementedError assert isint(n_inputs) and isint(n_outputs)\ and n_inputs > 0 and n_outputs > 0 self.density = density.lower() self.impute_missing = impute_missing self.n_hiddens = list(n_hiddens) self.n_outputs, self.n_inputs = n_outputs, n_inputs self.n_bypass = n_bypass self.n_rnn = n_rnn self.n_filters, self.filter_sizes, self.pool_sizes, n_cnn = \ list(n_filters), filter_sizes, pool_sizes, len(n_filters) if type(self.filter_sizes) is int: self.filter_sizes = [self.filter_sizes for _ in range(n_cnn)] else: assert len(self.filter_sizes) >= n_cnn if type(self.pool_sizes) is int: self.pool_sizes = [self.pool_sizes for _ in range(n_cnn)] else: assert len(self.pool_sizes) >= n_cnn self.iws = tt.vector('iws', dtype=dtype) self.seed = seed if seed is not None: self.rng = np.random.RandomState(seed=seed) else: self.rng = np.random.RandomState() lasagne.random.set_rng(self.rng) self.input_shape = (n_inputs,) if input_shape is None else input_shape assert np.prod(self.input_shape) + self.n_bypass == self.n_inputs assert 1 <= len(self.input_shape) <= 3 # params: output placeholder (batch, self.n_outputs) self.params = tensorN(2, name='params', dtype=dtype) # stats : input placeholder, (batch, self.n_inputs) self.stats = tensorN(2, name='stats', dtype=dtype) # compose layers self.layer = collections.OrderedDict() # input layer, None indicates batch size not fixed at compile time self.layer['input'] = ll.InputLayer( (None, self.n_inputs), input_var=self.stats) # learn replacement values if self.impute_missing: self.layer['missing'] = \ dl.ImputeMissingLayer(last(self.layer), n_inputs=(self.n_inputs,)) else: self.layer['missing'] = \ dl.ReplaceMissingLayer(last(self.layer), n_inputs=(self.n_inputs,)) if self.n_bypass > 0 and (self.n_rnn > 0 or n_cnn > 0): last_layer = last(self.layer) bypass_slice = slice(self.n_inputs - self.n_bypass, self.n_inputs) direct_slice = slice(0, self.n_inputs - self.n_bypass) self.layer['bypass'] = ll.SliceLayer(last_layer, bypass_slice) self.layer['direct'] = ll.SliceLayer(last_layer, direct_slice) # reshape inputs prior to RNN or CNN step if self.n_rnn > 0 or n_cnn > 0: if len(n_filters) > 0 and len(self.input_shape) == 2: # 1 channel rs = (-1, 1, *self.input_shape) else: if self.n_rnn > 0: assert len(self.input_shape) == 2 # time, dim else: assert len(self.input_shape) == 3 # channel, row, col rs = (-1, *self.input_shape) # last layer is 'missing' or 'direct' self.layer['reshape'] = ll.ReshapeLayer(last(self.layer), rs) # recurrent neural net, input: (batch, sequence_length, num_inputs) if self.n_rnn > 0: self.layer['rnn'] = ll.GRULayer(last(self.layer), n_rnn, only_return_final=True) # convolutional net, input: (batch, channels, rows, columns) if n_cnn > 0: for l in range(n_cnn): # add layers if self.pool_sizes[l] == 1: padding = (self.filter_sizes[l] - 1) // 2 else: padding = 0 self.layer['conv_' + str(l + 1)] = ll.Conv2DLayer( name='c' + str(l + 1), incoming=last(self.layer), num_filters=self.n_filters[l], filter_size=self.filter_sizes[l], stride=(1, 1), pad=padding, untie_biases=False, W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.), nonlinearity=lnl.rectify, flip_filters=True, convolution=tt.nnet.conv2d) if self.pool_sizes[l] > 1: self.layer['pool_' + str(l + 1)] = ll.MaxPool2DLayer( name='p' + str(l + 1), incoming=last(self.layer), pool_size=self.pool_sizes[l], stride=None, ignore_border=True) # flatten self.layer['flatten'] = ll.FlattenLayer( incoming=last(self.layer), outdim=2) # incorporate bypass inputs if self.n_bypass > 0 and (self.n_rnn > 0 or n_cnn > 0): self.layer['bypass_merge'] = lasagne.layers.ConcatLayer( [self.layer['bypass'], last(self.layer)], axis=1) if self.density == 'mog': self.init_mdn(**density_opts) elif self.density == 'maf': self.init_maf(**density_opts) else: raise NotImplementedError self.compile_funs() # theano functions
def build_RNN(self, n_hidden_list=(100, ), bidirectional=False, addDenseLayers=False, seed=int(time.time()), debug=False, logger=logger_RNNtools): # some inspiration from http://colinraffel.com/talks/hammer2015recurrent.pdf # if debug: # logger_RNNtools.debug('\nInputs:'); # logger_RNNtools.debug(' X.shape: %s', self.X[0].shape) # logger_RNNtools.debug(' X[0].shape: %s %s %s \n%s', self.X[0][0].shape, type(self.X[0][0]), # type(self.X[0][0][0]), self.X[0][0][:5]) # # logger_RNNtools.debug('Targets: '); # logger_RNNtools.debug(' Y.shape: %s', self.Y.shape) # logger_RNNtools.debug(' Y[0].shape: %s %s %s \n%s', self.Y[0].shape, type(self.Y[0]), type(self.Y[0][0]), # self.Y[0][:5]) # logger_RNNtools.debug('Layers: ') # fix these at initialization because it allows for compiler opimizations num_output_units = self.num_output_units num_features = self.num_features batch_size = self.batch_size audio_inputs = self.audio_inputs_var audio_masks = self.audio_masks_var #set MATRIX, not iMatrix!! Otherwise all mask calculations are done by CPU, and everything will be ~2x slowed down!! Also in general_tools.generate_masks() valid_indices = self.audio_valid_indices_var net = {} # net['l1_in_valid'] = L.InputLayer(shape=(batch_size, None), input_var=valid_indices) # shape = (batch_size, batch_max_seq_length, num_features) net['l1_in'] = L.InputLayer(shape=(batch_size, None, num_features), input_var=audio_inputs) # We could do this and set all input_vars to None, but that is slower -> fix batch_size and num_features at initialization # batch_size, n_time_steps, n_features = net['l1_in'].input_var.shape # This input will be used to provide the network with masks. # Masks are matrices of shape (batch_size, n_time_steps); net['l1_mask'] = L.InputLayer(shape=(batch_size, None), input_var=audio_masks) if debug: get_l_in = L.get_output(net['l1_in']) l_in_val = get_l_in.eval({net['l1_in'].input_var: self.X}) # logger_RNNtools.debug(l_in_val) logger_RNNtools.debug(' l_in size: %s', l_in_val.shape) get_l_mask = L.get_output(net['l1_mask']) l_mask_val = get_l_mask.eval( {net['l1_mask'].input_var: self.masks}) # logger_RNNtools.debug(l_in_val) logger_RNNtools.debug(' l_mask size: %s', l_mask_val.shape) n_batch, n_time_steps, n_features = net['l1_in'].input_var.shape logger_RNNtools.debug( " n_batch: %s | n_time_steps: %s | n_features: %s", n_batch, n_time_steps, n_features) ## LSTM parameters # All gates have initializers for the input-to-gate and hidden state-to-gate # weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity. # The convention is that gates use the standard sigmoid nonlinearity, # which is the default for the Gate class. gate_parameters = L.recurrent.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.)) cell_parameters = L.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=lasagne.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=lasagne.nonlinearities.tanh) # generate layers of stacked LSTMs, possibly bidirectional net['l2_lstm'] = [] for i in range(len(n_hidden_list)): n_hidden = n_hidden_list[i] if i == 0: input = net['l1_in'] else: input = net['l2_lstm'][i - 1] nextForwardLSTMLayer = L.recurrent.LSTMLayer( input, n_hidden, # We need to specify a separate input for masks mask_input=net['l1_mask'], # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=100.) net['l2_lstm'].append(nextForwardLSTMLayer) if bidirectional: input = net['l2_lstm'][-1] # Use backward LSTM # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. nextBackwardLSTMLayer = L.recurrent.LSTMLayer( input, n_hidden, ingate=gate_parameters, mask_input=net['l1_mask'], forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=100., backwards=True) net['l2_lstm'].append(nextBackwardLSTMLayer) # if debug: # # Backwards LSTM # get_l_lstm_back = theano.function([net['l1_in'].input_var, net['l1_mask'].input_var], # L.get_output(net['l2_lstm'][-1])) # l_lstmBack_val = get_l_lstm_back(self.X, self.masks) # logger_RNNtools.debug(' l_lstm_back size: %s', l_lstmBack_val.shape) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. # The output of l_sum will be of shape (n_batch, max_n_time_steps, n_features) net['l2_lstm'].append( L.ElemwiseSumLayer( [net['l2_lstm'][-2], net['l2_lstm'][-1]])) # we need to convert (batch_size, seq_length, num_features) to (batch_size * seq_length, num_features) because Dense networks can't deal with 2 unknown sizes net['l3_reshape'] = L.ReshapeLayer(net['l2_lstm'][-1], (-1, n_hidden_list[-1])) # if debug: # get_l_reshape = theano.function([net['l1_in'].input_var, net['l1_mask'].input_var], # L.get_output(net['l3_reshape'])) # l_reshape_val = get_l_reshape(self.X, self.masks) # logger.debug(' l_reshape size: %s', l_reshape_val.shape) # # if debug: # # Forwards LSTM # get_l_lstm = theano.function([net['l1_in'].input_var, net['l1_mask'].input_var], # L.get_output(net['l2_lstm'][-1])) # l_lstm_val = get_l_lstm(self.X, self.masks) # logger_RNNtools.debug(' l2_lstm size: %s', l_lstm_val.shape); if addDenseLayers: net['l4_dense'] = L.DenseLayer( net['l3_reshape'], nonlinearity=lasagne.nonlinearities.rectify, num_units=256) dropoutLayer = L.DropoutLayer(net['l4_dense'], p=0.3) net['l5_dense'] = L.DenseLayer( dropoutLayer, nonlinearity=lasagne.nonlinearities.rectify, num_units=64) # Now we can apply feed-forward layers as usual for classification net['l6_dense'] = L.DenseLayer( net['l5_dense'], num_units=num_output_units, nonlinearity=lasagne.nonlinearities.softmax) else: # Now we can apply feed-forward layers as usual for classification net['l6_dense'] = L.DenseLayer( net['l3_reshape'], num_units=num_output_units, nonlinearity=lasagne.nonlinearities.softmax) # # Now, the shape will be (n_batch * n_timesteps, num_output_units). We can then reshape to # # n_batch to get num_output_units values for each timestep from each sequence net['l7_out_flattened'] = L.ReshapeLayer(net['l6_dense'], (-1, num_output_units)) net['l7_out'] = L.ReshapeLayer(net['l6_dense'], (batch_size, -1, num_output_units)) net['l7_out_valid_basic'] = L.SliceLayer(net['l7_out'], indices=valid_indices, axis=1) net['l7_out_valid'] = L.ReshapeLayer( net['l7_out_valid_basic'], (batch_size, -1, num_output_units)) net['l7_out_valid_flattened'] = L.ReshapeLayer( net['l7_out_valid_basic'], (-1, num_output_units)) if debug: get_l_out = theano.function( [net['l1_in'].input_var, net['l1_mask'].input_var], L.get_output(net['l7_out'])) l_out = get_l_out(self.X, self.masks) # this only works for batch_size == 1 get_l_out_valid = theano.function( [audio_inputs, audio_masks, valid_indices], L.get_output(net['l7_out_valid'])) try: l_out_valid = get_l_out_valid(self.X, self.masks, self.valid_frames) logger_RNNtools.debug('\n\n\n l_out: %s | l_out_valid: %s', l_out.shape, l_out_valid.shape) except: logger_RNNtools.warning( "batchsize not 1, get_valid not working") if debug: self.print_network_structure(net) self.network_lout = net['l7_out_flattened'] self.network_lout_batch = net['l7_out'] self.network_lout_valid = net['l7_out_valid'] self.network_lout_valid_flattened = net['l7_out_valid_flattened'] self.network = net
def build_network(self, vocab_size, doc_var, query_var, docmask_var, qmask_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=EMBED_DIM, W=l_docembed.W) l_fwd_doc = L.GRULayer(l_docembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_docembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) l_fwd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) d = L.get_output(l_doc) # B x N x D q = L.get_output(l_q) # B x D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax( T.set_subtensor( T.alloc(-20., p.shape[0], p.shape[1])[docmask_var.nonzero()], p[docmask_var.nonzero()])) index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final = T.inc_subtensor( T.alloc(0., p.shape[0], vocab_size)[index, T.flatten(doc_var, outdim=2)], pm) #qv = T.flatten(query_var,outdim=2) #index2 = T.reshape(T.repeat(T.arange(qv.shape[0]),qv.shape[1]),qv.shape) #xx = index2[qmask_var.nonzero()] #yy = qv[qmask_var.nonzero()] #pV = T.set_subtensor(final[xx,yy], T.zeros_like(qv[xx,yy])) return final, l_doc, l_q
def __init__(self, full_length, output_size, meta_size, depth=2, encoder_size=64, decoder_size=64): latent_size = 16 input_var = TT.tensor3(dtype='float32') meta_var = TT.tensor3(dtype='float32') target_var = TT.matrix() cut_weights = TT.vector(dtype='float32') input_layer = layers.InputLayer((None, None, output_size), input_var=input_var) meta_layer = layers.InputLayer((None, None, meta_size), input_var=meta_var) meta_layer = layers.DropoutLayer(meta_layer, p=0.2) concat_input_layer = layers.ConcatLayer([input_layer, meta_layer], axis=-1) # encoder lstm_layer = layers.RecurrentLayer(concat_input_layer, encoder_size / 2, learn_init=True) lstm_layer = layers.RecurrentLayer(lstm_layer, encoder_size / 2, learn_init=True) lstm_layer = layers.ReshapeLayer(lstm_layer, (-1, encoder_size / 2)) encoded = layers.DenseLayer(lstm_layer, latent_size) encoded = layers.batch_norm(encoded) dense = encoded for idx in xrange(depth): dense = layers.DenseLayer(dense, decoder_size) dense = layers.batch_norm(dense) mu_and_logvar_x_layer = layers.DenseLayer(dense, full_length * 2, nonlinearity=nonlinearities.linear) mu_x_layer = layers.SliceLayer(mu_and_logvar_x_layer, slice(0, full_length), axis=1) mu_x_layer = layers.ReshapeLayer(mu_x_layer, (-1, full_length, full_length)) logvar_x_layer = layers.SliceLayer(mu_and_logvar_x_layer, slice(full_length, None), axis=1) logvar_x_layer = layers.ReshapeLayer(logvar_x_layer, (-1, full_length, full_length)) l2_norm = regularization.regularize_network_params(mu_and_logvar_x_layer, regularization.l2) loss = neg_log_likelihood( target_var, layers.get_output(mu_x_layer, deterministic=False), layers.get_output(logvar_x_layer, deterministic=False), cut_weights ) + 1e-4 * l2_norm test_loss = neg_log_likelihood( target_var, layers.get_output(mu_x_layer, deterministic=False), layers.get_output(logvar_x_layer, deterministic=False), cut_weights ) + 1e-4 * l2_norm params = layers.get_all_params(mu_and_logvar_x_layer, trainable=True) param_updates = updates.adadelta(loss.mean(), params) self._train_fn = theano.function( [input_var, meta_var, target_var, cut_weights], updates=param_updates, outputs=loss.mean() ) self._loss_fn = theano.function( [input_var, meta_var, target_var, cut_weights], outputs=test_loss.mean() ) self._predict_fn = theano.function( [input_var, meta_var], outputs=[ layers.get_output(mu_x_layer, deterministic=True), layers.get_output(logvar_x_layer, deterministic=True) ] )