def create_attention(self, gru_con, in_con_mask, condition, batch_size, n_hidden_con, **kwargs): # (batch_size, n_attention) gru_cond2 = non_flattening_dense_layer(gru_con, self.in_con_mask, self.n_attention, nonlinearity=None) gru_que2 = DenseLayer(condition, self.n_attention, nonlinearity=None) gru_que2 = dimshuffle(gru_que2, (0, 'x', 1)) att = ElemwiseSumLayer([gru_cond2, gru_que2]) att = NonlinearityLayer(att, T.tanh) att = SliceLayer(non_flattening_dense_layer(att, self.in_con_mask, 1, nonlinearity=None), indices=0, axis=2) att_softmax = SequenceSoftmax(att, self.in_con_mask) rep = ElemwiseMergeLayer( [ForgetSizeLayer(dimshuffle(att_softmax, (0, 1, 'x'))), gru_con], T.mul) return ExpressionLayer(rep, lambda x: T.sum(x, axis=1), lambda s: (s[0], ) + s[2:])
def get_embedding_layer(self, l_in, extra_vars): language = extra_vars[0] context_vars = extra_vars[1:] id_tag = (self.id + '/') if self.id else '' l_lang = InputLayer(shape=(None, ), input_var=language, name=id_tag + 'lang_input') if self.options.bilingual_en_embed_file: en_embeddings = load_embeddings( self.options.bilingual_en_embed_file, self.seq_vec) en_embed_size = en_embeddings.shape[1] else: en_embeddings = Normal() en_embed_size = self.options.bilingual_embed_size if self.options.bilingual_zh_embed_file: zh_embeddings = load_embeddings( self.options.bilingual_zh_embed_file, self.seq_vec) zh_embed_size = zh_embeddings.shape[1] else: zh_embeddings = Normal() zh_embed_size = self.options.bilingual_embed_size l_en = EmbeddingLayer(l_in, input_size=len(self.seq_vec.tokens), output_size=en_embed_size, W=en_embeddings, name=id_tag + 'desc_embed_en') l_en_transformed = dimshuffle(l_en, (0, 2, 1)) l_en_transformed = NINLayer(l_en_transformed, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'desc_embed_en_transformed') l_en_transformed = dimshuffle(l_en_transformed, (0, 2, 1)) l_zh = EmbeddingLayer(l_in, input_size=len(self.seq_vec.tokens), output_size=zh_embed_size, W=zh_embeddings, name=id_tag + 'desc_embed_zh') l_zh_transformed = dimshuffle(l_zh, (0, 2, 1)) l_zh_transformed = NINLayer(l_zh_transformed, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'desc_embed_zh_transformed') l_zh_transformed = dimshuffle(l_zh_transformed, (0, 2, 1)) l_merged = SwitchLayer(l_lang, [l_en_transformed, l_zh_transformed], name=id_tag + 'desc_embed_switch') return (l_merged, context_vars)
def broadcast_dot_layer(l_pred, l_targets, feature_dim, id_tag): l_broadcast = dimshuffle(l_pred, (0, 1, 'x'), name=id_tag + 'dot_broadcast') l_forget = ForgetSizeLayer(l_broadcast, axis=2, name=id_tag + 'dot_nosize') l_merge = ElemwiseMergeLayer((l_forget, l_targets), T.mul, name=id_tag + 'dot_elemwise_mul') l_pool = FeaturePoolLayer(l_merge, pool_size=feature_dim, axis=1, pool_function=T.sum, name=id_tag + 'dot_pool') return reshape(l_pool, ([0], [2]), name=id_tag + 'broadcast_dot')
def broadcast_sub_layer(l_pred, l_targets, feature_dim, id_tag): l_broadcast = dimshuffle(l_pred, (0, 1, 'x'), name=id_tag + 'sub_broadcast') l_forget = ForgetSizeLayer(l_broadcast, axis=2, name=id_tag + 'sub_nosize') return ElemwiseMergeLayer((l_forget, l_targets), T.sub, name=id_tag + 'broadcast_sub')
def apply_mask(layer_seq, layer_seq_mask): """ seq: layer of shape (batch_size, length_seq, n_features) seq_mask: layer of shape (batch_size, length_seq) """ return ElemwiseMergeLayer( [ForgetSizeLayer(dimshuffle(layer_seq_mask, (0, 1, 'x'))), layer_seq], T.mul)
def layer_context(layer_ctx, ctx_nblayers, ctx_nbfilters, ctx_winlen, hiddensize, nonlinearity, bn_axes=None, bn_cnn_axes=None, critic=False, useLRN=True): layer_ctx = ll.dimshuffle(layer_ctx, [0, 'x', 1, 2], name='ctx.dimshuffle_to_2DCNN') for layi in xrange(ctx_nblayers): layerstr = 'ctx.l' + str(1 + layi) + '_CNN{}x{}x{}'.format( ctx_nbfilters, ctx_winlen, 1) layer_ctx = ll.Conv2DLayer(layer_ctx, num_filters=ctx_nbfilters, filter_size=[ctx_winlen, 1], stride=1, pad='same', nonlinearity=nonlinearity, name=layerstr) if not critic and (not bn_cnn_axes is None): layer_ctx = ll.batch_norm(layer_ctx, axes=bn_cnn_axes) # layer_ctx = ll.batch_norm(layer_GatedConv2DLayer(layer_ctx, ctx_nbfilters, [ctx_winlen,1], stride=1, pad='same', nonlinearity=nonlinearity, name=layerstr)) if critic and useLRN: layer_ctx = ll.LocalResponseNormalization2DLayer(layer_ctx) layer_ctx = ll.dimshuffle(layer_ctx, [0, 2, 3, 1], name='ctx.dimshuffle_back') layer_ctx = ll.flatten(layer_ctx, outdim=3, name='ctx.flatten') for layi in xrange(2): layerstr = 'ctx.l' + str(1 + ctx_nblayers + layi) + '_FC{}'.format(hiddensize) layer_ctx = ll.DenseLayer(layer_ctx, hiddensize, nonlinearity=nonlinearity, num_leading_axes=2, name=layerstr) if not critic and (not bn_axes is None): layer_ctx = ll.batch_norm(layer_ctx, axes=bn_axes) return layer_ctx
def build_cnn(): data_size = (None, 10, 100) # Batch size x Img Channels x Height x Width input_var = T.tensor3(name="input", dtype='int64') values = np.array(np.random.randint(0, 1, (5, 10, 100))) input_var.tag.test_value = values input_layer = L.InputLayer(data_size, input_var=input_var) W = create_char_embedding_matrix() embed_layer = L.EmbeddingLayer(input_layer, input_size=102, output_size=101, W=W) reshape = L.reshape(embed_layer, (-1, 100, 101)) dim_shuffle = L.dimshuffle(reshape, (0, 2, 1)) #conv_layer_1 = L.Conv2DLayer(embed_layer, 4, (1), 1, 0) #pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=1) print L.get_output(dim_shuffle).tag.test_value.shape conv_layer_1 = L.Conv1DLayer(dim_shuffle, 50, 2, 1) print L.get_output(conv_layer_1).tag.test_value.shape print "TEST" pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=99) print L.get_output(pool_layer_1).tag.test_value.shape reshape_conv_1 = L.reshape(pool_layer_1, (-1, 50)) conv_layer_2 = L.Conv1DLayer(dim_shuffle, 50, 3, 1) pool_layer_2 = L.MaxPool1DLayer(conv_layer_2, pool_size=98) reshape_conv_2 = L.reshape(pool_layer_2, (-1, 50)) merge_layer = L.ConcatLayer([reshape_conv_1, reshape_conv_2], 1) print L.get_output(merge_layer).tag.test_value.shape reshape_output = L.reshape(merge_layer, (-1, 10, 100)) print L.get_output(reshape_output).tag.test_value.shape x = T.tensor3(name="testname", dtype='int32') #x = T.imatrix() #output = L.get_output(conv_layer_1,x) #f = theano.function([x],output) word = unicode("Tat") word_index = np.array([]) #print word_index #x_test = np.array([word_index]).astype('int32') #print f(x_test) return reshape_output
def create_attention(self, gru_con, in_con_mask, condition, batch_size, n_hidden_con, **kwargs): # (batch_size, n_attention) gru_cond2 = non_flattening_dense_layer( gru_con, self.in_con_mask, self.n_attention, nonlinearity=None) gru_que2 = DenseLayer(condition, self.n_attention, nonlinearity=None) gru_que2 = dimshuffle(gru_que2, (0, 'x', 1)) att = ElemwiseSumLayer([gru_cond2, gru_que2]) att = NonlinearityLayer(att, T.tanh) att = SliceLayer(non_flattening_dense_layer( att, self.in_con_mask, 1, nonlinearity=None), indices=0, axis=2) att_softmax = SequenceSoftmax(att, self.in_con_mask) rep = ElemwiseMergeLayer( [ForgetSizeLayer(dimshuffle(att_softmax, (0, 1, 'x'))), gru_con], T.mul) return ExpressionLayer(rep, lambda x: T.sum(x, axis=1), lambda s: (s[0],) + s[2:])
def get_conv_input(self, sidx, tidx, avg=False): suf = '_avg' if avg else '' feat_embs = [ self.manager.feats[name].get_emb_layer(sidx, tidx, avg=avg) for name in self.args.source_feats ] # TODO: change the meaning if self.args.lex == 'mix': concat_emb = L.ElemwiseSumLayer(feat_embs) # (100, 15, 256) else: concat_emb = L.concat(feat_embs, axis=2) # (100, 15, 256+100) pos = np.array([0] * (self.args.window_size / 2) + [1] + [0] * (self.args.window_size / 2)).astype( theano.config.floatX) post = theano.shared(pos[np.newaxis, :, np.newaxis], borrow=True) # (1, 15, 1) posl = L.InputLayer( (None, self.args.window_size, 1), input_var=T.extra_ops.repeat(post, sidx.shape[0], axis=0)) # (100, 15, 1) conv_in = L.concat([concat_emb, posl], axis=2) # (100, 15, 256+1) if self.args.pos_emb: posint = L.flatten( L.ExpressionLayer(posl, lambda x: T.cast(x, 'int64'))) # (100, 15) pos_emb = L.EmbeddingLayer( posint, self.args.window_size, 8, name='epos' + suf, W=Normal(0.01) if not avg else Constant()) # (100, 15, 8) pos_emb.params[pos_emb.W].remove('regularizable') conv_in = L.concat([concat_emb, posl, pos_emb], axis=2) # (100, 15, 256+1+8) # # squeeze # if self.args.squeeze: # conv_in = L.DenseLayer(conv_in, num_units=self.args.squeeze, name='squeeze'+suf, num_leading_axes=2, # W=HeNormal('relu')) # (100, 15, 256) conv_in = L.dimshuffle(conv_in, (0, 2, 1)) # (100, 256+1, 15) return conv_in
def get_char2word(self, ic, avg=False): suf = '_avg' if avg else '' ec = L.EmbeddingLayer( ic, self.args.vc, self.args.nc, name='ec' + suf, W=HeNormal() if not avg else Constant()) # (100, 24, 32, 16) ec.params[ec.W].remove('regularizable') if self.args.char_model == 'CNN': lds = L.dimshuffle(ec, (0, 3, 1, 2)) # (100, 16, 24, 32) ls = [] for n in self.args.ngrams: lconv = L.Conv2DLayer( lds, self.args.nf, (1, n), untie_biases=True, W=HeNormal('relu') if not avg else Constant(), name='conv_%d' % n + suf) # (100, 64/4, 24, 32-n+1) lpool = L.MaxPool2DLayer( lconv, (1, self.args.max_len - n + 1)) # (100, 64, 24, 1) lpool = L.flatten(lpool, outdim=3) # (100, 16, 24) lpool = L.dimshuffle(lpool, (0, 2, 1)) # (100, 24, 16) ls.append(lpool) xc = L.concat(ls, axis=2) # (100, 24, 64) return xc elif self.args.char_model == 'LSTM': ml = L.ExpressionLayer( ic, lambda x: T.neq(x, 0)) # mask layer (100, 24, 32) ml = L.reshape(ml, (-1, self.args.max_len)) # (2400, 32) gate_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal()) cell_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=None, nonlinearity=tanh) lstm_in = L.reshape( ec, (-1, self.args.max_len, self.args.nc)) # (2400, 32, 16) lstm_f = L.LSTMLayer( lstm_in, self.args.nw / 2, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, name='forward' + suf) # (2400, 64) lstm_b = L.LSTMLayer( lstm_in, self.args.nw / 2, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, backwards=True, name='backward' + suf) # (2400, 64) remove_reg(lstm_f) remove_reg(lstm_b) if avg: set_zero(lstm_f) set_zero(lstm_b) xc = L.concat([lstm_f, lstm_b], axis=1) # (2400, 128) xc = L.reshape(xc, (-1, self.args.sw, self.args.nw)) # (100, 24, 256) return xc
def _build_net(self, emb_char_filter_size=5, emb_dropout=True, **kwargs): batch_size = self.mask_context_var.shape[0] context_len = self.mask_context_var.shape[1] question_len = self.question_var.shape[1] context_word_len = self.context_char_var.shape[2] question_word_len = self.question_char_var.shape[2] self.batch_size = batch_size self.context_len = context_len ''' Inputs and word embeddings''' l_context_char = LL.InputLayer(shape=(None, None, None), input_var=self.context_char_var) l_question_char = LL.InputLayer(shape=(None, None, None), input_var=self.question_char_var) l_c_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_context_var) l_q_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_question_var) l_c_char_mask = LL.InputLayer(shape=(None, None, None), input_var=self.mask_context_char_var) l_q_char_mask = LL.InputLayer(shape=(None, None, None), input_var=self.mask_question_char_var) l_c_emb = LL.InputLayer(shape=(None, None, self.emb_size), input_var=self.context_var) l_q_emb = LL.InputLayer(shape=(None, None, self.emb_size), input_var=self.question_var) if self.train_unk: l_c_unk_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_context_unk_var) l_q_unk_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_question_unk_var) l_c_emb = TrainUnkLayer(l_c_emb, l_c_unk_mask, output_size=self.emb_size, W=self.word_embeddings[0]) l_q_emb = TrainUnkLayer(l_q_emb, l_q_unk_mask, output_size=self.emb_size, W=l_c_emb.W) if self.negative: l_c_emb = TrainNAWLayer(l_c_emb, l_c_mask, output_size=self.emb_size) ''' Char-embeddings ''' # (batch_size x context_len x context_word_len x emb_char_size) l_c_char_emb = LL.EmbeddingLayer(l_context_char, input_size=self.alphabet_size, output_size=self.emb_char_size) l_q_char_emb = LL.EmbeddingLayer(l_question_char, input_size=self.alphabet_size, output_size=self.emb_char_size, W=l_c_char_emb.W) # here I do multiplication of character embeddings with masks, # because I want to pad them with constant zeros l_c_char_mask = ForgetSizeLayer( LL.dimshuffle(l_c_char_mask, (0, 1, 2, 'x'))) l_q_char_mask = ForgetSizeLayer( LL.dimshuffle(l_q_char_mask, (0, 1, 2, 'x'))) l_c_char_emb = LL.ElemwiseMergeLayer([l_c_char_emb, l_c_char_mask], T.mul) l_q_char_emb = LL.ElemwiseMergeLayer([l_q_char_emb, l_q_char_mask], T.mul) # convolutions l_c_char_emb = LL.dimshuffle( LL.reshape(l_c_char_emb, (batch_size * context_len, context_word_len, self.emb_char_size)), (0, 2, 1)) l_c_char_conv = LL.Conv1DLayer(l_c_char_emb, num_filters=self.num_emb_char_filters, filter_size=emb_char_filter_size, nonlinearity=L.nonlinearities.tanh, pad=self.conv) # (batch_size * context_len x num_filters x context_word_len + filter_size - 1) l_c_char_emb = LL.ExpressionLayer(l_c_char_conv, lambda X: X.max(2), output_shape='auto') l_c_char_emb = LL.reshape( l_c_char_emb, (batch_size, context_len, self.num_emb_char_filters)) l_q_char_emb = LL.dimshuffle( LL.reshape(l_q_char_emb, (batch_size * question_len, question_word_len, self.emb_char_size)), (0, 2, 1)) l_q_char_conv = LL.Conv1DLayer(l_q_char_emb, num_filters=self.num_emb_char_filters, filter_size=emb_char_filter_size, nonlinearity=L.nonlinearities.tanh, W=l_c_char_conv.W, b=l_c_char_conv.b, pad=self.conv) # (batch_size * question_len x num_filters x question_word_len + filter_size - 1) l_q_char_emb = LL.ExpressionLayer(l_q_char_conv, lambda X: X.max(2), output_shape='auto') l_q_char_emb = LL.reshape( l_q_char_emb, (batch_size, question_len, self.num_emb_char_filters)) ''' Concatenating both embeddings ''' l_c_emb = LL.concat([l_c_emb, l_c_char_emb], axis=2) l_q_emb = LL.concat([l_q_emb, l_q_char_emb], axis=2) # originally I had dropout here ''' Highway layer allowing for interaction between embeddings ''' l_c_P = LL.reshape(l_c_emb, (batch_size * context_len, self.emb_size + self.num_emb_char_filters)) l_c_P = LL.DenseLayer(l_c_P, num_units=self.rec_size, b=None, nonlinearity=None) l_c_high = HighwayLayer(l_c_P) l_c_emb = LL.reshape(l_c_high, (batch_size, context_len, self.rec_size)) l_q_P = LL.reshape(l_q_emb, (batch_size * question_len, self.emb_size + self.num_emb_char_filters)) l_q_P = LL.DenseLayer(l_q_P, num_units=self.rec_size, W=l_c_P.W, b=None, nonlinearity=None) l_q_high = HighwayLayer(l_q_P, W1=l_c_high.W1, b1=l_c_high.b1, W2=l_c_high.W2, b2=l_c_high.b2) l_q_emb = LL.reshape(l_q_high, (batch_size, question_len, self.rec_size)) ''' Calculating wiq features from https://arxiv.org/abs/1703.04816 ''' l_weighted_feat = WeightedFeatureLayer( [l_c_emb, l_q_emb, l_c_mask, l_q_mask]) # batch_size x context_len l_weighted_feat = LL.dimshuffle(l_weighted_feat, (0, 1, 'x')) # batch_size x context_len l_bin_feat = LL.InputLayer(shape=(None, None), input_var=self.bin_feat_var) l_bin_feat = LL.dimshuffle(l_bin_feat, (0, 1, 'x')) ''' Dropout at the embeddings ''' if emb_dropout: print('Using dropout after wiq calculation.') l_c_emb = LL.dropout(l_c_emb) l_q_emb = LL.dropout(l_q_emb) ''' Here we concatenate wiq features to embeddings''' # both features are concatenated to the embeddings # for the question we fix the features to 1 l_c_emb = LL.concat([l_c_emb, l_bin_feat, l_weighted_feat], axis=2) l_q_emb = LL.pad(l_q_emb, width=[(0, 2)], val=L.utils.floatX(1), batch_ndim=2) ''' Context and question encoding using the same BiLSTM for both ''' # output shape is (batch_size x context_len x rec_size) l_c_enc_forw = LL.LSTMLayer(l_c_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_c_mask) l_c_enc_back = LL.LSTMLayer(l_c_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_c_mask, backwards=True) # output shape is (batch_size x question_len x rec_size) l_q_enc_forw = LL.LSTMLayer( l_q_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_q_mask, ingate=LL.Gate(W_in=l_c_enc_forw.W_in_to_ingate, W_hid=l_c_enc_forw.W_hid_to_ingate, W_cell=l_c_enc_forw.W_cell_to_ingate, b=l_c_enc_forw.b_ingate), forgetgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_forgetgate, W_hid=l_c_enc_forw.W_hid_to_forgetgate, W_cell=l_c_enc_forw.W_cell_to_forgetgate, b=l_c_enc_forw.b_forgetgate), outgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_outgate, W_hid=l_c_enc_forw.W_hid_to_outgate, W_cell=l_c_enc_forw.W_cell_to_outgate, b=l_c_enc_forw.b_outgate), cell=LL.Gate(W_in=l_c_enc_forw.W_in_to_cell, W_hid=l_c_enc_forw.W_hid_to_cell, W_cell=None, b=l_c_enc_forw.b_cell, nonlinearity=L.nonlinearities.tanh)) l_q_enc_back = LL.LSTMLayer( l_q_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_q_mask, backwards=True, ingate=LL.Gate(W_in=l_c_enc_back.W_in_to_ingate, W_hid=l_c_enc_back.W_hid_to_ingate, W_cell=l_c_enc_back.W_cell_to_ingate, b=l_c_enc_back.b_ingate), forgetgate=LL.Gate(W_in=l_c_enc_back.W_in_to_forgetgate, W_hid=l_c_enc_back.W_hid_to_forgetgate, W_cell=l_c_enc_back.W_cell_to_forgetgate, b=l_c_enc_back.b_forgetgate), outgate=LL.Gate(W_in=l_c_enc_back.W_in_to_outgate, W_hid=l_c_enc_back.W_hid_to_outgate, W_cell=l_c_enc_back.W_cell_to_outgate, b=l_c_enc_back.b_outgate), cell=LL.Gate(W_in=l_c_enc_back.W_in_to_cell, W_hid=l_c_enc_back.W_hid_to_cell, W_cell=None, b=l_c_enc_back.b_cell, nonlinearity=L.nonlinearities.tanh)) # batch_size x context_len x 2*rec_size l_c_enc = LL.concat([l_c_enc_forw, l_c_enc_back], axis=2) # batch_size x question_len x 2*rec_size l_q_enc = LL.concat([l_q_enc_forw, l_q_enc_back], axis=2) def proj_init(): return np.vstack([ np.eye(self.rec_size, dtype=theano.config.floatX), np.eye(self.rec_size, dtype=theano.config.floatX) ]) # this is H from the paper, shape: (batch_size * context_len x # rec_size) l_c_proj = LL.reshape(l_c_enc, (batch_size * context_len, 2 * self.rec_size)) l_c_proj = LL.DenseLayer(l_c_proj, num_units=self.rec_size, W=proj_init(), b=None, nonlinearity=L.nonlinearities.tanh) # this is Z from the paper, shape: (batch_size * question_len x # rec_size) l_q_proj = LL.reshape(l_q_enc, (batch_size * question_len, 2 * self.rec_size)) l_q_proj = LL.DenseLayer(l_q_proj, num_units=self.rec_size, W=proj_init(), b=None, nonlinearity=L.nonlinearities.tanh) ''' Additional, weighted question encoding (alphas from paper) ''' l_alpha = LL.DenseLayer( l_q_proj, # batch_size * question_len x 1 num_units=1, b=None, nonlinearity=None) # batch_size x question_len l_alpha = MaskedSoftmaxLayer( LL.reshape(l_alpha, (batch_size, question_len)), l_q_mask) # batch_size x rec_size l_z_hat = BatchedDotLayer([ LL.reshape(l_q_proj, (batch_size, question_len, self.rec_size)), l_alpha ]) return l_c_proj, l_z_hat
def additional_layer(self, idx_layer, emb_layer, avg=False): suf = '_avg' if avg else '' if self.name == 'char': if self.args.char_model == 'cnn': lds = L.dimshuffle(emb_layer, (0, 3, 1, 2)) # (100, 16, 26, 32) ls = [] for n in self.args.ngrams: lconv = L.Conv2DLayer( lds, self.args.conv_dim, (1, n), untie_biases=False, # W=HeNormal('relu') if not avg else Constant(), W=GlorotNormal('relu') if not avg else Constant(), name='conv_%d' % n + suf) # (100, 64/4, 26, 32-n+1) lpool = L.MaxPool2DLayer(lconv, (1, self.args.max_word_len - n + 1)) # (100, 64, 26, 1) lpool = L.flatten(lpool, outdim=3) # (100, 16, 26) lpool = L.dimshuffle(lpool, (0, 2, 1)) # (100, 26, 16) ls.append(lpool) xc = L.concat(ls, axis=2, name='echar_concat') # (100, 26, 64) # additional # xc = L.DenseLayer(xc, self.args.embw_dim, nonlinearity=None, name='echar_affine', num_leading_axes=2, # W=HeNormal() if not avg else Constant()) # (100, 26, 100) return xc elif self.args.char_model == 'lstm': ml = L.ExpressionLayer( idx_layer, lambda x: T.neq(x, 0)) # mask layer (100, 24, 32) ml = L.reshape(ml, (-1, self.args.max_word_len)) # (1500, 32) gate_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal()) cell_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=None, nonlinearity=tanh) lstm_in = L.reshape( emb_layer, (-1, self.args.max_word_len, self.config['char']['emb_dim'])) # (1500, 32, 16) lstm_f = L.LSTMLayer( lstm_in, 32, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, name='forward' + suf) # (1500, 32) lstm_b = L.LSTMLayer( lstm_in, 32, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, backwards=True, name='backward' + suf) # (1500, 32) remove_reg(lstm_f) remove_reg(lstm_b) if avg: set_zero(lstm_f) set_zero(lstm_b) xc = L.concat([lstm_f, lstm_b], axis=1) # (1500, 64) if self.args.lstm_tagger: xc = L.reshape( xc, (-1, self.args.max_sent_len, 64)) # (100, 161, 64) elif self.args.trans_tagger: xc = L.reshape( xc, (-1, self.args.window_size, 64)) # (100, 15, 64) else: xc = L.reshape(xc, (-1, 26, 64)) # (100, 26, 64) return xc elif self.name == 'morph': # idx (100, 26/161, 16) emb (100, 26/161, 16, 32) if self.args.morph_model == 'max': xm = L.MaxPool2DLayer( emb_layer, (self.args.max_morph_len, 1)) # (100, 26/161, 1, 32) # xm = L.reshape(xm, (-1, 26, self.config['morph']['emb_dim'])) # (100, 26/161, 32) xm = L.flatten(xm, outdim=3) # (100, 26/161, 32) # xm = L.ExpressionLayer(emb_layer, lambda x: T.max(x, 2)) elif self.args.morph_model == 'avg': mask = L.ExpressionLayer( idx_layer, lambda x: T.neq(x, 0)) # (100, 26, 16) mask = L.dimshuffle(mask, (0, 1, 2, 'x')) # (100, 26, 16, 1) mask = L.ExpressionLayer(mask, lambda x: T.extra_ops.repeat( x, self.config['morph']['emb_dim'], 3)) # (100, 26, 16, 1) xm = L.ElemwiseMergeLayer([ emb_layer, mask ], lambda x, m: T.sum(x * m, 2) / T.sum(m, 2)) # (100, 26, 32) # xm = L.reshape(xm, (-1, self.args.feat_shape, self.config['morph']['emb_dim'])) # (100, 26, 32) return xm else: return emb_layer
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' prev_output_var, mask_var = input_vars[-2:] color_input_vars = input_vars[:-2] context_len = self.context_len if hasattr(self, 'context_len') else 1 l_color_repr, color_inputs = self.color_vec.get_input_layer( color_input_vars, recurrent_length=self.seq_vec.max_len - 1, cell_size=self.options.speaker_cell_size, context_len=context_len, id=self.id) l_hidden_color = dimshuffle(l_color_repr, (0, 2, 1)) for i in range(1, self.options.speaker_hidden_color_layers + 1): l_hidden_color = NINLayer( l_hidden_color, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_color%d' % i) l_hidden_color = dimshuffle(l_hidden_color, (0, 2, 1)) l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=prev_output_var, name=id_tag + 'prev_input') l_prev_embed = EmbeddingLayer( l_prev_out, input_size=len(self.seq_vec.tokens), output_size=self.options.speaker_cell_size, name=id_tag + 'prev_embed') l_in = ConcatLayer([l_hidden_color, l_prev_embed], axis=2, name=id_tag + 'color_prev') l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=mask_var, name=id_tag + 'mask_input') l_rec_drop = l_in cell = CELLS[self.options.speaker_cell] cell_kwargs = { 'mask_input': (None if self.options.speaker_no_mask else l_mask_in), 'grad_clipping': self.options.speaker_grad_clipping, 'num_units': self.options.speaker_cell_size, } if self.options.speaker_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.speaker_forget_bias)) if self.options.speaker_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.speaker_nonlinearity] for i in range(1, self.options.speaker_recurrent_layers): l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs) if self.options.speaker_dropout > 0.0: l_rec_drop = DropoutLayer(l_rec, p=self.options.speaker_dropout, name=id_tag + 'rec%d_drop' % i) else: l_rec_drop = l_rec l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % self.options.speaker_recurrent_layers, **cell_kwargs) l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size), name=id_tag + 'reshape') l_hidden_out = l_shape for i in range(1, self.options.speaker_hidden_out_layers + 1): l_hidden_out = DenseLayer( l_hidden_out, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_out%d' % i) l_softmax = DenseLayer(l_hidden_out, num_units=len(self.seq_vec.tokens), nonlinearity=softmax, name=id_tag + 'softmax') l_out = ReshapeLayer( l_softmax, (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)), name=id_tag + 'out') return l_out, color_inputs + [l_prev_out, l_mask_in]
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] context_vars = input_vars[1:] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer( l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') # Context repr has shape (batch_size, seq_len, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, recurrent_length=self.seq_vec.max_len, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id) l_context_repr = reshape( l_context_repr, ([0], [1], self.context_len, self.color_vec.output_size)) l_hidden_context = dimshuffle(l_context_repr, (0, 3, 1, 2), name=id_tag + 'shuffle_in') for i in range(1, self.options.listener_hidden_color_layers + 1): l_hidden_context = NINLayer( l_hidden_context, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[ self.options.listener_nonlinearity], b=Constant(0.1), name=id_tag + 'hidden_context%d' % i) l_pool = FeaturePoolLayer(l_hidden_context, pool_size=self.context_len, axis=3, pool_function=T.mean, name=id_tag + 'pool') l_pool_squeezed = reshape(l_pool, ([0], [1], [2]), name=id_tag + 'pool_squeezed') l_pool_shuffle = dimshuffle(l_pool_squeezed, (0, 2, 1), name=id_tag + 'shuffle_out') l_concat = ConcatLayer([l_pool_shuffle, l_in_embed], axis=2, name=id_tag + 'concat_inp_context') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.listener_nonlinearity] # l_rec1_drop = l_concat l_rec1 = cell(l_concat, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', only_return_final=True, **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout, name=id_tag + 'rec2_drop') else: l_rec2_drop = l_rec2 l_rec2_drop = NINLayer(l_rec2_drop, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'rec2_dense') # Context is fed into the RNN as one copy for each time step; just use # the first time step for output. # Input shape: (batch_size, repr_size, seq_len, context_len) # Output shape: (batch_size, repr_size, context_len) l_context_nonrec = SliceLayer(l_hidden_context, indices=0, axis=2, name=id_tag + 'context_nonrec') l_pool_nonrec = SliceLayer(l_pool_squeezed, indices=0, axis=2, name=id_tag + 'pool_nonrec') # Output shape: (batch_size, repr_size, context_len) l_sub = broadcast_sub_layer( l_pool_nonrec, l_context_nonrec, feature_dim=self.options.listener_cell_size, id_tag=id_tag) # Output shape: (batch_size, repr_size * 2, context_len) l_concat_sub = ConcatLayer([l_context_nonrec, l_sub], axis=1, name=id_tag + 'concat_inp_context') # Output shape: (batch_size, cell_size, context_len) l_hidden = NINLayer(l_concat_sub, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_dot = broadcast_dot_layer( l_rec2_drop, l_hidden_drop, feature_dim=self.options.listener_cell_size, id_tag=id_tag) l_dot_bias = l_dot # BiasLayer(l_dot, name=id_tag + 'dot_bias') l_dot_clipped = NonlinearityLayer( l_dot_bias, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'dot_clipped') l_scores = NonlinearityLayer(l_dot_clipped, nonlinearity=softmax, name=id_tag + 'scores') return l_scores, [l_in] + context_inputs
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] context_vars = input_vars[1:] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer( l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') # Context repr has shape (batch_size, seq_len, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, recurrent_length=self.seq_vec.max_len, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id) l_hidden_context = dimshuffle(l_context_repr, (0, 2, 1)) for i in range(1, self.options.listener_hidden_color_layers + 1): l_hidden_context = NINLayer( l_hidden_context, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[ self.options.listener_nonlinearity], name=id_tag + 'hidden_context%d' % i) l_hidden_context = dimshuffle(l_hidden_context, (0, 2, 1)) l_concat = ConcatLayer([l_hidden_context, l_in_embed], axis=2, name=id_tag + 'concat_inp_context') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.listener_nonlinearity] l_rec1 = cell(l_concat, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout, name=id_tag + 'rec2_drop') else: l_rec2_drop = l_rec2 l_hidden = DenseLayer( l_rec2_drop, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_scores = DenseLayer(l_hidden_drop, num_units=self.context_len, nonlinearity=softmax, name=id_tag + 'scores') return l_scores, [l_in] + context_inputs
def __init__(self, insize, vocoder, hiddensize=256, nonlinearity=lasagne.nonlinearities.very_leaky_rectify, ctx_nblayers=1, ctx_nbfilters=2, ctx_winlen=21, nbcnnlayers=8, nbfilters=16, spec_freqlen=5, noise_freqlen=5, windur=0.025, bn_axes=None, noisesize=100): if bn_axes is None: bn_axes = [0, 1] model.Model.__init__(self, insize, vocoder, hiddensize) self._ctx_nblayers = ctx_nblayers self._ctx_nbfilters = ctx_nbfilters self._ctx_winlen = ctx_winlen self._nbcnnlayers = nbcnnlayers self._nbfilters = nbfilters self._spec_freqlen = spec_freqlen self._noise_freqlen = noise_freqlen self._windur = windur winlen = int(0.5 * self._windur / 0.005) * 2 + 1 layer_ctx_input = ll.InputLayer(shape=(None, None, insize), input_var=self._input_values, name='ctx.input') layer_noise_input = UniformNoiseLayer(layer_ctx_input, noisesize, name='noise.input') layer_ctx_input = ll.ConcatLayer( (layer_ctx_input, layer_noise_input), axis=2, name='concat.input') # TODO Put the noise later on self._layer_ctx = layer_context(layer_ctx_input, ctx_nblayers=self._ctx_nblayers, ctx_nbfilters=self._ctx_nbfilters, ctx_winlen=self._ctx_winlen, hiddensize=self._hiddensize, nonlinearity=nonlinearity, bn_axes=[0, 1], bn_cnn_axes=[0, 2, 3]) layers_toconcat = [] if vocoder.f0size() > 0: # F0 - BLSTM layer layer_f0 = self._layer_ctx grad_clipping = 50 for layi in xrange(1): layerstr = 'f0_l' + str(1 + layi) + '_BLSTM{}'.format( self._hiddensize) fwd = models_basic.layer_LSTM(layer_f0, self._hiddensize, nonlinearity, backwards=False, grad_clipping=grad_clipping, name=layerstr + '.fwd') bck = models_basic.layer_LSTM(layer_f0, self._hiddensize, nonlinearity, backwards=True, grad_clipping=grad_clipping, name=layerstr + '.bck') layer_f0 = ll.ConcatLayer((fwd, bck), axis=2, name=layerstr + '.concat') # TODO Replace by CNN ?? It didn't work well, maybe didn't work well with WGAN loss, but f0 is not more on WGAN loss layer_f0 = ll.DenseLayer(layer_f0, num_units=vocoder.f0size(), nonlinearity=None, num_leading_axes=2, name='f0_lout_projection') layers_toconcat.append(layer_f0) if vocoder.specsize() > 0: # Amplitude spectrum - 2D Gated Conv layers layer_spec_proj = ll.batch_norm(ll.DenseLayer( self._layer_ctx, vocoder.specsize(), nonlinearity=nonlinearity, num_leading_axes=2, name='spec_projection'), axes=bn_axes) # layer_spec_proj = ll.DenseLayer(self._layer_ctx, vocoder.specsize(), nonlinearity=None, num_leading_axes=2, name='spec_projection') layer_spec = ll.dimshuffle(layer_spec_proj, [0, 'x', 1, 2], name='spec_dimshuffle') for layi in xrange(nbcnnlayers): layerstr = 'spec_l' + str(1 + layi) + '_GC{}x{}x{}'.format( self._nbfilters, winlen, self._spec_freqlen) layer_spec = ll.batch_norm( layer_GatedConv2DLayer(layer_spec, self._nbfilters, [winlen, self._spec_freqlen], stride=1, pad='same', nonlinearity=nonlinearity, name=layerstr)) layer_spec = ll.Conv2DLayer(layer_spec, 1, [winlen, self._spec_freqlen], pad='same', nonlinearity=None, name='spec_lout_2DC') layer_spec = ll.dimshuffle(layer_spec, [0, 2, 3, 1], name='spec_dimshuffle') layer_spec = ll.flatten(layer_spec, outdim=3, name='spec_flatten') # layer_spec = ll.ElemwiseSumLayer([layer_spec, layer_spec_proj], name='skip') layers_toconcat.append(layer_spec) if vocoder.noisesize() > 0: layer_noise = self._layer_ctx for layi in xrange(np.max((1, int(np.ceil(nbcnnlayers / 2))))): layerstr = 'noise_l' + str(1 + layi) + '_FC{}'.format(hiddensize) layer_noise = ll.DenseLayer(layer_noise, num_units=hiddensize, nonlinearity=nonlinearity, num_leading_axes=2, name=layerstr) if isinstance(vocoder, vocoders.VocoderPML): layer_noise = ll.DenseLayer( layer_noise, num_units=vocoder.nm_size, nonlinearity=lasagne.nonlinearities.sigmoid, num_leading_axes=2, name='lo_noise' ) # sig is best among nonlin_saturatedsigmoid nonlin_tanh_saturated nonlin_tanh_bysigmoid else: layer_noise = ll.DenseLayer(layer_noise, num_units=vocoder.nm_size, nonlinearity=None, num_leading_axes=2, name='lo_noise') layers_toconcat.append(layer_noise) if vocoder.vuvsize() > 0: # VUV - BLSTM layer layer_vuv = self._layer_ctx grad_clipping = 50 for layi in xrange(1): layerstr = 'vuv_l' + str(1 + layi) + '_BLSTM{}'.format( self._hiddensize) fwd = models_basic.layer_LSTM(layer_vuv, self._hiddensize, nonlinearity, backwards=False, grad_clipping=grad_clipping, name=layerstr + '.fwd') bck = models_basic.layer_LSTM(layer_vuv, self._hiddensize, nonlinearity, backwards=True, grad_clipping=grad_clipping, name=layerstr + '.bck') layer_vuv = ll.ConcatLayer((fwd, bck), axis=2, name=layerstr + '.concat') layer_vuv = ll.DenseLayer(layer_vuv, num_units=vocoder.vuvsize(), nonlinearity=None, num_leading_axes=2, name='vuv_lout_projection') layers_toconcat.append(layer_vuv) layer = ll.ConcatLayer(layers_toconcat, axis=2, name='lout.concat') self.init_finish( layer ) # Has to be called at the end of the __init__ to print out the architecture, get the trainable params, etc.
def __init__( self, input_shape, output_dim, hidden_sizes, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_W_init=LI.GlorotUniform(), hidden_b_init=LI.Constant(0.), output_W_init=LI.GlorotUniform(), output_b_init=LI.Constant(0.), # conv_W_init=LI.GlorotUniform(), conv_b_init=LI.Constant(0.), hidden_nonlinearity=LN.rectify, output_nonlinearity=LN.softmax, name=None, input_var=None): if name is None: prefix = "" else: prefix = name + "_" if len(input_shape) == 3: l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var) input_shape = ([0], ) + input_shape l_hid = L.reshape(l_in, input_shape) l_hid = L.dimshuffle(l_hid, (0, 3, 1, 2)) ## theano ordering elif len(input_shape) == 2: l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var) input_shape = (1, ) + input_shape l_hid = L.reshape(l_in, ([0], ) + input_shape) else: l_in = L.InputLayer(shape=(None, ) + input_shape, input_var=input_var) l_hid = l_in for idx, conv_filter, filter_size, stride, pad in zip( range(len(conv_filters)), conv_filters, conv_filter_sizes, conv_strides, conv_pads, ): l_hid = L.Conv2DLayer( l_hid, num_filters=conv_filter, filter_size=filter_size, stride=(stride, stride), pad=pad, nonlinearity=hidden_nonlinearity, name="%sconv_hidden_%d" % (prefix, idx), convolution=wrapped_conv, ) for idx, hidden_size in enumerate(hidden_sizes): l_hid = L.DenseLayer( l_hid, num_units=hidden_size, nonlinearity=hidden_nonlinearity, name="%shidden_%d" % (prefix, idx), W=hidden_W_init, b=hidden_b_init, ) l_out = L.DenseLayer( l_hid, num_units=output_dim, nonlinearity=output_nonlinearity, name="%soutput" % (prefix, ), W=output_W_init, b=output_b_init, ) self._l_in = l_in self._l_out = l_out self._input_var = l_in.input_var
def _get_l_out(self, input_vars, multi_utt='ignored'): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] context_vars = input_vars[1:] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer(l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') # Context repr has shape (batch_size, seq_len, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, recurrent_length=self.seq_vec.max_len, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id ) l_context_repr = reshape(l_context_repr, ([0], [1], self.context_len, self.color_vec.output_size)) l_hidden_context = dimshuffle(l_context_repr, (0, 3, 1, 2), name=id_tag + 'shuffle_in') for i in range(1, self.options.listener_hidden_color_layers + 1): l_hidden_context = NINLayer( l_hidden_context, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], b=Constant(0.1), name=id_tag + 'hidden_context%d' % i) l_pool = FeaturePoolLayer(l_hidden_context, pool_size=self.context_len, axis=3, pool_function=T.mean, name=id_tag + 'pool') l_pool_squeezed = reshape(l_pool, ([0], [1], [2]), name=id_tag + 'pool_squeezed') l_pool_shuffle = dimshuffle(l_pool_squeezed, (0, 2, 1), name=id_tag + 'shuffle_out') l_concat = ConcatLayer([l_pool_shuffle, l_in_embed], axis=2, name=id_tag + 'concat_inp_context') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.listener_nonlinearity] # l_rec1_drop = l_concat l_rec1 = cell(l_concat, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', only_return_final=True, **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout, name=id_tag + 'rec2_drop') else: l_rec2_drop = l_rec2 l_rec2_drop = NINLayer(l_rec2_drop, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'rec2_dense') # Context is fed into the RNN as one copy for each time step; just use # the first time step for output. # Input shape: (batch_size, repr_size, seq_len, context_len) # Output shape: (batch_size, repr_size, context_len) l_context_nonrec = SliceLayer(l_hidden_context, indices=0, axis=2, name=id_tag + 'context_nonrec') l_pool_nonrec = SliceLayer(l_pool_squeezed, indices=0, axis=2, name=id_tag + 'pool_nonrec') # Output shape: (batch_size, repr_size, context_len) l_sub = broadcast_sub_layer(l_pool_nonrec, l_context_nonrec, feature_dim=self.options.listener_cell_size, id_tag=id_tag) # Output shape: (batch_size, repr_size * 2, context_len) l_concat_sub = ConcatLayer([l_context_nonrec, l_sub], axis=1, name=id_tag + 'concat_inp_context') # Output shape: (batch_size, cell_size, context_len) l_hidden = NINLayer(l_concat_sub, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_dot = broadcast_dot_layer(l_rec2_drop, l_hidden_drop, feature_dim=self.options.listener_cell_size, id_tag=id_tag) l_dot_bias = l_dot # BiasLayer(l_dot, name=id_tag + 'dot_bias') l_dot_clipped = NonlinearityLayer( l_dot_bias, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'dot_clipped') l_scores = NonlinearityLayer(l_dot_clipped, nonlinearity=softmax, name=id_tag + 'scores') return l_scores, [l_in] + context_inputs
def _get_l_out(self, input_vars, multi_utt='ignored'): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] extra_vars = input_vars[1:] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed, context_vars = self.get_embedding_layer(l_in, extra_vars) # Context repr has shape (batch_size, seq_len, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, recurrent_length=self.seq_vec.max_len, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id ) l_hidden_context = dimshuffle(l_context_repr, (0, 2, 1)) for i in range(1, self.options.listener_hidden_color_layers + 1): l_hidden_context = NINLayer( l_hidden_context, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'hidden_context%d' % i) l_hidden_context = dimshuffle(l_hidden_context, (0, 2, 1)) l_concat = ConcatLayer([l_hidden_context, l_in_embed], axis=2, name=id_tag + 'concat_inp_context') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.listener_nonlinearity] l_rec1 = cell(l_concat, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout, name=id_tag + 'rec2_drop') else: l_rec2_drop = l_rec2 l_hidden = DenseLayer(l_rec2_drop, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_scores = DenseLayer(l_hidden_drop, num_units=self.context_len, nonlinearity=softmax, name=id_tag + 'scores') return l_scores, [l_in] + context_inputs
def build_critic(self, critic_input_var, condition_var, vocoder, ctxsize, nonlinearity=lasagne.nonlinearities.very_leaky_rectify, postlayers_nb=6, use_LSweighting=True, LSWGANtransfreqcutoff=4000, LSWGANtranscoef=1.0 / 8.0, use_WGAN_incnoisefeature=False): useLRN = False # TODO layer_critic = ll.InputLayer(shape=(None, None, vocoder.featuressize()), input_var=critic_input_var, name='input') winlen = int(0.5 * self._windur / 0.005) * 2 + 1 layerstoconcats = [] # Amplitude spectrum layer = ll.SliceLayer(layer_critic, indices=slice( vocoder.f0size(), vocoder.f0size() + vocoder.specsize()), axis=2, name='spec_slice') # Assumed feature order if use_LSweighting: # Using weighted WGAN+LS print( 'WGAN Weighted LS - critic - SPEC (trans cutoff {}Hz)'.format( LSWGANtransfreqcutoff)) # wganls_spec_weights_ = nonlin_sigmoidparm(np.arange(vocoder.specsize(), dtype=theano.config.floatX), int(LSWGANtransfreqcutoff*vocoder.specsize()), LSWGANtranscoef) wganls_spec_weights_ = nonlin_sigmoidparm( np.arange(vocoder.specsize(), dtype=theano.config.floatX), sp.freq2fwspecidx(LSWGANtransfreqcutoff, vocoder.fs, vocoder.specsize()), LSWGANtranscoef) wganls_weights = theano.shared( value=np.asarray(wganls_spec_weights_), name='wganls_spec_weights_') layer = CstMulLayer(layer, cstW=wganls_weights, name='cstdot_wganls_weights') layer = ll.dimshuffle(layer, [0, 'x', 1, 2], name='spec_dimshuffle') for layi in xrange(self._nbcnnlayers): layerstr = 'spec_l' + str(1 + layi) + '_GC{}x{}x{}'.format( self._nbfilters, winlen, self._spec_freqlen) # strides>1 make the first two Conv layers pyramidal. Increase patches' effects here and there, bad. layer = layer_GatedConv2DLayer(layer, self._nbfilters, [winlen, self._spec_freqlen], pad='same', nonlinearity=nonlinearity, name=layerstr) if useLRN: layer = ll.LocalResponseNormalization2DLayer(layer) layer = ll.dimshuffle(layer, [0, 2, 3, 1], name='spec_dimshuffle') layer_spec = ll.flatten(layer, outdim=3, name='spec_flatten') layerstoconcats.append(layer_spec) if use_WGAN_incnoisefeature and vocoder.noisesize( ) > 0: # Add noise in critic layer = ll.SliceLayer(layer_critic, indices=slice( vocoder.f0size() + vocoder.specsize(), vocoder.f0size() + vocoder.specsize() + vocoder.noisesize()), axis=2, name='nm_slice') if use_LSweighting: # Using weighted WGAN+LS print('WGAN Weighted LS - critic - NM (trans cutoff {}Hz)'. format(LSWGANtransfreqcutoff)) # wganls_spec_weights_ = nonlin_sigmoidparm(np.arange(vocoder.noisesize(), dtype=theano.config.floatX), int(LSWGANtransfreqcutoff*vocoder.noisesize()), LSWGANtranscoef) wganls_spec_weights_ = nonlin_sigmoidparm( np.arange(vocoder.noisesize(), dtype=theano.config.floatX), sp.freq2fwspecidx(LSWGANtransfreqcutoff, vocoder.fs, vocoder.noisesize()), LSWGANtranscoef) wganls_weights = theano.shared( value=np.asarray(wganls_spec_weights_), name='wganls_spec_weights_') layer = CstMulLayer(layer, cstW=wganls_weights, name='cstdot_wganls_weights') layer = ll.dimshuffle(layer, [0, 'x', 1, 2], name='nm_dimshuffle') for layi in xrange(np.max( (1, int(np.ceil(self._nbcnnlayers / 2))))): layerstr = 'nm_l' + str(1 + layi) + '_GC{}x{}x{}'.format( self._nbfilters, winlen, self._noise_freqlen) layer = layer_GatedConv2DLayer(layer, self._nbfilters, [winlen, self._noise_freqlen], pad='same', nonlinearity=nonlinearity, name=layerstr) if useLRN: layer = ll.LocalResponseNormalization2DLayer(layer) layer = ll.dimshuffle(layer, [0, 2, 3, 1], name='nm_dimshuffle') layer_bndnm = ll.flatten(layer, outdim=3, name='nm_flatten') layerstoconcats.append(layer_bndnm) # Add the contexts layer_ctx_input = ll.InputLayer(shape=(None, None, ctxsize), input_var=condition_var, name='ctx_input') layer_ctx = layer_context(layer_ctx_input, ctx_nblayers=self._ctx_nblayers, ctx_nbfilters=self._ctx_nbfilters, ctx_winlen=self._ctx_winlen, hiddensize=self._hiddensize, nonlinearity=nonlinearity, bn_axes=None, bn_cnn_axes=None, critic=True, useLRN=useLRN) layerstoconcats.append(layer_ctx) # Concatenate the features analysis with the contexts... layer = ll.ConcatLayer(layerstoconcats, axis=2, name='ctx_features.concat') # ... and finalize with a common FC network for layi in xrange(postlayers_nb): layerstr = 'post.l' + str(1 + layi) + '_FC' + str(self._hiddensize) layer = ll.DenseLayer(layer, self._hiddensize, nonlinearity=nonlinearity, num_leading_axes=2, name=layerstr) # output layer (linear) layer = ll.DenseLayer(layer, 1, nonlinearity=None, num_leading_axes=2, name='projection') # No nonlin for this output return [layer, layer_critic, layer_ctx_input]