def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim*2],name='fork',output_names=["linear","gates"], weights_init=initialization.Identity(),biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(),biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear( input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin,gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim * 2], name='fork', output_names=["linear", "gates"], weights_init=initialization.Identity(), biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(), biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear(input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin, gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def gru_layer(dim, h, n, x_mask, first, **kwargs): fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)], name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2]) gru = GatedRecurrent(dim=dim, name='gru' + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) if first: gruApply = gru.apply(linear, gates, mask=x_mask, **kwargs) else: gruApply = gru.apply(linear, gates, **kwargs) return gruApply
def gru_layer(dim, h, n): fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)], name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2]) gru = GatedRecurrent(dim=dim, name='gru' + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) return gru.apply(linear, gates)
def gru_layer(dim, h, n): fork = Fork( output_names=["linear" + str(n), "gates" + str(n)], name="fork" + str(n), input_dim=dim, output_dims=[dim, dim * 2], ) gru = GatedRecurrent(dim=dim, name="gru" + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) return gru.apply(linear, gates)
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.transition.dim = self.state_dim self.fork.input_dim = self.embedding_dim self.fork.output_dims = [ self.state_dim for _ in self.fork.output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.dimshuffle(1, 0) source_sentence_mask = source_sentence_mask.T if self.reverse: source_sentence = source_sentence[::-1] source_sentence_mask = source_sentence_mask[::-1] embeddings = self.lookup.apply(source_sentence) representation = self.transition.apply( **merge(self.fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) return representation[-1]
class InnerRecurrent(BaseRecurrent, Initializable): def __init__(self, inner_input_dim, outer_input_dim, inner_dim, **kwargs): self.inner_gru = GatedRecurrent(dim=inner_dim, name='inner_gru') self.inner_input_fork = Fork( output_names=[name for name in self.inner_gru.apply.sequences if 'mask' not in name], input_dim=inner_input_dim, name='inner_input_fork') self.outer_input_fork = Fork( output_names=[name for name in self.inner_gru.apply.sequences if 'mask' not in name], input_dim=outer_input_dim, name='inner_outer_fork') super(InnerRecurrent, self).__init__(**kwargs) self.children = [ self.inner_gru, self.inner_input_fork, self.outer_input_fork] def _push_allocation_config(self): self.inner_input_fork.output_dims = self.inner_gru.get_dims( self.inner_input_fork.output_names) self.outer_input_fork.output_dims = self.inner_gru.get_dims( self.outer_input_fork.output_names) @recurrent(sequences=['inner_inputs'], states=['states'], contexts=['outer_inputs'], outputs=['states']) def apply(self, inner_inputs, states, outer_inputs): forked_inputs = self.inner_input_fork.apply(inner_inputs, as_dict=True) forked_states = self.outer_input_fork.apply(outer_inputs, as_dict=True) gru_inputs = {key: forked_inputs[key] + forked_states[key] for key in forked_inputs.keys()} new_states = self.inner_gru.apply( iterate=False, **dict_union(gru_inputs, {'states': states})) return new_states # mean according to the time axis def get_dim(self, name): if name == 'states': return self.inner_gru.get_dim(name) else: return AttributeError
class GatedRecurrentWithContext(Initializable): def __init__(self, *args, **kwargs): self.gated_recurrent = GatedRecurrent(*args, **kwargs) self.children = [self.gated_recurrent] @application(states=['states'], outputs=['states'], contexts=[ 'readout_context', 'transition_context', 'update_context', 'reset_context' ]) def apply(self, transition_context, update_context, reset_context, *args, **kwargs): kwargs['inputs'] += transition_context kwargs['update_inputs'] += update_context kwargs['reset_inputs'] += reset_context # readout_context was only added for the Readout brick, discard it kwargs.pop('readout_context') return self.gated_recurrent.apply(*args, **kwargs) def get_dim(self, name): if name in [ 'readout_context', 'transition_context', 'update_context', 'reset_context' ]: return self.dim return self.gated_recurrent.get_dim(name) def __getattr__(self, name): if name == 'gated_recurrent': raise AttributeError return getattr(self.gated_recurrent, name) @apply.property('sequences') def apply_inputs(self): sequences = ['mask', 'inputs'] if self.use_update_gate: sequences.append('update_inputs') if self.use_reset_gate: sequences.append('reset_inputs') return sequences
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.transition.dim = self.state_dim self.fork.input_dim = self.embedding_dim self.fork.output_dims = [self.state_dim for _ in self.fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.dimshuffle(1, 0) source_sentence_mask = source_sentence_mask.T if self.reverse: source_sentence = source_sentence[::-1] source_sentence_mask = source_sentence_mask[::-1] embeddings = self.lookup.apply(source_sentence) representation = self.transition.apply(**merge( self.fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask} )) return representation[-1]
class Encoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, blockid, vocab_size, embedding_dim, state_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.blockid = blockid self.lookup = LookupTable(name='embeddings' + '_' + self.blockid) self.gru = GatedRecurrent(activation=Tanh(), dim=state_dim, name = "GatedRNN" + self.blockid) self.fwd_fork = Fork( [name for name in self.gru.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork' + '_' + self.blockid) self.children = [self.lookup, self.gru, self.fwd_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.gru.get_dim(name) for name in self.fwd_fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) grupara = merge( self.fwd_fork.apply(embeddings, as_dict=True) , {'mask': source_sentence_mask}) representation = self.gru.apply(**grupara) return representation
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='embeddings') self.GRU = GatedRecurrent(activation=Tanh(), dim=state_dim) self.children = [self.lookup, self.GRU] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) representation = self.GRU.apply(embeddings, embeddings) return representation
class GatedRecurrentWithContext(Initializable): def __init__(self, *args, **kwargs): self.gated_recurrent = GatedRecurrent(*args, **kwargs) self.children = [self.gated_recurrent] @application(states=['states'], outputs=['states'], contexts=['readout_context', 'transition_context', 'update_context', 'reset_context']) def apply(self, transition_context, update_context, reset_context, *args, **kwargs): kwargs['inputs'] += transition_context kwargs['update_inputs'] += update_context kwargs['reset_inputs'] += reset_context # readout_context was only added for the Readout brick, discard it kwargs.pop('readout_context') return self.gated_recurrent.apply(*args, **kwargs) def get_dim(self, name): if name in ['readout_context', 'transition_context', 'update_context', 'reset_context']: return self.dim return self.gated_recurrent.get_dim(name) def __getattr__(self, name): if name == 'gated_recurrent': raise AttributeError return getattr(self.gated_recurrent, name) @apply.property('sequences') def apply_inputs(self): sequences = ['mask', 'inputs'] if self.use_update_gate: sequences.append('update_inputs') if self.use_reset_gate: sequences.append('reset_inputs') return sequences
iteration = 300 # number of epochs of gradient descent print "Building Model" # Symbolic variables x = tensor.tensor3('x', dtype=floatX) target = tensor.tensor3('target', dtype=floatX) # Build the model linear = Linear(input_dim = n_u, output_dim = n_h, name="first_layer") rnn = GatedRecurrent(dim=n_h, activation=Tanh()) linear2 = Linear(input_dim = n_h, output_dim = n_y, name="output_layer") sigm = Sigmoid() x_transform = linear.apply(x) h = rnn.apply(x_transform) predict = sigm.apply(linear2.apply(h)) # only for generation B x h_dim h_initial = tensor.tensor3('h_initial', dtype=floatX) h_testing = rnn.apply(x_transform, h_initial, iterate=False) y_hat_testing = linear2.apply(h_testing) y_hat_testing = sigm.apply(y_hat_testing) y_hat_testing.name = 'y_hat_testing' # Cost function cost = SquaredError().apply(predict,target) # Initialization
class Parrot(Initializable, Random): def __init__( self, input_dim=420, # Dimension of the text labels output_dim=63, # Dimension of vocoder fram rnn_h_dim=1024, # Size of rnn hidden state readouts_dim=1024, # Size of readouts (summary of rnn) weak_feedback=False, # Feedback to the top rnn layer full_feedback=False, # Feedback to all rnn layers feedback_noise_level=None, # Amount of noise in feedback layer_norm=False, # Use simple normalization? use_speaker=False, # Condition on the speaker id? num_speakers=21, # How many speakers there are? speaker_dim=128, # Size of speaker embedding which_cost='MSE', # Train with MSE or GMM k_gmm=20, # How many components in the GMM sampling_bias=0, # Make samples more likely (Graves13) epsilon=1e-5, # Numerical stabilities num_characters=43, # how many chars in the labels attention_type='graves', # graves or softmax attention_size=10, # number of gaussians in the attention attention_alignment=1., # audio steps per letter at initialization sharpening_coeff=1., timing_coeff=1., encoder_type=None, encoder_dim=128, **kwargs): super(Parrot, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = output_dim self.rnn_h_dim = rnn_h_dim self.readouts_dim = readouts_dim self.layer_norm = layer_norm self.which_cost = which_cost self.use_speaker = use_speaker self.full_feedback = full_feedback self.feedback_noise_level = feedback_noise_level self.epsilon = epsilon self.num_characters = num_characters self.attention_type = attention_type self.attention_alignment = attention_alignment self.attention_size = attention_size self.sharpening_coeff = sharpening_coeff self.timing_coeff = timing_coeff self.encoder_type = encoder_type self.encoder_dim = encoder_dim self.encoded_input_dim = input_dim if self.encoder_type == 'bidirectional': self.encoded_input_dim = 2 * encoder_dim if self.feedback_noise_level is not None: self.noise_level_var = tensor.scalar('feedback_noise_level') self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1') self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2') self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3') self.h1_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h2_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h2_to_readout') self.h3_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h3_to_readout') self.h1_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h2') self.h1_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h3') self.h2_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h2_to_h3') if which_cost == 'MSE': self.readout_to_output = Linear(input_dim=readouts_dim, output_dim=output_dim, name='readout_to_output') elif which_cost == 'GMM': self.sampling_bias = sampling_bias self.k_gmm = k_gmm self.readout_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=readouts_dim, output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='readout_to_output') self.encoder = Encoder(encoder_type, num_characters, input_dim, encoder_dim, name='encoder') self.children = [ self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout, self.h2_to_readout, self.h3_to_readout, self.h1_to_h2, self.h1_to_h3, self.h2_to_h3, self.readout_to_output ] self.inp_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h1') self.inp_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h2') self.inp_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h3') self.children += [self.inp_to_h1, self.inp_to_h2, self.inp_to_h3] self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'], input_dim=rnn_h_dim, output_dims=[attention_size] * 3, name='h1_to_att') self.att_to_readout = Linear(input_dim=self.encoded_input_dim, output_dim=readouts_dim, name='att_to_readout') self.children += [self.h1_to_att, self.att_to_readout] if use_speaker: self.num_speakers = num_speakers self.speaker_dim = speaker_dim self.embed_speaker = LookupTable(num_speakers, speaker_dim) self.speaker_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h1') self.speaker_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h2') self.speaker_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h3') self.speaker_to_readout = Linear(input_dim=speaker_dim, output_dim=readouts_dim, name='speaker_to_readout') if which_cost == 'MSE': self.speaker_to_output = Linear(input_dim=speaker_dim, output_dim=output_dim, name='speaker_to_output') elif which_cost == 'GMM': self.speaker_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=speaker_dim, output_dims=[ output_dim * k_gmm, output_dim * k_gmm, k_gmm ], name='speaker_to_output') self.children += [ self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2, self.speaker_to_h3, self.speaker_to_readout, self.speaker_to_output ] if full_feedback: self.out_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h2') self.out_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h3') self.children += [self.out_to_h2, self.out_to_h3] weak_feedback = True self.weak_feedback = weak_feedback if weak_feedback: self.out_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h1') self.children += [self.out_to_h1] def _allocate(self): self.initial_w = shared_floatx_zeros((self.encoded_input_dim, ), name="initial_w") add_role(self.initial_w, INITIAL_STATE) def symbolic_input_variables(self): features = tensor.tensor3('features') features_mask = tensor.matrix('features_mask') labels = tensor.imatrix('labels') labels_mask = tensor.matrix('labels_mask') start_flag = tensor.scalar('start_flag') if self.use_speaker: speaker = tensor.imatrix('speaker_index') else: speaker = None return features, features_mask, labels, labels_mask, \ speaker, start_flag def initial_states(self, batch_size): initial_h1 = self.rnn1.initial_states(batch_size) initial_h2 = self.rnn2.initial_states(batch_size) initial_h3 = self.rnn3.initial_states(batch_size) last_h1 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h2 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h3 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) # Defining for all initial_k = tensor.zeros((batch_size, self.attention_size), dtype=floatX) last_k = shared_floatx_zeros((batch_size, self.attention_size)) # Trainable initial state for w. Why not for k? initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0) last_w = shared_floatx_zeros((batch_size, self.encoded_input_dim)) return initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k @application def compute_cost(self, features, features_mask, labels, labels_mask, speaker, start_flag, batch_size): if speaker is None: assert not self.use_speaker target_features = features[1:] mask = features_mask[1:] cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim) gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.weak_feedback: input_features = features[:-1] if self.feedback_noise_level: noise = self.theano_rng.normal(size=input_features.shape, avg=0., std=1.) input_features += self.noise_level_var * noise out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features) to_normalize = [out_cell_h1, out_gat_h1] out_cell_h1, out_gat_h1 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += out_cell_h1 gat_h1 += out_gat_h1 if self.full_feedback: assert self.weak_feedback out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features) out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features) to_normalize = [out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3] out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2 += out_cell_h2 gat_h2 += out_gat_h2 cell_h3 += out_cell_h3 gat_h3 += out_gat_h3 if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3 ] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 = spk_cell_h1 + cell_h1 cell_h2 = spk_cell_h2 + cell_h2 cell_h3 = spk_cell_h3 + cell_h3 gat_h1 = spk_gat_h1 + gat_h1 gat_h2 = spk_gat_h2 + gat_h2 gat_h3 = spk_gat_h3 + gat_h3 initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(batch_size) # If it's a new example, use initial states. input_h1 = tensor.switch(start_flag, initial_h1, last_h1) input_h2 = tensor.switch(start_flag, initial_h2, last_h2) input_h3 = tensor.switch(start_flag, initial_h3, last_h3) input_w = tensor.switch(start_flag, initial_w, last_w) input_k = tensor.switch(start_flag, initial_k, last_k) context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX), 2) def step(inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh): attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) inp_h1_t += attinp_h1 gat_h1_t += attgat_h1 h1_t = self.rnn1.apply(inp_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) inp_h2_t += attinp_h2 gat_h2_t += attgat_h2 inp_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply(inp_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply(inp_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_ (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan( fn=step, sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[context_oh], outputs_info=[ input_h1, input_h2, input_h3, input_k, input_w, None, None ]) h1_out = self.h1_to_readout.apply(h1) h2_out = self.h2_to_readout.apply(h2) h3_out = self.h3_to_readout.apply(h3) to_normalize = [h1_out, h2_out, h3_out] h1_out, h2_out, h3_out = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readouts = h1_out + h2_out + h3_out if self.use_speaker: readouts += self.speaker_to_readout.apply(emb_speaker) readouts += self.att_to_readout.apply(w) predicted = self.readout_to_output.apply(readouts) if self.which_cost == 'MSE': if self.use_speaker: predicted += self.speaker_to_output.apply(emb_speaker) cost = tensor.sum((predicted - target_features)**2, axis=-1) next_x = predicted # Dummy value for coeff coeff = predicted elif self.which_cost == 'GMM': mu, sigma, coeff = predicted if self.use_speaker: spk_to_out = self.speaker_to_output.apply(emb_speaker) mu += spk_to_out[0] sigma += spk_to_out[1] coeff += spk_to_out[2] # When training there should not be sampling_bias sigma = tensor.exp(sigma) + self.epsilon coeff = tensor.nnet.softmax(coeff.reshape( (-1, self.k_gmm))).reshape(coeff.shape) + self.epsilon cost = cost_gmm(target_features, mu, sigma, coeff) next_x = sample_gmm(mu, sigma, coeff, self.theano_rng) cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag updates = [] updates.append((last_h1, h1[-1])) updates.append((last_h2, h2[-1])) updates.append((last_h3, h3[-1])) updates.append((last_k, k[-1])) updates.append((last_w, w[-1])) attention_vars = [next_x, k, w, coeff, phi, pi_att] return cost, scan_updates + updates, attention_vars @application def sample_model_fun(self, labels, labels_mask, speaker, num_samples, seq_size): initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(num_samples) initial_x = numpy.zeros((num_samples, self.output_dim), dtype=floatX) cell_shape = (seq_size, num_samples, self.rnn_h_dim) gat_shape = (seq_size, num_samples, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) # Applied before the broadcast. spk_readout = self.speaker_to_readout.apply(emb_speaker) spk_output = self.speaker_to_output.apply(emb_speaker) # Add dimension to repeat with time. emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3 ] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += spk_cell_h1 cell_h2 += spk_cell_h2 cell_h3 += spk_cell_h3 gat_h1 += spk_gat_h1 gat_h2 += spk_gat_h2 gat_h3 += spk_gat_h3 context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX), 2) def sample_step(inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t, inp_gat_h2_t, inp_cell_h3_t, inp_gat_h3_t, x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1): cell_h1_t = inp_cell_h1_t cell_h2_t = inp_cell_h2_t cell_h3_t = inp_cell_h3_t gat_h1_t = inp_gat_h1_t gat_h2_t = inp_gat_h2_t gat_h3_t = inp_gat_h3_t attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) cell_h1_t += attinp_h1 gat_h1_t += attgat_h1 if self.weak_feedback: out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1) to_normalize = [out_cell_h1_t, out_gat_h1_t] out_cell_h1_t, out_gat_h1_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1_t += out_cell_h1_t gat_h1_t += out_gat_h1_t if self.full_feedback: out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1) out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1) to_normalize = [ out_cell_h2_t, out_gat_h2_t, out_cell_h3_t, out_gat_h3_t ] out_cell_h2_t, out_gat_h2_t, \ out_cell_h3_t, out_gat_h3_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2_t += out_cell_h2_t cell_h3_t += out_cell_h3_t gat_h2_t += out_gat_h2_t gat_h3_t += out_gat_h3_t h1_t = self.rnn1.apply(cell_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon k_t = k_tm1 + self.attention_alignment * \ tensor.exp(k_t) / self.timing_coeff a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) cell_h2_t += attinp_h2 gat_h2_t += attgat_h2 cell_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply(cell_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply(cell_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) h1_out_t = self.h1_to_readout.apply(h1_t) h2_out_t = self.h2_to_readout.apply(h2_t) h3_out_t = self.h3_to_readout.apply(h3_t) to_normalize = [h1_out_t, h2_out_t, h3_out_t] h1_out_t, h2_out_t, h3_out_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readout_t = h1_out_t + h2_out_t + h3_out_t readout_t += self.att_to_readout.apply(w_t) if self.use_speaker: readout_t += spk_readout output_t = self.readout_to_output.apply(readout_t) if self.which_cost == 'MSE': predicted_x_t = output_t if self.use_speaker: predicted_x_t += spk_output # Dummy value for coeff_t coeff_t = predicted_x_t elif self.which_cost == "GMM": mu_t, sigma_t, coeff_t = output_t if self.use_speaker: mu_t += spk_output[0] sigma_t += spk_output[1] coeff_t += spk_output[2] sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \ self.epsilon coeff_t = tensor.nnet.softmax( coeff_t.reshape( (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape( coeff_t.shape) + self.epsilon predicted_x_t = sample_gmm(mu_t, sigma_t, coeff_t, self.theano_rng) return predicted_x_t, h1_t, h2_t, h3_t, \ k_t, w_t, coeff_t, phi_t, a_t_ (sample_x, h1, h2, h3, k, w, pi, phi, pi_att), updates = theano.scan( fn=sample_step, sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[], outputs_info=[ initial_x, initial_h1, initial_h2, initial_h3, initial_k, initial_w, None, None, None ]) return sample_x, k, w, pi, phi, pi_att, updates def sample_model(self, labels_tr, labels_mask_tr, features_mask_tr, speaker_tr, num_samples, num_steps): features, features_mask, labels, labels_mask, speaker, start_flag = \ self.symbolic_input_variables() sample_x, k, w, pi, phi, pi_att, updates = \ self.sample_model_fun( labels, labels_mask, speaker, num_samples, num_steps) theano_inputs = [labels, labels_mask] numpy_inputs = (labels_tr, labels_mask_tr) if self.use_speaker: theano_inputs += [speaker] numpy_inputs += (speaker_tr, ) return function(theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs) def sample_using_input(self, data_tr, num_samples): # Used to predict the values using the dataset features, features_mask, labels, labels_mask, speaker, start_flag = \ self.symbolic_input_variables() cost, updates, attention_vars = self.compute_cost( features, features_mask, labels, labels_mask, speaker, start_flag, num_samples) sample_x, k, w, pi, phi, pi_att = attention_vars theano_vars = [ features, features_mask, labels, labels_mask, speaker, start_flag ] theano_vars = [x for x in theano_vars if x is not None] theano_vars = list(set(theano_vars)) theano_vars = {x.name: x for x in theano_vars} theano_inputs = [] numpy_inputs = [] for key in data_tr.keys(): theano_inputs.append(theano_vars[key]) numpy_inputs.append(data_tr[key]) return function(theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs)
class TestGatedRecurrent(unittest.TestCase): def setUp(self): self.gated = GatedRecurrent( dim=3, weights_init=Constant(2), activation=Tanh(), gate_activation=Tanh()) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, weights_init=IsotropicGaussian(), activation=Tanh(), gate_activation=Tanh(), use_update_gate=False, seed=1) self.reset_only.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') z = tensor.matrix('z') r = tensor.matrix('r') h1 = self.gated.apply(x, z, r, h0, iterate=False) next_h = theano.function(inputs=[h0, x, z, r], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=floatX) zi_val = (h0_val + x_val) / 2 ri_val = -x_val W_val = 2 * numpy.ones((3, 3), dtype=floatX) z_val = numpy.tanh(h0_val.dot(W_val) + zi_val) r_val = numpy.tanh(h0_val.dot(W_val) + ri_val) h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) + (1 - z_val) * h0_val) assert_allclose(h1_val, next_h(h0_val, x_val, zi_val, ri_val)[0], rtol=1e-6) def test_reset_only_many_steps(self): x = tensor.tensor3('x') ri = tensor.tensor3('ri') mask = tensor.matrix('mask') h = self.reset_only.apply(x, reset_inputs=ri, mask=mask) calc_h = theano.function(inputs=[x, ri, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=floatX) x_val = numpy.ones((24, 4, 3), dtype=floatX) * x_val[..., None] ri_val = 0.3 - x_val mask_val = numpy.ones((24, 4), dtype=floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=floatX) W = self.reset_only.state_to_state.get_value() U = self.reset_only.state_to_reset.get_value() for i in range(1, 25): r_val = numpy.tanh(h_val[i - 1].dot(U) + ri_val[i - 1]) h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] # TODO Figure out why this tolerance needs to be so big assert_allclose(h_val, calc_h(x_val, ri_val, mask_val)[0], 1e-03)
class GatedRecurrentFull(Initializable): """A wrapper around the GatedRecurrent brick that improves usability. It contains: * A fork to map to initialize the reset and the update units. * Better initialization to initialize the different pieces While this works, there is probably a better more elegant way to do this. Parameters ---------- hidden_dim : int dimension of the hidden state activation : :class:`.Brick` gate_activation: :class:`.Brick` state_to_state_init: object Weight Initialization state_to_reset_init: object Weight Initialization state_to_update_init: obje64 Weight Initialization input_to_state_transform: :class:`.Brick` [CvMG14] uses Linear transform input_to_reset_transform: :class:`.Brick` [CvMG14] uses Linear transform input_to_update_transform: :class:`.Brick` [CvMG14] uses Linear transform References --------- self.rnn = GatedRecurrent( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=self.activation, gate_activation=self.gate_activation) .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio, *Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734. """ @lazy(allocation=['hidden_dim', 'state_to_state_init', 'state_to_update_init', 'state_to_reset_init'], initialization=['input_to_state_transform', 'input_to_update_transform', 'input_to_reset_transform']) def __init__(self, hidden_dim, activation=None, gate_activation=None, state_to_state_init=None, state_to_update_init=None, state_to_reset_init=None, input_to_state_transform=None, input_to_update_transform=None, input_to_reset_transform=None, **kwargs): super(GatedRecurrentFull, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.state_to_state_init = state_to_state_init self.state_to_update_init = state_to_update_init self.state_to_reset_init = state_to_reset_init self.input_to_state_transform = input_to_state_transform self.input_to_update_transform = input_to_update_transform self.input_to_reset_transform = input_to_reset_transform self.input_to_state_transform.name += "_input_to_state_transform" self.input_to_update_transform.name += "_input_to_update_transform" self.input_to_reset_transform.name += "_input_to_reset_transform" self.use_mine = True if self.use_mine: self.rnn = GatedRecurrentFast( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) else: self.rnn = GatedRecurrent( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) self.children = [self.rnn, self.input_to_state_transform, self.input_to_update_transform, self.input_to_reset_transform] self.children.extend(self.rnn.children) def initialize(self): super(GatedRecurrentFull, self).initialize() self.input_to_state_transform.initialize() self.input_to_update_transform.initialize() self.input_to_reset_transform.initialize() self.rnn.initialize() weight_shape = (self.hidden_dim, self.hidden_dim) state_to_state = self.state_to_state_init.generate(rng=self.rng, shape=weight_shape) state_to_update= self.state_to_update_init.generate(rng=self.rng, shape=weight_shape) state_to_reset = self.state_to_reset_init.generate(rng=self.rng, shape=weight_shape) self.rnn.state_to_state.set_value(state_to_state) if self.use_mine: self.rnn.state_to_update.set_value(state_to_update) self.rnn.state_to_reset.set_value(state_to_reset) else: self.rnn.state_to_gates.set_value(np.hstack((state_to_update, state_to_reset))) @application(inputs=['input_'], outputs=['output']) def apply(self, input_, mask=None): """ Parameters ---------- inputs_ : :class:`~tensor.TensorVariable` sequence to feed into GRU. Axes are mb, sequence, features mask : :class:`~tensor.TensorVariable` A 1D binary array with 1 or 0 to represent data given available. Returns ------- output: :class:`theano.tensor.TensorVariable` sequence to feed out. Axes are batch, sequence, features """ states_from_in = self.input_to_state_transform.apply(input_) update_from_in = self.input_to_update_transform.apply(input_) reset_from_in = self.input_to_reset_transform.apply(input_) gate_inputs = tensor.concatenate([update_from_in, reset_from_in], axis=2) if self.use_mine: output = self.rnn.apply(inputs=states_from_in, update_inputs=update_from_in, reset_inputs=reset_from_in, mask=mask) else: output = self.rnn.apply(inputs=states_from_in, gate_inputs=gate_inputs) return output
class Parrot(Initializable, Random): def __init__( self, input_dim=420, # Dimension of the text labels output_dim=63, # Dimension of vocoder fram rnn_h_dim=1024, # Size of rnn hidden state readouts_dim=1024, # Size of readouts (summary of rnn) weak_feedback=False, # Feedback to the top rnn layer full_feedback=False, # Feedback to all rnn layers feedback_noise_level=None, # Amount of noise in feedback layer_norm=False, # Use simple normalization? use_speaker=False, # Condition on the speaker id? num_speakers=21, # How many speakers there are? speaker_dim=128, # Size of speaker embedding which_cost='MSE', # Train with MSE or GMM k_gmm=20, # How many components in the GMM sampling_bias=0, # Make samples more likely (Graves13) epsilon=1e-5, # Numerical stabilities num_characters=43, # how many chars in the labels attention_type='graves', # graves or softmax attention_size=10, # number of gaussians in the attention attention_alignment=1., # audio steps per letter at initialization sharpening_coeff=1., timing_coeff=1., encoder_type=None, encoder_dim=128, raw_output=False, **kwargs): super(Parrot, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = output_dim self.rnn_h_dim = rnn_h_dim self.readouts_dim = readouts_dim self.layer_norm = layer_norm self.which_cost = which_cost self.use_speaker = use_speaker self.full_feedback = full_feedback self.feedback_noise_level = feedback_noise_level self.epsilon = epsilon self.num_characters = num_characters self.attention_type = attention_type self.attention_alignment = attention_alignment self.attention_size = attention_size self.sharpening_coeff = sharpening_coeff self.timing_coeff = timing_coeff self.encoder_type = encoder_type self.encoder_dim = encoder_dim self.encoded_input_dim = input_dim self.raw_output = raw_output if self.encoder_type == 'bidirectional': self.encoded_input_dim = 2 * encoder_dim if self.feedback_noise_level is not None: self.noise_level_var = tensor.scalar('feedback_noise_level') self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1') self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2') self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3') self.h1_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h2_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h2_to_readout') self.h3_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h3_to_readout') self.h1_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h2') self.h1_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h3') self.h2_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h2_to_h3') if which_cost == 'MSE': self.readout_to_output = Linear( input_dim=readouts_dim, output_dim=output_dim, name='readout_to_output') elif which_cost == 'GMM': self.sampling_bias = sampling_bias self.k_gmm = k_gmm self.readout_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=readouts_dim, output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='readout_to_output') self.encoder = Encoder( encoder_type, num_characters, input_dim, encoder_dim, name='encoder') self.children = [ self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout, self.h2_to_readout, self.h3_to_readout, self.h1_to_h2, self.h1_to_h3, self.h2_to_h3, self.readout_to_output] self.inp_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h1') self.inp_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h2') self.inp_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h3') self.children += [ self.inp_to_h1, self.inp_to_h2, self.inp_to_h3] self.h1_to_att = Fork( output_names=['alpha', 'beta', 'kappa'], input_dim=rnn_h_dim, output_dims=[attention_size] * 3, name='h1_to_att') self.att_to_readout = Linear( input_dim=self.encoded_input_dim, output_dim=readouts_dim, name='att_to_readout') self.children += [ self.h1_to_att, self.att_to_readout] if use_speaker: self.num_speakers = num_speakers self.speaker_dim = speaker_dim self.embed_speaker = LookupTable(num_speakers, speaker_dim) self.speaker_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h1') self.speaker_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h2') self.speaker_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h3') self.speaker_to_readout = Linear( input_dim=speaker_dim, output_dim=readouts_dim, name='speaker_to_readout') if which_cost == 'MSE': self.speaker_to_output = Linear( input_dim=speaker_dim, output_dim=output_dim, name='speaker_to_output') elif which_cost == 'GMM': self.speaker_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=speaker_dim, output_dims=[ output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='speaker_to_output') self.children += [ self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2, self.speaker_to_h3, self.speaker_to_readout, self.speaker_to_output] if full_feedback: self.out_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h2') self.out_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h3') self.children += [ self.out_to_h2, self.out_to_h3] weak_feedback = True self.weak_feedback = weak_feedback if weak_feedback: self.out_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h1') self.children += [ self.out_to_h1] if self.raw_output: self.sampleRnn = SampleRnn() self.children += [self.sampleRnn] def _allocate(self): self.initial_w = shared_floatx_zeros( (self.encoded_input_dim,), name="initial_w") add_role(self.initial_w, INITIAL_STATE) def symbolic_input_variables(self): features = tensor.tensor3('features') features_mask = tensor.matrix('features_mask') labels = tensor.imatrix('labels') labels_mask = tensor.matrix('labels_mask') start_flag = tensor.scalar('start_flag') if self.use_speaker: speaker = tensor.imatrix('speaker_index') else: speaker = None if self.raw_output: raw_sequence = tensor.itensor3('raw_audio') else: raw_sequence = None return features, features_mask, labels, labels_mask, \ speaker, start_flag, raw_sequence def initial_states(self, batch_size): initial_h1 = self.rnn1.initial_states(batch_size) initial_h2 = self.rnn2.initial_states(batch_size) initial_h3 = self.rnn3.initial_states(batch_size) last_h1 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h2 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h3 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) # Defining for all initial_k = tensor.zeros( (batch_size, self.attention_size), dtype=floatX) last_k = shared_floatx_zeros((batch_size, self.attention_size)) # Trainable initial state for w. Why not for k? initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0) last_w = shared_floatx_zeros((batch_size, self.encoded_input_dim)) return initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k @application def compute_cost( self, features, features_mask, labels, labels_mask, speaker, start_flag, batch_size, raw_audio=None): if speaker is None: assert not self.use_speaker target_features = features[1:] mask = features_mask[1:] cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim) gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.weak_feedback: input_features = features[:-1] if self.feedback_noise_level: noise = self.theano_rng.normal( size=input_features.shape, avg=0., std=1.) input_features += self.noise_level_var * noise out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features) to_normalize = [ out_cell_h1, out_gat_h1] out_cell_h1, out_gat_h1 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += out_cell_h1 gat_h1 += out_gat_h1 if self.full_feedback: assert self.weak_feedback out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features) out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features) to_normalize = [ out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3] out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2 += out_cell_h2 gat_h2 += out_gat_h2 cell_h3 += out_cell_h3 gat_h3 += out_gat_h3 if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 = spk_cell_h1 + cell_h1 cell_h2 = spk_cell_h2 + cell_h2 cell_h3 = spk_cell_h3 + cell_h3 gat_h1 = spk_gat_h1 + gat_h1 gat_h2 = spk_gat_h2 + gat_h2 gat_h3 = spk_gat_h3 + gat_h3 initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(batch_size) # If it's a new example, use initial states. input_h1 = tensor.switch( start_flag, initial_h1, last_h1) input_h2 = tensor.switch( start_flag, initial_h2, last_h2) input_h3 = tensor.switch( start_flag, initial_h3, last_h3) input_w = tensor.switch( start_flag, initial_w, last_w) input_k = tensor.switch( start_flag, initial_k, last_k) context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft( tensor.arange(labels.shape[1], dtype=floatX), 2) def step( inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh): attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) inp_h1_t += attinp_h1 gat_h1_t += attgat_h1 h1_t = self.rnn1.apply( inp_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum( a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) inp_h2_t += attinp_h2 gat_h2_t += attgat_h2 inp_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [ h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply( inp_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [ h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply( inp_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_ (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan( fn=step, sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[context_oh], outputs_info=[ input_h1, input_h2, input_h3, input_k, input_w, None, None]) h1_out = self.h1_to_readout.apply(h1) h2_out = self.h2_to_readout.apply(h2) h3_out = self.h3_to_readout.apply(h3) to_normalize = [ h1_out, h2_out, h3_out] h1_out, h2_out, h3_out = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readouts = h1_out + h2_out + h3_out if self.use_speaker: readouts += self.speaker_to_readout.apply(emb_speaker) readouts += self.att_to_readout.apply(w) predicted = self.readout_to_output.apply(readouts) if self.which_cost == 'MSE': if self.use_speaker: predicted += self.speaker_to_output.apply(emb_speaker) cost = tensor.sum((predicted - target_features) ** 2, axis=-1) next_x = predicted # Dummy value for coeff coeff = predicted elif self.which_cost == 'GMM': mu, sigma, coeff = predicted if self.use_speaker: spk_to_out = self.speaker_to_output.apply(emb_speaker) mu += spk_to_out[0] sigma += spk_to_out[1] coeff += spk_to_out[2] # When training there should not be sampling_bias sigma = tensor.exp(sigma) + self.epsilon coeff = tensor.nnet.softmax( coeff.reshape( (-1, self.k_gmm))).reshape( coeff.shape) + self.epsilon cost = cost_gmm(target_features, mu, sigma, coeff) next_x = sample_gmm(mu, sigma, coeff, self.theano_rng) cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag updates = [] updates.append((last_h1, h1[-1])) updates.append((last_h2, h2[-1])) updates.append((last_h3, h3[-1])) updates.append((last_k, k[-1])) updates.append((last_w, w[-1])) cost_raw = None if self.raw_output: raw_mask = tensor.extra_ops.repeat(features_mask, 80, axis=0) raw_mask = raw_mask.dimshuffle(1, 0) # breakpointOp = PdbBreakpoint("Raw mask breakpoint") # condition = tensor.gt(raw_mask.shape[0], 0) # raw_mask = breakpointOp(condition, raw_mask) predicted_transposed = predicted.dimshuffle(1, 0, 2) last_h0, last_big_h0 = self.sampleRnn.initial_states(batch_size) raw_audio_reshaped = raw_audio.dimshuffle(1, 0, 2) raw_audio_reshaped = raw_audio_reshaped.reshape((raw_audio_reshaped.shape[0], -1)) cost_raw, ip_cost, all_params, ip_params, other_params, new_h0, new_big_h0 =\ self.sampleRnn.apply(raw_audio_reshaped, predicted_transposed, last_h0, last_big_h0, start_flag, raw_mask) if self.sampleRnn.N_RNN == 1: new_h0 = tensor.unbroadcast(new_h0, 1) new_big_h0 = tensor.unbroadcast(new_big_h0, 1) updates.append((last_h0, new_h0)) updates.append((last_big_h0, new_big_h0)) # cost = cost + 80.*cost_raw alpha_ = numpy.float32(0.) beta_ = numpy.float32(1.) cost = alpha_*cost + beta_*cost_raw attention_vars = [next_x, k, w, coeff, phi, pi_att] return cost, scan_updates + updates, attention_vars, cost_raw @application def sample_model_fun( self, labels, labels_mask, speaker, num_samples, seq_size): initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(num_samples) initial_x = numpy.zeros( (num_samples, self.output_dim), dtype=floatX) cell_shape = (seq_size, num_samples, self.rnn_h_dim) gat_shape = (seq_size, num_samples, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) # Applied before the broadcast. spk_readout = self.speaker_to_readout.apply(emb_speaker) spk_output = self.speaker_to_output.apply(emb_speaker) # Add dimension to repeat with time. emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += spk_cell_h1 cell_h2 += spk_cell_h2 cell_h3 += spk_cell_h3 gat_h1 += spk_gat_h1 gat_h2 += spk_gat_h2 gat_h3 += spk_gat_h3 context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft( tensor.arange(labels.shape[1], dtype=floatX), 2) def sample_step( inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t, inp_gat_h2_t, inp_cell_h3_t, inp_gat_h3_t, x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1): cell_h1_t = inp_cell_h1_t cell_h2_t = inp_cell_h2_t cell_h3_t = inp_cell_h3_t gat_h1_t = inp_gat_h1_t gat_h2_t = inp_gat_h2_t gat_h3_t = inp_gat_h3_t attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) cell_h1_t += attinp_h1 gat_h1_t += attgat_h1 if self.weak_feedback: out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1) to_normalize = [ out_cell_h1_t, out_gat_h1_t] out_cell_h1_t, out_gat_h1_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1_t += out_cell_h1_t gat_h1_t += out_gat_h1_t if self.full_feedback: out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1) out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1) to_normalize = [ out_cell_h2_t, out_gat_h2_t, out_cell_h3_t, out_gat_h3_t] out_cell_h2_t, out_gat_h2_t, \ out_cell_h3_t, out_gat_h3_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2_t += out_cell_h2_t cell_h3_t += out_cell_h3_t gat_h2_t += out_gat_h2_t gat_h3_t += out_gat_h3_t h1_t = self.rnn1.apply( cell_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon k_t = k_tm1 + self.attention_alignment * \ tensor.exp(k_t) / self.timing_coeff a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum( a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) cell_h2_t += attinp_h2 gat_h2_t += attgat_h2 cell_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [ h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply( cell_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [ h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply( cell_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) h1_out_t = self.h1_to_readout.apply(h1_t) h2_out_t = self.h2_to_readout.apply(h2_t) h3_out_t = self.h3_to_readout.apply(h3_t) to_normalize = [ h1_out_t, h2_out_t, h3_out_t] h1_out_t, h2_out_t, h3_out_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readout_t = h1_out_t + h2_out_t + h3_out_t readout_t += self.att_to_readout.apply(w_t) if self.use_speaker: readout_t += spk_readout output_t = self.readout_to_output.apply(readout_t) if self.which_cost == 'MSE': predicted_x_t = output_t if self.use_speaker: predicted_x_t += spk_output # Dummy value for coeff_t coeff_t = predicted_x_t elif self.which_cost == "GMM": mu_t, sigma_t, coeff_t = output_t if self.use_speaker: mu_t += spk_output[0] sigma_t += spk_output[1] coeff_t += spk_output[2] sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \ self.epsilon coeff_t = tensor.nnet.softmax( coeff_t.reshape( (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape( coeff_t.shape) + self.epsilon predicted_x_t = sample_gmm( mu_t, sigma_t, coeff_t, self.theano_rng) return predicted_x_t, h1_t, h2_t, h3_t, \ k_t, w_t, coeff_t, phi_t, a_t_ (sample_x, h1, h2, h3, k, w, pi, phi, pi_att), updates = theano.scan( fn=sample_step, sequences=[ cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[], outputs_info=[ initial_x, initial_h1, initial_h2, initial_h3, initial_k, initial_w, None, None, None]) return sample_x, k, w, pi, phi, pi_att, updates def sample_model( self, labels_tr, labels_mask_tr, features_mask_tr, speaker_tr, num_samples, num_steps): features, features_mask, labels, labels_mask, speaker, start_flag, raw_sequence = \ self.symbolic_input_variables() sample_x, k, w, pi, phi, pi_att, updates = \ self.sample_model_fun( labels, labels_mask, speaker, num_samples, num_steps) theano_inputs = [labels, labels_mask] numpy_inputs = (labels_tr, labels_mask_tr) if self.use_speaker: theano_inputs += [speaker] numpy_inputs += (speaker_tr,) return function( theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs) def sample_using_input(self, data_tr, num_samples): # Used to predict the values using the dataset features, features_mask, labels, labels_mask, speaker, start_flag, raw_sequence = \ self.symbolic_input_variables() cost, updates, attention_vars = self.compute_cost( features, features_mask, labels, labels_mask, speaker, start_flag, num_samples) sample_x, k, w, pi, phi, pi_att = attention_vars theano_vars = [ features, features_mask, labels, labels_mask, speaker, start_flag] theano_vars = [x for x in theano_vars if x is not None] theano_vars = list(set(theano_vars)) theano_vars = {x.name: x for x in theano_vars} theano_inputs = [] numpy_inputs = [] for key in data_tr.keys(): theano_inputs.append(theano_vars[key]) numpy_inputs.append(data_tr[key]) return function( theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs)
class TestGatedRecurrent(unittest.TestCase): def setUp(self): self.gated = GatedRecurrent( dim=3, activation=Tanh(), gate_activation=Tanh(), weights_init=Constant(2)) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, activation=Tanh(), gate_activation=Tanh(), weights_init=IsotropicGaussian(), seed=1) self.reset_only.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') gi = tensor.matrix('gi') h1 = self.gated.apply(x, gi, h0, iterate=False) next_h = theano.function(inputs=[h0, x, gi], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=theano.config.floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=theano.config.floatX) zi_val = (h0_val + x_val) / 2 ri_val = -x_val W_val = 2 * numpy.ones((3, 3), dtype=theano.config.floatX) z_val = numpy.tanh(h0_val.dot(W_val) + zi_val) r_val = numpy.tanh(h0_val.dot(W_val) + ri_val) h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) + (1 - z_val) * h0_val) assert_allclose( h1_val, next_h(h0_val, x_val, numpy.hstack([zi_val, ri_val]))[0], rtol=1e-6) def test_many_steps(self): x = tensor.tensor3('x') gi = tensor.tensor3('gi') mask = tensor.matrix('mask') h = self.reset_only.apply(x, gi, mask=mask) calc_h = theano.function(inputs=[x, gi, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = numpy.ones((24, 4, 3), dtype=theano.config.floatX) * x_val[..., None] ri_val = 0.3 - x_val zi_val = 2 * ri_val mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) W = self.reset_only.state_to_state.get_value() Wz = self.reset_only.state_to_gates.get_value()[:, :3] Wr = self.reset_only.state_to_gates.get_value()[:, 3:] for i in range(1, 25): z_val = numpy.tanh(h_val[i - 1].dot(Wz) + zi_val[i - 1]) r_val = numpy.tanh(h_val[i - 1].dot(Wr) + ri_val[i - 1]) h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) + x_val[i - 1]) h_val[i] = z_val * h_val[i] + (1 - z_val) * h_val[i - 1] h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] # TODO Figure out why this tolerance needs to be so big assert_allclose( h_val, calc_h(x_val, numpy.concatenate( [zi_val, ri_val], axis=2), mask_val)[0], 1e-04) # Also test that initial state is a parameter initial_state, = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert is_shared_variable(initial_state) assert initial_state.name == 'initial_state'
class TestGatedRecurrent(unittest.TestCase): def setUp(self): self.gated = GatedRecurrent( dim=3, weights_init=Constant(2), activation=Tanh(), gate_activation=Tanh()) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, weights_init=IsotropicGaussian(), activation=Tanh(), gate_activation=Tanh(), use_update_gate=False, rng=numpy.random.RandomState(1)) self.reset_only.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') z = tensor.matrix('z') r = tensor.matrix('r') h1 = self.gated.apply(x, z, r, h0, iterate=False) next_h = theano.function(inputs=[h0, x, z, r], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=floatX) zi_val = (h0_val + x_val) / 2 ri_val = -x_val W_val = 2 * numpy.ones((3, 3), dtype=floatX) z_val = numpy.tanh(h0_val.dot(W_val) + zi_val) r_val = numpy.tanh(h0_val.dot(W_val) + ri_val) h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) + (1 - z_val) * h0_val) assert_allclose(h1_val, next_h(h0_val, x_val, zi_val, ri_val)[0], rtol=1e-6) def test_reset_only_many_steps(self): x = tensor.tensor3('x') ri = tensor.tensor3('ri') mask = tensor.matrix('mask') h = self.reset_only.apply(x, reset_inputs=ri, mask=mask) calc_h = theano.function(inputs=[x, ri, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=floatX) x_val = numpy.ones((24, 4, 3), dtype=floatX) * x_val[..., None] ri_val = 0.3 - x_val mask_val = numpy.ones((24, 4), dtype=floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=floatX) W = self.reset_only.state_to_state.get_value() U = self.reset_only.state_to_reset.get_value() for i in range(1, 25): r_val = numpy.tanh(h_val[i - 1].dot(U) + ri_val[i - 1]) h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] # TODO Figure out why this tolerance needs to be so big assert_allclose(h_val, calc_h(x_val, ri_val, mask_val)[0], 1e-03)
class Scribe(Initializable): def __init__(self, k=20, rec_h_dim=400, att_size=10, num_letters=68, sampling_bias=0., attention_type="graves", epsilon=1e-6, attention_alignment=1., **kwargs): super(Scribe, self).__init__(**kwargs) # For now only softmax and graves are supported. assert attention_type in ["graves", "softmax"] readouts_dim = 1 + 6 * k self.k = k self.rec_h_dim = rec_h_dim self.att_size = att_size self.num_letters = num_letters self.sampling_bias = sampling_bias self.attention_type = attention_type self.epsilon = epsilon self.attention_alignment = attention_alignment self.cell1 = GatedRecurrent(dim=rec_h_dim, name='cell1') self.inp_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'], input_dim=3, output_dims=[rec_h_dim, 2 * rec_h_dim], name='inp_to_h1') self.h1_to_readout = Linear(input_dim=rec_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'], input_dim=rec_h_dim, output_dims=[att_size] * 3, name='h1_to_att') self.att_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'], input_dim=num_letters, output_dims=[rec_h_dim, 2 * rec_h_dim], name='att_to_h1') self.att_to_readout = Linear(input_dim=num_letters, output_dim=readouts_dim, name='att_to_readout') self.emitter = BivariateGMMEmitter(k=k, sampling_bias=sampling_bias) self.children = [ self.cell1, self.inp_to_h1, self.h1_to_readout, self.h1_to_att, self.att_to_h1, self.att_to_readout, self.emitter ] def _allocate(self): self.initial_w = shared_floatx_zeros((self.num_letters, ), name="initial_w") add_role(self.initial_w, INITIAL_STATE) def symbolic_input_variables(self): data = tensor.tensor3('features') data_mask = tensor.matrix('features_mask') context = tensor.imatrix('transcripts') context_mask = tensor.matrix('transcripts_mask') start_flag = tensor.scalar('start_flag') return data, data_mask, context, context_mask, start_flag def initial_states(self, batch_size): initial_h1 = self.cell1.initial_states(batch_size) initial_kappa = shared_floatx_zeros((batch_size, self.att_size)) initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0) last_h1 = shared_floatx_zeros((batch_size, self.rec_h_dim)) last_w = shared_floatx_zeros((batch_size, self.num_letters)) use_last_states = shared(numpy.asarray(0., dtype=floatX)) return initial_h1, initial_kappa, initial_w, \ last_h1, last_w, use_last_states @application def compute_cost(self, data, data_mask, context, context_mask, start_flag, batch_size): x = data[:-1] target = data[1:] mask = data_mask[1:] xinp_h1, xgat_h1 = self.inp_to_h1.apply(x) context_oh = one_hot(context, self.num_letters) * \ tensor.shape_padright(context_mask) initial_h1, initial_kappa, initial_w, \ last_h1, last_w, use_last_states = \ self.initial_states(batch_size) input_h1 = tensor.switch(use_last_states, last_h1, initial_h1) input_w = tensor.switch(use_last_states, last_w, initial_w) u = tensor.shape_padleft(tensor.arange(context.shape[1], dtype=floatX), 2) def step(xinp_h1_t, xgat_h1_t, h1_tm1, k_tm1, w_tm1, ctx): attinp_h1, attgat_h1 = self.att_to_h1.apply(w_tm1) h1_t = self.cell1.apply(xinp_h1_t + attinp_h1, xgat_h1_t + attgat_h1, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) else: a_t = tensor.exp(a_t) b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * ctx).sum(axis=1) return h1_t, k_t, w_t (h1, kappa, w), scan_updates = theano.scan( fn=step, sequences=[xinp_h1, xgat_h1], non_sequences=[context_oh], outputs_info=[input_h1, initial_kappa, input_w]) readouts = self.h1_to_readout.apply(h1) + \ self.att_to_readout.apply(w) cost = self.emitter.cost(readouts, target) cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag updates = [] updates.append((last_h1, h1[-1])) updates.append((initial_kappa, tensor.switch(start_flag, 0. * initial_kappa, kappa[-1]))) updates.append((last_w, w[-1])) updates.append((use_last_states, 1. - start_flag)) return cost, scan_updates + updates @application def sample_model(self, context, context_mask, n_steps, batch_size): initial_h1, initial_kappa, initial_w, \ last_h1, last_w, use_last_states = \ self.initial_states(batch_size) initial_x = self.emitter.initial_outputs(batch_size) context_oh = one_hot(context, self.num_letters) * \ tensor.shape_padright(context_mask) u = tensor.shape_padleft(tensor.arange(context.shape[1], dtype=floatX), 2) def sample_step(x_tm1, h1_tm1, k_tm1, w_tm1, ctx): xinp_h1_t, xgat_h1_t = self.inp_to_h1.apply(x_tm1) attinp_h1, attgat_h1 = self.att_to_h1.apply(w_tm1) h1_t = self.cell1.apply(xinp_h1_t + attinp_h1, xgat_h1_t + attgat_h1, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) else: a_t = tensor.exp(a_t) b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * ctx).sum(axis=1) readout_t = self.h1_to_readout.apply(h1_t) + \ self.att_to_readout.apply(w_t) x_t = self.emitter.emit(readout_t) mu_t, sigma_t, corr_t, pi_t, penup_t = \ self.emitter.components(readout_t) return x_t, h1_t, k_t, w_t, pi_t, phi_t, a_t (sample_x, h1, k, w, pi, phi, pi_att), updates = theano.scan(fn=sample_step, n_steps=n_steps, sequences=[], non_sequences=[context_oh], outputs_info=[ initial_x.eval(), initial_h1, initial_kappa, initial_w, None, None, None ]) return sample_x, pi, phi, pi_att, updates
def __init__(self, config): inp = tensor.imatrix('bytes') embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX), name='embedding_matrix') in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim)) in_repr.name = 'in_repr' bricks = [] states = [] # Construct predictive GRU hierarchy hidden = [] costs = [] next_target = in_repr.dimshuffle(1, 0, 2) for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims, config.cost_factors, config.hidden_q)): init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX), name='st0_%d'%i) linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim, name="lstm_in_%d"%i) lstm = GatedRecurrent(dim=hdim, activation=config.activation_function, name="lstm_rec_%d"%i) linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i) tanh = Tanh('lstm_out_tanh_%d'%i) bricks += [linear, lstm, linear2, tanh] if i > 0: linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim, name='lstm_in2_%d'%i) bricks += [linear1] next_target = tensor.cast(next_target, dtype=theano.config.floatX) inter = linear.apply(theano.gradient.disconnected_grad(next_target)) if i > 0: inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:])) new_hidden = lstm.apply(inputs=inter[:,:,:hdim], gate_inputs=inter[:,:,hdim:], states=init_state) states.append((init_state, new_hidden[-1, :, :])) hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)] pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:])) costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()] costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()] diff = next_target - pred next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5) # Construct output from hidden states hidden = [s.dimshuffle(1, 0, 2) for s in hidden] out_parts = [] out_dims = config.out_hidden + [config.io_dim] for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)): pred_linear = Linear(input_dim=dim, output_dim=out_dims[0], name='pred_linear_%d'%i) bricks.append(pred_linear) lin = theano.gradient.disconnected_grad(state) out_parts.append(pred_linear.apply(lin)) # Do prediction and calculate cost out = sum(out_parts) if len(out_dims) > 1: out = config.out_hidden_act[0](name='out_act0').apply(out) mlp = MLP(dims=out_dims, activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])] +[Identity()], name='out_mlp') bricks.append(mlp) out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1)) ).reshape((inp.shape[0],inp.shape[1]+1,-1)) pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp.flatten(), out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1], config.io_dim))).mean() error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean() sgd_cost = cost + sum(costs) # Initialize all bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize() # apply noise cg = ComputationGraph([sgd_cost, cost, error_rate]+costs) if config.weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.weight_noise) sgd_cost = cg.outputs[0] cost = cg.outputs[1] error_rate = cg.outputs[2] costs = cg.outputs[3:] # put stuff into self that is usefull for training or extensions self.sgd_cost = sgd_cost sgd_cost.name = 'sgd_cost' for i in range(len(costs)): costs[i].name = 'pred_cost_%d'%i cost.name = 'cost' error_rate.name = 'error_rate' self.monitor_vars = [costs, [cost], [error_rate]] self.out = out[:,1:,:] self.pred = pred[:,1:] self.states = states
class GatedRecurrentFull(Initializable): """A wrapper around the GatedRecurrent brick that improves usability. It contains: * A fork to map to initialize the reset and the update units. * Better initialization to initialize the different pieces While this works, there is probably a better more elegant way to do this. Parameters ---------- hidden_dim : int dimension of the hidden state activation : :class:`.Brick` gate_activation: :class:`.Brick` state_to_state_init: object Weight Initialization state_to_reset_init: object Weight Initialization state_to_update_init: obje64 Weight Initialization input_to_state_transform: :class:`.Brick` [CvMG14] uses Linear transform input_to_reset_transform: :class:`.Brick` [CvMG14] uses Linear transform input_to_update_transform: :class:`.Brick` [CvMG14] uses Linear transform References --------- self.rnn = GatedRecurrent( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=self.activation, gate_activation=self.gate_activation) .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio, *Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734. """ @lazy(allocation=[ 'hidden_dim', 'state_to_state_init', 'state_to_update_init', 'state_to_reset_init' ], initialization=[ 'input_to_state_transform', 'input_to_update_transform', 'input_to_reset_transform' ]) def __init__(self, hidden_dim, activation=None, gate_activation=None, state_to_state_init=None, state_to_update_init=None, state_to_reset_init=None, input_to_state_transform=None, input_to_update_transform=None, input_to_reset_transform=None, **kwargs): super(GatedRecurrentFull, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.state_to_state_init = state_to_state_init self.state_to_update_init = state_to_update_init self.state_to_reset_init = state_to_reset_init self.input_to_state_transform = input_to_state_transform self.input_to_update_transform = input_to_update_transform self.input_to_reset_transform = input_to_reset_transform self.input_to_state_transform.name += "_input_to_state_transform" self.input_to_update_transform.name += "_input_to_update_transform" self.input_to_reset_transform.name += "_input_to_reset_transform" self.use_mine = True if self.use_mine: self.rnn = GatedRecurrentFast(weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) else: self.rnn = GatedRecurrent(weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) self.children = [ self.rnn, self.input_to_state_transform, self.input_to_update_transform, self.input_to_reset_transform ] self.children.extend(self.rnn.children) def initialize(self): super(GatedRecurrentFull, self).initialize() self.input_to_state_transform.initialize() self.input_to_update_transform.initialize() self.input_to_reset_transform.initialize() self.rnn.initialize() weight_shape = (self.hidden_dim, self.hidden_dim) state_to_state = self.state_to_state_init.generate(rng=self.rng, shape=weight_shape) state_to_update = self.state_to_update_init.generate( rng=self.rng, shape=weight_shape) state_to_reset = self.state_to_reset_init.generate(rng=self.rng, shape=weight_shape) self.rnn.state_to_state.set_value(state_to_state) if self.use_mine: self.rnn.state_to_update.set_value(state_to_update) self.rnn.state_to_reset.set_value(state_to_reset) else: self.rnn.state_to_gates.set_value( np.hstack((state_to_update, state_to_reset))) @application(inputs=['input_'], outputs=['output']) def apply(self, input_, mask=None): """ Parameters ---------- inputs_ : :class:`~tensor.TensorVariable` sequence to feed into GRU. Axes are mb, sequence, features mask : :class:`~tensor.TensorVariable` A 1D binary array with 1 or 0 to represent data given available. Returns ------- output: :class:`theano.tensor.TensorVariable` sequence to feed out. Axes are batch, sequence, features """ states_from_in = self.input_to_state_transform.apply(input_) update_from_in = self.input_to_update_transform.apply(input_) reset_from_in = self.input_to_reset_transform.apply(input_) gate_inputs = tensor.concatenate([update_from_in, reset_from_in], axis=2) if self.use_mine: output = self.rnn.apply(inputs=states_from_in, update_inputs=update_from_in, reset_inputs=reset_from_in, mask=mask) else: output = self.rnn.apply(inputs=states_from_in, gate_inputs=gate_inputs) return output