def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim * 2], name='fork', output_names=["linear", "gates"], weights_init=initialization.Identity(), biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(), biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear(input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin, gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim*2],name='fork',output_names=["linear","gates"], weights_init=initialization.Identity(),biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(),biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear( input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin,gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
class BidirectionalEncoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='embeddings') self.bidir = NewBidirectional( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [ self.lookup, self.bidir, self.fwd_fork, self.back_fork ] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names ] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension. source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) representation = self.bidir.apply( # Conversion to embedding representation here. merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) self.representation = representation return representation
class BidirectionalEncoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) # self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim # self.lookup = LookupTable(name='embeddings') self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.fwd_fork, self.back_fork] def _push_allocation_config(self): # self.lookup.length = self.vocab_size # self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names ] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names ] @application(inputs=['image_embedding'], outputs=['representation']) def apply(self, image_embedding): # Time as first dimension image_embedding_mask = tensor.ones(image_embedding.shape[:2]) # print image_embedding.type # embeddings = self.lookup.apply(source_sentence) representation = self.bidir.apply( merge(self.fwd_fork.apply(image_embedding, as_dict=True), {'mask': image_embedding_mask}), merge(self.back_fork.apply(image_embedding, as_dict=True), {'mask': image_embedding_mask})) return representation
class BidirectionalEncoder(Initializable): """ Bidirectional GRU encoder. """ def __init__(self, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) # Dimension of the word embeddings taken as input self.embedding_dim = embedding_dim # Hidden state dimension self.state_dim = state_dim # The bidir GRU self.bidir = BidirectionalFromDict( GatedRecurrent(activation=Tanh(), dim=state_dim)) # Forks to administer the inputs of GRU gates self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.fwd_fork, self.back_fork] def _push_allocation_config(self): self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names ] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names ] @application(inputs=['source_sentence_tbf', 'source_sentence_mask_tb'], outputs=['representation']) def apply(self, source_sentence_tbf, source_sentence_mask_tb=None): representation_tbf = self.bidir.apply( merge(self.fwd_fork.apply(source_sentence_tbf, as_dict=True), {'mask': source_sentence_mask_tb}), merge(self.back_fork.apply(source_sentence_tbf, as_dict=True), {'mask': source_sentence_mask_tb})) return representation_tbf
class BidirectionalEncoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='embeddings') self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='back_fork') self.children = [self.lookup, self.bidir, self.fwd_fork, self.back_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) representation = self.bidir.apply( merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}) ) return representation
class RecurrentWithFork(Initializable): @lazy(allocation=['input_dim']) def __init__(self, recurrent, input_dim, **kwargs): super(RecurrentWithFork, self).__init__(**kwargs) self.recurrent = recurrent self.input_dim = input_dim self.fork = Fork( [name for name in self.recurrent.sequences if name != 'mask'], prototype=Linear()) self.children = [recurrent.brick, self.fork] def _push_allocation_config(self): self.fork.input_dim = self.input_dim self.fork.output_dims = [self.recurrent.brick.get_dim(name) for name in self.fork.output_names] @application(inputs=['input_', 'mask']) def apply(self, input_, mask=None, **kwargs): return self.recurrent( mask=mask, **dict_union(self.fork.apply(input_, as_dict=True), kwargs)) @apply.property('outputs') def apply_outputs(self): return self.recurrent.states
def gru_layer(dim, h, n): fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)], name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2]) gru = GatedRecurrent(dim=dim, name='gru' + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) return gru.apply(linear, gates)
class RecurrentWithFork(Initializable): # Obtained from Dima's code. @rizar # https://github.com/rizar/attention-lvcsr/blob/master/lvsr/bricks/__init__.py @lazy(allocation=['input_dim']) def __init__(self, recurrent, input_dim, **kwargs): super(RecurrentWithFork, self).__init__(**kwargs) self.recurrent = recurrent self.input_dim = input_dim self.fork = Fork( [name for name in self.recurrent.sequences if name != 'mask'], prototype=Linear()) self.children = [recurrent.brick, self.fork] def _push_allocation_config(self): self.fork.input_dim = self.input_dim self.fork.output_dims = [ self.recurrent.brick.get_dim(name) for name in self.fork.output_names ] @application(inputs=['input_', 'mask']) def apply(self, input_, mask=None, **kwargs): return self.recurrent(mask=mask, **dict_union( self.fork.apply(input_, as_dict=True), kwargs)) @apply.property('outputs') def apply_outputs(self): return self.recurrent.states
class RecurrentWithFork(Initializable): @lazy(allocation=['input_dim']) def __init__(self, transition, input_dim, hidden_dim, rec_weights_init, ff_weights_init, biases_init, **kwargs): super(RecurrentWithFork, self).__init__(**kwargs) self.rec_weights_init = rec_weights_init self.ff_weights_init = ff_weights_init self.biases_init = biases_init self.input_dim = input_dim self.hidden_dim = hidden_dim self.transition = transition self.transition.dim = self.hidden_dim self.transition.weights_init = self.rec_weights_init self.transition.bias_init = self.biases_init self.fork = Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear()) self.fork.input_dim = self.input_dim self.fork.output_dims = [ self.transition.apply.brick.get_dim(name) for name in self.fork.output_names ] self.fork.weights_init = self.ff_weights_init self.fork.biases_init = self.biases_init self.children = [transition, self.fork] # def _push_allocation_config(self):# # #super(RecurrentWithFork, self)._push_allocation_config() # self.transition.dim=self.hidden_dim # self.fork.input_dim = self.input_dim # self.fork.output_dims = [self.transition.apply.brick.get_dim(name) # for name in self.fork.output_names] # def _push_initialization_config(self): # #super(RecurrentWithFork, self)._push_initialization_config() # self.fork.weights_init=self.ff_weights_init # self.fork.biases_init=self.biases_init # self.transition.weights_init=self.rec_weights_init # self.transition.bias_init=self.biases_init @application(inputs=['input_', 'mask']) def apply(self, input_, mask=None, **kwargs): states = self.transition.apply(mask=mask, **dict_union( self.fork.apply(input_, as_dict=True), kwargs)) # I don't know, why blocks returns a list [states, cell] for LSTM # but just states (no list) for GRU or normal RNN. We only want LSTM's states. # cells should not be visible from outside. return states[0] if isinstance(states, list) else states @apply.property('outputs') def apply_outputs(self): return self.transition.apply.states
class RecurrentWithFork(Initializable): @lazy(allocation=['input_dim']) def __init__(self, proto, input_dim, **kwargs): super(RecurrentWithFork, self).__init__(**kwargs) self.recurrent = proto self.input_dim = input_dim self.fork = Fork([ name for name in self.recurrent.apply.sequences if name != 'mask' ], prototype=Linear()) self.children = [self.recurrent, self.fork] def _push_allocation_config(self): self.fork.input_dim = self.input_dim self.fork.output_dims = [ self.recurrent.get_dim(name) for name in self.fork.output_names ] @application(inputs=['input_', 'mask']) def apply(self, input_, mask=None, **kwargs): return self.recurrent.apply(mask=mask, **dict_union( self.fork.apply(input_, as_dict=True), kwargs)) @apply.property('outputs') def apply_outputs(self): return self.recurrent.states
class BidirectionalEncoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='words_embeddings') self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='words_fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='words_back_fork') self.children = [self.lookup, self.bidir, self.fwd_fork, self.back_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names] @application(inputs=['words', 'words_mask'], outputs=['representation']) def apply(self, words, words_mask): # Time as first dimension words = words.T words_mask = words_mask.T embeddings = self.lookup.apply(words) representation = self.bidir.apply( merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': words_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': words_mask}) ) return representation
class BidirectionalEncoder(Initializable): """ Bidirectional GRU encoder. """ def __init__(self, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) # Dimension of the word embeddings taken as input self.embedding_dim = embedding_dim # Hidden state dimension self.state_dim = state_dim # The bidir GRU self.bidir = BidirectionalFromDict( GatedRecurrent(activation=Tanh(), dim=state_dim)) # Forks to administer the inputs of GRU gates self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.fwd_fork, self.back_fork] def _push_allocation_config(self): self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names] @application(inputs=['source_sentence_tbf', 'source_sentence_mask_tb'], outputs=['representation']) def apply(self, source_sentence_tbf, source_sentence_mask_tb=None): representation_tbf = self.bidir.apply( merge(self.fwd_fork.apply(source_sentence_tbf, as_dict=True), {'mask': source_sentence_mask_tb}), merge(self.back_fork.apply(source_sentence_tbf, as_dict=True), {'mask': source_sentence_mask_tb}) ) return representation_tbf
class InnerRecurrent(BaseRecurrent, Initializable): def __init__(self, inner_input_dim, outer_input_dim, inner_dim, **kwargs): self.inner_gru = GatedRecurrent(dim=inner_dim, name='inner_gru') self.inner_input_fork = Fork( output_names=[name for name in self.inner_gru.apply.sequences if 'mask' not in name], input_dim=inner_input_dim, name='inner_input_fork') self.outer_input_fork = Fork( output_names=[name for name in self.inner_gru.apply.sequences if 'mask' not in name], input_dim=outer_input_dim, name='inner_outer_fork') super(InnerRecurrent, self).__init__(**kwargs) self.children = [ self.inner_gru, self.inner_input_fork, self.outer_input_fork] def _push_allocation_config(self): self.inner_input_fork.output_dims = self.inner_gru.get_dims( self.inner_input_fork.output_names) self.outer_input_fork.output_dims = self.inner_gru.get_dims( self.outer_input_fork.output_names) @recurrent(sequences=['inner_inputs'], states=['states'], contexts=['outer_inputs'], outputs=['states']) def apply(self, inner_inputs, states, outer_inputs): forked_inputs = self.inner_input_fork.apply(inner_inputs, as_dict=True) forked_states = self.outer_input_fork.apply(outer_inputs, as_dict=True) gru_inputs = {key: forked_inputs[key] + forked_states[key] for key in forked_inputs.keys()} new_states = self.inner_gru.apply( iterate=False, **dict_union(gru_inputs, {'states': states})) return new_states # mean according to the time axis def get_dim(self, name): if name == 'states': return self.inner_gru.get_dim(name) else: return AttributeError
class RecurrentWithFork(Initializable): @lazy(allocation=['input_dim']) def __init__(self, transition, input_dim, hidden_dim, rec_weights_init, ff_weights_init, biases_init, **kwargs): super(RecurrentWithFork, self).__init__(**kwargs) self.rec_weights_init=rec_weights_init self.ff_weights_init=ff_weights_init self.biases_init=biases_init self.input_dim=input_dim self.hidden_dim=hidden_dim self.transition=transition self.transition.dim=self.hidden_dim self.transition.weights_init=self.rec_weights_init self.transition.bias_init=self.biases_init self.fork = Fork( [name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) self.fork.input_dim = self.input_dim self.fork.output_dims = [self.transition.apply.brick.get_dim(name) for name in self.fork.output_names] self.fork.weights_init=self.ff_weights_init self.fork.biases_init=self.biases_init self.children = [transition, self.fork] # def _push_allocation_config(self):# # #super(RecurrentWithFork, self)._push_allocation_config() # self.transition.dim=self.hidden_dim # self.fork.input_dim = self.input_dim # self.fork.output_dims = [self.transition.apply.brick.get_dim(name) # for name in self.fork.output_names] # def _push_initialization_config(self): # #super(RecurrentWithFork, self)._push_initialization_config() # self.fork.weights_init=self.ff_weights_init # self.fork.biases_init=self.biases_init # self.transition.weights_init=self.rec_weights_init # self.transition.bias_init=self.biases_init @application(inputs=['input_', 'mask']) def apply(self, input_, mask=None, **kwargs): states=self.transition.apply( mask=mask, **dict_union(self.fork.apply(input_, as_dict=True), kwargs)) # I don't know, why blocks returns a list [states, cell] for LSTM # but just states (no list) for GRU or normal RNN. We only want LSTM's states. # cells should not be visible from outside. return states[0] if isinstance(states,list) else states @apply.property('outputs') def apply_outputs(self): return self.transition.apply.states
def gru_layer(dim, h, n): fork = Fork( output_names=["linear" + str(n), "gates" + str(n)], name="fork" + str(n), input_dim=dim, output_dims=[dim, dim * 2], ) gru = GatedRecurrent(dim=dim, name="gru" + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) return gru.apply(linear, gates)
def gru_layer(dim, h, n, x_mask, first, **kwargs): fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)], name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2]) gru = GatedRecurrent(dim=dim, name='gru' + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) if first: gruApply = gru.apply(linear, gates, mask=x_mask, **kwargs) else: gruApply = gru.apply(linear, gates, **kwargs) return gruApply
class Feedback(Initializable): """Feedback. Attributes ---------- output_names : list output_dims : dict """ @lazy(allocation=['output_names', 'output_dims']) def __init__(self, output_names, output_dims, embedding=None, input_dim=0, **kwargs): super(Feedback, self).__init__(**kwargs) self.output_names = output_names self.output_dims = output_dims self.input_dim = input_dim self.embedding = embedding self.fork = Fork(self.output_names) self.apply.inputs = ['input'] self.apply.outputs = output_names self.children = [self.embedding, self.fork] self.children = [child for child in self.children if child] def _push_allocation_config(self): if self.fork: self.fork.output_dims = self.output_dims else: self.embedding.output_dim, = self.output_dims if self.embedding: self.embedding.input_dim = self.input_dim self.fork.input_dim = self.embedding.output_dim else: self.fork.input_dim = self.input_dim @application def apply(self, symbols): embedded_symbols = symbols if self.embedding: embedded_symbols = self.embedding.apply(symbols) if self.fork: return self.fork.apply(embedded_symbols) return embedded_symbols
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.transition.dim = self.state_dim self.fork.input_dim = self.embedding_dim self.fork.output_dims = [ self.state_dim for _ in self.fork.output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.dimshuffle(1, 0) source_sentence_mask = source_sentence_mask.T if self.reverse: source_sentence = source_sentence[::-1] source_sentence_mask = source_sentence_mask[::-1] embeddings = self.lookup.apply(source_sentence) representation = self.transition.apply( **merge(self.fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) return representation[-1]
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.transition.dim = self.state_dim self.fork.input_dim = self.embedding_dim self.fork.output_dims = [self.state_dim for _ in self.fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.dimshuffle(1, 0) source_sentence_mask = source_sentence_mask.T if self.reverse: source_sentence = source_sentence[::-1] source_sentence_mask = source_sentence_mask[::-1] embeddings = self.lookup.apply(source_sentence) representation = self.transition.apply(**merge( self.fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask} )) return representation[-1]
def build_fork_lookup(vocab_size, args): x = tensor.lmatrix('features') virtual_dim = 6 time_length = 5 mini_batch_size = 2 skip_connections = True layers = 3 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) print output_names print output_dims lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) fork.initialize() f = theano.function([x], pre_rnn) return f
def build_fork_lookup(vocab_size, args): x = tensor.lmatrix('features') virtual_dim = 6 time_length = 5 mini_batch_size = 2 skip_connections = True layers = 3 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) print output_names print output_dims lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) fork.initialize() f = theano.function([x], pre_rnn) return f
class Encoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, blockid, vocab_size, embedding_dim, state_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.blockid = blockid self.lookup = LookupTable(name='embeddings' + '_' + self.blockid) self.gru = GatedRecurrent(activation=Tanh(), dim=state_dim, name = "GatedRNN" + self.blockid) self.fwd_fork = Fork( [name for name in self.gru.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork' + '_' + self.blockid) self.children = [self.lookup, self.gru, self.fwd_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.gru.get_dim(name) for name in self.fwd_fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) grupara = merge( self.fwd_fork.apply(embeddings, as_dict=True) , {'mask': source_sentence_mask}) representation = self.gru.apply(**grupara) return representation
def build_fork_lookup(vocab_size, time_length, args): x = tensor.lmatrix('features') virtual_dim = 6 state_dim = 6 skip_connections = False layers = 1 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper transitions = [ClockworkBase(dim=state_dim, activation=Tanh(), period=2 ** i) for i in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) # Give time as the first index for each element in the list: # (Time X Batch X embedding_dim) if layers > 1 and skip_connections: for t in range(len(pre_rnn)): pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2) else: pre_rnn = pre_rnn.dimshuffle(1, 0, 2) f_pre_rnn = theano.function([x], pre_rnn) # Prepare inputs for the RNN kwargs = OrderedDict() for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] else: kwargs['inputs' + suffix] = pre_rnn print kwargs # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() f_h = theano.function([x], h) return f_pre_rnn, f_h
class RecurrentEncoder(Initializable): def __init__(self, config, output_dim, activation, **kwargs): super(RecurrentEncoder, self).__init__(**kwargs) self.config = config self.context_embedder = ContextEmbedder(config) self.rec = SegregatedBidirectional(LSTM(dim=config.rec_state_dim, name='encoder_recurrent')) self.fwd_fork = Fork([name for name in self.rec.prototype.apply.sequences if name!='mask'], prototype=Linear(), name='fwd_fork') self.bkwd_fork = Fork([name for name in self.rec.prototype.apply.sequences if name!='mask'], prototype=Linear(), name='bkwd_fork') rto_in = config.rec_state_dim * 2 + sum(x[2] for x in config.dim_embeddings) self.rec_to_output = MLP( activations=[Rectifier() for _ in config.dim_hidden] + [activation], dims=[rto_in] + config.dim_hidden + [output_dim], name='encoder_rto') self.children = [self.context_embedder, self.rec, self.fwd_fork, self.bkwd_fork, self.rec_to_output] self.rec_inputs = ['latitude', 'longitude', 'latitude_mask'] self.inputs = self.context_embedder.inputs + self.rec_inputs def _push_allocation_config(self): for i, fork in enumerate([self.fwd_fork, self.bkwd_fork]): fork.input_dim = 2 fork.output_dims = [ self.rec.children[i].get_dim(name) for name in fork.output_names ] def _push_initialization_config(self): for brick in self.children: brick.weights_init = self.config.weights_init brick.biases_init = self.config.biases_init @application def apply(self, latitude, longitude, latitude_mask, **kwargs): latitude = (latitude.T - data.train_gps_mean[0]) / data.train_gps_std[0] longitude = (longitude.T - data.train_gps_mean[1]) / data.train_gps_std[1] latitude_mask = latitude_mask.T rec_in = tensor.concatenate((latitude[:, :, None], longitude[:, :, None]), axis=2) path = self.rec.apply(merge(self.fwd_fork.apply(rec_in, as_dict=True), {'mask': latitude_mask}), merge(self.bkwd_fork.apply(rec_in, as_dict=True), {'mask': latitude_mask}))[0] last_id = tensor.cast(latitude_mask.sum(axis=0) - 1, dtype='int64') path_representation = (path[0][:, -self.config.rec_state_dim:], path[last_id - 1, tensor.arange(last_id.shape[0])] [:, :self.config.rec_state_dim]) embeddings = tuple(self.context_embedder.apply( **{k: kwargs[k] for k in self.context_embedder.inputs })) inputs = tensor.concatenate(path_representation + embeddings, axis=1) outputs = self.rec_to_output.apply(inputs) return outputs @apply.property('inputs') def apply_inputs(self): return self.inputs
class Scribe(Initializable): def __init__(self, k=20, rec_h_dim=400, att_size=10, num_letters=68, sampling_bias=0., attention_type="graves", epsilon=1e-6, attention_alignment=1., **kwargs): super(Scribe, self).__init__(**kwargs) # For now only softmax and graves are supported. assert attention_type in ["graves", "softmax"] readouts_dim = 1 + 6 * k self.k = k self.rec_h_dim = rec_h_dim self.att_size = att_size self.num_letters = num_letters self.sampling_bias = sampling_bias self.attention_type = attention_type self.epsilon = epsilon self.attention_alignment = attention_alignment self.cell1 = GatedRecurrent(dim=rec_h_dim, name='cell1') self.inp_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'], input_dim=3, output_dims=[rec_h_dim, 2 * rec_h_dim], name='inp_to_h1') self.h1_to_readout = Linear(input_dim=rec_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'], input_dim=rec_h_dim, output_dims=[att_size] * 3, name='h1_to_att') self.att_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'], input_dim=num_letters, output_dims=[rec_h_dim, 2 * rec_h_dim], name='att_to_h1') self.att_to_readout = Linear(input_dim=num_letters, output_dim=readouts_dim, name='att_to_readout') self.emitter = BivariateGMMEmitter(k=k, sampling_bias=sampling_bias) self.children = [ self.cell1, self.inp_to_h1, self.h1_to_readout, self.h1_to_att, self.att_to_h1, self.att_to_readout, self.emitter ] def _allocate(self): self.initial_w = shared_floatx_zeros((self.num_letters, ), name="initial_w") add_role(self.initial_w, INITIAL_STATE) def symbolic_input_variables(self): data = tensor.tensor3('features') data_mask = tensor.matrix('features_mask') context = tensor.imatrix('transcripts') context_mask = tensor.matrix('transcripts_mask') start_flag = tensor.scalar('start_flag') return data, data_mask, context, context_mask, start_flag def initial_states(self, batch_size): initial_h1 = self.cell1.initial_states(batch_size) initial_kappa = shared_floatx_zeros((batch_size, self.att_size)) initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0) last_h1 = shared_floatx_zeros((batch_size, self.rec_h_dim)) last_w = shared_floatx_zeros((batch_size, self.num_letters)) use_last_states = shared(numpy.asarray(0., dtype=floatX)) return initial_h1, initial_kappa, initial_w, \ last_h1, last_w, use_last_states @application def compute_cost(self, data, data_mask, context, context_mask, start_flag, batch_size): x = data[:-1] target = data[1:] mask = data_mask[1:] xinp_h1, xgat_h1 = self.inp_to_h1.apply(x) context_oh = one_hot(context, self.num_letters) * \ tensor.shape_padright(context_mask) initial_h1, initial_kappa, initial_w, \ last_h1, last_w, use_last_states = \ self.initial_states(batch_size) input_h1 = tensor.switch(use_last_states, last_h1, initial_h1) input_w = tensor.switch(use_last_states, last_w, initial_w) u = tensor.shape_padleft(tensor.arange(context.shape[1], dtype=floatX), 2) def step(xinp_h1_t, xgat_h1_t, h1_tm1, k_tm1, w_tm1, ctx): attinp_h1, attgat_h1 = self.att_to_h1.apply(w_tm1) h1_t = self.cell1.apply(xinp_h1_t + attinp_h1, xgat_h1_t + attgat_h1, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) else: a_t = tensor.exp(a_t) b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * ctx).sum(axis=1) return h1_t, k_t, w_t (h1, kappa, w), scan_updates = theano.scan( fn=step, sequences=[xinp_h1, xgat_h1], non_sequences=[context_oh], outputs_info=[input_h1, initial_kappa, input_w]) readouts = self.h1_to_readout.apply(h1) + \ self.att_to_readout.apply(w) cost = self.emitter.cost(readouts, target) cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag updates = [] updates.append((last_h1, h1[-1])) updates.append((initial_kappa, tensor.switch(start_flag, 0. * initial_kappa, kappa[-1]))) updates.append((last_w, w[-1])) updates.append((use_last_states, 1. - start_flag)) return cost, scan_updates + updates @application def sample_model(self, context, context_mask, n_steps, batch_size): initial_h1, initial_kappa, initial_w, \ last_h1, last_w, use_last_states = \ self.initial_states(batch_size) initial_x = self.emitter.initial_outputs(batch_size) context_oh = one_hot(context, self.num_letters) * \ tensor.shape_padright(context_mask) u = tensor.shape_padleft(tensor.arange(context.shape[1], dtype=floatX), 2) def sample_step(x_tm1, h1_tm1, k_tm1, w_tm1, ctx): xinp_h1_t, xgat_h1_t = self.inp_to_h1.apply(x_tm1) attinp_h1, attgat_h1 = self.att_to_h1.apply(w_tm1) h1_t = self.cell1.apply(xinp_h1_t + attinp_h1, xgat_h1_t + attgat_h1, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) else: a_t = tensor.exp(a_t) b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * ctx).sum(axis=1) readout_t = self.h1_to_readout.apply(h1_t) + \ self.att_to_readout.apply(w_t) x_t = self.emitter.emit(readout_t) mu_t, sigma_t, corr_t, pi_t, penup_t = \ self.emitter.components(readout_t) return x_t, h1_t, k_t, w_t, pi_t, phi_t, a_t (sample_x, h1, k, w, pi, phi, pi_att), updates = theano.scan(fn=sample_step, n_steps=n_steps, sequences=[], non_sequences=[context_oh], outputs_info=[ initial_x.eval(), initial_h1, initial_kappa, initial_w, None, None, None ]) return sample_x, pi, phi, pi_att, updates
def build_model_vanilla(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh()) for _ in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear( input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # We have # h = [state, state_1, state_2 ...] if layers > 1 # h = state if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} if layers > 1: # Save all the last states for d in range(layers): last_states[d] = h[d][-1, :, :] if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: last_states[0] = h[-1, :, :] h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
class BidirectionalEncoder(Initializable): """A generalized version of the vanilla encoder of the RNNsearch model which supports different numbers of layers. Zero layers represent non-recurrent encoders. """ def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') if self.n_layers >= 1: self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [ self.lookup, self.bidir, self.fwd_fork, self.back_fork ] if self.n_layers > 1: # Deep encoder self.mid_fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='mid_fwd_fork') self.mid_back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='mid_back_fork') self.children.append(self.mid_fwd_fork) self.children.append(self.mid_back_fork) elif self.n_layers == 0: self.embedding_dim = state_dim * 2 self.children = [self.lookup] else: logging.fatal("Number of encoder layers must be non-negative") def _push_allocation_config(self): """Sets the parameters of sub bricks """ self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim if self.n_layers >= 1: self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names ] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names ] if self.n_layers > 1: # Deep encoder inp_dim = self.state_dim * 2 if self.skip_connections: inp_dim += self.embedding_dim self.mid_fwd_fork.input_dim = inp_dim self.mid_fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names ] self.mid_back_fork.input_dim = inp_dim self.mid_back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation', 'representation_mask']) def apply(self, source_sentence, source_sentence_mask): """Produces source annotations, either non-recurrently or with a bidirectional RNN architecture. """ # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) if self.n_layers >= 1: representation = self.bidir.apply( merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) for _ in xrange(self.n_layers - 1): if self.skip_connections: inp = tensor.concatenate([representation, embeddings], axis=2) else: inp = representation representation = self.bidir.apply( merge(self.mid_fwd_fork.apply(inp, as_dict=True), {'mask': source_sentence_mask}), merge(self.mid_back_fork.apply(inp, as_dict=True), {'mask': source_sentence_mask})) else: representation = embeddings return representation, source_sentence_mask
class BidirectionalEncoderSigmoid(Initializable): """Encoder of RNNsearch model.""" def __init__(self, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoderSigmoid, self).__init__(**kwargs) self.embedding_dim = embedding_dim self.state_dim = state_dim curSeed = 1791095845 self.rng = numpy.random.RandomState(curSeed) self.bidir = BidirectionalWMT15( GatedRecurrentWithZerosAtMask(activation=Logistic(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') #self.children = [self.lookup, self.bidir, self.children = [self.bidir, self.fwd_fork, self.back_fork] self._push_allocation_config( ) # maybe not necessary? (maybe only necessary for decoder) print "RNN seed: " + str(self.rng.get_state()[1][0]) # initialization of parameters self.weights_init = IsotropicGaussian() self.biases_init = Constant(0) self.push_initialization_config() self.bidir.prototype.weights_init = Orthogonal() self.initialize() def _push_allocation_config(self): self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names ] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = source_sentence representation = self.bidir.apply( # Conversion to embedding representation here. # TODO: Less than the current number of dimensions should be totally fine. merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) self.representation = representation return representation
class BidirectionalEncoder(Initializable): """A generalized version of the vanilla encoder of the RNNsearch model which supports different numbers of layers. Zero layers represent non-recurrent encoders. """ def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') if self.n_layers >= 1: self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='back_fork') self.children = [self.lookup, self.bidir, self.fwd_fork, self.back_fork] if self.n_layers > 1: # Deep encoder self.mid_fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='mid_fwd_fork') self.mid_back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='mid_back_fork') self.children.append(self.mid_fwd_fork) self.children.append(self.mid_back_fork) elif self.n_layers == 0: self.embedding_dim = state_dim*2 self.children = [self.lookup] else: logging.fatal("Number of encoder layers must be non-negative") def _push_allocation_config(self): """Sets the parameters of sub bricks """ self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim if self.n_layers >= 1: self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names] if self.n_layers > 1: # Deep encoder inp_dim = self.state_dim * 2 if self.skip_connections: inp_dim += self.embedding_dim self.mid_fwd_fork.input_dim = inp_dim self.mid_fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names] self.mid_back_fork.input_dim = inp_dim self.mid_back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation', 'representation_mask']) def apply(self, source_sentence, source_sentence_mask): """Produces source annotations, either non-recurrently or with a bidirectional RNN architecture. """ # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) if self.n_layers >= 1: representation = self.bidir.apply( merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}) ) for _ in xrange(self.n_layers-1): if self.skip_connections: inp = tensor.concatenate([representation, embeddings], axis=2) else: inp = representation representation = self.bidir.apply( merge(self.mid_fwd_fork.apply(inp, as_dict=True), {'mask': source_sentence_mask}), merge(self.mid_back_fork.apply(inp, as_dict=True), {'mask': source_sentence_mask}) ) else: representation = embeddings return representation, source_sentence_mask
class NoLookupEncoder(Initializable): """This is a variation of ``BidirectionalEncoder`` which works with sparse feature maps. It does not use a lookup table but directly feeds the predefined distributed representations into the encoder network.""" def __init__(self, embedding_dim, state_dim, **kwargs): """Constructor. Note that this implementation only supports single layer architectures. Args: embedding_dim (int): Dimensionality of the word vectors defined by the sparse feature map. state_dim (int): Size of the recurrent layer. """ super(NoLookupEncoder, self).__init__(**kwargs) self.embedding_dim = embedding_dim self.state_dim = state_dim self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.fwd_fork, self.back_fork] def _push_allocation_config(self): """Sets the dimensions of the forward and backward forks. """ self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation', 'representation_mask']) def apply(self, source_sentence, source_sentence_mask): """Creates bidirectional RNN source annotations. Args: source_sentence (Variable): Source sentence with words in vector representation. source_sentence_mask (Variable): Source mask Returns: Variable. source annotations """ # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T representation = self.bidir.apply( merge(self.fwd_fork.apply(source_sentence, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(source_sentence, as_dict=True), {'mask': source_sentence_mask}) ) return representation, source_sentence_mask
class SimplePyramidLayer(Initializable): """Basic unit for the pyramid model. """ def __init__(self, batch_size, frame_size, k, depth, size, **kwargs): super(SimplePyramidLayer, self).__init__(**kwargs) target_size = frame_size * k depth_x = depth hidden_size_mlp_x = 32*size depth_transition = depth-1 depth_theta = depth hidden_size_mlp_theta = 32*size hidden_size_recurrent = 32*size*3 activations_x = [Rectifier()]*depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [4*hidden_size_recurrent] activations_theta = [Rectifier()]*depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta self.mlp_x = MLP(activations = activations_x, dims = dims_x, name = "mlp_x") transition = [GatedRecurrent(dim=hidden_size_recurrent, use_bias = True, name = "gru_{}".format(i) ) for i in range(depth_transition)] self.transition = RecurrentStack( transition, name="transition", skip_connections = True) mlp_theta = MLP( activations = activations_theta, dims = dims_theta, name = "mlp_theta") mlp_gmm = GMMMLP(mlp = mlp_theta, dim = target_size, k = k, const = 0.00001, name = "gmm_wrap") self.gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) normal_inputs = [name for name in self.transition.apply.sequences if 'mask' not in name] self.fork = Fork(normal_inputs, input_dim = 4*hidden_size_recurrent, output_dims = self.transition.get_dims(normal_inputs)) self.children = [self.mlp_x, self.transition, self.gmm_emitter, self.fork] def monitoring_vars(self, cg): mu, sigma, coeff = VariableFilter( applications = [self.gmm_emitter.gmmmlp.apply], name_regex = "output")(cg.variables) min_sigma = sigma.min().copy(name="sigma_min") mean_sigma = sigma.mean().copy(name="sigma_mean") max_sigma = sigma.max().copy(name="sigma_max") min_mu = mu.min().copy(name="mu_min") mean_mu = mu.mean().copy(name="mu_mean") max_mu = mu.max().copy(name="mu_max") monitoring_vars = [mean_sigma, min_sigma, min_mu, max_mu, mean_mu, max_sigma] return monitoring_vars @application def cost(self, x, context, **kwargs): x_g = self.mlp_x.apply(context) inputs = self.fork.apply(x_g, as_dict = True) h = self.transition.apply(**dict_union(inputs, kwargs)) self.final_states = [] for var in h: self.final_states.append(var[-1].copy(name = var.name + "_final_value")) cost = self.gmm_emitter.cost(h[-1], x) return cost.mean() @application def generate(context): x_g = self.mlp_x.apply(context) inputs = self.fork.apply(x_g, as_dict = True) h = self.transition.apply(**dict_union(inputs, kwargs)) return self.gmm_emitter.emit(h[-1])
def main(mode, save_path, num_batches, from_dump): if mode == "train": # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Data processing pipeline data_stream = DataStreamMapping( mapping=lambda data: tuple(array.T for array in data), data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=lambda data: len(data[0]) <= 100, data_stream=OneBillionWord( "training", [99], char2code, level="character", preprocess=str.lower).get_default_stream()))))) # Build the model chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") encoder = Bidirectional(GatedRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) encoder.initialize() fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.fork_dims = {name: dimension for name in fork.fork_names} fork.initialize() lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) lookup.initialize() transition = Transition(activation=Tanh(), dim=dimension, attended_dim=2 * dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() bricks = [encoder, fork, lookup, generator] # Give an idea of what's going on params = Selector(bricks).get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Build the cost computation graph batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) energies = unpack(VariableFilter(application=readout.readout, name="output")(cg.variables), singleton=True) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy(activations.mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule([ GradientClipping(10.0), SteepestDescent(0.01) ])) observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) main_loop = MainLoop( model=bricks, data_stream=data_stream, algorithm=algorithm, extensions=([LoadFromDump(from_dump)] if from_dump else []) + [ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), TrainingDataMonitoring( observables, prefix="average", every_n_batches=10), FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", lambda log: math.isnan( log.current_row.total_gradient_norm)), Plot(os.path.basename(save_path), [["average_" + cost.name], ["average_" + cost_per_character.name]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": with open(save_path, "rb") as source: encoder, fork, lookup, generator = dill.load(source) logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) sample_function = ComputationGraph(generated).get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=lambda tuple_: -tuple_[0]) for _, message in messages: print(message)
def main(): nvis, nhid, nlat, learn_prior = 784, 200, 100, False theano_rng = MRG_RandomStreams(134663) # Initialize prior prior_mu = shared_floatx(numpy.zeros(nlat), name='prior_mu') prior_log_sigma = shared_floatx(numpy.zeros(nlat), name='prior_log_sigma') if learn_prior: add_role(prior_mu, PARAMETER) add_role(prior_log_sigma, PARAMETER) # Initialize encoding network encoding_network = MLP(activations=[Rectifier()], dims=[nvis, nhid], weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) encoding_network.initialize() encoding_parameter_mapping = Fork( output_names=['mu_phi', 'log_sigma_phi'], input_dim=nhid, output_dims=dict(mu_phi=nlat, log_sigma_phi=nlat), prototype=Linear(), weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) encoding_parameter_mapping.initialize() # Initialize decoding network decoding_network = MLP(activations=[Rectifier()], dims=[nlat, nhid], weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) decoding_network.initialize() decoding_parameter_mapping = Linear( input_dim=nhid, output_dim=nvis, name='mu_theta', weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) decoding_parameter_mapping.initialize() # Encode / decode x = tensor.matrix('features') h_phi = encoding_network.apply(x) mu_phi, log_sigma_phi = encoding_parameter_mapping.apply(h_phi) epsilon = theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) epsilon.name = 'epsilon' z = mu_phi + epsilon * tensor.exp(log_sigma_phi) z.name = 'z' h_theta = decoding_network.apply(z) mu_theta = decoding_parameter_mapping.apply(h_theta) # Compute cost kl_term = ( prior_log_sigma - log_sigma_phi + 0.5 * ( tensor.exp(2 * log_sigma_phi) + (mu_phi - prior_mu) ** 2 ) / tensor.exp(2 * prior_log_sigma) - 0.5 ).sum(axis=1) kl_term.name = 'kl_term' kl_term_mean = kl_term.mean() kl_term_mean.name = 'avg_kl_term' reconstruction_term = - ( x * tensor.nnet.softplus(-mu_theta) + (1 - x) * tensor.nnet.softplus(mu_theta)).sum(axis=1) reconstruction_term.name = 'reconstruction_term' reconstruction_term_mean = -reconstruction_term.mean() reconstruction_term_mean.name = 'avg_reconstruction_term' cost = -(reconstruction_term - kl_term).mean() cost.name = 'nll_upper_bound' # Datasets and data streams mnist_train = MNIST( 'train', start=0, stop=50000, binary=True, sources=('features',)) train_loop_stream = DataStream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 100)) train_monitor_stream = DataStream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 500)) mnist_valid = MNIST( 'train', start=50000, stop=60000, binary=True, sources=('features',)) valid_monitor_stream = DataStream( dataset=mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500)) mnist_test = MNIST('test', binary=True, sources=('features',)) test_monitor_stream = DataStream( dataset=mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 500)) # Get parameters computation_graph = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(computation_graph.variables) # Training loop step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) monitored_quantities = [cost, reconstruction_term_mean, kl_term_mean] main_loop = MainLoop( model=None, data_stream=train_loop_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=200), DataStreamMonitoring( monitored_quantities, train_monitor_stream, prefix="train"), DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid"), DataStreamMonitoring( monitored_quantities, test_monitor_stream, prefix="test"), Printing()]) main_loop.run()
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') x = m.mean() + x #stupid mask not always needed... #embedding_size = 300 #glove_version = "glove.6B.300d.txt" embedding_size = 50 glove_version = "vectors.6B.50d.txt" wstd = 0.02 conv1 = Conv1D(filter_length=5, num_filters=128, input_dim=embedding_size, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.0)) conv1.initialize() o = conv1.apply(x) o = Rectifier(name="conv1red").apply(o) o = MaxPooling1D(pooling_length=5 #, step=2 ).apply(o) conv2 = Conv1D(filter_length=5, num_filters=128, input_dim=128, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.0), step=3, name="conv2") conv2.initialize() o = conv2.apply(o) o = Rectifier(name="conv2rec").apply(o) conv2 = Conv1D(filter_length=5, num_filters=128, input_dim=128, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.0), step=3, name="conv3") conv2.initialize() o = conv2.apply(o) o = Rectifier(name="conv3rec").apply(o) fork = Fork(weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.), input_dim=128, output_dims=[128]*3, output_names=['inputs', 'reset_inputs', 'update_inputs'] ) fork.initialize() inputs, reset_inputs, update_inputs = fork.apply(o) out = o.mean(axis=1) #gru = GatedRecurrent(dim=128, #weights_init=IsotropicGaussian(0.02), #biases_init=IsotropicGaussian(0.0)) #gru.initialize() #states = gru.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs) #out = states[:, -1, :] hidden = Linear( input_dim = 128, output_dim = 128, weights_init = Uniform(std=0.01), biases_init = Constant(0.)) hidden.initialize() o = hidden.apply(out) o = Rectifier().apply(o) #hidden = Linear( #input_dim = 128, #output_dim = 128, #weights_init = IsotropicGaussian(std=0.02), #biases_init = Constant(0.), #name="hiddenmap2") #hidden.initialize() #o = hidden.apply(o) #o = Rectifier(name="rec2").apply(o) score_layer = Linear( input_dim = 128, output_dim = 1, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.), name="linear2") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cost, params=params, step_rule = CompositeRule([ StepClipping(threshold=10), AdaM(), #AdaDelta(), ]) ) # ======== print "setting up data" ports = { 'gpu0_train' : 5557, 'gpu0_test' : 5558, 'gpu1_train' : 5559, 'gpu1_test' : 5560, } batch_size = 16 def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set) n_train = dataset.num_examples stream = DataStream( dataset=dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + '_train'] train_p = Process(target=start_server, args=(train_port, 'train')) train_p.start() test_port = ports[theano.config.device + '_test'] test_p = Process(target=start_server, args=(test_port, 'test')) test_p.start() train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port) test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port) print "setting up model" #import ipdb #ipdb.set_trace() n_examples = 25000 #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True )) extensions.append(DataStreamMonitoring( [cost, misclassification], data_stream=test_stream, prefix='test', after_epoch=True )) extensions.append(Timing()) extensions.append(Printing()) #extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True)) extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True)) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
class DRAW(BaseRecurrent, Initializable, Random): def __init__(self, nvis, nhid, encoding_mlp, encoding_lstm, decoding_mlp, decoding_lstm, T=1, **kwargs): super(DRAW, self).__init__(**kwargs) self.nvis = nvis self.nhid = nhid self.T = T self.encoding_mlp = encoding_mlp self.encoding_mlp.name = 'encoder_mlp' for i, child in enumerate(self.encoding_mlp.children): child.name = '{}_{}'.format(self.encoding_mlp.name, i) self.encoding_lstm = encoding_lstm self.encoding_lstm.name = 'encoder_lstm' self.encoding_parameter_mapping = Fork( output_names=['mu_phi', 'log_sigma_phi'], prototype=Linear()) self.decoding_mlp = decoding_mlp self.decoding_mlp.name = 'decoder_mlp' for i, child in enumerate(self.decoding_mlp.children): child.name = '{}_{}'.format(self.decoding_mlp.name, i) self.decoding_lstm = decoding_lstm self.decoding_lstm.name = 'decoder_lstm' self.decoding_parameter_mapping = Linear(name='mu_theta') self.prior_mu = tensor.zeros((self.nhid,)) self.prior_mu.name = 'prior_mu' self.prior_log_sigma = tensor.zeros((self.nhid,)) self.prior_log_sigma.name = 'prior_log_sigma' self.children = [self.encoding_mlp, self.encoding_lstm, self.encoding_parameter_mapping, self.decoding_mlp, self.decoding_lstm, self.decoding_parameter_mapping] def _push_allocation_config(self): # The attention-less read operation concatenates x and x_hat, and # we feed the decoder back into the encoder, which is why the input # to the encoding MLP is twice the size of x plus the size of the # decoding LSTM. self.encoding_mlp.dims[0] = 2 * self.nvis + self.decoding_lstm.dim self.encoding_mlp.dims[-1] = 4 * self.encoding_lstm.dim self.encoding_parameter_mapping.input_dim = self.encoding_lstm.dim self.encoding_parameter_mapping.output_dims = dict( mu_phi=self.nhid, log_sigma_phi=self.nhid) self.decoding_mlp.dims[0] = self.nhid self.decoding_mlp.dims[-1] = 4 * self.decoding_lstm.dim self.decoding_parameter_mapping.input_dim = self.decoding_lstm.dim self.decoding_parameter_mapping.output_dim = self.nvis def sample(self, num_samples): z = self.theano_rng.normal(size=(self.T, num_samples, self.nhid), avg=self.prior_mu, std=tensor.exp(self.prior_log_sigma)) return tensor.nnet.sigmoid(self.decode_z(z)[0][-1]) @application(inputs=['x'], outputs=['x_hat']) def reconstruct(self, x): x_sequence = tensor.tile(x.dimshuffle('x', 0, 1), (self.T, 1, 1)) rval = self.apply(x_sequence) return tensor.nnet.sigmoid(rval[0][-1]) @recurrent(sequences=['z'], contexts=[], states=['c_states', 'decoding_states', 'decoding_cells'], outputs=['c_states', 'decoding_states', 'decoding_cells']) def decode_z(self, z, c_states=None, decoding_states=None, decoding_cells=None): h_mlp_theta = self.decoding_mlp.apply(z) h_lstm_theta, cells_theta = self.decoding_lstm.apply( inputs=h_mlp_theta, states=decoding_states, cells=decoding_cells, iterate=False) new_c_states = ( c_states + self.decoding_parameter_mapping.apply(h_lstm_theta)) return new_c_states, h_lstm_theta, cells_theta @recurrent(sequences=['x'], contexts=[], states=['c_states', 'encoding_states', 'encoding_cells', 'decoding_states', 'decoding_cells'], outputs=['c_states', 'encoding_states', 'encoding_cells', 'decoding_states', 'decoding_cells', 'mu_phi', 'log_sigma_phi']) def apply(self, x, c_states=None, encoding_states=None, encoding_cells=None, decoding_states=None, decoding_cells=None): x_hat = x - tensor.nnet.sigmoid(c_states) # Concatenate x and x_hat r = tensor.concatenate([x, x_hat], axis=1) # Concatenate r and h_dec h_mlp_phi = self.encoding_mlp.apply( tensor.concatenate([r, decoding_states], axis=1)) h_lstm_phi, cells_phi = self.encoding_lstm.apply( inputs=h_mlp_phi, states=encoding_states, cells=encoding_cells, iterate=False) phi = self.encoding_parameter_mapping.apply(h_lstm_phi) mu_phi, log_sigma_phi = phi epsilon = self.theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) epsilon.name = 'epsilon' z = mu_phi + epsilon * tensor.exp(log_sigma_phi) z.name = 'z' h_mlp_theta = self.decoding_mlp.apply(z) h_lstm_theta, cells_theta = self.decoding_lstm.apply( inputs=h_mlp_theta, states=decoding_states, cells=decoding_cells, iterate=False) new_c_states = ( c_states + self.decoding_parameter_mapping.apply(h_lstm_theta)) return (new_c_states, h_lstm_phi, cells_phi, h_lstm_theta, cells_theta, mu_phi, log_sigma_phi) @application(inputs=['x'], outputs=['log_likelihood_lower_bound']) def log_likelihood_lower_bound(self, x): x_sequence = tensor.tile(x.dimshuffle('x', 0, 1), (self.T, 1, 1)) rval = self.apply(x_sequence) c_states, mu_phi, log_sigma_phi = rval[0], rval[-2], rval[-1] prior_mu = self.prior_mu.dimshuffle('x', 'x', 0) prior_log_sigma = self.prior_log_sigma.dimshuffle('x', 'x', 0) kl_term = ( prior_log_sigma - log_sigma_phi + 0.5 * ( tensor.exp(2 * log_sigma_phi) + (mu_phi - prior_mu) ** 2 ) / tensor.exp(2 * prior_log_sigma) - 0.5).sum(axis=2).sum(axis=0) kl_term.name = 'kl_term' reconstruction_term = - ( x * tensor.nnet.softplus(-c_states[-1]) + (1 - x) * tensor.nnet.softplus(c_states[-1])).sum(axis=1) reconstruction_term.name = 'reconstruction_term' log_likelihood_lower_bound = reconstruction_term - kl_term log_likelihood_lower_bound.name = 'log_likelihood_lower_bound' annotation = Annotation() annotation.add_auxiliary_variable(kl_term, name='kl_term') annotation.add_auxiliary_variable(-reconstruction_term, name='reconstruction_term') add_annotation(log_likelihood_lower_bound, annotation) return log_likelihood_lower_bound def get_dim(self, name): if name is 'c_states': return self.nvis elif name is 'encoding_states': return self.encoding_lstm.get_dim('states') elif name is 'encoding_cells': return self.encoding_lstm.get_dim('cells') elif name is 'decoding_states': return self.decoding_lstm.get_dim('states') elif name is 'decoding_cells': return self.decoding_lstm.get_dim('cells') else: return super(DRAW, self).get_dim(name)
class AddParameters(Brick): """Adds dependency on parameters to a transition function. In fact an improved version of this brick should be moved to the main body of the library, because it is clearly reusable (e.g. it can be a part of Encoder-Decoder translation model. """ @lazy def __init__(self, transition, num_params, params_name, weights_init, biases_init, **kwargs): super(AddParameters, self).__init__(**kwargs) update_instance(self, locals()) self.input_names = [name for name in transition.apply.sequences if name != 'mask'] self.state_name = transition.apply.states[0] assert len(transition.apply.states) == 1 self.fork = Fork(self.input_names) # Could be also several init bricks, one for each of the states self.init = MLP([Identity()], name="init") self.children = [self.transition, self.fork, self.init] def _push_allocation_config(self): self.fork.input_dim = self.num_params self.fork.fork_dims = {name: self.transition.get_dim(name) for name in self.input_names} self.init.dims[0] = self.num_params self.init.dims[-1] = self.transition.get_dim(self.state_name) def _push_initialization_config(self): for child in self.children: if self.weights_init: child.weights_init = self.weights_init if self.biases_init: child.biases_init = self.biases_init @application def apply(self, **kwargs): inputs = {name: kwargs.pop(name) for name in self.input_names} params = kwargs.pop("params") forks = self.fork.apply(params, return_dict=True) for name in self.input_names: inputs[name] = inputs[name] + forks[name] kwargs.update(inputs) if kwargs.get('iterate', True): kwargs[self.state_name] = self.initial_state(None, params=params) return self.transition.apply(**kwargs) @apply.delegate def apply_delegate(self): return self.transition.apply @apply.property('contexts') def apply_contexts(self): return [self.params_name] + self.transition.apply.contexts @application def initial_state(self, batch_size, *args, **kwargs): return self.init.apply(kwargs['params']) def get_dim(self, name): if name == 'params': return self.num_params return self.transition.get_dim(name)
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor( x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
class Decoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim readout = Readout( source_names=['states', 'feedback', 'readout_context'], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(), feedback_brick=LookupFeedback(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence( [Bias(dim=1000).apply, Maxout(num_pieces=2).apply, Linear(input_dim=state_dim / 2, output_dim=100, use_bias=False).apply, Linear(input_dim=100).apply]), merged_dim=1000) self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim, name='decoder') # Readout will apply the linear transformation to 'readout_context' # with a Merge brick, so no need to fork it here self.fork = Fork([name for name in self.transition.apply.contexts + self.transition.apply.states if name != 'readout_context'], prototype=Linear()) self.tanh = Tanh() self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, fork_inputs=[name for name in self.transition.apply.sequences if name != 'mask'], ) self.children = [self.fork, self.sequence_generator, self.tanh] def _push_allocation_config(self): self.fork.input_dim = self.representation_dim self.fork.output_dims = [self.state_dim for _ in self.fork.output_names] @application(inputs=['representation', 'target_sentence_mask', 'target_sentence'], outputs=['cost']) def cost(self, representation, target_sentence, target_sentence_mask): target_sentence = target_sentence.dimshuffle(1, 0) target_sentence_mask = target_sentence_mask.T # The initial state and contexts, all functions of the representation contexts = {key: value.dimshuffle('x', 0, 1) if key not in self.transition.apply.states else value for key, value in self.fork.apply(representation, as_dict=True).items()} contexts['states'] = self.tanh.apply(contexts['states']) cost = self.sequence_generator.cost(**merge( contexts, {'mask': target_sentence_mask, 'outputs': target_sentence, 'readout_context': representation.dimshuffle('x', 0, 1)} )) return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1]
class NoLookupEncoder(Initializable): """This is a variation of ``BidirectionalEncoder`` which works with sparse feature maps. It does not use a lookup table but directly feeds the predefined distributed representations into the encoder network.""" def __init__(self, embedding_dim, state_dim, **kwargs): """Constructor. Note that this implementation only supports single layer architectures. Args: embedding_dim (int): Dimensionality of the word vectors defined by the sparse feature map. state_dim (int): Size of the recurrent layer. """ super(NoLookupEncoder, self).__init__(**kwargs) self.embedding_dim = embedding_dim self.state_dim = state_dim self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.fwd_fork, self.back_fork] def _push_allocation_config(self): """Sets the dimensions of the forward and backward forks. """ self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names ] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation', 'representation_mask']) def apply(self, source_sentence, source_sentence_mask): """Creates bidirectional RNN source annotations. Args: source_sentence (Variable): Source sentence with words in vector representation. source_sentence_mask (Variable): Source mask Returns: Variable. source annotations """ # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T representation = self.bidir.apply( merge(self.fwd_fork.apply(source_sentence, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(source_sentence, as_dict=True), {'mask': source_sentence_mask})) return representation, source_sentence_mask
class TargetWordEncoder(Initializable): """Word encoder in target side use a single RNN to map a charater-level word to a vector""" def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(TargetWordEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') self.children = [self.lookup, self.dgru, self.gru_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) if self.dgru_depth > 1: gru_out = gru_out[-1] sampled_representation = tensor.batched_dot( sample_matrix, gru_out.dimshuffle([1, 0, 2])) return sampled_representation.dimshuffle([1, 0, 2]) @application(inputs=['target_single_char']) def single_emit(self, target_single_char, batch_size, mask, states=None): # Time as first dimension # only one batch embeddings = self.lookup.apply(target_single_char) if states is None: states = self.dgru.initial_states(batch_size) states_dict = {'states': states[0]} for i in range(1, self.dgru_depth): states_dict['states' + RECURRENTSTACK_SEPARATOR + str(i)] = states[i] gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), states_dict, { 'mask': mask, 'iterate': False })) return gru_out @single_emit.property('outputs') def single_emit_outputs(self): return [ 'gru_out' + RECURRENTSTACK_SEPARATOR + str(i) for i in range(self.dgru_depth) ] def get_dim(self, name): if name in ['output', 'feedback']: return self.dgru_state_dim super(TargetWordEncoder, self).get_dim(name)
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor(x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
class Interpolator(AbstractReadout): """Readout char by char.""" def __init__(self, vocab_size, embedding_dim, igru_state_dim, igru_depth, trg_dgru_depth, emitter, feedback_brick, merge=None, merge_prototype=None, post_merge=None, **kwargs): merged_dim = igru_state_dim if not merge: merge = Merge(input_names=kwargs['source_names'], prototype=merge_prototype) if not post_merge: post_merge = Bias(dim=merged_dim) # for compatible if igru_depth == 1: self.igru = IGRU(dim=igru_state_dim) else: self.igru = RecurrentStack( [IGRU(dim=igru_state_dim, name='igru')] + [ UpperIGRU(dim=igru_state_dim, activation=Tanh(), name='upper_igru' + str(i)) for i in range(1, igru_depth) ], skip_connections=True) self.embedding_dim = embedding_dim self.emitter = emitter self.feedback_brick = feedback_brick self.merge = merge self.post_merge = post_merge self.merged_dim = merged_dim self.igru_depth = igru_depth self.trg_dgru_depth = trg_dgru_depth self.lookup = LookupTable(name='embeddings') self.vocab_size = vocab_size self.igru_state_dim = igru_state_dim self.gru_to_softmax = Linear(input_dim=igru_state_dim, output_dim=vocab_size) self.gru_fork = Fork([ name for name in self.igru.apply.sequences if name != 'mask' and name != 'input_states' ], prototype=Linear(), name='gru_fork') children = [ self.emitter, self.feedback_brick, self.merge, self.post_merge, self.igru, self.lookup, self.gru_to_softmax, self.gru_fork ] kwargs.setdefault('children', []).extend(children) super(Interpolator, self).__init__(**kwargs) def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.emitter.readout_dim = self.get_dim('readouts') self.merge.input_names = self.source_names self.merge.input_dims = self.source_dims self.merge.output_dim = self.merged_dim self.post_merge.input_dim = self.merged_dim self.post_merge.output_dim = self.igru_state_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.igru.get_dim(name) for name in self.gru_fork.output_names ] @application def initial_igru_outputs(self, batch_size): return self.igru.initial_states(batch_size) @application def emit(self, readouts): return self.emitter.emit(readouts) @application def cost(self, readouts, outputs): return self.emitter.cost(readouts, outputs) @application def initial_outputs(self, batch_size): return self.emitter.initial_outputs(batch_size) @application(outputs=['feedback']) def feedback(self, outputs): return self.feedback_brick.feedback(outputs) @application(outputs=['feedback']) def feedback_apply(self, target_char_seq, target_sample_matrix, target_char_aux): return self.feedback_brick.apply(target_char_seq, target_sample_matrix, target_char_aux) @application def single_feedback(self, target_single_char, batch_size, mask=None, states=None): return self.feedback_brick.single_emit(target_single_char, batch_size, mask, states) @single_feedback.property('outputs') def single_feedback_outputs(self): return [ 'single_feedback' + RECURRENTSTACK_SEPARATOR + str(i) for i in range(self.trg_dgru_depth) ] @application(outputs=['gru_out', 'readout_chars']) def single_readout_gru(self, target_prev_char, target_prev_char_aux, input_states, states): embeddings = self.lookup.apply(target_prev_char) states_dict = {'states': states[0]} if self.igru_depth > 1: for i in range(1, self.igru_depth): states_dict['states' + RECURRENTSTACK_SEPARATOR + str(i)] = states[i] gru_out = self.igru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), states_dict, { 'mask': target_prev_char_aux, 'input_states': input_states, 'iterate': False })) if self.igru_depth > 1: readout_chars = self.gru_to_softmax.apply(gru_out[-1]) else: readout_chars = self.gru_to_softmax.apply(gru_out) return gru_out, readout_chars @application def readout(self, **kwargs): merged = self.merge.apply( **{name: kwargs[name] for name in self.merge.input_names}) merged = self.post_merge.apply(merged) return merged @application(outputs=['readout_chars']) def readout_gru(self, target_prev_char_seq, target_prev_char_aux, input_states): embeddings = self.lookup.apply(target_prev_char_seq) gru_out = self.igru.apply( **merge(self.gru_fork.apply(embeddings, as_dict=True), { 'mask': target_prev_char_aux, 'input_states': input_states })) if self.igru_depth > 1: gru_out = gru_out[-1] readout_chars = self.gru_to_softmax.apply(gru_out) return readout_chars def get_dim(self, name): if name == 'outputs': return self.emitter.get_dim(name) elif name == 'feedback': return self.feedback_brick.get_dim(name) elif name == 'readouts': return self.readout_dim return super(AbstractReadout, self).get_dim(name)
def build_model_lstm(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections virtual_dim = 4 * state_dim # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) # Make sure time_length is what we need fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [ LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers) ] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear(input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} init_cells = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) init_cells[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='cell0_%d' % d) kwargs['states' + suffix] = init_states[d] kwargs['cells' + suffix] = init_cells[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} for d in range(layers): last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) updates.append((init_cells[d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = { "in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates } h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if layers > 1 # h = [state] if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if layers > 1: if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state" presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() # Dont initialize as Orthogonal if we are about to load new parameters if args.load_path is not None: rnn.weights_init = initialization.Constant(0) else: rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
class Parrot(Initializable, Random): def __init__( self, input_dim=420, # Dimension of the text labels output_dim=63, # Dimension of vocoder fram rnn_h_dim=1024, # Size of rnn hidden state readouts_dim=1024, # Size of readouts (summary of rnn) weak_feedback=False, # Feedback to the top rnn layer full_feedback=False, # Feedback to all rnn layers feedback_noise_level=None, # Amount of noise in feedback layer_norm=False, # Use simple normalization? use_speaker=False, # Condition on the speaker id? num_speakers=21, # How many speakers there are? speaker_dim=128, # Size of speaker embedding which_cost='MSE', # Train with MSE or GMM k_gmm=20, # How many components in the GMM sampling_bias=0, # Make samples more likely (Graves13) epsilon=1e-5, # Numerical stabilities num_characters=43, # how many chars in the labels attention_type='graves', # graves or softmax attention_size=10, # number of gaussians in the attention attention_alignment=1., # audio steps per letter at initialization sharpening_coeff=1., timing_coeff=1., encoder_type=None, encoder_dim=128, raw_output=False, **kwargs): super(Parrot, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = output_dim self.rnn_h_dim = rnn_h_dim self.readouts_dim = readouts_dim self.layer_norm = layer_norm self.which_cost = which_cost self.use_speaker = use_speaker self.full_feedback = full_feedback self.feedback_noise_level = feedback_noise_level self.epsilon = epsilon self.num_characters = num_characters self.attention_type = attention_type self.attention_alignment = attention_alignment self.attention_size = attention_size self.sharpening_coeff = sharpening_coeff self.timing_coeff = timing_coeff self.encoder_type = encoder_type self.encoder_dim = encoder_dim self.encoded_input_dim = input_dim self.raw_output = raw_output if self.encoder_type == 'bidirectional': self.encoded_input_dim = 2 * encoder_dim if self.feedback_noise_level is not None: self.noise_level_var = tensor.scalar('feedback_noise_level') self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1') self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2') self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3') self.h1_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h2_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h2_to_readout') self.h3_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h3_to_readout') self.h1_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h2') self.h1_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h3') self.h2_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h2_to_h3') if which_cost == 'MSE': self.readout_to_output = Linear( input_dim=readouts_dim, output_dim=output_dim, name='readout_to_output') elif which_cost == 'GMM': self.sampling_bias = sampling_bias self.k_gmm = k_gmm self.readout_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=readouts_dim, output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='readout_to_output') self.encoder = Encoder( encoder_type, num_characters, input_dim, encoder_dim, name='encoder') self.children = [ self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout, self.h2_to_readout, self.h3_to_readout, self.h1_to_h2, self.h1_to_h3, self.h2_to_h3, self.readout_to_output] self.inp_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h1') self.inp_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h2') self.inp_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h3') self.children += [ self.inp_to_h1, self.inp_to_h2, self.inp_to_h3] self.h1_to_att = Fork( output_names=['alpha', 'beta', 'kappa'], input_dim=rnn_h_dim, output_dims=[attention_size] * 3, name='h1_to_att') self.att_to_readout = Linear( input_dim=self.encoded_input_dim, output_dim=readouts_dim, name='att_to_readout') self.children += [ self.h1_to_att, self.att_to_readout] if use_speaker: self.num_speakers = num_speakers self.speaker_dim = speaker_dim self.embed_speaker = LookupTable(num_speakers, speaker_dim) self.speaker_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h1') self.speaker_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h2') self.speaker_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h3') self.speaker_to_readout = Linear( input_dim=speaker_dim, output_dim=readouts_dim, name='speaker_to_readout') if which_cost == 'MSE': self.speaker_to_output = Linear( input_dim=speaker_dim, output_dim=output_dim, name='speaker_to_output') elif which_cost == 'GMM': self.speaker_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=speaker_dim, output_dims=[ output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='speaker_to_output') self.children += [ self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2, self.speaker_to_h3, self.speaker_to_readout, self.speaker_to_output] if full_feedback: self.out_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h2') self.out_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h3') self.children += [ self.out_to_h2, self.out_to_h3] weak_feedback = True self.weak_feedback = weak_feedback if weak_feedback: self.out_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h1') self.children += [ self.out_to_h1] if self.raw_output: self.sampleRnn = SampleRnn() self.children += [self.sampleRnn] def _allocate(self): self.initial_w = shared_floatx_zeros( (self.encoded_input_dim,), name="initial_w") add_role(self.initial_w, INITIAL_STATE) def symbolic_input_variables(self): features = tensor.tensor3('features') features_mask = tensor.matrix('features_mask') labels = tensor.imatrix('labels') labels_mask = tensor.matrix('labels_mask') start_flag = tensor.scalar('start_flag') if self.use_speaker: speaker = tensor.imatrix('speaker_index') else: speaker = None if self.raw_output: raw_sequence = tensor.itensor3('raw_audio') else: raw_sequence = None return features, features_mask, labels, labels_mask, \ speaker, start_flag, raw_sequence def initial_states(self, batch_size): initial_h1 = self.rnn1.initial_states(batch_size) initial_h2 = self.rnn2.initial_states(batch_size) initial_h3 = self.rnn3.initial_states(batch_size) last_h1 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h2 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h3 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) # Defining for all initial_k = tensor.zeros( (batch_size, self.attention_size), dtype=floatX) last_k = shared_floatx_zeros((batch_size, self.attention_size)) # Trainable initial state for w. Why not for k? initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0) last_w = shared_floatx_zeros((batch_size, self.encoded_input_dim)) return initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k @application def compute_cost( self, features, features_mask, labels, labels_mask, speaker, start_flag, batch_size, raw_audio=None): if speaker is None: assert not self.use_speaker target_features = features[1:] mask = features_mask[1:] cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim) gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.weak_feedback: input_features = features[:-1] if self.feedback_noise_level: noise = self.theano_rng.normal( size=input_features.shape, avg=0., std=1.) input_features += self.noise_level_var * noise out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features) to_normalize = [ out_cell_h1, out_gat_h1] out_cell_h1, out_gat_h1 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += out_cell_h1 gat_h1 += out_gat_h1 if self.full_feedback: assert self.weak_feedback out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features) out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features) to_normalize = [ out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3] out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2 += out_cell_h2 gat_h2 += out_gat_h2 cell_h3 += out_cell_h3 gat_h3 += out_gat_h3 if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 = spk_cell_h1 + cell_h1 cell_h2 = spk_cell_h2 + cell_h2 cell_h3 = spk_cell_h3 + cell_h3 gat_h1 = spk_gat_h1 + gat_h1 gat_h2 = spk_gat_h2 + gat_h2 gat_h3 = spk_gat_h3 + gat_h3 initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(batch_size) # If it's a new example, use initial states. input_h1 = tensor.switch( start_flag, initial_h1, last_h1) input_h2 = tensor.switch( start_flag, initial_h2, last_h2) input_h3 = tensor.switch( start_flag, initial_h3, last_h3) input_w = tensor.switch( start_flag, initial_w, last_w) input_k = tensor.switch( start_flag, initial_k, last_k) context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft( tensor.arange(labels.shape[1], dtype=floatX), 2) def step( inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh): attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) inp_h1_t += attinp_h1 gat_h1_t += attgat_h1 h1_t = self.rnn1.apply( inp_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum( a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) inp_h2_t += attinp_h2 gat_h2_t += attgat_h2 inp_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [ h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply( inp_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [ h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply( inp_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_ (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan( fn=step, sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[context_oh], outputs_info=[ input_h1, input_h2, input_h3, input_k, input_w, None, None]) h1_out = self.h1_to_readout.apply(h1) h2_out = self.h2_to_readout.apply(h2) h3_out = self.h3_to_readout.apply(h3) to_normalize = [ h1_out, h2_out, h3_out] h1_out, h2_out, h3_out = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readouts = h1_out + h2_out + h3_out if self.use_speaker: readouts += self.speaker_to_readout.apply(emb_speaker) readouts += self.att_to_readout.apply(w) predicted = self.readout_to_output.apply(readouts) if self.which_cost == 'MSE': if self.use_speaker: predicted += self.speaker_to_output.apply(emb_speaker) cost = tensor.sum((predicted - target_features) ** 2, axis=-1) next_x = predicted # Dummy value for coeff coeff = predicted elif self.which_cost == 'GMM': mu, sigma, coeff = predicted if self.use_speaker: spk_to_out = self.speaker_to_output.apply(emb_speaker) mu += spk_to_out[0] sigma += spk_to_out[1] coeff += spk_to_out[2] # When training there should not be sampling_bias sigma = tensor.exp(sigma) + self.epsilon coeff = tensor.nnet.softmax( coeff.reshape( (-1, self.k_gmm))).reshape( coeff.shape) + self.epsilon cost = cost_gmm(target_features, mu, sigma, coeff) next_x = sample_gmm(mu, sigma, coeff, self.theano_rng) cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag updates = [] updates.append((last_h1, h1[-1])) updates.append((last_h2, h2[-1])) updates.append((last_h3, h3[-1])) updates.append((last_k, k[-1])) updates.append((last_w, w[-1])) cost_raw = None if self.raw_output: raw_mask = tensor.extra_ops.repeat(features_mask, 80, axis=0) raw_mask = raw_mask.dimshuffle(1, 0) # breakpointOp = PdbBreakpoint("Raw mask breakpoint") # condition = tensor.gt(raw_mask.shape[0], 0) # raw_mask = breakpointOp(condition, raw_mask) predicted_transposed = predicted.dimshuffle(1, 0, 2) last_h0, last_big_h0 = self.sampleRnn.initial_states(batch_size) raw_audio_reshaped = raw_audio.dimshuffle(1, 0, 2) raw_audio_reshaped = raw_audio_reshaped.reshape((raw_audio_reshaped.shape[0], -1)) cost_raw, ip_cost, all_params, ip_params, other_params, new_h0, new_big_h0 =\ self.sampleRnn.apply(raw_audio_reshaped, predicted_transposed, last_h0, last_big_h0, start_flag, raw_mask) if self.sampleRnn.N_RNN == 1: new_h0 = tensor.unbroadcast(new_h0, 1) new_big_h0 = tensor.unbroadcast(new_big_h0, 1) updates.append((last_h0, new_h0)) updates.append((last_big_h0, new_big_h0)) # cost = cost + 80.*cost_raw alpha_ = numpy.float32(0.) beta_ = numpy.float32(1.) cost = alpha_*cost + beta_*cost_raw attention_vars = [next_x, k, w, coeff, phi, pi_att] return cost, scan_updates + updates, attention_vars, cost_raw @application def sample_model_fun( self, labels, labels_mask, speaker, num_samples, seq_size): initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(num_samples) initial_x = numpy.zeros( (num_samples, self.output_dim), dtype=floatX) cell_shape = (seq_size, num_samples, self.rnn_h_dim) gat_shape = (seq_size, num_samples, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) # Applied before the broadcast. spk_readout = self.speaker_to_readout.apply(emb_speaker) spk_output = self.speaker_to_output.apply(emb_speaker) # Add dimension to repeat with time. emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += spk_cell_h1 cell_h2 += spk_cell_h2 cell_h3 += spk_cell_h3 gat_h1 += spk_gat_h1 gat_h2 += spk_gat_h2 gat_h3 += spk_gat_h3 context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft( tensor.arange(labels.shape[1], dtype=floatX), 2) def sample_step( inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t, inp_gat_h2_t, inp_cell_h3_t, inp_gat_h3_t, x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1): cell_h1_t = inp_cell_h1_t cell_h2_t = inp_cell_h2_t cell_h3_t = inp_cell_h3_t gat_h1_t = inp_gat_h1_t gat_h2_t = inp_gat_h2_t gat_h3_t = inp_gat_h3_t attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) cell_h1_t += attinp_h1 gat_h1_t += attgat_h1 if self.weak_feedback: out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1) to_normalize = [ out_cell_h1_t, out_gat_h1_t] out_cell_h1_t, out_gat_h1_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1_t += out_cell_h1_t gat_h1_t += out_gat_h1_t if self.full_feedback: out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1) out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1) to_normalize = [ out_cell_h2_t, out_gat_h2_t, out_cell_h3_t, out_gat_h3_t] out_cell_h2_t, out_gat_h2_t, \ out_cell_h3_t, out_gat_h3_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2_t += out_cell_h2_t cell_h3_t += out_cell_h3_t gat_h2_t += out_gat_h2_t gat_h3_t += out_gat_h3_t h1_t = self.rnn1.apply( cell_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon k_t = k_tm1 + self.attention_alignment * \ tensor.exp(k_t) / self.timing_coeff a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum( a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) cell_h2_t += attinp_h2 gat_h2_t += attgat_h2 cell_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [ h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply( cell_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [ h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply( cell_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) h1_out_t = self.h1_to_readout.apply(h1_t) h2_out_t = self.h2_to_readout.apply(h2_t) h3_out_t = self.h3_to_readout.apply(h3_t) to_normalize = [ h1_out_t, h2_out_t, h3_out_t] h1_out_t, h2_out_t, h3_out_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readout_t = h1_out_t + h2_out_t + h3_out_t readout_t += self.att_to_readout.apply(w_t) if self.use_speaker: readout_t += spk_readout output_t = self.readout_to_output.apply(readout_t) if self.which_cost == 'MSE': predicted_x_t = output_t if self.use_speaker: predicted_x_t += spk_output # Dummy value for coeff_t coeff_t = predicted_x_t elif self.which_cost == "GMM": mu_t, sigma_t, coeff_t = output_t if self.use_speaker: mu_t += spk_output[0] sigma_t += spk_output[1] coeff_t += spk_output[2] sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \ self.epsilon coeff_t = tensor.nnet.softmax( coeff_t.reshape( (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape( coeff_t.shape) + self.epsilon predicted_x_t = sample_gmm( mu_t, sigma_t, coeff_t, self.theano_rng) return predicted_x_t, h1_t, h2_t, h3_t, \ k_t, w_t, coeff_t, phi_t, a_t_ (sample_x, h1, h2, h3, k, w, pi, phi, pi_att), updates = theano.scan( fn=sample_step, sequences=[ cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[], outputs_info=[ initial_x, initial_h1, initial_h2, initial_h3, initial_k, initial_w, None, None, None]) return sample_x, k, w, pi, phi, pi_att, updates def sample_model( self, labels_tr, labels_mask_tr, features_mask_tr, speaker_tr, num_samples, num_steps): features, features_mask, labels, labels_mask, speaker, start_flag, raw_sequence = \ self.symbolic_input_variables() sample_x, k, w, pi, phi, pi_att, updates = \ self.sample_model_fun( labels, labels_mask, speaker, num_samples, num_steps) theano_inputs = [labels, labels_mask] numpy_inputs = (labels_tr, labels_mask_tr) if self.use_speaker: theano_inputs += [speaker] numpy_inputs += (speaker_tr,) return function( theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs) def sample_using_input(self, data_tr, num_samples): # Used to predict the values using the dataset features, features_mask, labels, labels_mask, speaker, start_flag, raw_sequence = \ self.symbolic_input_variables() cost, updates, attention_vars = self.compute_cost( features, features_mask, labels, labels_mask, speaker, start_flag, num_samples) sample_x, k, w, pi, phi, pi_att = attention_vars theano_vars = [ features, features_mask, labels, labels_mask, speaker, start_flag] theano_vars = [x for x in theano_vars if x is not None] theano_vars = list(set(theano_vars)) theano_vars = {x.name: x for x in theano_vars} theano_inputs = [] numpy_inputs = [] for key in data_tr.keys(): theano_inputs.append(theano_vars[key]) numpy_inputs.append(data_tr[key]) return function( theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs)
class BidirectionalPhonemeAudioEncoder(Initializable): def __init__(self, feature_size, embedding_dim, state_dim, **kwargs): super(BidirectionalPhonemeAudioEncoder, self).__init__(**kwargs) self.feature_size = feature_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.audio_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_embeddings") self.audio_fwd_fork = Fork( [name for name in self.audio_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='audio_fwd_fork') self.audio_back_fork = Fork( [name for name in self.audio_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='audio_back_fork') self.phoneme_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="phoneme_embeddings") self.phoneme_fwd_fork = Fork( [name for name in self.phoneme_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='phoneme_fwd_fork') self.phoneme_back_fork = Fork( [name for name in self.phoneme_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='phoneme_back_fork') self.words_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="words_embeddings") self.words_fwd_fork = Fork( [name for name in self.words_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='words_fwd_fork') self.words_back_fork = Fork( [name for name in self.words_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='words_back_fork') self.children = [self.phoneme_embedding, self.audio_embedding, self.words_embedding, self.phoneme_fwd_fork, self.phoneme_back_fork, self.audio_fwd_fork, self.audio_back_fork, self.words_fwd_fork, self.words_back_fork] def _push_allocation_config(self): self.audio_fwd_fork.input_dim = self.feature_size self.audio_fwd_fork.output_dims = [self.audio_embedding.children[0].get_dim(name) for name in self.audio_fwd_fork.output_names] self.audio_back_fork.input_dim = self.feature_size self.audio_back_fork.output_dims = [self.audio_embedding.children[1].get_dim(name) for name in self.audio_back_fork.output_names] self.phoneme_fwd_fork.input_dim = 2 * self.embedding_dim self.phoneme_fwd_fork.output_dims = [self.phoneme_embedding.children[0].get_dim(name) for name in self.phoneme_fwd_fork.output_names] self.phoneme_back_fork.input_dim = 2 * self.embedding_dim self.phoneme_back_fork.output_dims = [self.phoneme_embedding.children[1].get_dim(name) for name in self.phoneme_back_fork.output_names] self.words_fwd_fork.input_dim = 2 * self.embedding_dim self.words_fwd_fork.output_dims = [self.words_embedding.children[0].get_dim(name) for name in self.words_fwd_fork.output_names] self.words_back_fork.input_dim = 2 * self.embedding_dim self.words_back_fork.output_dims = [self.words_embedding.children[1].get_dim(name) for name in self.words_back_fork.output_names] @application(inputs=['audio', 'audio_mask', 'phones_words_acoustic_ends', 'phones_words_acoustic_ends_mask', 'phoneme_words_ends', 'phoneme_words_ends_mask'], outputs=['representation']) def apply(self, audio, audio_mask, phones_words_acoustic_ends, phones_words_acoustic_ends_mask, phoneme_words_ends, phoneme_words_ends_mask): batch_size = audio.shape[0] audio = audio.dimshuffle(1, 0, 2) audio_mask = audio_mask.dimshuffle(1, 0) audio_embeddings = self.audio_embedding.apply( merge(self.audio_fwd_fork.apply(audio, as_dict=True), {'mask': audio_mask}), merge(self.audio_back_fork.apply(audio, as_dict=True), {'mask': audio_mask}) ) rows = tensor.arange(batch_size).reshape((batch_size, 1)) phoneme_embeddings = audio_embeddings.dimshuffle(1, 0, 2)[rows, phones_words_acoustic_ends].dimshuffle(1, 0, 2) phones_words_acoustic_ends_mask = phones_words_acoustic_ends_mask.dimshuffle(1, 0) words_embeddings = self.phoneme_embedding.apply( merge(self.phoneme_fwd_fork.apply(phoneme_embeddings, as_dict=True), {'mask': phones_words_acoustic_ends_mask}), merge(self.phoneme_back_fork.apply(phoneme_embeddings, as_dict=True), {'mask': phones_words_acoustic_ends_mask}) ) words_embeddings = words_embeddings.dimshuffle(1, 0, 2)[rows, phoneme_words_ends].dimshuffle(1, 0, 2) phoneme_words_ends_mask = phoneme_words_ends_mask.dimshuffle(1, 0) representation = self.words_embedding.apply( merge(self.words_fwd_fork.apply(phoneme_embeddings, as_dict=True), {'mask': phoneme_words_ends_mask}), merge(self.words_back_fork.apply(phoneme_embeddings, as_dict=True), {'mask': phoneme_words_ends_mask}) ) return representation
class Decimator(Initializable): """Source word encoder, mapping a charater-level word to a vector. This encoder is able to learn the morphology. For compatibility with previous version, we call it Decimator. """ def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(Decimator, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth # representation self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) # importance of this representation self.bidir_w = Bidirectional(RecurrentWithFork( DGRU(activation=Tanh(), dim=self.dgru_state_dim // 2), self.embedding_dim, name='src_word_with_fork'), name='bidir_src_word_encoder') self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') # map to a energy scalar self.wl = Linear(input_dim=dgru_state_dim, output_dim=1) self.children = [ self.lookup, self.dgru, self.gru_fork, self.bidir_w, self.wl ] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation', 'weight']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) wgru_out = tensor.exp( self.wl.apply(self.bidir_w.apply(embeddings, char_aux))) if self.dgru_depth > 1: gru_out = gru_out[-1] gru_out = tensor.addbroadcast(wgru_out, 2) * gru_out sampled_representation = tensor.tanh( tensor.batched_dot(sample_matrix, gru_out.dimshuffle([1, 0, 2]))) return sampled_representation.dimshuffle([1, 0, 2]), wgru_out def get_dim(self, name): if name == 'output': return self.dgru_state_dim super(Decimator, self).get_dim(name)
class Parrot(Initializable, Random): def __init__( self, input_dim=420, # Dimension of the text labels output_dim=63, # Dimension of vocoder fram rnn_h_dim=1024, # Size of rnn hidden state readouts_dim=1024, # Size of readouts (summary of rnn) weak_feedback=False, # Feedback to the top rnn layer full_feedback=False, # Feedback to all rnn layers feedback_noise_level=None, # Amount of noise in feedback layer_norm=False, # Use simple normalization? use_speaker=False, # Condition on the speaker id? num_speakers=21, # How many speakers there are? speaker_dim=128, # Size of speaker embedding which_cost='MSE', # Train with MSE or GMM k_gmm=20, # How many components in the GMM sampling_bias=0, # Make samples more likely (Graves13) epsilon=1e-5, # Numerical stabilities num_characters=43, # how many chars in the labels attention_type='graves', # graves or softmax attention_size=10, # number of gaussians in the attention attention_alignment=1., # audio steps per letter at initialization sharpening_coeff=1., timing_coeff=1., encoder_type=None, encoder_dim=128, **kwargs): super(Parrot, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = output_dim self.rnn_h_dim = rnn_h_dim self.readouts_dim = readouts_dim self.layer_norm = layer_norm self.which_cost = which_cost self.use_speaker = use_speaker self.full_feedback = full_feedback self.feedback_noise_level = feedback_noise_level self.epsilon = epsilon self.num_characters = num_characters self.attention_type = attention_type self.attention_alignment = attention_alignment self.attention_size = attention_size self.sharpening_coeff = sharpening_coeff self.timing_coeff = timing_coeff self.encoder_type = encoder_type self.encoder_dim = encoder_dim self.encoded_input_dim = input_dim if self.encoder_type == 'bidirectional': self.encoded_input_dim = 2 * encoder_dim if self.feedback_noise_level is not None: self.noise_level_var = tensor.scalar('feedback_noise_level') self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1') self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2') self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3') self.h1_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h2_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h2_to_readout') self.h3_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h3_to_readout') self.h1_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h2') self.h1_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h3') self.h2_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h2_to_h3') if which_cost == 'MSE': self.readout_to_output = Linear(input_dim=readouts_dim, output_dim=output_dim, name='readout_to_output') elif which_cost == 'GMM': self.sampling_bias = sampling_bias self.k_gmm = k_gmm self.readout_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=readouts_dim, output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='readout_to_output') self.encoder = Encoder(encoder_type, num_characters, input_dim, encoder_dim, name='encoder') self.children = [ self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout, self.h2_to_readout, self.h3_to_readout, self.h1_to_h2, self.h1_to_h3, self.h2_to_h3, self.readout_to_output ] self.inp_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h1') self.inp_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h2') self.inp_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h3') self.children += [self.inp_to_h1, self.inp_to_h2, self.inp_to_h3] self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'], input_dim=rnn_h_dim, output_dims=[attention_size] * 3, name='h1_to_att') self.att_to_readout = Linear(input_dim=self.encoded_input_dim, output_dim=readouts_dim, name='att_to_readout') self.children += [self.h1_to_att, self.att_to_readout] if use_speaker: self.num_speakers = num_speakers self.speaker_dim = speaker_dim self.embed_speaker = LookupTable(num_speakers, speaker_dim) self.speaker_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h1') self.speaker_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h2') self.speaker_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h3') self.speaker_to_readout = Linear(input_dim=speaker_dim, output_dim=readouts_dim, name='speaker_to_readout') if which_cost == 'MSE': self.speaker_to_output = Linear(input_dim=speaker_dim, output_dim=output_dim, name='speaker_to_output') elif which_cost == 'GMM': self.speaker_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=speaker_dim, output_dims=[ output_dim * k_gmm, output_dim * k_gmm, k_gmm ], name='speaker_to_output') self.children += [ self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2, self.speaker_to_h3, self.speaker_to_readout, self.speaker_to_output ] if full_feedback: self.out_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h2') self.out_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h3') self.children += [self.out_to_h2, self.out_to_h3] weak_feedback = True self.weak_feedback = weak_feedback if weak_feedback: self.out_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h1') self.children += [self.out_to_h1] def _allocate(self): self.initial_w = shared_floatx_zeros((self.encoded_input_dim, ), name="initial_w") add_role(self.initial_w, INITIAL_STATE) def symbolic_input_variables(self): features = tensor.tensor3('features') features_mask = tensor.matrix('features_mask') labels = tensor.imatrix('labels') labels_mask = tensor.matrix('labels_mask') start_flag = tensor.scalar('start_flag') if self.use_speaker: speaker = tensor.imatrix('speaker_index') else: speaker = None return features, features_mask, labels, labels_mask, \ speaker, start_flag def initial_states(self, batch_size): initial_h1 = self.rnn1.initial_states(batch_size) initial_h2 = self.rnn2.initial_states(batch_size) initial_h3 = self.rnn3.initial_states(batch_size) last_h1 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h2 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h3 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) # Defining for all initial_k = tensor.zeros((batch_size, self.attention_size), dtype=floatX) last_k = shared_floatx_zeros((batch_size, self.attention_size)) # Trainable initial state for w. Why not for k? initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0) last_w = shared_floatx_zeros((batch_size, self.encoded_input_dim)) return initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k @application def compute_cost(self, features, features_mask, labels, labels_mask, speaker, start_flag, batch_size): if speaker is None: assert not self.use_speaker target_features = features[1:] mask = features_mask[1:] cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim) gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.weak_feedback: input_features = features[:-1] if self.feedback_noise_level: noise = self.theano_rng.normal(size=input_features.shape, avg=0., std=1.) input_features += self.noise_level_var * noise out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features) to_normalize = [out_cell_h1, out_gat_h1] out_cell_h1, out_gat_h1 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += out_cell_h1 gat_h1 += out_gat_h1 if self.full_feedback: assert self.weak_feedback out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features) out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features) to_normalize = [out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3] out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2 += out_cell_h2 gat_h2 += out_gat_h2 cell_h3 += out_cell_h3 gat_h3 += out_gat_h3 if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3 ] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 = spk_cell_h1 + cell_h1 cell_h2 = spk_cell_h2 + cell_h2 cell_h3 = spk_cell_h3 + cell_h3 gat_h1 = spk_gat_h1 + gat_h1 gat_h2 = spk_gat_h2 + gat_h2 gat_h3 = spk_gat_h3 + gat_h3 initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(batch_size) # If it's a new example, use initial states. input_h1 = tensor.switch(start_flag, initial_h1, last_h1) input_h2 = tensor.switch(start_flag, initial_h2, last_h2) input_h3 = tensor.switch(start_flag, initial_h3, last_h3) input_w = tensor.switch(start_flag, initial_w, last_w) input_k = tensor.switch(start_flag, initial_k, last_k) context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX), 2) def step(inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh): attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) inp_h1_t += attinp_h1 gat_h1_t += attgat_h1 h1_t = self.rnn1.apply(inp_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) inp_h2_t += attinp_h2 gat_h2_t += attgat_h2 inp_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply(inp_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply(inp_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_ (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan( fn=step, sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[context_oh], outputs_info=[ input_h1, input_h2, input_h3, input_k, input_w, None, None ]) h1_out = self.h1_to_readout.apply(h1) h2_out = self.h2_to_readout.apply(h2) h3_out = self.h3_to_readout.apply(h3) to_normalize = [h1_out, h2_out, h3_out] h1_out, h2_out, h3_out = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readouts = h1_out + h2_out + h3_out if self.use_speaker: readouts += self.speaker_to_readout.apply(emb_speaker) readouts += self.att_to_readout.apply(w) predicted = self.readout_to_output.apply(readouts) if self.which_cost == 'MSE': if self.use_speaker: predicted += self.speaker_to_output.apply(emb_speaker) cost = tensor.sum((predicted - target_features)**2, axis=-1) next_x = predicted # Dummy value for coeff coeff = predicted elif self.which_cost == 'GMM': mu, sigma, coeff = predicted if self.use_speaker: spk_to_out = self.speaker_to_output.apply(emb_speaker) mu += spk_to_out[0] sigma += spk_to_out[1] coeff += spk_to_out[2] # When training there should not be sampling_bias sigma = tensor.exp(sigma) + self.epsilon coeff = tensor.nnet.softmax(coeff.reshape( (-1, self.k_gmm))).reshape(coeff.shape) + self.epsilon cost = cost_gmm(target_features, mu, sigma, coeff) next_x = sample_gmm(mu, sigma, coeff, self.theano_rng) cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag updates = [] updates.append((last_h1, h1[-1])) updates.append((last_h2, h2[-1])) updates.append((last_h3, h3[-1])) updates.append((last_k, k[-1])) updates.append((last_w, w[-1])) attention_vars = [next_x, k, w, coeff, phi, pi_att] return cost, scan_updates + updates, attention_vars @application def sample_model_fun(self, labels, labels_mask, speaker, num_samples, seq_size): initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(num_samples) initial_x = numpy.zeros((num_samples, self.output_dim), dtype=floatX) cell_shape = (seq_size, num_samples, self.rnn_h_dim) gat_shape = (seq_size, num_samples, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) # Applied before the broadcast. spk_readout = self.speaker_to_readout.apply(emb_speaker) spk_output = self.speaker_to_output.apply(emb_speaker) # Add dimension to repeat with time. emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3 ] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += spk_cell_h1 cell_h2 += spk_cell_h2 cell_h3 += spk_cell_h3 gat_h1 += spk_gat_h1 gat_h2 += spk_gat_h2 gat_h3 += spk_gat_h3 context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX), 2) def sample_step(inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t, inp_gat_h2_t, inp_cell_h3_t, inp_gat_h3_t, x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1): cell_h1_t = inp_cell_h1_t cell_h2_t = inp_cell_h2_t cell_h3_t = inp_cell_h3_t gat_h1_t = inp_gat_h1_t gat_h2_t = inp_gat_h2_t gat_h3_t = inp_gat_h3_t attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) cell_h1_t += attinp_h1 gat_h1_t += attgat_h1 if self.weak_feedback: out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1) to_normalize = [out_cell_h1_t, out_gat_h1_t] out_cell_h1_t, out_gat_h1_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1_t += out_cell_h1_t gat_h1_t += out_gat_h1_t if self.full_feedback: out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1) out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1) to_normalize = [ out_cell_h2_t, out_gat_h2_t, out_cell_h3_t, out_gat_h3_t ] out_cell_h2_t, out_gat_h2_t, \ out_cell_h3_t, out_gat_h3_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2_t += out_cell_h2_t cell_h3_t += out_cell_h3_t gat_h2_t += out_gat_h2_t gat_h3_t += out_gat_h3_t h1_t = self.rnn1.apply(cell_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon k_t = k_tm1 + self.attention_alignment * \ tensor.exp(k_t) / self.timing_coeff a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) cell_h2_t += attinp_h2 gat_h2_t += attgat_h2 cell_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply(cell_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply(cell_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) h1_out_t = self.h1_to_readout.apply(h1_t) h2_out_t = self.h2_to_readout.apply(h2_t) h3_out_t = self.h3_to_readout.apply(h3_t) to_normalize = [h1_out_t, h2_out_t, h3_out_t] h1_out_t, h2_out_t, h3_out_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readout_t = h1_out_t + h2_out_t + h3_out_t readout_t += self.att_to_readout.apply(w_t) if self.use_speaker: readout_t += spk_readout output_t = self.readout_to_output.apply(readout_t) if self.which_cost == 'MSE': predicted_x_t = output_t if self.use_speaker: predicted_x_t += spk_output # Dummy value for coeff_t coeff_t = predicted_x_t elif self.which_cost == "GMM": mu_t, sigma_t, coeff_t = output_t if self.use_speaker: mu_t += spk_output[0] sigma_t += spk_output[1] coeff_t += spk_output[2] sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \ self.epsilon coeff_t = tensor.nnet.softmax( coeff_t.reshape( (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape( coeff_t.shape) + self.epsilon predicted_x_t = sample_gmm(mu_t, sigma_t, coeff_t, self.theano_rng) return predicted_x_t, h1_t, h2_t, h3_t, \ k_t, w_t, coeff_t, phi_t, a_t_ (sample_x, h1, h2, h3, k, w, pi, phi, pi_att), updates = theano.scan( fn=sample_step, sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[], outputs_info=[ initial_x, initial_h1, initial_h2, initial_h3, initial_k, initial_w, None, None, None ]) return sample_x, k, w, pi, phi, pi_att, updates def sample_model(self, labels_tr, labels_mask_tr, features_mask_tr, speaker_tr, num_samples, num_steps): features, features_mask, labels, labels_mask, speaker, start_flag = \ self.symbolic_input_variables() sample_x, k, w, pi, phi, pi_att, updates = \ self.sample_model_fun( labels, labels_mask, speaker, num_samples, num_steps) theano_inputs = [labels, labels_mask] numpy_inputs = (labels_tr, labels_mask_tr) if self.use_speaker: theano_inputs += [speaker] numpy_inputs += (speaker_tr, ) return function(theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs) def sample_using_input(self, data_tr, num_samples): # Used to predict the values using the dataset features, features_mask, labels, labels_mask, speaker, start_flag = \ self.symbolic_input_variables() cost, updates, attention_vars = self.compute_cost( features, features_mask, labels, labels_mask, speaker, start_flag, num_samples) sample_x, k, w, pi, phi, pi_att = attention_vars theano_vars = [ features, features_mask, labels, labels_mask, speaker, start_flag ] theano_vars = [x for x in theano_vars if x is not None] theano_vars = list(set(theano_vars)) theano_vars = {x.name: x for x in theano_vars} theano_inputs = [] numpy_inputs = [] for key in data_tr.keys(): theano_inputs.append(theano_vars[key]) numpy_inputs.append(data_tr[key]) return function(theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs)
def build_model_soft(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] # Build the MLP dims = [2 * state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear( input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
class BidiRNN(Initializable): @lazy() def __init__(self, config, output_dim=2, **kwargs): super(BidiRNN, self).__init__(**kwargs) self.config = config self.context_embedder = ContextEmbedder(config) act = config.rec_activation() if hasattr(config, 'rec_activation') else None self.rec = SegregatedBidirectional(LSTM(dim=config.hidden_state_dim, activation=act, name='recurrent')) self.fwd_fork = Fork([name for name in self.rec.prototype.apply.sequences if name!='mask'], prototype=Linear(), name='fwd_fork') self.bkwd_fork = Fork([name for name in self.rec.prototype.apply.sequences if name!='mask'], prototype=Linear(), name='bkwd_fork') rto_in = config.hidden_state_dim * 2 + sum(x[2] for x in config.dim_embeddings) self.rec_to_output = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()], dims=[rto_in] + config.dim_hidden + [output_dim]) self.sequences = ['latitude', 'latitude_mask', 'longitude'] self.inputs = self.sequences + self.context_embedder.inputs self.children = [ self.context_embedder, self.fwd_fork, self.bkwd_fork, self.rec, self.rec_to_output ] def _push_allocation_config(self): for i, fork in enumerate([self.fwd_fork, self.bkwd_fork]): fork.input_dim = 2 fork.output_dims = [ self.rec.children[i].get_dim(name) for name in fork.output_names ] def _push_initialization_config(self): for brick in [self.fwd_fork, self.bkwd_fork, self.rec, self.rec_to_output]: brick.weights_init = self.config.weights_init brick.biases_init = self.config.biases_init def process_outputs(self, outputs): pass # must be implemented in child class @application(outputs=['destination']) def predict(self, latitude, longitude, latitude_mask, **kwargs): latitude = (latitude.T - data.train_gps_mean[0]) / data.train_gps_std[0] longitude = (longitude.T - data.train_gps_mean[1]) / data.train_gps_std[1] latitude_mask = latitude_mask.T rec_in = tensor.concatenate((latitude[:, :, None], longitude[:, :, None]), axis=2) last_id = tensor.cast(latitude_mask.sum(axis=0) - 1, dtype='int64') path = self.rec.apply(merge(self.fwd_fork.apply(rec_in, as_dict=True), {'mask': latitude_mask}), merge(self.bkwd_fork.apply(rec_in, as_dict=True), {'mask': latitude_mask}))[0] path_representation = (path[0][:, -self.config.hidden_state_dim:], path[last_id - 1, tensor.arange(latitude_mask.shape[1])] [:, :self.config.hidden_state_dim]) embeddings = tuple(self.context_embedder.apply( **{k: kwargs[k] for k in self.context_embedder.inputs })) inputs = tensor.concatenate(path_representation + embeddings, axis=1) outputs = self.rec_to_output.apply(inputs) return self.process_outputs(outputs) @predict.property('inputs') def predict_inputs(self): return self.inputs @application(outputs=['cost']) def cost(self, **kwargs): y_hat = self.predict(**kwargs) y = tensor.concatenate((kwargs['destination_latitude'][:, None], kwargs['destination_longitude'][:, None]), axis=1) return error.erdist(y_hat, y).mean() @cost.property('inputs') def cost_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude']