class LookupFeedback(AbstractFeedback, Initializable): """A feedback brick for the case when readout are integers. Stores and retrieves distributed representations of integers. Notes ----- Currently works only with lazy initialization (can not be initialized with a single constructor call). """ def __init__(self, num_outputs=None, feedback_dim=None, **kwargs): super(LookupFeedback, self).__init__(**kwargs) self.num_outputs = num_outputs self.feedback_dim = feedback_dim self.lookup = LookupTable(num_outputs, feedback_dim, weights_init=self.weights_init) self.children = [self.lookup] def _push_allocation_config(self): self.lookup.length = self.num_outputs self.lookup.dim = self.feedback_dim @application def feedback(self, outputs): assert self.output_dim == 0 return self.lookup.apply(outputs) def get_dim(self, name): if name == 'feedback': return self.feedback_dim return super(LookupFeedback, self).get_dim(name)
class topicalq_transformer(Initializable): def __init__(self, vocab_size, topical_embedding_dim, state_dim,word_num,batch_size, **kwargs): super(topicalq_transformer, self).__init__(**kwargs) self.vocab_size = vocab_size; self.word_embedding_dim = topical_embedding_dim; self.state_dim = state_dim; self.word_num=word_num; self.batch_size=batch_size; self.look_up=LookupTable(name='topical_embeddings'); self.transformer=MLP(activations=[Tanh()], dims=[self.word_embedding_dim*self.word_num, self.state_dim], name='topical_transformer'); self.children = [self.look_up,self.transformer]; def _push_allocation_config(self): self.look_up.length = self.vocab_size self.look_up.dim = self.word_embedding_dim # do we have to push_config? remain unsure @application(inputs=['source_topical_word_sequence'], outputs=['topical_embedding']) def apply(self, source_topical_word_sequence): # Time as first dimension source_topical_word_sequence=source_topical_word_sequence.T; word_topical_embeddings = self.look_up.apply(source_topical_word_sequence); word_topical_embeddings=word_topical_embeddings.swapaxes(0,1); #requires testing concatenated_topical_embeddings=tensor.reshape(word_topical_embeddings,[word_topical_embeddings.shape[0],word_topical_embeddings.shape[1]*word_topical_embeddings.shape[2]]); topical_embedding=self.transformer.apply(concatenated_topical_embeddings); return topical_embedding
class LookupFeedback(AbstractFeedback, Initializable): """A feedback brick for the case when readout are integers. Stores and retrieves distributed representations of integers. """ def __init__(self, num_outputs=None, feedback_dim=None, **kwargs): super(LookupFeedback, self).__init__(**kwargs) self.num_outputs = num_outputs self.feedback_dim = feedback_dim self.lookup = LookupTable(num_outputs, feedback_dim, weights_init=self.weights_init) self.children = [self.lookup] def _push_allocation_config(self): self.lookup.length = self.num_outputs self.lookup.dim = self.feedback_dim @application def feedback(self, outputs): assert self.output_dim == 0 return self.lookup.apply(outputs) def get_dim(self, name): if name == 'feedback': return self.feedback_dim return super(LookupFeedback, self).get_dim(name)
def build_model(self, x, config): logger.info('building %s model for: %s ', self.nn_model, self.name) vocabsize = self.get_vocab_size() logger.info('%s vocab size is: %d', self.name, vocabsize) self.embeddings, self.dim_emb = self.get_embeddings() if self.tune_tune: logger.info('%s lookuptable with size (%d, %d) will be tuned.', self.name, vocabsize, self.dim_emb) lookup = LookupTable(length=vocabsize, dim=self.dim_emb) lookup.allocate() # add_role(lookup.W, WEIGHT) lookup.W.name = 'lt.W' else: logger.info('%s lookuptable with size (%d, %d) will NOT be tuned.', self.name, vocabsize, self.dim_emb) lookup = MyLookupTable(length=vocabsize, dim=self.dim_emb) lookup.allocate() lookup.name = self.name + 'lookuptable' lookup.W.set_value(self.embeddings) xemb = lookup.apply(x) xemb = debug_print(xemb, 'xemb', False) if 'cnn' in self.nn_model: logger.info('CNN') feature_vec, feature_vec_len = create_cnn_general(xemb, self.dim_emb, self.max_len, config, self.name) elif self.nn_model == 'lstm': feature_vec, feature_vec_len = create_lstm(xemb, self.dim_emb, False, config, self.name) elif self.nn_model == 'bilstm': feature_vec, feature_vec_len = create_lstm(xemb, self.dim_emb, True, config, self.name) elif self.nn_model == 'rnn': feature_vec, feature_vec_len = create_rnn(xemb, self.dim_emb, config, self.name) elif self.nn_model == 'ff': feature_vec, feature_vec_len = create_ff(xemb, self.dim_emb, self.max_len, config) elif self.nn_model == 'mean': feature_vec, feature_vec_len = create_mean(xemb, self.dim_emb, self.max_len, config) return feature_vec, feature_vec_len
def test_lookup_table(): lt = LookupTable(5, 3) lt.allocate() lt.W.set_value(numpy.arange(15).reshape(5, 3).astype(theano.config.floatX)) x = tensor.lmatrix("x") y = lt.apply(x) f = theano.function([x], [y]) x_val = [[1, 2], [0, 3]] desired = numpy.array([[[3, 4, 5], [6, 7, 8]], [[0, 1, 2], [9, 10, 11]]], dtype=theano.config.floatX) assert_equal(f(x_val)[0], desired) # Test get_dim assert_equal(lt.get_dim(lt.apply.inputs[0]), 0) assert_equal(lt.get_dim(lt.apply.outputs[0]), lt.dim) assert_raises(ValueError, lt.get_dim, 'random_name') # Test feedforward interface assert lt.input_dim == 0 assert lt.output_dim == 3 lt.output_dim = 4 assert lt.output_dim == 4 def assign_input_dim(): lt.input_dim = 11 assert_raises(ValueError, assign_input_dim) lt.input_dim = 0
class BidirectionalEncoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='embeddings') self.bidir = NewBidirectional( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [ self.lookup, self.bidir, self.fwd_fork, self.back_fork ] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names ] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension. source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) representation = self.bidir.apply( # Conversion to embedding representation here. merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) self.representation = representation return representation
class LookupFeedback(AbstractFeedback, Initializable): """A feedback brick for the case when readout are integers. Stores and retrieves distributed representations of integers. """ def __init__(self, num_outputs=None, feedback_dim=None, **kwargs): self.num_outputs = num_outputs self.feedback_dim = feedback_dim self.lookup = LookupTable(num_outputs, feedback_dim) children = [self.lookup] kwargs.setdefault('children', []).extend(children) super(LookupFeedback, self).__init__(**kwargs) def _push_allocation_config(self): self.lookup.length = self.num_outputs self.lookup.dim = self.feedback_dim @application def feedback(self, outputs): assert self.output_dim == 0 return self.lookup.apply(outputs) def get_dim(self, name): if name == 'feedback': return self.feedback_dim return super(LookupFeedback, self).get_dim(name)
class CompositionalLayerToyWithTables(Initializable): def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size, subword_RNN_hidden_state_size, **kwargs): super(CompositionalLayerToyWithTables, self).__init__(**kwargs) self.batch_size = batch_size self.num_subwords = num_subwords # number of subwords which make up a word self.num_words = num_words # number of words in the sentence self.subword_embedding_size = subword_embedding_size self.input_vocab_size = input_vocab_size self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size # create the look up table self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name='input_lookup') self.lookup.weights_init = Uniform(width=0.08) self.lookup.biases_init = Constant(0) # has one RNN which reads the subwords into a word embedding self.compositional_subword_to_word_RNN = SimpleRecurrent( dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN', weights_init=Identity_init()) self.children = [self.lookup, self.compositional_subword_to_word_RNN] ''' subword_id_input_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint16 or equivalent subword_id_input_mask_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint8 or equivalent and has binary values of 1 when there is data and zero otherwise. The look up table will return a 4d tensor with shape = (num_words, num_subwords, batch_size, embedding size) The RNN will eat up the subwords dimension, resulting in a 3d tensor of shape = (num_words, batch_size, RNN_hidden_value_size), which is returned as 'word_embeddings' Also returned is a 2d tensor of shape = (num_words, batch_zize), which is the remaining mask indicated the length of the sentence for each sentence in the batch. i.e., 1 when there is a word, 0 otherwise. ''' @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['word_embeddings', 'word_embeddings_mask']) def apply(self, subword_id_input_, subword_id_input_mask_): ##shape = (num_words, num_subwords, batch_size, embedding size) subword_embeddings = self.lookup.apply(subword_id_input_) result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN.apply(subword_embeddings, mask=subword_id_input_mask_), sequences= [subword_embeddings, subword_id_input_mask_]) word_embeddings = result.dimshuffle(1,0,2,3) #put the states as the last dimension #remove this line to see the RNN states word_embeddings = word_embeddings[-1] #take only the last state, since we dont need the others #remove subword dim from mask #if subword is empty then word is emptry the word is emptry, if not then the word is used word_embeddings_mask = subword_id_input_mask_.max(axis=1) return word_embeddings, word_embeddings_mask
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
def create_rnn(hidden_dim, vocab_dim,mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name = "W1", #dim = hidden_dim*4, dim = hidden_dim, length = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) if mode == "lstm": # Long Short Term Memory H = LSTM( hidden_dim, name = 'H', weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0.0) ) else: # recurrent history weight H = SimpleRecurrent( name = "H", dim = hidden_dim, activation = Tanh(), weights_init = initialization.IsotropicGaussian(0.01) ) # S = Linear( name = "W2", input_dim = hidden_dim, output_dim = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) A = NDimensionalSoftmax( name = "softmax" ) initLayers([W,H,S]) activations = W.apply(x) hiddens = H.apply(activations)#[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def create_model(self): input_dim = self.input_dim x = self.x y = self.y p = self.p mask = self.mask hidden_dim = self.hidden_dim embedding_dim = self.embedding_dim lookup = LookupTable(self.dict_size, embedding_dim, weights_init=IsotropicGaussian(0.001), name='LookupTable') x_to_h = Linear(embedding_dim, hidden_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) lstm = LSTM(hidden_dim, name='lstm', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) h_to_o = MLP([Logistic()], [hidden_dim, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0), name='h_to_o') lookup.initialize() x_to_h.initialize() lstm.initialize() h_to_o.initialize() embed = lookup.apply(x).reshape( (x.shape[0], x.shape[1], self.embedding_dim)) embed.name = "embed_vec" x_transform = x_to_h.apply(embed.transpose(1, 0, 2)) x_transform.name = "Transformed X" self.lookup = lookup self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o #if mask is None: h, c = lstm.apply(x_transform) #else: #h, c = lstm.apply(x_transform, mask=mask) h.name = "hidden_state" c.name = "cell state" # only values of hidden units of the last timeframe are used for # the classification indices = T.sum(mask, axis=0) - 1 rel_hid = h[indices, T.arange(h.shape[1])] out = self.h_to_o.apply(rel_hid) probs = out return probs
def nn_fprop(x, y, vocab_size, hidden_size, num_layers, model): lookup = LookupTable(length=vocab_size, dim=hidden_size) initialize([lookup]) h = lookup.apply(x) for i in range(num_layers): if model == 'rnn': h = rnn_layer(hidden_size, h, i) if model == 'gru': h = gru_layer(hidden_size, h, i) if model == 'lstm': h = lstm_layer(hidden_size, h, i) return softmax_layer(h, y, vocab_size, hidden_size)
def test_lookup_table(): lt = LookupTable(5, 3) lt.allocate() lt.W.set_value(numpy.arange(15).reshape(5, 3).astype(theano.config.floatX)) x = tensor.lmatrix("x") y = lt.apply(x) f = theano.function([x], [y]) x_val = [[1, 2], [0, 3]] desired = numpy.array([[[3, 4, 5], [6, 7, 8]], [[0, 1, 2], [9, 10, 11]]], dtype=theano.config.floatX) assert_equal(f(x_val)[0], desired)
def create_rnn(hidden_dim, vocab_dim, mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name="W1", #dim = hidden_dim*4, dim=hidden_dim, length=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) if mode == "lstm": # Long Short Term Memory H = LSTM(hidden_dim, name='H', weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0.0)) else: # recurrent history weight H = SimpleRecurrent( name="H", dim=hidden_dim, activation=Tanh(), weights_init=initialization.IsotropicGaussian(0.01)) # S = Linear(name="W2", input_dim=hidden_dim, output_dim=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) A = NDimensionalSoftmax(name="softmax") initLayers([W, H, S]) activations = W.apply(x) hiddens = H.apply(activations) #[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
class BidirectionalEncoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='embeddings') self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='back_fork') self.children = [self.lookup, self.bidir, self.fwd_fork, self.back_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) representation = self.bidir.apply( merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}) ) return representation
class BidirectionalEncoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='words_embeddings') self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='words_fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='words_back_fork') self.children = [self.lookup, self.bidir, self.fwd_fork, self.back_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names] @application(inputs=['words', 'words_mask'], outputs=['representation']) def apply(self, words, words_mask): # Time as first dimension words = words.T words_mask = words_mask.T embeddings = self.lookup.apply(words) representation = self.bidir.apply( merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': words_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': words_mask}) ) return representation
class Encoder(Initializable): def __init__( self, encoder_type, num_characters, input_dim, encoder_dim, **kwargs): assert encoder_type in [None, 'bidirectional'] self.encoder_type = encoder_type super(Encoder, self).__init__(**kwargs) self.children = [] if encoder_type in ['lookup', 'bidirectional']: self.embed_label = LookupTable( num_characters, input_dim, name='embed_label') self.children += [ self.embed_label] else: # If there is no encoder. assert num_characters == input_dim if encoder_type == 'bidirectional': transition = RecurrentWithFork( GatedRecurrent(dim=encoder_dim).apply, input_dim, name='encoder_transition') self.encoder = Bidirectional(transition, name='encoder') self.children.append(self.encoder) @application def apply(self, x, x_mask=None): if self.encoder_type is None: return x if self.encoder_type in ['lookup', 'bidirectional']: embed_x = self.embed_label.apply(x) if self.encoder_type == 'lookup': encoded_x = embed_x if self.encoder_type == 'bidirectional': encoded_x = self.encoder.apply(embed_x, x_mask) return encoded_x
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.transition.dim = self.state_dim self.fork.input_dim = self.embedding_dim self.fork.output_dims = [ self.state_dim for _ in self.fork.output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.dimshuffle(1, 0) source_sentence_mask = source_sentence_mask.T if self.reverse: source_sentence = source_sentence[::-1] source_sentence_mask = source_sentence_mask[::-1] embeddings = self.lookup.apply(source_sentence) representation = self.transition.apply( **merge(self.fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) return representation[-1]
def nn_fprop(x, x_mask, y, y_mask, lens, vocab_size, hidden_size, num_layers, model, boosting=False, **kwargs): lookup = LookupTable(length=vocab_size, dim=hidden_size) initialize([lookup]) h = lookup.apply(x) first = True for i in range(num_layers): if model == 'rnn': h = rnn_layer(hidden_size, h, i, x_mask=x_mask, first=first, **kwargs) elif model == 'gru': h = gru_layer(hidden_size, h, i, x_mask=x_mask, first=first, **kwargs) elif model == 'lstm': h = lstm_layer(hidden_size, h, i, x_mask=x_mask, first=first, **kwargs) else: print("models must either be rnn or lstm") sys.exit(0) first = False return softmax_layer(h, y, x_mask, y_mask, lens, vocab_size, hidden_size, boosting)
def build_model(self, x, config): logger.info('building %s model for: %s ', self.nn_model, self.name) vocabsize = self.get_vocab_size() logger.info('%s vocab size is: %d', self.name, vocabsize) self.embeddings, self.dim_emb = self.get_embeddings() if self.tune_tune: logger.info('%s lookuptable with size (%d, %d) will be tuned.', self.name, vocabsize, self.dim_emb) lookup = LookupTable(length=vocabsize, dim=self.dim_emb) lookup.allocate() # add_role(lookup.W, WEIGHT) lookup.W.name = 'lt.W' else: logger.info('%s lookuptable with size (%d, %d) will NOT be tuned.', self.name, vocabsize, self.dim_emb) lookup = MyLookupTable(length=vocabsize, dim=self.dim_emb) lookup.allocate() lookup.name = self.name + 'lookuptable' lookup.W.set_value(self.embeddings) xemb = lookup.apply(x) xemb = debug_print(xemb, 'xemb', False) if 'cnn' in self.nn_model: logger.info('CNN') feature_vec, feature_vec_len = create_cnn_general( xemb, self.dim_emb, self.max_len, config, self.name) elif self.nn_model == 'lstm': feature_vec, feature_vec_len = create_lstm(xemb, self.dim_emb, False, config, self.name) elif self.nn_model == 'bilstm': feature_vec, feature_vec_len = create_lstm(xemb, self.dim_emb, True, config, self.name) elif self.nn_model == 'rnn': feature_vec, feature_vec_len = create_rnn(xemb, self.dim_emb, config, self.name) elif self.nn_model == 'ff': feature_vec, feature_vec_len = create_ff(xemb, self.dim_emb, self.max_len, config) elif self.nn_model == 'mean': feature_vec, feature_vec_len = create_mean(xemb, self.dim_emb, self.max_len, config) return feature_vec, feature_vec_len
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.transition.dim = self.state_dim self.fork.input_dim = self.embedding_dim self.fork.output_dims = [self.state_dim for _ in self.fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.dimshuffle(1, 0) source_sentence_mask = source_sentence_mask.T if self.reverse: source_sentence = source_sentence[::-1] source_sentence_mask = source_sentence_mask[::-1] embeddings = self.lookup.apply(source_sentence) representation = self.transition.apply(**merge( self.fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask} )) return representation[-1]
class Encoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, blockid, vocab_size, embedding_dim, state_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.blockid = blockid self.lookup = LookupTable(name='embeddings' + '_' + self.blockid) self.gru = GatedRecurrent(activation=Tanh(), dim=state_dim, name = "GatedRNN" + self.blockid) self.fwd_fork = Fork( [name for name in self.gru.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork' + '_' + self.blockid) self.children = [self.lookup, self.gru, self.fwd_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.gru.get_dim(name) for name in self.fwd_fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) grupara = merge( self.fwd_fork.apply(embeddings, as_dict=True) , {'mask': source_sentence_mask}) representation = self.gru.apply(**grupara) return representation
def construct_model(vocab_size, embedding_dim, ngram_order, hidden_dims, activations): # Construct the model x = tensor.lmatrix('features') y = tensor.lvector('targets') lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup') hidden = MLP(activations=activations + [None], dims=[ngram_order * embedding_dim] + hidden_dims + [vocab_size]) embeddings = lookup.apply(x) embeddings = embeddings.flatten(ndim=2) # Concatenate embeddings activations = hidden.apply(embeddings) cost = Softmax().categorical_cross_entropy(y, activations) # Initialize parameters lookup.weights_init = IsotropicGaussian(0.001) hidden.weights_init = IsotropicGaussian(0.01) hidden.biases_init = Constant(0.001) lookup.initialize() hidden.initialize() return cost
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='embeddings') self.GRU = GatedRecurrent(activation=Tanh(), dim=state_dim) self.children = [self.lookup, self.GRU] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) representation = self.GRU.apply(embeddings, embeddings) return representation
class TargetWordEncoder(Initializable): """Word encoder in target side use a single RNN to map a charater-level word to a vector""" def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(TargetWordEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') self.children = [self.lookup, self.dgru, self.gru_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) if self.dgru_depth > 1: gru_out = gru_out[-1] sampled_representation = tensor.batched_dot( sample_matrix, gru_out.dimshuffle([1, 0, 2])) return sampled_representation.dimshuffle([1, 0, 2]) @application(inputs=['target_single_char']) def single_emit(self, target_single_char, batch_size, mask, states=None): # Time as first dimension # only one batch embeddings = self.lookup.apply(target_single_char) if states is None: states = self.dgru.initial_states(batch_size) states_dict = {'states': states[0]} for i in range(1, self.dgru_depth): states_dict['states' + RECURRENTSTACK_SEPARATOR + str(i)] = states[i] gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), states_dict, { 'mask': mask, 'iterate': False })) return gru_out @single_emit.property('outputs') def single_emit_outputs(self): return [ 'gru_out' + RECURRENTSTACK_SEPARATOR + str(i) for i in range(self.dgru_depth) ] def get_dim(self, name): if name in ['output', 'feedback']: return self.dgru_state_dim super(TargetWordEncoder, self).get_dim(name)
class BidirectionalEncoder(Initializable): """A generalized version of the vanilla encoder of the RNNsearch model which supports different numbers of layers. Zero layers represent non-recurrent encoders. """ def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') if self.n_layers >= 1: self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='back_fork') self.children = [self.lookup, self.bidir, self.fwd_fork, self.back_fork] if self.n_layers > 1: # Deep encoder self.mid_fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='mid_fwd_fork') self.mid_back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='mid_back_fork') self.children.append(self.mid_fwd_fork) self.children.append(self.mid_back_fork) elif self.n_layers == 0: self.embedding_dim = state_dim*2 self.children = [self.lookup] else: logging.fatal("Number of encoder layers must be non-negative") def _push_allocation_config(self): """Sets the parameters of sub bricks """ self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim if self.n_layers >= 1: self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names] if self.n_layers > 1: # Deep encoder inp_dim = self.state_dim * 2 if self.skip_connections: inp_dim += self.embedding_dim self.mid_fwd_fork.input_dim = inp_dim self.mid_fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names] self.mid_back_fork.input_dim = inp_dim self.mid_back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation', 'representation_mask']) def apply(self, source_sentence, source_sentence_mask): """Produces source annotations, either non-recurrently or with a bidirectional RNN architecture. """ # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) if self.n_layers >= 1: representation = self.bidir.apply( merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}) ) for _ in xrange(self.n_layers-1): if self.skip_connections: inp = tensor.concatenate([representation, embeddings], axis=2) else: inp = representation representation = self.bidir.apply( merge(self.mid_fwd_fork.apply(inp, as_dict=True), {'mask': source_sentence_mask}), merge(self.mid_back_fork.apply(inp, as_dict=True), {'mask': source_sentence_mask}) ) else: representation = embeddings return representation, source_sentence_mask
x_tr = next(train_stream.get_epoch_iterator()) ################# # Model ################# f0 = tensor.matrix("f0") voiced = tensor.matrix("voiced") start_flag = tensor.scalar("start_flag") sp = tensor.tensor3("sp") phonemes = tensor.imatrix("phonemes") num_phonemes = 365 context_size = 1000 lookup = LookupTable(num_phonemes, context_size) context = lookup.apply(phonemes) f0s = f0.dimshuffle(0, 1, "x") voiceds = voiced.dimshuffle(0, 1, "x") x = tensor.concatenate([sp, f0s, voiceds], 2) # x = tensor.tensor3('features') activations_x = [Rectifier()] * depth_x dims_x = [frame_size] + [hidden_size_mlp_x] * (depth_x - 1) + [hidden_size_recurrent] activations_theta = [Rectifier()] * depth_theta dims_theta = [hidden_size_recurrent] + [hidden_size_mlp_theta] * depth_theta
rnn.initialize() gather.initialize() ## Now for the application of these units # Define the shape of x specifically... :: the data has format (batch, features). x.tag.test_value = np.random.randint(vocab_size, size=batch_of_sentences ).astype(np.int32) x_extra.tag.test_value = np.zeros( (max_sentence_length, mini_batch_size, 1) ).astype(np.float32) x_mask.tag.test_value = np.random.choice( [0.0, 1.0], size=batch_of_sentences ).astype(np.float32) print("x shape", x.shape.tag.test_value) # array([29, 16])) word_embedding = lookup.apply(x) print("word_embedding shape", word_embedding.shape.tag.test_value) # array([ 29, 16, 100])) print("x_extra shape", x_extra.shape.tag.test_value) # array([ 29, 16, 1])) embedding_extended = tensor.concatenate([ word_embedding, x_extra ], axis=-1) print("embedding_extended shape", embedding_extended.shape.tag.test_value) # array([ 29, 16, 101])) rnn_outputs = rnn.apply(embedding_extended, mask=x_mask) print("rnn_outputs shape", rnn_outputs.shape.tag.test_value) # array([ 29, 16, 202])) ### So : Need to reshape the rnn outputs to produce suitable input here... # Convert a tensor here into a long stream of vectors # The shape actually depends on the specific batch... (for instance, the last one in an epoch may be smaller) #rnn_outputs_reshaped = rnn_outputs.reshape( (max_sentence_length*mini_batch_size, hidden_dim*2) ) # not parameterized properly rnn_outputs_reshaped = rnn_outputs.reshape( (x.shape[0]*x.shape[1], hidden_dim*2) )
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate([ cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2) clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism Bilinear attention_clinear_1 = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc_1') bricks += [attention_clinear_1] att_start = qenc[None, :, :] * attention_clinear_1.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_start = att_start.sum(axis=2) att_start = tensor.nnet.softmax(att_start.T).T attention_clinear_2 = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc_2') bricks += [attention_clinear_2] att_end = qenc[None, :, :] * attention_clinear_2.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_end = att_end.sum(axis=2) att_end = tensor.nnet.softmax(att_end.T).T att_start = tensor.dot( tensor.le( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_start) att_end = tensor.dot( tensor.ge( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_end) # add attention from left and right att_weights = att_start * att_end att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) self.predictions = tensor.gt(att_weights, 0.25) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) #cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() cost = (((att_weights - att_target)**2) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_start.name = 'att_start' att_end.name = 'att_end' att_weights.name = 'att_weights' att_target.name = 'att_target' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars = [ cost, self.predictions, att_start, att_end, att_weights, att_target ] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
class NeuralLM: def __init__(self, x, y, vocab_size, hidden_size, num_layers, pretrained_embeds=None): """ Implements a neural language model using an LSTM. Word y_n+1 ~ Softmax(U * h_n) :param x A minibatch: each row is an instance (a sequence), with batch_size rows :param y x shifted by 1, which are the target words to predict for the language modeling objective based on the hidden LSTM state :param vocab_size The number of types in the training data :param hidden_size The dimensionality of the word embeddings :param pretrained_embeds Pretrained embeddings for initailization as an ND array """ self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_layers = num_layers # Initialize the word embedding table. If we have pretrained embeddings, we use those self.word_embedding_lookup = LookupTable(length=vocab_size, dim=hidden_size, name="word_embeddings") if pretrained_embeds is None: initialize(self.word_embedding_lookup, 0.8) else: assert pretrained_embeds.shape[0] == vocab_size and pretrained_embeds.shape[1] == hidden_size self.word_embedding_lookup.weights_init = Constant(pretrained_embeds) self.word_embedding_lookup.biases_init = Constant(0) self.word_embedding_lookup.initialize() self.word_embeddings = self.word_embedding_lookup.W self.y_hat, self.cost, self.cells = self.nn_fprop(x, y, num_layers) def lstm_layer(self, h, n): """ Performs the LSTM update for a batch of word sequences :param h The word embeddings for this update :param n The number of layers of the LSTM """ # Maps the word embedding to a dimensionality to be used in the LSTM linear = Linear(input_dim=self.hidden_size, output_dim=self.hidden_size * 4, name='linear_lstm' + str(n)) initialize(linear, sqrt(6.0 / (5 * self.hidden_size))) lstm = LSTM(dim=self.hidden_size, name='lstm' + str(n)) initialize(lstm, 0.08) return lstm.apply(linear.apply(h)) def softmax_layer(self, h, y): """ Perform Softmax over the hidden state in order to predict the next word in the sequence and compute the loss. :param h The hidden state sequence :param y The target words """ hidden_to_output = Linear(name='hidden_to_output', input_dim=self.hidden_size, output_dim=self.vocab_size) initialize(hidden_to_output, sqrt(6.0 / (self.hidden_size + self.vocab_size))) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax(name="lm_softmax") y_hat = softmax.log_probabilities(linear_output, extra_ndim=1) y_hat.name = 'y_hat' cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean() cost.name = 'cost' return y_hat, cost def nn_fprop(self, x, y, num_layers): h = T.nnet.sigmoid(self.word_embedding_lookup.apply(x)) # constrain the word embeddings cells = [] for i in range(num_layers): h, c = self.lstm_layer(h, i) cells.append(c) return self.softmax_layer(h, y) + (cells, ) @property def cost(self): return self.cost @property def embeddings(self): return self.word_embeddings
from theano import tensor # ------------------------------------------------------------- # words = brown.words() V = list(set(words)) v = len(V) table = LookupTable(length=v, dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) PARAM_H_SIZE = 100 x = tensor.matrix('x',dtype="int64") y = tensor.lvector('y') in_to_h = table.apply(x) # now average! x.mean(axis=1) # figure out what that really does, why axis 1? # h_to_out = Linear(name='h_to_out', input_dim=PARAM_H_SIZE, output_dim=v, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) y_hat = Softmax().apply(h_to_out.apply(x)) cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) # is there something more to do with cost? cg = ComputationGraph(cost) train_set = # TODO data_stream = Flatten(DataStream.default_stream(train_set,
rnn = SimpleRecurrent(name='hidden', dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear(name='linear_output', input_dim=hidden_layer_dim, output_dim=train_dataset.durations_vocab_size(), weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)] algorithm = GradientDescent(cost=cost,
rnn.initialize() gather.initialize() ## Now for the application of these units # Define the shape of x specifically... :: the data has format (batch, features). x.tag.test_value = np.random.randint(vocab_size, size=batch_of_sentences).astype(np.int32) x_extra.tag.test_value = np.zeros( (max_sentence_length, mini_batch_size, 1)).astype(np.float32) x_mask.tag.test_value = np.random.choice( [0.0, 1.0], size=batch_of_sentences).astype(np.float32) print("x shape", x.shape.tag.test_value) # array([29, 16])) word_embedding = lookup.apply(x) print("word_embedding shape", word_embedding.shape.tag.test_value) # array([ 29, 16, 100])) print("x_extra shape", x_extra.shape.tag.test_value) # array([ 29, 16, 1])) embedding_extended = tensor.concatenate([word_embedding, x_extra], axis=-1) print("embedding_extended shape", embedding_extended.shape.tag.test_value) # array([ 29, 16, 101])) rnn_outputs = rnn.apply(embedding_extended, mask=x_mask) print("rnn_outputs shape", rnn_outputs.shape.tag.test_value) # array([ 29, 16, 202])) ### So : Need to reshape the rnn outputs to produce suitable input here... # Convert a tensor here into a long stream of vectors
class Parrot(Initializable, Random): def __init__( self, input_dim=420, # Dimension of the text labels output_dim=63, # Dimension of vocoder fram rnn_h_dim=1024, # Size of rnn hidden state readouts_dim=1024, # Size of readouts (summary of rnn) weak_feedback=False, # Feedback to the top rnn layer full_feedback=False, # Feedback to all rnn layers feedback_noise_level=None, # Amount of noise in feedback layer_norm=False, # Use simple normalization? use_speaker=False, # Condition on the speaker id? num_speakers=21, # How many speakers there are? speaker_dim=128, # Size of speaker embedding which_cost='MSE', # Train with MSE or GMM k_gmm=20, # How many components in the GMM sampling_bias=0, # Make samples more likely (Graves13) epsilon=1e-5, # Numerical stabilities num_characters=43, # how many chars in the labels attention_type='graves', # graves or softmax attention_size=10, # number of gaussians in the attention attention_alignment=1., # audio steps per letter at initialization sharpening_coeff=1., timing_coeff=1., encoder_type=None, encoder_dim=128, **kwargs): super(Parrot, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = output_dim self.rnn_h_dim = rnn_h_dim self.readouts_dim = readouts_dim self.layer_norm = layer_norm self.which_cost = which_cost self.use_speaker = use_speaker self.full_feedback = full_feedback self.feedback_noise_level = feedback_noise_level self.epsilon = epsilon self.num_characters = num_characters self.attention_type = attention_type self.attention_alignment = attention_alignment self.attention_size = attention_size self.sharpening_coeff = sharpening_coeff self.timing_coeff = timing_coeff self.encoder_type = encoder_type self.encoder_dim = encoder_dim self.encoded_input_dim = input_dim if self.encoder_type == 'bidirectional': self.encoded_input_dim = 2 * encoder_dim if self.feedback_noise_level is not None: self.noise_level_var = tensor.scalar('feedback_noise_level') self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1') self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2') self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3') self.h1_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h2_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h2_to_readout') self.h3_to_readout = Linear(input_dim=rnn_h_dim, output_dim=readouts_dim, name='h3_to_readout') self.h1_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h2') self.h1_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h3') self.h2_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h2_to_h3') if which_cost == 'MSE': self.readout_to_output = Linear(input_dim=readouts_dim, output_dim=output_dim, name='readout_to_output') elif which_cost == 'GMM': self.sampling_bias = sampling_bias self.k_gmm = k_gmm self.readout_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=readouts_dim, output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='readout_to_output') self.encoder = Encoder(encoder_type, num_characters, input_dim, encoder_dim, name='encoder') self.children = [ self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout, self.h2_to_readout, self.h3_to_readout, self.h1_to_h2, self.h1_to_h3, self.h2_to_h3, self.readout_to_output ] self.inp_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h1') self.inp_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h2') self.inp_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h3') self.children += [self.inp_to_h1, self.inp_to_h2, self.inp_to_h3] self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'], input_dim=rnn_h_dim, output_dims=[attention_size] * 3, name='h1_to_att') self.att_to_readout = Linear(input_dim=self.encoded_input_dim, output_dim=readouts_dim, name='att_to_readout') self.children += [self.h1_to_att, self.att_to_readout] if use_speaker: self.num_speakers = num_speakers self.speaker_dim = speaker_dim self.embed_speaker = LookupTable(num_speakers, speaker_dim) self.speaker_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h1') self.speaker_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h2') self.speaker_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h3') self.speaker_to_readout = Linear(input_dim=speaker_dim, output_dim=readouts_dim, name='speaker_to_readout') if which_cost == 'MSE': self.speaker_to_output = Linear(input_dim=speaker_dim, output_dim=output_dim, name='speaker_to_output') elif which_cost == 'GMM': self.speaker_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=speaker_dim, output_dims=[ output_dim * k_gmm, output_dim * k_gmm, k_gmm ], name='speaker_to_output') self.children += [ self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2, self.speaker_to_h3, self.speaker_to_readout, self.speaker_to_output ] if full_feedback: self.out_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h2') self.out_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h3') self.children += [self.out_to_h2, self.out_to_h3] weak_feedback = True self.weak_feedback = weak_feedback if weak_feedback: self.out_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h1') self.children += [self.out_to_h1] def _allocate(self): self.initial_w = shared_floatx_zeros((self.encoded_input_dim, ), name="initial_w") add_role(self.initial_w, INITIAL_STATE) def symbolic_input_variables(self): features = tensor.tensor3('features') features_mask = tensor.matrix('features_mask') labels = tensor.imatrix('labels') labels_mask = tensor.matrix('labels_mask') start_flag = tensor.scalar('start_flag') if self.use_speaker: speaker = tensor.imatrix('speaker_index') else: speaker = None return features, features_mask, labels, labels_mask, \ speaker, start_flag def initial_states(self, batch_size): initial_h1 = self.rnn1.initial_states(batch_size) initial_h2 = self.rnn2.initial_states(batch_size) initial_h3 = self.rnn3.initial_states(batch_size) last_h1 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h2 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h3 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) # Defining for all initial_k = tensor.zeros((batch_size, self.attention_size), dtype=floatX) last_k = shared_floatx_zeros((batch_size, self.attention_size)) # Trainable initial state for w. Why not for k? initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0) last_w = shared_floatx_zeros((batch_size, self.encoded_input_dim)) return initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k @application def compute_cost(self, features, features_mask, labels, labels_mask, speaker, start_flag, batch_size): if speaker is None: assert not self.use_speaker target_features = features[1:] mask = features_mask[1:] cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim) gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.weak_feedback: input_features = features[:-1] if self.feedback_noise_level: noise = self.theano_rng.normal(size=input_features.shape, avg=0., std=1.) input_features += self.noise_level_var * noise out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features) to_normalize = [out_cell_h1, out_gat_h1] out_cell_h1, out_gat_h1 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += out_cell_h1 gat_h1 += out_gat_h1 if self.full_feedback: assert self.weak_feedback out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features) out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features) to_normalize = [out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3] out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2 += out_cell_h2 gat_h2 += out_gat_h2 cell_h3 += out_cell_h3 gat_h3 += out_gat_h3 if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3 ] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 = spk_cell_h1 + cell_h1 cell_h2 = spk_cell_h2 + cell_h2 cell_h3 = spk_cell_h3 + cell_h3 gat_h1 = spk_gat_h1 + gat_h1 gat_h2 = spk_gat_h2 + gat_h2 gat_h3 = spk_gat_h3 + gat_h3 initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(batch_size) # If it's a new example, use initial states. input_h1 = tensor.switch(start_flag, initial_h1, last_h1) input_h2 = tensor.switch(start_flag, initial_h2, last_h2) input_h3 = tensor.switch(start_flag, initial_h3, last_h3) input_w = tensor.switch(start_flag, initial_w, last_w) input_k = tensor.switch(start_flag, initial_k, last_k) context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX), 2) def step(inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh): attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) inp_h1_t += attinp_h1 gat_h1_t += attgat_h1 h1_t = self.rnn1.apply(inp_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) inp_h2_t += attinp_h2 gat_h2_t += attgat_h2 inp_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply(inp_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply(inp_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_ (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan( fn=step, sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[context_oh], outputs_info=[ input_h1, input_h2, input_h3, input_k, input_w, None, None ]) h1_out = self.h1_to_readout.apply(h1) h2_out = self.h2_to_readout.apply(h2) h3_out = self.h3_to_readout.apply(h3) to_normalize = [h1_out, h2_out, h3_out] h1_out, h2_out, h3_out = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readouts = h1_out + h2_out + h3_out if self.use_speaker: readouts += self.speaker_to_readout.apply(emb_speaker) readouts += self.att_to_readout.apply(w) predicted = self.readout_to_output.apply(readouts) if self.which_cost == 'MSE': if self.use_speaker: predicted += self.speaker_to_output.apply(emb_speaker) cost = tensor.sum((predicted - target_features)**2, axis=-1) next_x = predicted # Dummy value for coeff coeff = predicted elif self.which_cost == 'GMM': mu, sigma, coeff = predicted if self.use_speaker: spk_to_out = self.speaker_to_output.apply(emb_speaker) mu += spk_to_out[0] sigma += spk_to_out[1] coeff += spk_to_out[2] # When training there should not be sampling_bias sigma = tensor.exp(sigma) + self.epsilon coeff = tensor.nnet.softmax(coeff.reshape( (-1, self.k_gmm))).reshape(coeff.shape) + self.epsilon cost = cost_gmm(target_features, mu, sigma, coeff) next_x = sample_gmm(mu, sigma, coeff, self.theano_rng) cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag updates = [] updates.append((last_h1, h1[-1])) updates.append((last_h2, h2[-1])) updates.append((last_h3, h3[-1])) updates.append((last_k, k[-1])) updates.append((last_w, w[-1])) attention_vars = [next_x, k, w, coeff, phi, pi_att] return cost, scan_updates + updates, attention_vars @application def sample_model_fun(self, labels, labels_mask, speaker, num_samples, seq_size): initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(num_samples) initial_x = numpy.zeros((num_samples, self.output_dim), dtype=floatX) cell_shape = (seq_size, num_samples, self.rnn_h_dim) gat_shape = (seq_size, num_samples, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) # Applied before the broadcast. spk_readout = self.speaker_to_readout.apply(emb_speaker) spk_output = self.speaker_to_output.apply(emb_speaker) # Add dimension to repeat with time. emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3 ] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += spk_cell_h1 cell_h2 += spk_cell_h2 cell_h3 += spk_cell_h3 gat_h1 += spk_gat_h1 gat_h2 += spk_gat_h2 gat_h3 += spk_gat_h3 context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX), 2) def sample_step(inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t, inp_gat_h2_t, inp_cell_h3_t, inp_gat_h3_t, x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1): cell_h1_t = inp_cell_h1_t cell_h2_t = inp_cell_h2_t cell_h3_t = inp_cell_h3_t gat_h1_t = inp_gat_h1_t gat_h2_t = inp_gat_h2_t gat_h3_t = inp_gat_h3_t attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) cell_h1_t += attinp_h1 gat_h1_t += attgat_h1 if self.weak_feedback: out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1) to_normalize = [out_cell_h1_t, out_gat_h1_t] out_cell_h1_t, out_gat_h1_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1_t += out_cell_h1_t gat_h1_t += out_gat_h1_t if self.full_feedback: out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1) out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1) to_normalize = [ out_cell_h2_t, out_gat_h2_t, out_cell_h3_t, out_gat_h3_t ] out_cell_h2_t, out_gat_h2_t, \ out_cell_h3_t, out_gat_h3_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2_t += out_cell_h2_t cell_h3_t += out_cell_h3_t gat_h2_t += out_gat_h2_t gat_h3_t += out_gat_h3_t h1_t = self.rnn1.apply(cell_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon k_t = k_tm1 + self.attention_alignment * \ tensor.exp(k_t) / self.timing_coeff a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) cell_h2_t += attinp_h2 gat_h2_t += attgat_h2 cell_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply(cell_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply(cell_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) h1_out_t = self.h1_to_readout.apply(h1_t) h2_out_t = self.h2_to_readout.apply(h2_t) h3_out_t = self.h3_to_readout.apply(h3_t) to_normalize = [h1_out_t, h2_out_t, h3_out_t] h1_out_t, h2_out_t, h3_out_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readout_t = h1_out_t + h2_out_t + h3_out_t readout_t += self.att_to_readout.apply(w_t) if self.use_speaker: readout_t += spk_readout output_t = self.readout_to_output.apply(readout_t) if self.which_cost == 'MSE': predicted_x_t = output_t if self.use_speaker: predicted_x_t += spk_output # Dummy value for coeff_t coeff_t = predicted_x_t elif self.which_cost == "GMM": mu_t, sigma_t, coeff_t = output_t if self.use_speaker: mu_t += spk_output[0] sigma_t += spk_output[1] coeff_t += spk_output[2] sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \ self.epsilon coeff_t = tensor.nnet.softmax( coeff_t.reshape( (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape( coeff_t.shape) + self.epsilon predicted_x_t = sample_gmm(mu_t, sigma_t, coeff_t, self.theano_rng) return predicted_x_t, h1_t, h2_t, h3_t, \ k_t, w_t, coeff_t, phi_t, a_t_ (sample_x, h1, h2, h3, k, w, pi, phi, pi_att), updates = theano.scan( fn=sample_step, sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[], outputs_info=[ initial_x, initial_h1, initial_h2, initial_h3, initial_k, initial_w, None, None, None ]) return sample_x, k, w, pi, phi, pi_att, updates def sample_model(self, labels_tr, labels_mask_tr, features_mask_tr, speaker_tr, num_samples, num_steps): features, features_mask, labels, labels_mask, speaker, start_flag = \ self.symbolic_input_variables() sample_x, k, w, pi, phi, pi_att, updates = \ self.sample_model_fun( labels, labels_mask, speaker, num_samples, num_steps) theano_inputs = [labels, labels_mask] numpy_inputs = (labels_tr, labels_mask_tr) if self.use_speaker: theano_inputs += [speaker] numpy_inputs += (speaker_tr, ) return function(theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs) def sample_using_input(self, data_tr, num_samples): # Used to predict the values using the dataset features, features_mask, labels, labels_mask, speaker, start_flag = \ self.symbolic_input_variables() cost, updates, attention_vars = self.compute_cost( features, features_mask, labels, labels_mask, speaker, start_flag, num_samples) sample_x, k, w, pi, phi, pi_att = attention_vars theano_vars = [ features, features_mask, labels, labels_mask, speaker, start_flag ] theano_vars = [x for x in theano_vars if x is not None] theano_vars = list(set(theano_vars)) theano_vars = {x.name: x for x in theano_vars} theano_inputs = [] numpy_inputs = [] for key in data_tr.keys(): theano_inputs.append(theano_vars[key]) numpy_inputs.append(data_tr[key]) return function(theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs)
def __init__(self): inp = tensor.tensor3('input') inp = inp.dimshuffle(1,0,2) target = tensor.matrix('target') target = target.reshape((target.shape[0],)) product = tensor.lvector('product') missing = tensor.eq(inp, 0) train_input_mean = 1470614.1 train_input_std = 3256577.0 trans_1 = tensor.concatenate((inp[1:,:,:],tensor.zeros((1,inp.shape[1],inp.shape[2]))), axis=0) trans_2 = tensor.concatenate((tensor.zeros((1,inp.shape[1],inp.shape[2])), inp[:-1,:,:]), axis=0) inp = tensor.switch(missing,(trans_1+trans_2)/2, inp) lookup = LookupTable(length = 352, dim=4*hidden_dim) product_embed= lookup.apply(product) salut = tensor.concatenate((inp, missing),axis =2) linear = Linear(input_dim=input_dim+1, output_dim=4*hidden_dim, name="lstm_in") inter = linear.apply(salut) inter = inter + product_embed[None,:,:] lstm = LSTM(dim=hidden_dim, activation=activation_function, name="lstm") hidden, cells = lstm.apply(inter) linear2= Linear(input_dim = hidden_dim, output_dim = out_dim, name="ouput_linear") pred = linear2.apply(hidden[-1])*train_input_std + train_input_mean pred = pred.reshape((product.shape[0],)) cost = tensor.mean(abs((pred-target)/target)) # Initialize all bricks for brick in [linear, linear2, lstm, lookup]: brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() # Apply noise and dropout cg = ComputationGraph([cost]) if w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, w_noise_std) if i_dropout > 0: cg = apply_dropout(cg, [hidden], i_dropout) [cost_reg] = cg.outputs cost_reg += 1e-20 if cost_reg is not cost: self.cost = cost self.cost_reg = cost_reg cost_reg.name = 'cost_reg' cost.name = 'cost' self.sgd_cost = cost_reg self.monitor_vars = [[cost, cost_reg]] else: self.cost = cost cost.name = 'cost' self.sgd_cost = cost self.monitor_vars = [[cost]] self.pred = pred pred.name = 'pred'
print('Building model ...') # ----------- THE MODEL -------------------------- inputt = tensor.lmatrix('input').T input_mask = tensor.matrix('input_mask').T y = tensor.lmatrix('output').T y_mask = tensor.matrix('output_mask').T y_len = y_mask.sum(axis=0) # inputt : T x B # input_mask : T x B # y : L x B # y_mask : L x B # Linear bricks in input_to_h = LookupTable(num_input_classes, h_dim, name='lookup') h = input_to_h.apply(inputt) # h : T x B x h_dim # RNN bricks pre_lstm = Linear(input_dim=h_dim, output_dim=4*rec_dim, name='LSTM_linear') lstm = LSTM(activation=Tanh(), dim=rec_dim, name="rnn") rnn_out, _ = lstm.apply(pre_lstm.apply(h), mask=input_mask) # Linear bricks out rec_to_o = Linear(name='rec_to_o', input_dim=rec_dim, output_dim=num_output_classes + 1) y_hat_pre = rec_to_o.apply(rnn_out) # y_hat_pre : T x B x C+1
class Interpolator(AbstractReadout): """Readout char by char.""" def __init__(self, vocab_size, embedding_dim, igru_state_dim, igru_depth, trg_dgru_depth, emitter, feedback_brick, merge=None, merge_prototype=None, post_merge=None, **kwargs): merged_dim = igru_state_dim if not merge: merge = Merge(input_names=kwargs['source_names'], prototype=merge_prototype) if not post_merge: post_merge = Bias(dim=merged_dim) # for compatible if igru_depth == 1: self.igru = IGRU(dim=igru_state_dim) else: self.igru = RecurrentStack( [IGRU(dim=igru_state_dim, name='igru')] + [ UpperIGRU(dim=igru_state_dim, activation=Tanh(), name='upper_igru' + str(i)) for i in range(1, igru_depth) ], skip_connections=True) self.embedding_dim = embedding_dim self.emitter = emitter self.feedback_brick = feedback_brick self.merge = merge self.post_merge = post_merge self.merged_dim = merged_dim self.igru_depth = igru_depth self.trg_dgru_depth = trg_dgru_depth self.lookup = LookupTable(name='embeddings') self.vocab_size = vocab_size self.igru_state_dim = igru_state_dim self.gru_to_softmax = Linear(input_dim=igru_state_dim, output_dim=vocab_size) self.gru_fork = Fork([ name for name in self.igru.apply.sequences if name != 'mask' and name != 'input_states' ], prototype=Linear(), name='gru_fork') children = [ self.emitter, self.feedback_brick, self.merge, self.post_merge, self.igru, self.lookup, self.gru_to_softmax, self.gru_fork ] kwargs.setdefault('children', []).extend(children) super(Interpolator, self).__init__(**kwargs) def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.emitter.readout_dim = self.get_dim('readouts') self.merge.input_names = self.source_names self.merge.input_dims = self.source_dims self.merge.output_dim = self.merged_dim self.post_merge.input_dim = self.merged_dim self.post_merge.output_dim = self.igru_state_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.igru.get_dim(name) for name in self.gru_fork.output_names ] @application def initial_igru_outputs(self, batch_size): return self.igru.initial_states(batch_size) @application def emit(self, readouts): return self.emitter.emit(readouts) @application def cost(self, readouts, outputs): return self.emitter.cost(readouts, outputs) @application def initial_outputs(self, batch_size): return self.emitter.initial_outputs(batch_size) @application(outputs=['feedback']) def feedback(self, outputs): return self.feedback_brick.feedback(outputs) @application(outputs=['feedback']) def feedback_apply(self, target_char_seq, target_sample_matrix, target_char_aux): return self.feedback_brick.apply(target_char_seq, target_sample_matrix, target_char_aux) @application def single_feedback(self, target_single_char, batch_size, mask=None, states=None): return self.feedback_brick.single_emit(target_single_char, batch_size, mask, states) @single_feedback.property('outputs') def single_feedback_outputs(self): return [ 'single_feedback' + RECURRENTSTACK_SEPARATOR + str(i) for i in range(self.trg_dgru_depth) ] @application(outputs=['gru_out', 'readout_chars']) def single_readout_gru(self, target_prev_char, target_prev_char_aux, input_states, states): embeddings = self.lookup.apply(target_prev_char) states_dict = {'states': states[0]} if self.igru_depth > 1: for i in range(1, self.igru_depth): states_dict['states' + RECURRENTSTACK_SEPARATOR + str(i)] = states[i] gru_out = self.igru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), states_dict, { 'mask': target_prev_char_aux, 'input_states': input_states, 'iterate': False })) if self.igru_depth > 1: readout_chars = self.gru_to_softmax.apply(gru_out[-1]) else: readout_chars = self.gru_to_softmax.apply(gru_out) return gru_out, readout_chars @application def readout(self, **kwargs): merged = self.merge.apply( **{name: kwargs[name] for name in self.merge.input_names}) merged = self.post_merge.apply(merged) return merged @application(outputs=['readout_chars']) def readout_gru(self, target_prev_char_seq, target_prev_char_aux, input_states): embeddings = self.lookup.apply(target_prev_char_seq) gru_out = self.igru.apply( **merge(self.gru_fork.apply(embeddings, as_dict=True), { 'mask': target_prev_char_aux, 'input_states': input_states })) if self.igru_depth > 1: gru_out = gru_out[-1] readout_chars = self.gru_to_softmax.apply(gru_out) return readout_chars def get_dim(self, name): if name == 'outputs': return self.emitter.get_dim(name) elif name == 'feedback': return self.feedback_brick.get_dim(name) elif name == 'readouts': return self.readout_dim return super(AbstractReadout, self).get_dim(name)
class Decimator(Initializable): """Source word encoder, mapping a charater-level word to a vector. This encoder is able to learn the morphology. For compatibility with previous version, we call it Decimator. """ def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(Decimator, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth # representation self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) # importance of this representation self.bidir_w = Bidirectional(RecurrentWithFork( DGRU(activation=Tanh(), dim=self.dgru_state_dim // 2), self.embedding_dim, name='src_word_with_fork'), name='bidir_src_word_encoder') self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') # map to a energy scalar self.wl = Linear(input_dim=dgru_state_dim, output_dim=1) self.children = [ self.lookup, self.dgru, self.gru_fork, self.bidir_w, self.wl ] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation', 'weight']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) wgru_out = tensor.exp( self.wl.apply(self.bidir_w.apply(embeddings, char_aux))) if self.dgru_depth > 1: gru_out = gru_out[-1] gru_out = tensor.addbroadcast(wgru_out, 2) * gru_out sampled_representation = tensor.tanh( tensor.batched_dot(sample_matrix, gru_out.dimshuffle([1, 0, 2]))) return sampled_representation.dimshuffle([1, 0, 2]), wgru_out def get_dim(self, name): if name == 'output': return self.dgru_state_dim super(Decimator, self).get_dim(name)
class DeepBidirectionalEncoder(Initializable): """This encoder is a multi-layered version of ``BidirectionalEncoder`` where parameters between layers are not shared. """ def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(DeepBidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') self.bidirs = [] self.fwd_forks = [] self.back_forks = [] for i in xrange(self.n_layers): bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name='bidir%d' % i) self.bidirs.append(bidir) self.fwd_forks.append( Fork([ name for name in bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork%d' % i)) self.back_forks.append( Fork([ name for name in bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork%d' % i)) self.children = [self.lookup] \ + self.bidirs \ + self.fwd_forks \ + self.back_forks def _push_allocation_config(self): """Sets the parameters of sub bricks """ self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_forks[0].input_dim = self.embedding_dim self.fwd_forks[0].output_dims = [ self.bidirs[0].children[0].get_dim(name) for name in self.fwd_forks[0].output_names ] self.back_forks[0].input_dim = self.embedding_dim self.back_forks[0].output_dims = [ self.bidirs[0].children[1].get_dim(name) for name in self.back_forks[0].output_names ] for i in xrange(1, self.n_layers): inp_dim = self.state_dim * 2 if self.skip_connections: inp_dim += self.embedding_dim self.fwd_forks[i].input_dim = inp_dim self.fwd_forks[i].output_dims = [ self.bidirs[i].children[0].get_dim(name) for name in self.fwd_forks[i].output_names ] self.back_forks[i].input_dim = inp_dim self.back_forks[i].output_dims = [ self.bidirs[i].children[1].get_dim(name) for name in self.back_forks[i].output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation', 'representation_mask']) def apply(self, source_sentence, source_sentence_mask): """Produces source annotations, either non-recurrently or with a bidirectional RNN architecture. """ # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) representation = self.bidirs[0].apply( merge(self.fwd_forks[0].apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_forks[0].apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) for i in xrange(1, self.n_layers): if self.skip_connections: inp = tensor.concatenate([representation, embeddings], axis=2) else: inp = representation representation = self.bidirs[i].apply( merge(self.fwd_forks[i].apply(inp, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_forks[i].apply(inp, as_dict=True), {'mask': source_sentence_mask})) return representation, source_sentence_mask
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply( attention_clinear.apply( cenc.reshape((cenc.shape[0] * cenc.shape[1], cenc.shape[2] ))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply( layer1.reshape( (layer1.shape[0] * layer1.shape[1], layer1.shape[2]))) att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights = tensor.nnet.sigmoid(att_weights.T).T att_weights.name = 'att_weights' att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() self.predictions = tensor.gt(att_weights, 0.1) * context # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
class BidirectionalEncoder(Initializable): """A generalized version of the vanilla encoder of the RNNsearch model which supports different numbers of layers. Zero layers represent non-recurrent encoders. """ def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') if self.n_layers >= 1: self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [ self.lookup, self.bidir, self.fwd_fork, self.back_fork ] if self.n_layers > 1: # Deep encoder self.mid_fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='mid_fwd_fork') self.mid_back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='mid_back_fork') self.children.append(self.mid_fwd_fork) self.children.append(self.mid_back_fork) elif self.n_layers == 0: self.embedding_dim = state_dim * 2 self.children = [self.lookup] else: logging.fatal("Number of encoder layers must be non-negative") def _push_allocation_config(self): """Sets the parameters of sub bricks """ self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim if self.n_layers >= 1: self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names ] self.back_fork.input_dim = self.embedding_dim self.back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names ] if self.n_layers > 1: # Deep encoder inp_dim = self.state_dim * 2 if self.skip_connections: inp_dim += self.embedding_dim self.mid_fwd_fork.input_dim = inp_dim self.mid_fwd_fork.output_dims = [ self.bidir.children[0].get_dim(name) for name in self.fwd_fork.output_names ] self.mid_back_fork.input_dim = inp_dim self.mid_back_fork.output_dims = [ self.bidir.children[1].get_dim(name) for name in self.back_fork.output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation', 'representation_mask']) def apply(self, source_sentence, source_sentence_mask): """Produces source annotations, either non-recurrently or with a bidirectional RNN architecture. """ # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) if self.n_layers >= 1: representation = self.bidir.apply( merge(self.fwd_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) for _ in xrange(self.n_layers - 1): if self.skip_connections: inp = tensor.concatenate([representation, embeddings], axis=2) else: inp = representation representation = self.bidir.apply( merge(self.mid_fwd_fork.apply(inp, as_dict=True), {'mask': source_sentence_mask}), merge(self.mid_back_fork.apply(inp, as_dict=True), {'mask': source_sentence_mask})) else: representation = embeddings return representation, source_sentence_mask
dataset = BrownCorpus(window_size=10) #dataset = ToyCorpus() print "done" VOCAB_DIM = dataset.vocabulary_size print "vocab size:", VOCAB_DIM EMBEDDING_DIM = 100 Xs = tensor.imatrix("context") y = tensor.ivector('center') w1 = LookupTable(name="w1", length=VOCAB_DIM, dim=EMBEDDING_DIM) w2 = Linear(name='w2', input_dim=EMBEDDING_DIM, output_dim=VOCAB_DIM) hidden = tensor.mean(w1.apply(Xs), axis=1) y_hat = Softmax().apply(w2.apply(hidden)) w1.weights_init = w2.weights_init = IsotropicGaussian(0.01) w1.biases_init = w2.biases_init = Constant(0) w1.initialize() w2.initialize() cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = "loss"
def __init__(self, config, vocab_size, id_to_vocab, logger): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') # question_actual = tensor.imatrix('question_actual') # context_actual = tensor.imatrix('context_actual') # answer_actual = tensor.imatrix('answer_actual') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] #u qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' #r attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] # g^AR probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate pred = probs.argmax(axis=1) cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def create_model(self, symbols_num = 500): # Hyperparameters # The dimension of the hidden state of the GRUs in each direction. hidden_states = self.args.encoder_hidden_dims # Dimension of the word-embedding space embedding_dims = self.args.source_embeddings_dim ################### # Declaration of the Theano variables that come from the data stream ################### # The context document. context_bt = tt.lmatrix('context') # Context document mask used to distinguish real symbols from the sequence and padding symbols that are at the end context_mask_bt = tt.matrix('context_mask') # The question question_bt = tt.lmatrix('question') question_mask_bt = tt.matrix('question_mask') # The correct answer y = tt.lmatrix('answer') y = y[:,0] # originally answers are in a 2d matrix, here we convert it to a vector # The candidates among which the answer is selected candidates_bi = tt.lmatrix("candidates") candidates_bi_mask = tt.matrix("candidates_mask") ################### # Network's components ################### # Lookup table with randomly initialized word embeddings lookup = LookupTable(symbols_num, embedding_dims, weights_init=Uniform(width=0.2)) # bidirectional encoder that translates context context_encoder = self.create_bidi_encoder("context_encoder", embedding_dims, hidden_states) # bidirectional encoder for question question_encoder = self.create_bidi_encoder("question_encoder", embedding_dims, hidden_states) # Initialize the components (where not done upon creation) lookup.initialize() ################### # Wiring the components together # # Where present, the 3 letters at the end of the variable name identify its dimensions: # b ... position of the example within the batch # t ... position of the word within the document/question # f ... features of the embedding vector ################### ### Read the context document # Map token indices to word embeddings context_embedding_tbf = lookup.apply(context_bt.T) # Read the embedded context document using the bidirectional GRU and produce the contextual embedding of each word memory_encoded_btf = context_encoder.apply(context_embedding_tbf, context_mask_bt.T).dimshuffle(1,0,2) memory_encoded_btf.name = "memory_encoded_btf" ### Correspondingly, read the query x_embedded_tbf = lookup.apply(question_bt.T) x_encoded_btf = question_encoder.apply(x_embedded_tbf, question_mask_bt.T).dimshuffle(1,0,2) # The query encoding is a concatenation of the final states of the forward and backward GRU encoder x_forward_encoded_bf = x_encoded_btf[:,-1,0:hidden_states] x_backward_encoded_bf = x_encoded_btf[:,0,hidden_states:hidden_states*2] query_representation_bf = tt.concatenate([x_forward_encoded_bf,x_backward_encoded_bf],axis=1) # Compute the attention on each word in the context as a dot product of its contextual embedding and the query mem_attention_presoft_bt = tt.batched_dot(query_representation_bf, memory_encoded_btf.dimshuffle(0,2,1)) # TODO is this pre-masking necessary? mem_attention_presoft_masked_bt = tt.mul(mem_attention_presoft_bt,context_mask_bt) # Normalize the attention using softmax mem_attention_bt = SoftmaxWithMask(name="memory_query_softmax").apply(mem_attention_presoft_masked_bt,context_mask_bt) if self.args.weighted_att: # compute weighted attention over original word vectors att_weighted_responses_bf = theano.tensor.batched_dot(mem_attention_bt, context_embedding_tbf.dimshuffle(1,0,2)) # compare desired response to all candidate responses # select relevant candidate answer words candidates_embeddings_bfi = lookup.apply(candidates_bi).dimshuffle(0,2,1) # convert it to output symbol probabilities y_hat_presoft = tt.batched_dot(att_weighted_responses_bf, candidates_embeddings_bfi) y_hat = SoftmaxWithMask(name="output_softmax").apply(y_hat_presoft,candidates_bi_mask) else: # Sum the attention of each candidate word across the whole context document, # this is the key innovation of the model # TODO: Get rid of sentence-by-sentence processing? # TODO: Rewrite into matrix notation instead of scans? def sum_prob_of_word(word_ix, sentence_ixs, sentence_attention_probs): word_ixs_in_sentence = tt.eq(sentence_ixs,word_ix).nonzero()[0] return sentence_attention_probs[word_ixs_in_sentence].sum() def sum_probs_single_sentence(candidate_indices_i, sentence_ixs_t, sentence_attention_probs_t): result, updates = theano.scan( fn=sum_prob_of_word, sequences=[candidate_indices_i], non_sequences=[sentence_ixs_t, sentence_attention_probs_t]) return result def sum_probs_batch(candidate_indices_bt,sentence_ixs_bt, sentence_attention_probs_bt): result, updates = theano.scan( fn=sum_probs_single_sentence, sequences=[candidate_indices_bt, sentence_ixs_bt, sentence_attention_probs_bt], non_sequences=None) return result # Sum the attention of each candidate word across the whole context document y_hat = sum_probs_batch(candidates_bi, context_bt, mem_attention_bt) y_hat.name = "y_hat" # We use the convention that ground truth is always at index 0, so the following are the target answers y = y.zeros_like() # We use Cross Entropy as the training objective cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost.name = "cost" predicted_response_index = tt.argmax(y_hat,axis=1) accuracy = tt.eq(y,predicted_response_index).mean() accuracy.name = "accuracy" return cost, accuracy, mem_attention_bt, y_hat, context_bt, candidates_bi, candidates_bi_mask, y, context_mask_bt, question_bt, question_mask_bt
def run(epochs=1, corpus="data/", HIDDEN_DIMS=100, path="./"): brown = BrownDataset(corpus) INPUT_DIMS = brown.get_vocabulary_size() OUTPUT_DIMS = brown.get_vocabulary_size() # These are theano variables x = tensor.lmatrix('context') y = tensor.ivector('output') # Construct the graph input_to_hidden = LookupTable(name='input_to_hidden', length=INPUT_DIMS, dim=HIDDEN_DIMS) # Compute the weight matrix for every word in the context and then compute # the average. h = tensor.mean(input_to_hidden.apply(x), axis=1) hidden_to_output = Linear(name='hidden_to_output', input_dim=HIDDEN_DIMS, output_dim=OUTPUT_DIMS) y_hat = Softmax().apply(hidden_to_output.apply(h)) # And initialize with random varibales and set the bias vector to 0 weights = IsotropicGaussian(0.01) input_to_hidden.weights_init = hidden_to_output.weights_init = weights input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # And now the cost function cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' mini_batch = SequentialScheme(brown.num_instances(), 512) data_stream = DataStream.default_stream(brown, iteration_scheme=mini_batch) # Now we tie up lose ends and construct the algorithm for the training # and define what happens in the main loop. algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ ProgressBar(), FinishAfter(after_n_epochs=epochs), Printing(), # TrainingDataMonitoring(variables=[cost]), SaveWeights(layers=[input_to_hidden, hidden_to_output], prefixes=['%sfirst' % path, '%ssecond' % path]), # Plot( # 'Word Embeddings', # channels=[ # [ # 'cost_with_regularization' # ] # ]) ] logger.info("Starting main loop...") main = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=extensions) main.run() pickle.dump(cg, open('%scg.pickle' % path, 'wb'))
dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear( name='linear_output', input_dim=hidden_layer_dim, output_dim=charset_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)]
class ExtractiveQAModel(Initializable): """The dictionary-equipped extractive QA model. Parameters ---------- dim : int The default dimensionality for the components. emd_dim : int The dimensionality for the embeddings. If 0, `dim` is used. coattention : bool Use the coattention mechanism. num_input_words : int The number of input words. If 0, `vocab.size()` is used. The vocabulary object. use_definitions : bool Triggers the use of definitions. reuse_word_embeddings : bool compose_type : str """ def __init__(self, dim, emb_dim, readout_dims, num_input_words, def_num_input_words, vocab, use_definitions, def_word_gating, compose_type, coattention, def_reader, reuse_word_embeddings, random_unk, **kwargs): self._vocab = vocab if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words self._coattention = coattention self._num_input_words = num_input_words self._use_definitions = use_definitions self._random_unk = random_unk self._reuse_word_embeddings = reuse_word_embeddings lookup_num_words = num_input_words if reuse_word_embeddings: lookup_num_words = max(num_input_words, def_num_input_words) if random_unk: lookup_num_words = vocab.size() # Dima: we can have slightly less copy-paste here if we # copy the RecurrentFromFork class from my other projects. children = [] self._lookup = LookupTable(lookup_num_words, emb_dim) self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') self._question_transform = Linear(dim, dim, name='question_transform') self._bidir_fork = Linear(3 * dim if coattention else 2 * dim, 4 * dim, name='bidir_fork') self._bidir = Bidirectional(LSTM(dim), name='bidir') children.extend([ self._lookup, self._encoder_fork, self._encoder_rnn, self._question_transform, self._bidir, self._bidir_fork ]) activations = [Rectifier()] * len(readout_dims) + [None] readout_dims = [2 * dim] + readout_dims + [1] self._begin_readout = MLP(activations, readout_dims, name='begin_readout') self._end_readout = MLP(activations, readout_dims, name='end_readout') self._softmax = NDimensionalSoftmax() children.extend( [self._begin_readout, self._end_readout, self._softmax]) if self._use_definitions: # A potential bug here: we pass the same vocab to the def reader. # If a different token is reserved for UNK in text and in the definitions, # we can be screwed. def_reader_class = eval(def_reader) def_reader_kwargs = dict( num_input_words=def_num_input_words, dim=dim, emb_dim=emb_dim, vocab=vocab, lookup=self._lookup if reuse_word_embeddings else None) if def_reader_class == MeanPoolReadDefinitions: def_reader_kwargs.update(dict(normalize=True, translate=False)) self._def_reader = def_reader_class(**def_reader_kwargs) self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, def_word_gating=def_word_gating, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) super(ExtractiveQAModel, self).__init__(children=children, **kwargs) # create default input variables self.contexts = tensor.lmatrix('contexts') self.context_mask = tensor.matrix('contexts_mask') self.questions = tensor.lmatrix('questions') self.question_mask = tensor.matrix('questions_mask') self.answer_begins = tensor.lvector('answer_begins') self.answer_ends = tensor.lvector('answer_ends') input_vars = [ self.contexts, self.context_mask, self.questions, self.question_mask, self.answer_begins, self.answer_ends ] if self._use_definitions: self.defs = tensor.lmatrix('defs') self.def_mask = tensor.matrix('def_mask') self.contexts_def_map = tensor.lmatrix('contexts_def_map') self.questions_def_map = tensor.lmatrix('questions_def_map') input_vars.extend([ self.defs, self.def_mask, self.contexts_def_map, self.questions_def_map ]) self.input_vars = OrderedDict([(var.name, var) for var in input_vars]) def set_embeddings(self, embeddings): self._lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def embeddings_var(self): return self._lookup.parameters[0] def def_reading_parameters(self): parameters = Selector(self._def_reader).get_parameters().values() parameters.extend(Selector(self._combiner).get_parameters().values()) if self._reuse_word_embeddings: lookup_parameters = Selector( self._lookup).get_parameters().values() parameters = [p for p in parameters if p not in lookup_parameters] return parameters @application def _encode(self, application_call, text, mask, def_embs=None, def_map=None, text_name=None): if not self._random_unk: text = (tensor.lt(text, self._num_input_words) * text + tensor.ge(text, self._num_input_words) * self._vocab.unk) if text_name: application_call.add_auxiliary_variable( unk_ratio(text, mask, self._vocab.unk), name='{}_unk_ratio'.format(text_name)) embs = self._lookup.apply(text) if self._random_unk: embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs + tensor.ge(text, self._num_input_words)[:, :, None] * disconnected_grad(embs)) if def_embs: embs = self._combiner.apply(embs, mask, def_embs, def_map) add_role(embs, EMBEDDINGS) encoded = flip01( self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)), mask=mask.T)[0]) return encoded @application def apply(self, application_call, contexts, contexts_mask, questions, questions_mask, answer_begins, answer_ends, defs=None, def_mask=None, contexts_def_map=None, questions_def_map=None): def_embs = None if self._use_definitions: def_embs = self._def_reader.apply(defs, def_mask) context_enc = self._encode(contexts, contexts_mask, def_embs, contexts_def_map, 'context') question_enc_pre = self._encode(questions, questions_mask, def_embs, questions_def_map, 'question') question_enc = tensor.tanh( self._question_transform.apply(question_enc_pre)) # should be (batch size, context length, question_length) affinity = tensor.batched_dot(context_enc, flip12(question_enc)) affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :] affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask) # soft-aligns every position in the context to positions in the question d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1) application_call.add_auxiliary_variable(d2q_att_weights.copy(), name='d2q_att_weights') # soft-aligns every position in the question to positions in the document q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1) application_call.add_auxiliary_variable(q2d_att_weights.copy(), name='q2d_att_weights') # question encoding "in the view of the document" question_enc_informed = tensor.batched_dot(q2d_att_weights, context_enc) question_enc_concatenated = tensor.concatenate( [question_enc, question_enc_informed], 2) # document encoding "in the view of the question" context_enc_informed = tensor.batched_dot(d2q_att_weights, question_enc_concatenated) if self._coattention: context_enc_concatenated = tensor.concatenate( [context_enc, context_enc_informed], 2) else: question_repr_repeated = tensor.repeat(question_enc[:, [-1], :], context_enc.shape[1], axis=1) context_enc_concatenated = tensor.concatenate( [context_enc, question_repr_repeated], 2) # note: forward and backward LSTMs share the # input weights in the current impl bidir_states = flip01( self._bidir.apply(self._bidir_fork.apply( flip01(context_enc_concatenated)), mask=contexts_mask.T)[0]) begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0] begin_readouts = begin_readouts * contexts_mask - 1000.0 * ( 1 - contexts_mask) begin_costs = self._softmax.categorical_cross_entropy( answer_begins, begin_readouts) end_readouts = self._end_readout.apply(bidir_states)[:, :, 0] end_readouts = end_readouts * contexts_mask - 1000.0 * (1 - contexts_mask) end_costs = self._softmax.categorical_cross_entropy( answer_ends, end_readouts) predicted_begins = begin_readouts.argmax(axis=-1) predicted_ends = end_readouts.argmax(axis=-1) exact_match = (tensor.eq(predicted_begins, answer_begins) * tensor.eq(predicted_ends, answer_ends)) application_call.add_auxiliary_variable(predicted_begins, name='predicted_begins') application_call.add_auxiliary_variable(predicted_ends, name='predicted_ends') application_call.add_auxiliary_variable(exact_match, name='exact_match') return begin_costs + end_costs def apply_with_default_vars(self): return self.apply(*self.input_vars.values())
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') context_bag = tensor.eq(context[:, :, None], tensor.arange(vocab_size)).sum(axis=1).clip( 0, 1) bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') # embed.weights_init = Constant(embeddings_initial_value) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' #embed size: 200, lstm_size = 256 #qenc: length * batch_size * (2*lstm_size) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate( [ cembed, tensor.extra_ops.repeat( qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2 ) #length * batch_size * (embed+2*lstm_size) this is what goes into encoder clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' #cenc: length * batch_size * (2*lstm_size) #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) self.theano_params = [] add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() # for p in tparams.values(): # add_role(p, WEIGHT) # self.theano_params.append(p) #n_steps = length , n_samples = batch_size n_steps = ans_indices.shape[0] n_samples = ans_indices.shape[1] preds, generations = ptr_network( tparams, cqembed, context_mask.astype(theano.config.floatX), ans_indices, ans_indices_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, cenc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, ans_indices, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 # probs += (1 - y_mask) # change unmasked position to 1, since log(1) = 0 probs += off # probs_printed = theano.printing.Print('this is probs')(probs) cost = -tensor.log(probs) cost *= ans_indices_mask cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
class DeepBidirectionalEncoder(Initializable): """This encoder is a multi-layered version of ``BidirectionalEncoder`` where parameters between layers are not shared. """ def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(DeepBidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') self.bidirs = [] self.fwd_forks =[] self.back_forks = [] for i in xrange(self.n_layers): bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim), name='bidir%d' % i) self.bidirs.append(bidir) self.fwd_forks.append(Fork( [name for name in bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork%d' % i)) self.back_forks.append(Fork( [name for name in bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='back_fork%d' % i)) self.children = [self.lookup] \ + self.bidirs \ + self.fwd_forks \ + self.back_forks def _push_allocation_config(self): """Sets the parameters of sub bricks """ self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_forks[0].input_dim = self.embedding_dim self.fwd_forks[0].output_dims = [ self.bidirs[0].children[0].get_dim(name) for name in self.fwd_forks[0].output_names] self.back_forks[0].input_dim = self.embedding_dim self.back_forks[0].output_dims = [ self.bidirs[0].children[1].get_dim(name) for name in self.back_forks[0].output_names] for i in xrange(1, self.n_layers): inp_dim = self.state_dim * 2 if self.skip_connections: inp_dim += self.embedding_dim self.fwd_forks[i].input_dim = inp_dim self.fwd_forks[i].output_dims = [ self.bidirs[i].children[0].get_dim(name) for name in self.fwd_forks[i].output_names] self.back_forks[i].input_dim = inp_dim self.back_forks[i].output_dims = [ self.bidirs[i].children[1].get_dim(name) for name in self.back_forks[i].output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation', 'representation_mask']) def apply(self, source_sentence, source_sentence_mask): """Produces source annotations, either non-recurrently or with a bidirectional RNN architecture. """ # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) representation = self.bidirs[0].apply( merge(self.fwd_forks[0].apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_forks[0].apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) for i in xrange(1, self.n_layers): if self.skip_connections: inp = tensor.concatenate([representation, embeddings], axis=2) else: inp = representation representation = self.bidirs[i].apply( merge(self.fwd_forks[i].apply(inp, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_forks[i].apply(inp, as_dict=True), {'mask': source_sentence_mask}) ) return representation, source_sentence_mask
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = SimpleRecurrent( dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=Adam() ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) def sampling_step(g_noise, states, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states = rnn.apply(inputs=embedding_step, states=states, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_samples [_, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') y = tensor.imatrix('targets') x_int = x.astype(dtype='int32').T - 2 train_dataset = IMDB() idx_sort = numpy.argsort( [len(s) for s in train_dataset.indexables[ train_dataset.sources.index('features')]] ) n_voc = len(train_dataset.dict.keys()) for idx in xrange(len(train_dataset.sources)): train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort] n_h = 10 linear_embedding = LookupTable( length=n_voc, dim=4 * n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = Bidirectional(LSTM( dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) )) rnn.initialize() score_layer = Linear( input_dim=2*n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T) rnn_out = rnn.apply(embedding) rnn_out_mean_pooled = tensor.mean(rnn_out[0], axis=0) probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * tensor.log(probs) + (1 - y) * tensor.log(1 - probs) ).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5) ).mean() misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule( components=[StepClipping(threshold=10.), Adam() ] ) ) n_train = int(numpy.floor(.8 * train_dataset.num_examples)) n_valid = int(numpy.floor(.1 * train_dataset.num_examples)) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(100), batch_size=10, ) ), mask_sources=('features',) ) valid_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(100, 110), batch_size=10, ) ), mask_sources=('features',) ) test_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(110, 120), batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(DataStreamMonitoring( [cost, misclassification], test_data_stream, prefix='test')) extensions.append(DataStreamMonitoring( [cost, misclassification], valid_data_stream, prefix='valid')) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification', 'valid_cost', 'valid_misclassification']], titles=['Costs'])) extensions.append(PlotManager('IMDB classification example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()