class FeedbackRNN(BaseRecurrent): def __init__(self, dim, **kwargs): super(FeedbackRNN, self).__init__(**kwargs) self.dim = dim self.first_recurrent_layer = SimpleRecurrent( dim=self.dim, activation=Identity(), name='first_recurrent_layer', weights_init=initialization.Identity()) self.second_recurrent_layer = SimpleRecurrent( dim=self.dim, activation=Identity(), name='second_recurrent_layer', weights_init=initialization.Identity()) self.children = [self.first_recurrent_layer, self.second_recurrent_layer] @recurrent(sequences=['inputs'], contexts=[], states=['first_states', 'second_states'], outputs=['first_states', 'second_states']) def apply(self, inputs, first_states=None, second_states=None): first_h = self.first_recurrent_layer.apply( inputs=inputs, states=first_states + second_states, iterate=False) second_h = self.second_recurrent_layer.apply( inputs=first_h, states=second_states, iterate=False) return first_h, second_h def get_dim(self, name): return (self.dim if name in ('inputs', 'first_states', 'second_states') else super(FeedbackRNN, self).get_dim(name))
def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size, subword_RNN_hidden_state_size, add_one = True, **kwargs): super(CompositionalLayerToyBidirectional, self).__init__(**kwargs) self.batch_size = batch_size self.num_subwords = num_subwords # number of subwords which make up a word self.num_words = num_words # number of words in the sentence self.subword_embedding_size = subword_embedding_size self.input_vocab_size = input_vocab_size self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size self.add_one = add_one #adds 1 to the backwards embeddings # create the look up table self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name='input_lookup') self.lookup.weights_init = Uniform(width=0.08) self.lookup.biases_init = Constant(0) # has one RNN which reads the subwords into a word embedding self.compositional_subword_to_word_RNN_forward = SimpleRecurrent( dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN_forward', weights_init=Identity_init()) self.compositional_subword_to_word_RNN_backward = SimpleRecurrent( dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN_backward', weights_init=Identity_init()) self.children = [self.lookup, self.compositional_subword_to_word_RNN_forward, self.compositional_subword_to_word_RNN_backward]
class TestBidirectional(unittest.TestCase): def setUp(self): self.bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=3, activation=Tanh())) self.simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) self.bidir.allocate() self.simple.initialize() self.bidir.children[0].params[0].set_value( self.simple.params[0].get_value()) self.bidir.children[1].params[0].set_value( self.simple.params[0].get_value()) self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=floatX) self.mask_val[12:24, 3] = 0 def test(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [self.bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [self.simple.apply(x, mask=mask)]) h_bidir = calc_bidir(self.x_val, self.mask_val)[0] h_simple = calc_simple(self.x_val, self.mask_val)[0] h_simple_rev = calc_simple(self.x_val[::-1], self.mask_val[::-1])[0] assert_allclose(h_simple, h_bidir[..., :3], rtol=1e-04) assert_allclose(h_simple_rev, h_bidir[::-1, ..., 3:], rtol=1e-04)
def __init__(self, dim, **kwargs): super(FeedbackRNN, self).__init__(**kwargs) self.dim = dim self.first_recurrent_layer = SimpleRecurrent( dim=self.dim, activation=Identity(), name='first_recurrent_layer', weights_init=initialization.Identity()) self.second_recurrent_layer = SimpleRecurrent( dim=self.dim, activation=Identity(), name='second_recurrent_layer', weights_init=initialization.Identity()) self.children = [self.first_recurrent_layer, self.second_recurrent_layer]
def example5(): """Bidir + simplereccurent. Adaptation from a unittest in blocks """ bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=3, activation=Tanh())) simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) bidir.allocate() simple.initialize() bidir.children[0].parameters[0].set_value( simple.parameters[0].get_value()) bidir.children[1].parameters[0].set_value( simple.parameters[0].get_value()) #Initialize theano variables and functions x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [simple.apply(x, mask=mask)]) #Testing time x_val = 0.1 * np.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = (np.ones((24, 4, 3), dtype=theano.config.floatX) * x_val[..., None]) mask_val = np.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_bidir = calc_bidir(x_val, mask_val)[0] h_simple = calc_simple(x_val, mask_val)[0] h_simple_rev = calc_simple(x_val[::-1], mask_val[::-1])[0] print(h_bidir) print(h_simple) print(h_simple_rev)
class MyRnn(BaseRecurrent): # Extend the base recurrent class to create one of your own def __init__(self, dim, **kwargs): super(MyRnn, self).__init__(**kwargs) self.dim = dim self.layer1 = SimpleRecurrent(dim=self.dim, activation=Identity(), name='recurrent layer 1', weights_init=initialization.Identity()) self.layer2 = SimpleRecurrent(dim=self.dim, activation=Identity(), name='recurrent layer 2', weights_init=initialization.Identity()) self.children = [self.layer1, self.layer2] def apply(self, inputs, first_states=None, second_states=None): first_h = self.layer1.apply(inputs=inputs, states=first_states, iterate=False) second_h = self.layer2.apply(inputs=first_h, states=second_states, iterate=False) return first_h, second_h def get_dim(self): pass
class TestSimpleRecurrent(unittest.TestCase): def setUp(self): self.simple = SimpleRecurrent(dim=3, weights_init=Constant(2), activation=Tanh()) self.simple.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') mask = tensor.vector('mask') h1 = self.simple.apply(x, h0, mask=mask, iterate=False) next_h = theano.function(inputs=[h0, x, mask], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=theano.config.floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=theano.config.floatX) mask_val = numpy.array([1, 0]).astype(theano.config.floatX) h1_val = numpy.tanh(h0_val.dot(2 * numpy.ones((3, 3))) + x_val) h1_val = mask_val[:, None] * h1_val + (1 - mask_val[:, None]) * h0_val assert_allclose(h1_val, next_h(h0_val, x_val, mask_val)[0]) def test_many_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') h = self.simple.apply(x, mask=mask, iterate=True) calc_h = theano.function(inputs=[x, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = numpy.ones((24, 4, 3), dtype=theano.config.floatX) * x_val[..., None] mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) for i in range(1, 25): h_val[i] = numpy.tanh(h_val[i - 1].dot( 2 * numpy.ones((3, 3))) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] assert_allclose(h_val, calc_h(x_val, mask_val)[0], rtol=1e-04) # Also test that initial state is a parameter initial_state, = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert is_shared_variable(initial_state) assert initial_state.name == 'initial_state'
def __init__(self, dim_in, dim_hidden, dim_out, **kwargs): self.dim_in = dim_in self.dim_hidden = dim_hidden self.dim_out = dim_out self.input_layer = Linear(input_dim=self.dim_in, output_dim=self.dim_hidden, weights_init=initialization.IsotropicGaussian(), biases_init=initialization.Constant(0)) self.input_layer.initialize() sparse_init = initialization.Sparse(num_init=15, weights_init=initialization.IsotropicGaussian()) self.recurrent_layer = SimpleRecurrent( dim=self.dim_hidden, activation=Tanh(), name="first_recurrent_layer", weights_init=sparse_init, biases_init=initialization.Constant(0.01)) ''' self.recurrent_layer = LSTM(dim=self.dim_hidden, activation=Tanh(), weights_init=initialization.IsotropicGaussian(std=0.001), biases_init=initialization.Constant(0.01)) ''' self.recurrent_layer.initialize() self.output_layer = Linear(input_dim=self.dim_hidden, output_dim=self.dim_out, weights_init=initialization.Uniform(width=0.01), biases_init=initialization.Constant(0.01)) self.output_layer.initialize() self.children = [self.input_layer, self.recurrent_layer, self.output_layer]
class CompositionalLayerToyWithTables(Initializable): def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size, subword_RNN_hidden_state_size, **kwargs): super(CompositionalLayerToyWithTables, self).__init__(**kwargs) self.batch_size = batch_size self.num_subwords = num_subwords # number of subwords which make up a word self.num_words = num_words # number of words in the sentence self.subword_embedding_size = subword_embedding_size self.input_vocab_size = input_vocab_size self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size # create the look up table self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name='input_lookup') self.lookup.weights_init = Uniform(width=0.08) self.lookup.biases_init = Constant(0) # has one RNN which reads the subwords into a word embedding self.compositional_subword_to_word_RNN = SimpleRecurrent( dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN', weights_init=Identity_init()) self.children = [self.lookup, self.compositional_subword_to_word_RNN] ''' subword_id_input_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint16 or equivalent subword_id_input_mask_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint8 or equivalent and has binary values of 1 when there is data and zero otherwise. The look up table will return a 4d tensor with shape = (num_words, num_subwords, batch_size, embedding size) The RNN will eat up the subwords dimension, resulting in a 3d tensor of shape = (num_words, batch_size, RNN_hidden_value_size), which is returned as 'word_embeddings' Also returned is a 2d tensor of shape = (num_words, batch_zize), which is the remaining mask indicated the length of the sentence for each sentence in the batch. i.e., 1 when there is a word, 0 otherwise. ''' @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['word_embeddings', 'word_embeddings_mask']) def apply(self, subword_id_input_, subword_id_input_mask_): ##shape = (num_words, num_subwords, batch_size, embedding size) subword_embeddings = self.lookup.apply(subword_id_input_) result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN.apply(subword_embeddings, mask=subword_id_input_mask_), sequences= [subword_embeddings, subword_id_input_mask_]) word_embeddings = result.dimshuffle(1,0,2,3) #put the states as the last dimension #remove this line to see the RNN states word_embeddings = word_embeddings[-1] #take only the last state, since we dont need the others #remove subword dim from mask #if subword is empty then word is emptry the word is emptry, if not then the word is used word_embeddings_mask = subword_id_input_mask_.max(axis=1) return word_embeddings, word_embeddings_mask
def test_saved_inner_graph(): """Make sure that the original inner graph is saved.""" x = tensor.tensor3() recurrent = SimpleRecurrent(dim=3, activation=Tanh()) y = recurrent.apply(x) application_call = get_application_call(y) assert application_call.inner_inputs assert application_call.inner_outputs cg = ComputationGraph(application_call.inner_outputs) # Check that the inner scan graph is annotated # with `recurrent.apply` assert len(VariableFilter(application=recurrent.apply)(cg)) == 3 # Check that the inner graph is equivalent to the one # produced by a stand-alone of `recurrent.apply` assert is_same_graph(application_call.inner_outputs[0], recurrent.apply(*application_call.inner_inputs, iterate=False))
class TextRNN(object): def __init__(self, dim_in, dim_hidden, dim_out, **kwargs): self.dim_in = dim_in self.dim_hidden = dim_hidden self.dim_out = dim_out self.input_layer = Linear(input_dim=self.dim_in, output_dim=self.dim_hidden, weights_init=initialization.IsotropicGaussian(), biases_init=initialization.Constant(0)) self.input_layer.initialize() sparse_init = initialization.Sparse(num_init=15, weights_init=initialization.IsotropicGaussian()) self.recurrent_layer = SimpleRecurrent( dim=self.dim_hidden, activation=Tanh(), name="first_recurrent_layer", weights_init=sparse_init, biases_init=initialization.Constant(0.01)) ''' self.recurrent_layer = LSTM(dim=self.dim_hidden, activation=Tanh(), weights_init=initialization.IsotropicGaussian(std=0.001), biases_init=initialization.Constant(0.01)) ''' self.recurrent_layer.initialize() self.output_layer = Linear(input_dim=self.dim_hidden, output_dim=self.dim_out, weights_init=initialization.Uniform(width=0.01), biases_init=initialization.Constant(0.01)) self.output_layer.initialize() self.children = [self.input_layer, self.recurrent_layer, self.output_layer] ''' @recurrent(sequences=['inputs'], states=['states'], contexts=[], outputs=['states', 'output']) ''' def run(self, inputs): output = self.output_layer.apply( self.recurrent_layer.apply(self.input_layer.apply(inputs)) ) return output
class LanguageModelToy(Initializable): """ This takes the word embeddings from CompositionalLayerToyWithTables and creates sentence embeddings Input is a 3d tensor with the dimensions of (num_words, num_subwords, batch_size) and a 3d tensor a mask of size (num_words, num_subwords, batch_size) All hidden state sizes are the same as the subword embedding size This returns a 3d tensor with dimenstions of (num_words = num RNN states, batch_size, sentence embedding size) """ def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size, subword_RNN_hidden_state_size, LM_RNN_hidden_state_size, **kwargs): super(LanguageModelToy, self).__init__(**kwargs) self.batch_size = batch_size self.num_subwords = num_subwords # number of subwords which make up a word self.num_words = num_words # number of words in the sentence self.subword_embedding_size = subword_embedding_size self.input_vocab_size = input_vocab_size self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size self.LM_RNN_hidden_state_size = LM_RNN_hidden_state_size self.compositional_layer = CompositionalLayerToyWithTables(self.batch_size, self.num_subwords, self.num_words, self.subword_embedding_size, self.input_vocab_size, self.subword_RNN_hidden_state_size, name='compositional_layer') # has one RNN which reads the word embeddings into a sentence embedding self.language_model_RNN = SimpleRecurrent( dim=self.LM_RNN_hidden_state_size, activation=Identity(), name='language_model_RNN', weights_init=Identity_init()) self.children = [self.compositional_layer, self.language_model_RNN] @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['sentence_embeddings', 'sentence_embeddings_mask']) def apply(self, subword_id_input_, subword_id_input_mask_): """ subword_id_input_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint16 or equivalent subword_id_input_mask_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint8 or equivalent and has binary values of 1 when there is data and zero otherwise. Returned is a 3d tensor of size (num_words = num RNN states, batch_size, sentence embedding size) Also returned is a 1d tensor of size (batch_size) describing if the sentence is valid of empty in the batch """ word_embeddings, word_embeddings_mask = self.compositional_layer.apply(subword_id_input_, subword_id_input_mask_) sentence_embeddings = self.language_model_RNN.apply(word_embeddings, mask=word_embeddings_mask) sentence_embeddings_mask = word_embeddings_mask.max(axis=0).T return sentence_embeddings, sentence_embeddings_mask
def example(): """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """ x = tensor.tensor3('x') rnn = SimpleRecurrent(dim=3, activation=Identity(), weights_init=initialization.Identity()) rnn.initialize() h = rnn.apply(x) f = theano.function([x], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) doubler = Linear( input_dim=3, output_dim=3, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() h_doubler = rnn.apply(doubler.apply(x)) f = theano.function([x], h_doubler) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) #Initial State h0 = tensor.matrix('h0') h = rnn.apply(inputs=x, states=h0) f = theano.function([x, h0], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
def test_saved_inner_graph(): """Make sure that the original inner graph is saved.""" x = tensor.tensor3() recurrent = SimpleRecurrent(dim=3, activation=Tanh()) y = recurrent.apply(x) application_call = get_application_call(y) assert application_call.inner_inputs assert application_call.inner_outputs cg = ComputationGraph(application_call.inner_outputs) # Check that the inner scan graph is annotated # with `recurrent.apply` assert len(VariableFilter(applications=[recurrent.apply])(cg)) == 3 # Check that the inner graph is equivalent to the one # produced by a stand-alone of `recurrent.apply` assert is_same_graph(application_call.inner_outputs[0], recurrent.apply(*application_call.inner_inputs, iterate=False))
def setUp(self): self.bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=3, activation=Tanh())) self.simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) self.bidir.allocate() self.simple.initialize() self.bidir.children[0].parameters[0].set_value( self.simple.parameters[0].get_value()) self.bidir.children[1].parameters[0].set_value( self.simple.parameters[0].get_value()) self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0
def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size, subword_RNN_hidden_state_size, LM_RNN_hidden_state_size, **kwargs): super(LanguageModelToy, self).__init__(**kwargs) self.batch_size = batch_size self.num_subwords = num_subwords # number of subwords which make up a word self.num_words = num_words # number of words in the sentence self.subword_embedding_size = subword_embedding_size self.input_vocab_size = input_vocab_size self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size self.LM_RNN_hidden_state_size = LM_RNN_hidden_state_size self.compositional_layer = CompositionalLayerToyWithTables(self.batch_size, self.num_subwords, self.num_words, self.subword_embedding_size, self.input_vocab_size, self.subword_RNN_hidden_state_size, name='compositional_layer') # has one RNN which reads the word embeddings into a sentence embedding self.language_model_RNN = SimpleRecurrent( dim=self.LM_RNN_hidden_state_size, activation=Identity(), name='language_model_RNN', weights_init=Identity_init()) self.children = [self.compositional_layer, self.language_model_RNN]
def __init__( self, input_dim, state_dim, activation=Tanh(), state_weights_init=None, input_weights_init=None, biases_init=None, **kwargs ): super(SimpleRecurrentLayer, self).__init__(biases_init=biases_init, **kwargs) if state_weights_init is None: state_weights_init = init.IsotropicGaussian(0.01) if input_weights_init is None: input_weights_init = init.IsotropicGaussian(0.01) if biases_init is None: biases_init = init.Constant(0) self.input_transformation = Linear( input_dim=input_dim, output_dim=state_dim, weights_init=input_weights_init, biases_init=biases_init ) self.rnn = SimpleRecurrent(dim=state_dim, activation=activation, weights_init=state_weights_init) self.children = [self.input_transformation, self.rnn]
def __init__(self, dimension, alphabet_size, **kwargs): super(SimpleGenerator, self).__init__(**kwargs) lookup = LookupTable(alphabet_size, dimension) transition = SimpleRecurrent( activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dimension, match_dim=dimension, name="attention") readout = Readout( readout_dim=alphabet_size, source_names=[transition.apply.states[0], attention.take_glimpses.outputs[0]], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(alphabet_size, dimension), name="readout") generator = SequenceGenerator( readout=readout, transition=transition, attention=attention, name="generator") self.lookup = lookup self.generator = generator self.children = [lookup, generator]
def __init__(self, dims=(88, 100, 100), **kwargs): super(Rnn, self).__init__(**kwargs) self.dims = dims self.input_transform = Linear(input_dim=dims[0], output_dim=dims[1], weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0.0), use_bias=False, name="input_transfrom") self.gru_layer = SimpleRecurrent(dim=dims[1], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="gru_rnn_layer") # TODO: find a way to automatically set the output dim in case of lstm vs normal rnn self.linear_trans = Linear(input_dim=dims[1], output_dim=dims[2] * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=False, name="h2h_transform") self.lstm_layer = LSTM(dim=dims[2], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="lstm_rnn_layer") self.out_transform = MLP(activations=[Sigmoid()], dims=[dims[2], dims[0]], weights_init=IsotropicGaussian(0.01), use_bias=True, biases_init=Constant(0.0), name="out_layer") self.children = [self.input_transform, self.gru_layer, self.linear_trans, self.lstm_layer, self.out_transform]
def example5(): """Bidir + simplereccurent. Adaptation from a unittest in blocks """ bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent(dim=3, activation=Tanh())) simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) bidir.allocate() simple.initialize() bidir.children[0].parameters[0].set_value(simple.parameters[0].get_value()) bidir.children[1].parameters[0].set_value(simple.parameters[0].get_value()) #Initialize theano variables and functions x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [simple.apply(x, mask=mask)]) #Testing time x_val = 0.1 * np.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = (np.ones( (24, 4, 3), dtype=theano.config.floatX) * x_val[..., None]) mask_val = np.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_bidir = calc_bidir(x_val, mask_val)[0] h_simple = calc_simple(x_val, mask_val)[0] h_simple_rev = calc_simple(x_val[::-1], mask_val[::-1])[0] print(h_bidir) print(h_simple) print(h_simple_rev)
def sgd(cost, params): grads = T.grad(cost=cost, wrt=params) updates = [] for p, g in zip(params, grads): updates.append([p, p - g * learning_rate]) return updates # Computational Graph input = T.tensor3('input') mask = T.fmatrix('mask') target = T.tensor3('target') linear1 = Linear(name='linear1', input_dim=300, output_dim=128) recurrent = SimpleRecurrent(name='recurrent', activation=Tanh(), dim=128) linear2 = Linear(name='linear2', input_dim=128, output_dim=9) softmax = Softmax() bricks = [linear1, recurrent, linear2] for brick in bricks: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize() linear1_output = linear1.apply(input) recurrent_output = recurrent.apply(linear1_output, mask=mask) linear2_output = linear2.apply(recurrent_output) shape = linear2_output.shape # 100 * 29*9 output = softmax.apply(linear2_output.reshape( (-1, 9))).reshape(shape) # hameye dimension ha be gheyr az yeki k oon 9 hast.
class Rnn(Initializable, BaseRecurrent): def __init__(self, dims=(88, 100, 100), **kwargs): super(Rnn, self).__init__(**kwargs) self.dims = dims self.input_transform = Linear(input_dim=dims[0], output_dim=dims[1], weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0.0), use_bias=False, name="input_transfrom") self.gru_layer = SimpleRecurrent(dim=dims[1], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="gru_rnn_layer") # TODO: find a way to automatically set the output dim in case of lstm vs normal rnn self.linear_trans = Linear(input_dim=dims[1], output_dim=dims[2] * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=False, name="h2h_transform") self.lstm_layer = LSTM(dim=dims[2], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="lstm_rnn_layer") self.out_transform = MLP(activations=[Sigmoid()], dims=[dims[2], dims[0]], weights_init=IsotropicGaussian(0.01), use_bias=True, biases_init=Constant(0.0), name="out_layer") self.children = [self.input_transform, self.gru_layer, self.linear_trans, self.lstm_layer, self.out_transform] # @recurrent(sequences=['inputs', 'input_mask'], contexts=[], # states=['gru_state', 'lstm_state', 'lstm_cells'], # outputs=['gru_state', 'lstm_state', 'lstm_cells']) def rnn_apply(self, inputs, mask=None, gru_state=None, lstm_state=None, lstm_cells=None): input_transform = self.input_transform.apply(inputs) gru_state = self.gru_layer.apply( inputs=input_transform, # update_inputs=input_transform, # reset_inputs=input_transform, states=gru_state, mask=mask, iterate=False) lstm_transform = self.linear_trans.apply(gru_state) lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state, cells=lstm_cells, mask=mask, iterate=False) return gru_state, lstm_state, lstm_cells @recurrent(sequences=[], contexts=[], states=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'], outputs=['inputs', 'gru_state', 'lstm_state', 'lstm_cells']) def rnn_generate(self, inputs=None, gru_state=None, lstm_state=None, lstm_cells=None): output = self.apply(inputs=inputs, gru_state=gru_state, lstm_state=lstm_state, lstm_cells=lstm_cells, iterate=False) return output, gru_state, lstm_state, lstm_cells @recurrent(sequences=['inputs', 'mask'], contexts=[], states=['gru_state', 'lstm_state', 'lstm_cells'], outputs=['output', 'gru_state', 'lstm_state', 'lstm_cells']) def apply(self, inputs, mask, gru_state=None, lstm_state=None, lstm_cells=None): # input_transform = self.input_transform.apply(inputs) # gru_state = self.gru_layer.apply( # inputs=input_transform, # mask=mask, # states=gru_state, # iterate=False) # lstm_transform = self.linear_trans.apply(gru_state) # lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state, # cells=lstm_cells, # mask=mask, iterate=False) gru_state, lstm_state, lstm_cells = self.rnn_apply(inputs=inputs, mask=mask, gru_state=gru_state, lstm_state=lstm_state, lstm_cells=lstm_cells) output = 1.17 * self.out_transform.apply(lstm_state) * mask[:, None] return output, gru_state, lstm_state, lstm_cells def get_dim(self, name): dims = dict(zip(['outputs', 'gru_state', 'lstm_state'], self.dims)) dims['lstm_cells'] = dims['lstm_state'] return dims.get(name, None) or super(Rnn, self).get_dim(name)
def construct_model(vocab_size, embedding_dim, hidden_dim, activation): # Construct the model x = tensor.lmatrix('features') x_mask = tensor.fmatrix('features_mask') y = tensor.lmatrix('targets') # Batch X Time y_mask = tensor.fmatrix('targets_mask') # Batch X Time frequency_mask = tensor.fmatrix('frequency_mask') frequency_mask_mask = tensor.fmatrix('frequency_mask_mask') # Only for the validation last_word = tensor.lvector('last_word') lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup') linear = Linear(input_dim=embedding_dim, output_dim=hidden_dim, name="linear") hidden = SimpleRecurrent(dim=hidden_dim, activation=activation, name='hidden_recurrent') top_linear = Linear(input_dim=hidden_dim, output_dim=vocab_size, name="top_linear") # Return 3D Tensor: Batch X Time X embedding_dim embeddings = lookup.apply(x) # Give time as the first index: Time X Batch X embedding_dim embeddings = embeddings.dimshuffle(1, 0, 2) pre_recurrent = linear.apply(embeddings) after_recurrent = hidden.apply(inputs=pre_recurrent, mask=x_mask.T)[:-1] after_recurrent_last = after_recurrent[-1] presoft = top_linear.apply(after_recurrent) # Define the cost # Give y as a vector and reshape presoft to 2D tensor y = y.flatten() shape = presoft.shape presoft = presoft.dimshuffle(1, 0, 2) presoft = presoft.reshape((shape[0] * shape[1], shape[2])) # Build cost_matrix presoft = presoft - presoft.max(axis=1).dimshuffle(0, 'x') log_prob = presoft - \ tensor.log(tensor.exp(presoft).sum(axis=1).dimshuffle(0, 'x')) flat_log_prob = log_prob.flatten() range_ = tensor.arange(y.shape[0]) flat_indices = y + range_ * presoft.shape[1] cost_matrix = flat_log_prob[flat_indices] # Mask useless values from the cost_matrix cost_matrix = - cost_matrix * \ y_mask.flatten() * frequency_mask.flatten() * \ frequency_mask_mask.flatten() # Average the cost cost = cost_matrix.sum() cost = cost / (y_mask * frequency_mask).sum() # Initialize parameters for brick in (lookup, linear, hidden, top_linear): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() return cost
def __init__(self, rnn_dims, num_actions, data_X_np=None, data_y_np=None, width=32, height=32): ############################################################### # # Network and data setup # ############################################################## RNN_DIMS = 100 NUM_ACTIONS = num_actions tensor5 = T.TensorType('float32', [False, True, True, True, True]) self.x = T.tensor4('features') self.reward = T.tensor3('targets', dtype='float32') self.state = T.matrix('states', dtype='float32') self.hidden_states = [] # holds hidden states in np array form #data_X & data_Y supplied in init function now... if data_X_np is None or data_y_np is None: print 'you did not supply data at init' data_X_np = np.float32(np.random.normal(size=(1280, 1,1, width, height))) data_y_np = np.float32(np.random.normal(size=(1280, 1,1,1))) #data_states_np = np.float32(np.ones((1280, 1, 100))) state_shape = (data_X_np.shape[0],rnn_dims) self.data_states_np = np.float32(np.zeros(state_shape)) self.datastream = IterableDataset(dict(features=data_X_np, targets=data_y_np, states=self.data_states_np)).get_example_stream() self.datastream_test = IterableDataset(dict(features=data_X_np, targets=data_y_np, states=self.data_states_np)).get_example_stream() data_X = self.datastream # 2 conv inputs # we want to take our sequence of input images and convert them to convolutional # representations conv_layers = [ConvolutionalLayer(Rectifier().apply, (3, 3), 16, (2, 2), name='l1'), ConvolutionalLayer(Rectifier().apply, (3, 3), 32, (2, 2), name='l2'), ConvolutionalLayer(Rectifier().apply, (3, 3), 64, (2, 2), name='l3'), ConvolutionalLayer(Rectifier().apply, (3, 3), 128, (2, 2), name='l4'), ConvolutionalLayer(Rectifier().apply, (3, 3), 128, (2, 2), name='l5'), ConvolutionalLayer(Rectifier().apply, (3, 3), 128, (2, 2), name='l6')] convnet = ConvolutionalSequence(conv_layers, num_channels=4, image_size=(width, height), weights_init=init.Uniform(0, 0.01), biases_init=init.Constant(0.0), tied_biases=False, border_mode='full') convnet.initialize() output_dim = np.prod(convnet.get_dim('output')) conv_out = convnet.apply(self.x) reshape_dims = (conv_out.shape[0], conv_out.shape[1]*conv_out.shape[2]*conv_out.shape[3]) hidden_repr = conv_out.reshape(reshape_dims) conv2rnn = Linear(input_dim=output_dim, output_dim=RNN_DIMS, weights_init=init.Uniform(width=0.01), biases_init=init.Constant(0.)) conv2rnn.initialize() conv2rnn_output = conv2rnn.apply(hidden_repr) # RNN hidden layer # then we want to feed those conv representations into an RNN rnn = SimpleRecurrent(dim=RNN_DIMS, activation=Rectifier(), weights_init=init.Uniform(width=0.01)) rnn.initialize() self.learned_state = rnn.apply(inputs=conv2rnn_output, states=self.state, iterate=False) # linear output from hidden layer # the RNN has two outputs, but only this one has a target. That is, this is "expected return" # which the network attempts to minimize difference between expected return and actual return lin_output = Linear(input_dim=RNN_DIMS, output_dim=1, weights_init=init.Uniform(width=0.01), biases_init=init.Constant(0.)) lin_output.initialize() self.exp_reward = lin_output.apply(self.learned_state) self.get_exp_reward = theano.function([self.x, self.state], self.exp_reward) # softmax output from hidden layer # this provides a softmax of action recommendations # the hypothesis is that adjusting the other outputs magically influences this set of outputs # to suggest smarter (or more realistic?) moves action_output = Linear(input_dim=RNN_DIMS, output_dim=NUM_ACTIONS, weights_init=init.Constant(.001), biases_init=init.Constant(0.)) action_output.initialize() self.suggested_actions = Softmax().apply(action_output.apply(self.learned_state[-1])) ###################### # use this to get suggested actions... it requires the state of the hidden units from the previous # timestep ##################### self.get_suggested_actions = theano.function([self.x, self.state], [self.suggested_actions, self.learned_state])
def main(mode, save_path, num_batches, data_path=None): # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Build bricks encoder = Bidirectional(SimpleRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) fork = Fork( [name for name in encoder.prototype.apply.sequences if name != 'mask'], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.output_dims = {name: dimension for name in fork.input_names} lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) transition = SimpleRecurrent(activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention(state_names=transition.apply.states, sequence_dim=2 * dimension, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = DataStreamMapping( mapping=_transpose, data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=_filter_long, data_stream=dataset.get_default_stream()))))) # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) (energies, ) = VariableFilter(application=readout.readout, name="output")(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy( abs(activations).mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) # More variables for debugging observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring(observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) model = Model(generated) model.set_param_values(load_parameter_values(save_path)) sample_function = model.get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def test_sequence_generator_with_lm(): floatX = theano.config.floatX rng = numpy.random.RandomState(1234) readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) language_model = SequenceGenerator(Readout( readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, dim, name='feedback')), SimpleRecurrent(dim, Tanh()), name='language_model') generator = SequenceGenerator(Readout( readout_dim=readout_dim, source_names=["states", "lm_states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, feedback_dim)), transition, language_model=language_model, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.lmatrix('y') y.tag.test_value = numpy.zeros((15, batch_size), dtype='int64') mask = tensor.matrix('mask') mask.tag.test_value = numpy.ones((15, batch_size)) costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 costs_fun = theano.function([y, mask], [costs]) y_test = rng.randint(readout_dim, size=(n_steps, batch_size)) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = costs_fun(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 483.153, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], cost)(y_test, m_test) assert_allclose(cost_val, 16.105, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join( [generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [ el for el in var_filter(cg.variables) if el.name == aux_var_name ][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 1.61051, rtol=1e-5) # Test generate states, outputs, lm_states, costs = generator.generate( iterate=True, batch_size=batch_size, n_steps=n_steps) cg = ComputationGraph([states, outputs, costs]) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=cg.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size) assert_allclose(states_val.sum(), -4.88367, rtol=1e-5) assert_allclose(costs_val.sum(), 486.681, rtol=1e-5) assert outputs_val.sum() == 627 # Test masks agnostic results of cost cost1 = costs_fun([[1], [2]], [[1], [1]])[0] cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0] assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
def setUp(self): self.simple = SimpleRecurrent(dim=3, weights_init=Constant(2), activation=Tanh()) self.simple.initialize()
def test_sequence_generator(): """Test a sequence generator with no contexts and continuous outputs. Such sequence generators can be used to model e.g. dynamical systems. """ rng = numpy.random.RandomState(1234) output_dim = 1 dim = 20 batch_size = 30 n_steps = 10 transition = SimpleRecurrent(activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator( Readout(readout_dim=output_dim, source_names=["states"], emitter=TestEmitter()), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0.0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.tensor3('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 y_test = rng.uniform(size=(n_steps, batch_size, output_dim)).astype(floatX) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = theano.function([y, mask], [costs])(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 115.593, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 3.8531, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join([generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [el for el in var_filter(cg.variables) if el.name == aux_var_name][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 0.38531, rtol=1e-5) # Test 'generate' method states, outputs, costs = [variable.eval() for variable in generator.generate( states=rng.uniform( size=(batch_size, dim)).astype(floatX), iterate=True, batch_size=batch_size, n_steps=n_steps)] assert states.shape == (n_steps, batch_size, dim) assert outputs.shape == (n_steps, batch_size, output_dim) assert costs.shape == (n_steps, batch_size) assert_allclose(outputs.sum(), -0.33683, rtol=1e-5) assert_allclose(states.sum(), 15.7909, rtol=1e-5) # There is no generation cost in this case, since generation is # deterministic assert_allclose(costs.sum(), 0.0)
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = SimpleRecurrent( dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=Adam() ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) def sampling_step(g_noise, states, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states = rnn.apply(inputs=embedding_step, states=states, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_samples [_, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
so let's think about sizes of the arrays... """ x = tensor.matrix('tokens', dtype="int32") x_mask = tensor.matrix('tokens_mask', dtype=floatX) #rnn.apply(inputs=input_to_hidden.apply(x), mask=x_mask) lookup = LookupTable(vocab_size, embedding_dim) x_extra = tensor.tensor3('extras', dtype=floatX) rnn = Bidirectional( SimpleRecurrent( dim=hidden_dim, activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), ), ) ### Will need to reshape the rnn outputs to produce suitable input here... gather = Linear(name='hidden_to_output', input_dim=hidden_dim * 2, output_dim=labels_size, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) p_labels = Softmax() ## Let's initialize the variables lookup.allocate() #print("lookup.parameters=", lookup.parameters) # ('lookup.parameters=', [W])
floatX = theano.config.floatX n_epochs = 30 x_dim = 1 h_dim = 100 o_dim = 10 batch_size = 50 print 'Building model ...' # T x B x F x = tensor.tensor3('x', dtype=floatX) y = tensor.tensor3('y', dtype='int32') x_to_h1 = Linear(name='x_to_h1', input_dim=x_dim, output_dim=h_dim) pre_rnn = x_to_h1.apply(x) rnn = SimpleRecurrent(activation=Rectifier(), dim=h_dim, name="rnn") h1 = rnn.apply(pre_rnn) h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=o_dim) pre_softmax = h1_to_o.apply(h1) softmax = Softmax() shape = pre_softmax.shape softmax_out = softmax.apply(pre_softmax.reshape((-1, o_dim))) softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1]) error_rate.name = 'error_rate'
def rnn_layer(dim, h, n): linear = Linear(input_dim=dim, output_dim=dim, name='linear' + str(n)) rnn = SimpleRecurrent(dim=dim, activation=Tanh(), name='rnn' + str(n)) initialize([linear, rnn]) return rnn.apply(linear.apply(h))
def main(model_path, recurrent_type): dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, _make_target, add_sources=('target',)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(100)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') target = tensor.lmatrix('target') target_mask = tensor.matrix('target_mask') dim = 100 lookup = LookupTable(len(all_chars), dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) if recurrent_type == 'lstm': rnn = LSTM(dim / 4, Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) elif recurrent_type == 'simple': rnn = SimpleRecurrent(dim, Tanh()) rnn = Bidirectional(rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) else: raise ValueError('Not known RNN type') rnn.initialize() lookup.initialize() y_hat = rnn.apply(lookup.apply(features), mask=features_mask) print len(all_chars) linear = Linear(2 * dim, len(all_chars), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) linear.initialize() y_hat = linear.apply(y_hat) seq_lenght = y_hat.shape[0] batch_size = y_hat.shape[1] y_hat = Softmax().apply(y_hat.reshape((seq_lenght * batch_size, -1))).reshape(y_hat.shape) cost = CategoricalCrossEntropy().apply( target.flatten(), y_hat.reshape((-1, len(all_chars)))) * seq_lenght * batch_size cost.name = 'cost' cost_per_character = cost / features_mask.sum() cost_per_character.name = 'cost_per_character' cg = ComputationGraph([cost, cost_per_character]) model = Model(cost) algorithm = GradientDescent(step_rule=Adam(), cost=cost, params=cg.parameters) train_monitor = TrainingDataMonitoring( [cost, cost_per_character], prefix='train', after_batch=True) extensions = [train_monitor, Printing(every_n_batches=40), Dump(model_path, every_n_batches=200), #Checkpoint('rnn.pkl', every_n_batches=200) ] main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=data_stream, extensions=extensions) main_loop.run()
dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup_input.initialize() linear_input = Linear( name='linear_input', input_dim=hidden_layer_dim, output_dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_input.initialize() rnn = SimpleRecurrent( name='hidden', dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear( name='linear_output', input_dim=hidden_layer_dim, output_dim=charset_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x)
inputs_mask = numpy.max(data[b'mask_inputs'], axis=-1) labels_mask = data[b'mask_labels'] print('Building model ...') # T x B x F x = tensor.tensor3('x', dtype=floatX) # T x B x_mask = tensor.matrix('x_mask', dtype=floatX) # L x B y = tensor.matrix('y', dtype=floatX) # L x B y_mask = tensor.matrix('y_mask', dtype=floatX) x_to_h = Linear(name='x_to_h', input_dim=x_dim, output_dim=h_dim) x_transform = x_to_h.apply(x) rnn = SimpleRecurrent(activation=Tanh(), dim=h_dim, name="rnn") h = rnn.apply(x_transform) h_to_o = Linear(name='h_to_o', input_dim=h_dim, output_dim=num_classes + 1) h_transform = h_to_o.apply(h) # T x B x C+1 y_hat = tensor.nnet.softmax(h_transform.reshape( (-1, num_classes + 1))).reshape((h.shape[0], h.shape[1], -1)) y_hat.name = 'y_hat' y_hat_mask = x_mask cost = CTC().apply(y, y_hat, y_mask, y_hat_mask, 'normal_scale') cost.name = 'CTC' # Initialization for brick in (rnn, x_to_h, h_to_o): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0)
x = tensor.tensor3('features') y = tensor.matrix('targets') n_batchs = 1000 h_dim = 2 x_dim = 2 encode = Linear(name='encode', input_dim=x_dim, output_dim=h_dim) gates = Linear(name='gates', input_dim=x_dim, output_dim=2 * h_dim) #lstm = LSTM(activation=Tanh(), # dim=h_dim, name="lstm") lstm = SimpleRecurrent(dim=h_dim, activation=Tanh()) #lstm = GatedRecurrent(dim=h_dim, # activation=Tanh()) decode = Linear(name='decode', input_dim=h_dim, output_dim=1) for brick in (encode, gates, decode): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() lstm.weights_init = IsotropicGaussian(0.01) #lstm.weights_init = Orthogonal() lstm.biases_init = Constant(0.) lstm.initialize()
n_batchs = 1000 h_dim = 2 x_dim = 2 encode = Linear(name='encode', input_dim=x_dim, output_dim=h_dim) gates = Linear(name = 'gates', input_dim = x_dim, output_dim = 2*h_dim) #lstm = LSTM(activation=Tanh(), # dim=h_dim, name="lstm") lstm = SimpleRecurrent(dim=h_dim, activation=Tanh()) #lstm = GatedRecurrent(dim=h_dim, # activation=Tanh()) decode = Linear(name='decode', input_dim=h_dim, output_dim=1) for brick in (encode, gates, decode): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() lstm.weights_init = IsotropicGaussian(0.01) #lstm.weights_init = Orthogonal()
def __init__(self, dim, **kwargs): super(MyRnn, self).__init__(**kwargs) self.dim = dim self.layer1 = SimpleRecurrent(dim=self.dim, activation=Identity(), name='recurrent layer 1', weights_init=initialization.Identity()) self.layer2 = SimpleRecurrent(dim=self.dim, activation=Identity(), name='recurrent layer 2', weights_init=initialization.Identity()) self.children = [self.layer1, self.layer2]
class EUTHM(UTHM): ''' UTH model with extend information ''' def __init__(self, config, dataset, *args, **kwargs): super(EUTHM, self).__init__(config, dataset) def _define_inputs(self, *args, **kwargs): super(EUTHM, self)._define_inputs() self.user_word = tensor.ivector('user_word') self.user_word_sparse_mask = tensor.vector('user_word_sparse_mask', dtype=theano.config.floatX) self.user_word_left_idx = tensor.ivector('user_word_idx_left_idx') self.user_word_right_idx = tensor.ivector('user_word_idx_right_idx') self.hashtag_word = tensor.ivector('hashtag_word') self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask', dtype=theano.config.floatX) self.hashtag_word_left_idx = tensor.ivector( 'hashtag_word_idx_left_idx') self.hashtag_word_right_idx = tensor.ivector( 'hashtag_word_idx_right_idx') self.sparse_word = tensor.imatrix('sparse_word') self.sparse_word_sparse_mask = tensor.vector( 'sparse_word_sparse_mask', dtype=theano.config.floatX) self.sparse_word_mask = tensor.matrix('sparse_word_mask', dtype=theano.config.floatX) self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx') self.sparse_word_right_idx = tensor.ivector( 'sparse_word_idx_right_idx') def _build_bricks(self, *args, **kwargs): # Build lookup tables super(EUTHM, self)._build_bricks() self.user2word = MLP( activations=[Tanh('user2word_tanh')], dims=[self.config.user_embed_dim, self.config.word_embed_dim], name='user2word_mlp') self.user2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.user2word.biases_init = Constant(0) self.user2word.initialize() self.hashtag2word = MLP( activations=[Tanh('hashtag2word_tanh')], dims=[ self.config.user_embed_dim + self.config.word_embed_dim, self.config.word_embed_dim ], name='hashtag2word_mlp') self.hashtag2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.hashtag2word.biases_init = Constant(0) self.hashtag2word.initialize() self.user2word_bias = Bias(dim=1, name='user2word_bias') self.user2word_bias.biases_init = Constant(0) self.user2word_bias.initialize() self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias') self.hashtag2word_bias.biases_init = Constant(0) self.hashtag2word_bias.initialize() #Build character embedding self.char_embed = self._embed(len(self.dataset.char2index), self.config.char_embed_dim, name='char_embed') # Build sparse word encoder self.rnn_ins = Linear(input_dim=self.config.char_embed_dim, output_dim=self.config.word_embed_dim, name='rnn_in') self.rnn_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim + self.config.word_embed_dim)) self.rnn_ins.biases_init = Constant(0) self.rnn_ins.initialize() self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim, activation=Tanh()) self.rnn.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.rnn.initialize() def _set_OV_value(self, *args, **kwargs): '''Train a <unk> representation''' tensor.set_subtensor( self.char_embed.W[self.dataset.char2index['<unk>']], numpy.zeros(self.config.char_embed_dim, dtype=theano.config.floatX)) def _get_text_vec(self, *args, **kwargs): # Transpose text self.text = self.text.dimshuffle(1, 0) self.text_mask = self.text_mask.dimshuffle(1, 0) self.sparse_word = self.sparse_word.dimshuffle(1, 0) self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0) # Turn word, user and hashtag into vector representation text_vec = self.word_embed.apply(self.text) # Apply user word, hashtag word and url text_vec = self._apply_user_word(text_vec) text_vec = self._apply_hashtag_word(text_vec) text_vec = self._apply_sparse_word(text_vec) return text_vec @abstractmethod def _apply_user_word(self, text_vec, *args, **kwargs): ''' Replace @a with transformed author vector :param text_vec: :param args: :param kwargs: :return: ''' user_word_vec = self.user2word.apply(self.user_embed.apply(self.user_word)) + \ self.user2word_bias.parameters[0][0] text_vec = tensor.set_subtensor( text_vec[self.user_word_right_idx, self.user_word_left_idx], text_vec[self.user_word_right_idx, self.user_word_left_idx] * (1 - self.user_word_sparse_mask[:, None]) + user_word_vec * self.user_word_sparse_mask[:, None]) return text_vec @abstractmethod def _apply_hashtag_word(self, text_vec, *args, **kwargs): ''' Replace #h with transformed hashtag vector :param text_vec: :param args: :param kwargs: :return: ''' hashtag_word_vec = self.hashtag2word.apply(self.hashtag_embed.apply(self.hashtag_word)) +\ self.hashtag2word_bias.parameters[0][0] text_vec = tensor.set_subtensor( text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx], text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx] * (1 - self.hashtag_sparse_mask[:, None]) + hashtag_word_vec * self.hashtag_sparse_mask[:, None]) return text_vec @abstractmethod def _apply_sparse_word(self, text_vec, *args, **kwargs): ''' Replace sparse word encoding with character embedding. (maybe lstm) :param text_vec: :param args: :param kwargs: :return: ''' sparse_word_vec = self.char_embed.apply(self.sparse_word) sparse_word_hiddens = self.rnn.apply( inputs=self.rnn_ins.apply(sparse_word_vec), mask=self.sparse_word_mask) tmp = sparse_word_hiddens[-1] text_vec = tensor.set_subtensor( text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx], text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx] * (1 - self.sparse_word_sparse_mask[:, None]) + tmp * self.sparse_word_sparse_mask[:, None]) return text_vec
class ETHM(EUTHM): '''Model with only textual-hashtag information''' def __init__(self, config, dataset, *args, **kwargs): super(ETHM, self).__init__(config, dataset) def _build_model(self, *args, **kwargs): # Define inputs self._define_inputs() self._build_bricks() self._set_OV_value() # Transpose text self.text = self.text.dimshuffle(1, 0) self.text_mask = self.text_mask.dimshuffle(1, 0) self.sparse_word = self.sparse_word.dimshuffle(1, 0) self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0) # Turn word, and hashtag into vector representation text_vec = self.word_embed.apply(self.text) # Apply word and hashtag word and url text_vec = self._apply_hashtag_word(text_vec) text_vec = self._apply_sparse_word(text_vec) # Encode text mlstm_hidden, mlstm_cell = self.mlstm.apply( inputs=self.mlstm_ins.apply(text_vec), mask=self.text_mask.astype(theano.config.floatX)) text_encodes = mlstm_hidden[-1] input_vec = text_encodes self._get_cost(input_vec, None, None) def _define_inputs(self, *args, **kwargs): self.hashtag = tensor.ivector('hashtag') self.text = tensor.imatrix('text') self.text_mask = tensor.matrix('text_mask', dtype=theano.config.floatX) self.hashtag_word = tensor.ivector('hashtag_word') self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask', dtype=theano.config.floatX) self.hashtag_word_left_idx = tensor.ivector( 'hashtag_word_idx_left_idx') self.hashtag_word_right_idx = tensor.ivector( 'hashtag_word_idx_right_idx') self.sparse_word = tensor.imatrix('sparse_word') self.sparse_word_sparse_mask = tensor.vector( 'sparse_word_sparse_mask', dtype=theano.config.floatX) self.sparse_word_mask = tensor.matrix('sparse_word_mask', dtype=theano.config.floatX) self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx') self.sparse_word_right_idx = tensor.ivector( 'sparse_word_idx_right_idx') def _build_bricks(self, *args, **kwargs): # Build lookup tables self.word_embed = self._embed(len(self.dataset.word2index), self.config.word_embed_dim, name='word_embed') self.hashtag_embed = self._embed(len(self.dataset.hashtag2index), self.config.lstm_dim, name='hashtag_embed') # Build text encoder self.mlstm_ins = Linear(input_dim=self.config.word_embed_dim, output_dim=4 * self.config.lstm_dim, name='mlstm_in') self.mlstm_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim)) self.mlstm_ins.biases_init = Constant(0) self.mlstm_ins.initialize() self.mlstm = MLSTM(self.config.lstm_time, self.config.lstm_dim, shared=False) self.mlstm.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim)) self.mlstm.biases_init = Constant(0) self.mlstm.initialize() self.hashtag2word = MLP( activations=[Tanh('hashtag2word_tanh')], dims=[self.config.lstm_dim, self.config.word_embed_dim], name='hashtag2word_mlp') self.hashtag2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.hashtag2word.biases_init = Constant(0) self.hashtag2word.initialize() self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias') self.hashtag2word_bias.biases_init = Constant(0) self.hashtag2word_bias.initialize() #Build character embedding self.char_embed = self._embed(len(self.dataset.char2index), self.config.char_embed_dim, name='char_embed') # Build sparse word encoder self.rnn_ins = Linear(input_dim=self.config.char_embed_dim, output_dim=self.config.word_embed_dim, name='rnn_in') self.rnn_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim + self.config.word_embed_dim)) self.rnn_ins.biases_init = Constant(0) self.rnn_ins.initialize() self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim, activation=Tanh()) self.rnn.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.rnn.initialize() def _apply_dropout(self, outputs, *args, **kwargs): variables = [self.word_embed.W, self.hashtag_embed.W] cgs = ComputationGraph(outputs) cg_dropouts = apply_dropout(cgs, variables, drop_prob=self.config.dropout_prob, seed=123).outputs return cg_dropouts def _apply_reg(self, cost, params=None, *args, **kwargs): try: if self.config.l2_norm > 0: cost = cost + self.config.l2_norm * theano_expressions.l2_norm( tensors=[self.hashtag_embed.W, self.word_embed.W])**2 else: pass except Exception: pass return cost
def rnn_layer(in_dim, h, h_dim, n): linear = Linear(input_dim=in_dim, output_dim=h_dim, name='linear' + str(n) + h.name) rnn = SimpleRecurrent(dim=h_dim, name='rnn' + str(n)) initialize([linear, rnn]) return rnn.apply(linear.apply(h))
class Rnn(Initializable, BaseRecurrent): def __init__(self, dims=(88, 100, 100), **kwargs): super(Rnn, self).__init__(**kwargs) self.dims = dims self.input_transform = Linear( input_dim=dims[0], output_dim=dims[1], weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0.0), use_bias=False, name="input_transfrom") self.gru_layer = SimpleRecurrent(dim=dims[1], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="gru_rnn_layer") # TODO: find a way to automatically set the output dim in case of lstm vs normal rnn self.linear_trans = Linear(input_dim=dims[1], output_dim=dims[2] * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=False, name="h2h_transform") self.lstm_layer = LSTM(dim=dims[2], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="lstm_rnn_layer") self.out_transform = MLP(activations=[Sigmoid()], dims=[dims[2], dims[0]], weights_init=IsotropicGaussian(0.01), use_bias=True, biases_init=Constant(0.0), name="out_layer") self.children = [ self.input_transform, self.gru_layer, self.linear_trans, self.lstm_layer, self.out_transform ] # @recurrent(sequences=['inputs', 'input_mask'], contexts=[], # states=['gru_state', 'lstm_state', 'lstm_cells'], # outputs=['gru_state', 'lstm_state', 'lstm_cells']) def rnn_apply(self, inputs, mask=None, gru_state=None, lstm_state=None, lstm_cells=None): input_transform = self.input_transform.apply(inputs) gru_state = self.gru_layer.apply( inputs=input_transform, # update_inputs=input_transform, # reset_inputs=input_transform, states=gru_state, mask=mask, iterate=False) lstm_transform = self.linear_trans.apply(gru_state) lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state, cells=lstm_cells, mask=mask, iterate=False) return gru_state, lstm_state, lstm_cells @recurrent(sequences=[], contexts=[], states=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'], outputs=['inputs', 'gru_state', 'lstm_state', 'lstm_cells']) def rnn_generate(self, inputs=None, gru_state=None, lstm_state=None, lstm_cells=None): output = self.apply(inputs=inputs, gru_state=gru_state, lstm_state=lstm_state, lstm_cells=lstm_cells, iterate=False) return output, gru_state, lstm_state, lstm_cells @recurrent(sequences=['inputs', 'mask'], contexts=[], states=['gru_state', 'lstm_state', 'lstm_cells'], outputs=['output', 'gru_state', 'lstm_state', 'lstm_cells']) def apply(self, inputs, mask, gru_state=None, lstm_state=None, lstm_cells=None): # input_transform = self.input_transform.apply(inputs) # gru_state = self.gru_layer.apply( # inputs=input_transform, # mask=mask, # states=gru_state, # iterate=False) # lstm_transform = self.linear_trans.apply(gru_state) # lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state, # cells=lstm_cells, # mask=mask, iterate=False) gru_state, lstm_state, lstm_cells = self.rnn_apply( inputs=inputs, mask=mask, gru_state=gru_state, lstm_state=lstm_state, lstm_cells=lstm_cells) output = 1.17 * self.out_transform.apply(lstm_state) * mask[:, None] return output, gru_state, lstm_state, lstm_cells def get_dim(self, name): dims = dict(zip(['outputs', 'gru_state', 'lstm_state'], self.dims)) dims['lstm_cells'] = dims['lstm_state'] return dims.get(name, None) or super(Rnn, self).get_dim(name)
lookup_input = LookupTable(name='lookup_input', length=train_dataset.syllables_vocab_size() + 1, dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup_input.initialize() linear_input = Linear(name='linear_input', input_dim=hidden_layer_dim, output_dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_input.initialize() rnn = SimpleRecurrent(name='hidden', dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear(name='linear_output', input_dim=hidden_layer_dim, output_dim=train_dataset.durations_vocab_size(), weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden)
n_epochs = 30 x_dim = 1 h_dim = 100 o_dim = 10 batch_size = 50 print 'Building model ...' # T x B x F x = tensor.tensor3('x', dtype=floatX) y = tensor.tensor3('y', dtype='int32') x_to_h1 = Linear(name='x_to_h1', input_dim=x_dim, output_dim=h_dim) pre_rnn = x_to_h1.apply(x) rnn = SimpleRecurrent(activation=Rectifier(), dim=h_dim, name="rnn") h1 = rnn.apply(pre_rnn) h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=o_dim) pre_softmax = h1_to_o.apply(h1) softmax = Softmax() shape = pre_softmax.shape softmax_out = softmax.apply(pre_softmax.reshape((-1, o_dim))) softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1])
def test_attention_recurrent(): rng = numpy.random.RandomState(1234) dim = 5 batch_size = 4 input_length = 20 attended_dim = 10 attended_length = 15 wrapped = SimpleRecurrent(dim, Identity()) attention = SequenceContentAttention(state_names=wrapped.apply.states, attended_dim=attended_dim, match_dim=attended_dim) recurrent = AttentionRecurrent(wrapped, attention, seed=1234) recurrent.weights_init = IsotropicGaussian(0.5) recurrent.biases_init = Constant(0) recurrent.initialize() attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") inputs = tensor.tensor3("inputs") inputs_mask = tensor.matrix("inputs_mask") outputs = recurrent.apply(inputs=inputs, mask=inputs_mask, attended=attended, attended_mask=attended_mask) states, glimpses, weights = outputs assert states.ndim == 3 assert glimpses.ndim == 3 assert weights.ndim == 3 # For values. def rand(size): return rng.uniform(size=size).astype(floatX) # For masks. def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask input_vals = rand((input_length, batch_size, dim)) input_mask_vals = generate_mask(input_length, batch_size) attended_vals = rand((attended_length, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_length, batch_size) func = theano.function([inputs, inputs_mask, attended, attended_mask], [states, glimpses, weights]) states_vals, glimpses_vals, weight_vals = func(input_vals, input_mask_vals, attended_vals, attended_mask_vals) assert states_vals.shape == (input_length, batch_size, dim) assert glimpses_vals.shape == (input_length, batch_size, attended_dim) assert (len(ComputationGraph(outputs).shared_variables) == len( Selector(recurrent).get_params())) # weights for not masked position must be zero assert numpy.all(weight_vals * (1 - attended_mask_vals.T) == 0) # weights for masked positions must be non-zero assert numpy.all(abs(weight_vals + (1 - attended_mask_vals.T)) > 1e-5) # weights from different steps should be noticeably different assert (abs(weight_vals[0] - weight_vals[1])).sum() > 1e-2 # weights for all state after the last masked position should be same for i in range(batch_size): last = int(input_mask_vals[:, i].sum()) for j in range(last, input_length): assert_allclose(weight_vals[last, i], weight_vals[j, i]) # freeze sums assert_allclose(weight_vals.sum(), input_length * batch_size, 1e-5) assert_allclose(states_vals.sum(), 113.429, rtol=1e-5) assert_allclose(glimpses_vals.sum(), 415.901, rtol=1e-5)
def test_attention_recurrent(): rng = numpy.random.RandomState(1234) dim = 5 batch_size = 4 input_length = 20 attended_dim = 10 attended_length = 15 wrapped = SimpleRecurrent(dim, Identity()) attention = SequenceContentAttention( state_names=wrapped.apply.states, attended_dim=attended_dim, match_dim=attended_dim) recurrent = AttentionRecurrent(wrapped, attention, seed=1234) recurrent.weights_init = IsotropicGaussian(0.5) recurrent.biases_init = Constant(0) recurrent.initialize() attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") inputs = tensor.tensor3("inputs") inputs_mask = tensor.matrix("inputs_mask") outputs = recurrent.apply( inputs=inputs, mask=inputs_mask, attended=attended, attended_mask=attended_mask) states, glimpses, weights = outputs assert states.ndim == 3 assert glimpses.ndim == 3 assert weights.ndim == 3 # For values. def rand(size): return rng.uniform(size=size).astype(theano.config.floatX) # For masks. def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=theano.config.floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask input_vals = rand((input_length, batch_size, dim)) input_mask_vals = generate_mask(input_length, batch_size) attended_vals = rand((attended_length, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_length, batch_size) func = theano.function([inputs, inputs_mask, attended, attended_mask], [states, glimpses, weights]) states_vals, glimpses_vals, weight_vals = func( input_vals, input_mask_vals, attended_vals, attended_mask_vals) assert states_vals.shape == (input_length, batch_size, dim) assert glimpses_vals.shape == (input_length, batch_size, attended_dim) assert (len(ComputationGraph(outputs).shared_variables) == len(Selector(recurrent).get_parameters())) # Manual reimplementation inputs2d = tensor.matrix() states2d = tensor.matrix() mask1d = tensor.vector() weighted_averages = tensor.matrix() distribute_func = theano.function( [inputs2d, weighted_averages], recurrent.distribute.apply( inputs=inputs2d, weighted_averages=weighted_averages)) wrapped_apply_func = theano.function( [states2d, inputs2d, mask1d], wrapped.apply( states=states2d, inputs=inputs2d, mask=mask1d, iterate=False)) attention_func = theano.function( [states2d, attended, attended_mask], attention.take_glimpses( attended=attended, attended_mask=attended_mask, states=states2d)) states_man = wrapped.initial_states(batch_size).eval() glimpses_man = numpy.zeros((batch_size, attended_dim), dtype=theano.config.floatX) for i in range(input_length): inputs_man = distribute_func(input_vals[i], glimpses_man) states_man = wrapped_apply_func(states_man, inputs_man, input_mask_vals[i]) glimpses_man, weights_man = attention_func( states_man, attended_vals, attended_mask_vals) assert_allclose(states_man, states_vals[i], rtol=1e-5) assert_allclose(glimpses_man, glimpses_vals[i], rtol=1e-5) assert_allclose(weights_man, weight_vals[i], rtol=1e-5) # weights for not masked position must be zero assert numpy.all(weight_vals * (1 - attended_mask_vals.T) == 0) # weights for masked positions must be non-zero assert numpy.all(abs(weight_vals + (1 - attended_mask_vals.T)) > 1e-5) # weights from different steps should be noticeably different assert (abs(weight_vals[0] - weight_vals[1])).sum() > 1e-2 # weights for all state after the last masked position should be same for i in range(batch_size): last = int(input_mask_vals[:, i].sum()) for j in range(last, input_length): assert_allclose(weight_vals[last, i], weight_vals[j, i], 1e-5)
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
# Parameters n_u = 225 # input vector size (not time at this point) n_y = 225 # output vector size n_h = 500 # numer of hidden units iteration = 300 # number of epochs of gradient descent print "Building Model" # Symbolic variables x = tensor.tensor3('x', dtype=floatX) target = tensor.tensor3('target', dtype=floatX) # Build the model linear = Linear(input_dim=n_u, output_dim=n_h, name="first_layer") rnn = SimpleRecurrent(dim=n_h, activation=Tanh()) linear2 = Linear(input_dim=n_h, output_dim=n_y, name="output_layer") sigm = Sigmoid() x_transform = linear.apply(x) h = rnn.apply(x_transform) predict = sigm.apply(linear2.apply(h)) # only for generation B x h_dim h_initial = tensor.tensor3('h_initial', dtype=floatX) h_testing = rnn.apply(x_transform, h_initial, iterate=False) y_hat_testing = linear2.apply(h_testing) y_hat_testing = sigm.apply(y_hat_testing) y_hat_testing.name = 'y_hat_testing'
print('Building model ...') # T x B x F x = tensor.tensor3('x', dtype=floatX) # T x B x_mask = tensor.matrix('x_mask', dtype=floatX) # L x B y = tensor.matrix('y', dtype=floatX) # L x B y_mask = tensor.matrix('y_mask', dtype=floatX) x_to_h = Linear(name='x_to_h', input_dim=x_dim, output_dim=h_dim) x_transform = x_to_h.apply(x) rnn = SimpleRecurrent(activation=Tanh(), dim=h_dim, name="rnn") h = rnn.apply(x_transform) h_to_o = Linear(name='h_to_o', input_dim=h_dim, output_dim=num_classes + 1) h_transform = h_to_o.apply(h) # T x B x C+1 y_hat = tensor.nnet.softmax( h_transform.reshape((-1, num_classes + 1)) ).reshape((h.shape[0], h.shape[1], -1)) y_hat.name = 'y_hat' y_hat_mask = x_mask cost = CTC().apply(y, y_hat, y_mask, y_hat_mask, 'normal_scale') cost.name = 'CTC' # Initialization
def __init__(self, input_sources_list, input_sources_vocab_size_list, output_source, output_source_vocab_size, lookup_dim=200, hidden_size=256, recurrent_stack_size=1): self.InputSources = input_sources_list self.InputSourcesVocab = input_sources_vocab_size_list self.OutputSource = output_source self.OutputSourceVocab = output_source_vocab_size inputs = [tensor.lmatrix(source) for source in input_sources_list] output = tensor.lmatrix(output_source) lookups = self.get_lookups(lookup_dim, input_sources_vocab_size_list) for lookup in lookups: lookup.initialize() merge = Merge([lookup.name for lookup in lookups], [lookup.dim for lookup in lookups], hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() linear0 = Linear(input_dim=hidden_size, output_dim=hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0), name='linear0') linear0.initialize() recurrent_blocks = [] for i in range(recurrent_stack_size): recurrent_blocks.append(SimpleRecurrent( dim=hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01), use_bias=False)) for i, recurrent_block in enumerate(recurrent_blocks): recurrent_block.name = 'recurrent'+str(i+1) recurrent_block.initialize() linear_out = Linear(input_dim=hidden_size, output_dim=output_source_vocab_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0), name='linear_out') linear_out.initialize() softmax = NDimensionalSoftmax(name='softmax') lookup_outputs = [lookup.apply(input) for lookup, input in zip(lookups, inputs)] m = merge.apply(*lookup_outputs) r = linear0.apply(m) for block in recurrent_blocks: r = block.apply(r) a = linear_out.apply(r) self.Cost = softmax.categorical_cross_entropy(output, a, extra_ndim=1).mean() self.Cost.name = 'cost' y_hat = softmax.apply(a, extra_ndim=1) y_hat.name = 'y_hat' self.ComputationGraph = ComputationGraph(self.Cost) self.Function = None self.MainLoop = None self.Model = Model(y_hat)
def main(save_to, num_epochs): batch_size = 128 dim = 100 n_steps = 20 i2h1 = MLP([Identity()], [784, dim], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) h2o1 = MLP([Rectifier(), Logistic()], [dim, dim, 784], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) rec1 = SimpleRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) i2h1.initialize() h2o1.initialize() rec1.initialize() x = tensor.tensor3('features') x1 = x[1:, :, :] x2 = x[:-1, :, :] preproc = i2h1.apply(x1) h1 = rec1.apply(preproc) x_hat = h2o1.apply(h1) cost = tensor.nnet.binary_crossentropy(x_hat, x2).mean() # cost = CategoricalCrossEntropy().apply(y.flatten(), probs) cost.name = 'final_cost' cg = ComputationGraph([cost, ]) mnist_train = MNIST("train", subset=slice(0, 50000), sources=('features', )) mnist_valid = MNIST("train", subset=slice(50000, 60000), sources=('features',)) mnist_test = MNIST("test") trainstream = Mapping(Flatten(DataStream(mnist_train, iteration_scheme=SequentialScheme(50000, batch_size))), _meanize(n_steps)) validstream = Mapping(Flatten(DataStream(mnist_valid, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps)) teststream = Mapping(Flatten(DataStream(mnist_test, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps)) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([Adam(), StepClipping(100)])) main_loop = MainLoop( algorithm, trainstream, extensions=[Timing(), FinishAfter(after_n_epochs=num_epochs), # DataStreamMonitoring( # [cost, ], # teststream, # prefix="test"), DataStreamMonitoringAndSaving( [cost, ], validstream, [i2h1, h2o1, rec1], 'best_'+save_to+'.pkl', cost_name=cost.name, after_epoch=True, prefix='valid'), TrainingDataMonitoring( [cost, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), # Plot( # save_to, # channels=[ # ['test_final_cost', # 'test_misclassificationrate_apply_error_rate'], # ['train_total_gradient_norm']]), Printing()]) main_loop.run()
def build_model_soft(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [SimpleRecurrent(dim=args.state_dim, activation=Tanh())] # Build the MLP dims = [2 * args.state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(args.state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(args.layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=args.state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} hidden_states = [] for d in range(args.layers): h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) # Concatenate all the states if args.layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, cross_entropy = get_costs(presoft, args) return cost, cross_entropy, updates, gate_values, hidden_states