Beispiel #1
0
class FeedbackRNN(BaseRecurrent):
    def __init__(self, dim, **kwargs):
        super(FeedbackRNN, self).__init__(**kwargs)
        self.dim = dim
        self.first_recurrent_layer = SimpleRecurrent(
            dim=self.dim,
            activation=Identity(),
            name='first_recurrent_layer',
            weights_init=initialization.Identity())
        self.second_recurrent_layer = SimpleRecurrent(
            dim=self.dim,
            activation=Identity(),
            name='second_recurrent_layer',
            weights_init=initialization.Identity())
        self.children = [
            self.first_recurrent_layer, self.second_recurrent_layer
        ]

    @recurrent(sequences=['inputs'],
               contexts=[],
               states=['first_states', 'second_states'],
               outputs=['first_states', 'second_states'])
    def apply(self, inputs, first_states=None, second_states=None):
        first_h = self.first_recurrent_layer.apply(inputs=inputs,
                                                   states=first_states +
                                                   second_states,
                                                   iterate=False)
        second_h = self.second_recurrent_layer.apply(inputs=first_h,
                                                     states=second_states,
                                                     iterate=False)
        return first_h, second_h

    def get_dim(self, name):
        return (self.dim if name in ('inputs', 'first_states', 'second_states')
                else super(FeedbackRNN, self).get_dim(name))
Beispiel #2
0
def example():
    """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """
    x = tensor.tensor3('x')

    rnn = SimpleRecurrent(dim=3, activation=Identity(), weights_init=initialization.Identity())
    rnn.initialize()
    h = rnn.apply(x)

    f = theano.function([x], h)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) 

    doubler = Linear(
                 input_dim=3, output_dim=3, weights_init=initialization.Identity(2),
                 biases_init=initialization.Constant(0))
    doubler.initialize()
    h_doubler = rnn.apply(doubler.apply(x))

    f = theano.function([x], h_doubler)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) 

    #Initial State
    h0 = tensor.matrix('h0')
    h = rnn.apply(inputs=x, states=h0)

    f = theano.function([x, h0], h)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX),
            np.ones((1, 3), dtype=theano.config.floatX))) 
Beispiel #3
0
class FeedbackRNN(BaseRecurrent):
    def __init__(self, dim, **kwargs):
        super(FeedbackRNN, self).__init__(**kwargs)
        self.dim = dim
        self.first_recurrent_layer = SimpleRecurrent(
            dim=self.dim, activation=Identity(), name='first_recurrent_layer',
            weights_init=initialization.Identity())
        self.second_recurrent_layer = SimpleRecurrent(
            dim=self.dim, activation=Identity(), name='second_recurrent_layer',
            weights_init=initialization.Identity())
        self.children = [self.first_recurrent_layer,
                         self.second_recurrent_layer]

    @recurrent(sequences=['inputs'], contexts=[],
               states=['first_states', 'second_states'],
               outputs=['first_states', 'second_states'])
    def apply(self, inputs, first_states=None, second_states=None):
        first_h = self.first_recurrent_layer.apply(
            inputs=inputs, states=first_states + second_states, iterate=False)
        second_h = self.second_recurrent_layer.apply(
            inputs=first_h, states=second_states, iterate=False)
        return first_h, second_h

    def get_dim(self, name):
        return (self.dim if name in ('inputs', 'first_states', 'second_states')
                else super(FeedbackRNN, self).get_dim(name))
Beispiel #4
0
def rnn_layer(in_size, dim, x, h, n, first_layer=False):
    if connect_h_to_h == 'all-previous':
        if first_layer:
            rnn_input = x
            linear = Linear(input_dim=in_size,
                            output_dim=dim,
                            name='linear' + str(n) + '-' + str(task))
        elif connect_x_to_h:
            rnn_input = T.concatenate([x] + [hidden for hidden in h], axis=2)
            linear = Linear(input_dim=in_size + dim * n,
                            output_dim=dim,
                            name='linear' + str(n) + '-' + str(task))
        else:
            rnn_input = T.concatenate([hidden for hidden in h], axis=2)
            linear = Linear(input_dim=dim * n,
                            output_dim=dim,
                            name='linear' + str(n) + '-' + str(task))
    elif connect_h_to_h == 'two-previous':
        if first_layer:
            rnn_input = x
            linear = Linear(input_dim=in_size,
                            output_dim=dim,
                            name='linear' + str(n) + '-' + str(task))
        elif connect_x_to_h:
            rnn_input = T.concatenate([x] + h[max(0, n - 2):n], axis=2)
            linear = Linear(input_dim=in_size + dim * 2 if n > 1 else in_size +
                            dim,
                            output_dim=dim,
                            name='linear' + str(n) + '-' + str(task))
        else:
            rnn_input = T.concatenate(h[max(0, n - 2):n], axis=2)
            linear = Linear(input_dim=dim * 2 if n > 1 else dim,
                            output_dim=dim,
                            name='linear' + str(n) + '-' + str(task))
    elif connect_h_to_h == 'one-previous':
        if first_layer:
            rnn_input = x
            linear = Linear(input_dim=in_size,
                            output_dim=dim,
                            name='linear' + str(n) + '-' + str(task))
        elif connect_x_to_h:
            rnn_input = T.concatenate([x] + [h[n - 1]], axis=2)
            linear = Linear(input_dim=in_size + dim,
                            output_dim=dim,
                            name='linear' + str(n) + '-' + str(task))
        else:
            rnn_input = h[n]
            linear = Linear(input_dim=dim,
                            output_dim=dim,
                            name='linear' + str(n) + '-' + str(task))
    rnn = SimpleRecurrent(dim=dim,
                          activation=Tanh(),
                          name=layer_models[n] + str(n) + '-' + str(task))
    initialize([linear, rnn])
    if layer_models[n] == 'rnn':
        return rnn.apply(linear.apply(rnn_input))
    elif layer_models[n] == 'mt_rnn':
        return rnn.apply(linear.apply(rnn_input),
                         time_scale=layer_resolutions[n],
                         time_offset=layer_execution_time_offset[n])
Beispiel #5
0
def example():
    """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """
    x = tensor.tensor3('x')

    rnn = SimpleRecurrent(dim=3,
                          activation=Identity(),
                          weights_init=initialization.Identity())
    rnn.initialize()
    h = rnn.apply(x)

    f = theano.function([x], h)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX)))

    doubler = Linear(input_dim=3,
                     output_dim=3,
                     weights_init=initialization.Identity(2),
                     biases_init=initialization.Constant(0))
    doubler.initialize()
    h_doubler = rnn.apply(doubler.apply(x))

    f = theano.function([x], h_doubler)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX)))

    #Initial State
    h0 = tensor.matrix('h0')
    h = rnn.apply(inputs=x, states=h0)

    f = theano.function([x, h0], h)
    print(
        f(np.ones((3, 1, 3), dtype=theano.config.floatX),
          np.ones((1, 3), dtype=theano.config.floatX)))
Beispiel #6
0
def test_similar_scans():
    x = tensor.tensor3('x')
    r1 = SimpleRecurrent(activation=Tanh(), dim=10)
    y1 = r1.apply(x)
    r2 = SimpleRecurrent(activation=Tanh(), dim=10)
    y2 = r2.apply(x)
    cg = ComputationGraph([y1, y2])
    assert len(cg.scans) == 2
Beispiel #7
0
class RNNwMini(BaseRecurrent):
    def __init__(self, dim, mini_dim, summary_dim, **kwargs):
        super(RNNwMini, self).__init__(**kwargs)
        self.dim = dim
        self.mini_dim = mini_dim
        self.summary_dim = summary_dim

        self.recurrent_layer = SimpleRecurrent(
            dim=self.summary_dim,
            activation=Rectifier(),
            name='recurrent_layer',
            weights_init=IsotropicGaussian(),
            biases_init=Constant(0.0))
        self.mini_recurrent_layer = SimpleRecurrent(
            dim=self.mini_dim,
            activation=Rectifier(),
            name='mini_recurrent_layer',
            weights_init=IsotropicGaussian(),
            biases_init=Constant(0.0))

        self.mini_to_main = Linear(self.dim + self.mini_dim,
                                   self.summary_dim,
                                   name='mini_to_main',
                                   weights_init=IsotropicGaussian(),
                                   biases_init=Constant(0.0))
        self.children = [
            self.recurrent_layer, self.mini_recurrent_layer, self.mini_to_main
        ]

    @recurrent(sequences=['x', 'xmini'],
               contexts=[],
               states=['states'],
               outputs=['states'])
    def apply(self, x, xmini, states=None):
        mini_h_all = self.mini_recurrent_layer.apply(inputs=xmini,
                                                     states=None,
                                                     iterate=True)
        #grab last hidden state
        mini_h = mini_h_all[-1]

        combInput = T.concatenate([x, mini_h], axis=1)
        combTransform = self.mini_to_main.apply(combInput)

        h = self.recurrent_layer.apply(inputs=combTransform,
                                       states=states,
                                       iterate=False)

        return h

    def get_dim(self, name):
        dim = 1
        if name == 'x':
            dim = self.dim
        elif name == 'states':
            dim = self.summary_dim
        else:
            dim = super(RNNwMini, self).get_dim(name)
        return dim
Beispiel #8
0
def rnn_layer(dim, h, n, x_mask, first, **kwargs):
    linear = Linear(input_dim=dim, output_dim=dim, name='linear' + str(n))
    rnn = SimpleRecurrent(dim=dim, activation=Rectifier(), name='rnn' + str(n))
    initialize([linear, rnn])
    applyLin = linear.apply(h)
    if first:
        rnnApply = rnn.apply(applyLin, mask=x_mask, **kwargs)
    else:
        rnnApply = rnn.apply(applyLin, **kwargs)

    return rnnApply
Beispiel #9
0
class TestSimpleRecurrent(unittest.TestCase):
    def setUp(self):
        self.simple = SimpleRecurrent(dim=3,
                                      weights_init=Constant(2),
                                      activation=Tanh())
        self.simple.initialize()

    def test_one_step(self):
        h0 = tensor.matrix('h0')
        x = tensor.matrix('x')
        mask = tensor.vector('mask')
        h1 = self.simple.apply(x, h0, mask=mask, iterate=False)
        next_h = theano.function(inputs=[h0, x, mask], outputs=[h1])

        h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]],
                                   dtype=theano.config.floatX)
        x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]],
                                  dtype=theano.config.floatX)
        mask_val = numpy.array([1, 0]).astype(theano.config.floatX)
        h1_val = numpy.tanh(h0_val.dot(2 * numpy.ones((3, 3))) + x_val)
        h1_val = mask_val[:, None] * h1_val + (1 - mask_val[:, None]) * h0_val
        assert_allclose(h1_val, next_h(h0_val, x_val, mask_val)[0])

    def test_many_steps(self):
        x = tensor.tensor3('x')
        mask = tensor.matrix('mask')
        h = self.simple.apply(x, mask=mask, iterate=True)
        calc_h = theano.function(inputs=[x, mask], outputs=[h])

        x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))),
                                    dtype=theano.config.floatX)
        x_val = numpy.ones(
            (24, 4, 3), dtype=theano.config.floatX) * x_val[..., None]
        mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        mask_val[12:24, 3] = 0
        h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX)
        for i in range(1, 25):
            h_val[i] = numpy.tanh(h_val[i - 1].dot(2 * numpy.ones((3, 3))) +
                                  x_val[i - 1])
            h_val[i] = (mask_val[i - 1, :, None] * h_val[i] +
                        (1 - mask_val[i - 1, :, None]) * h_val[i - 1])
        h_val = h_val[1:]
        assert_allclose(h_val, calc_h(x_val, mask_val)[0], rtol=1e-04)

        # Also test that initial state is a parameter
        initial_state, = VariableFilter(roles=[INITIAL_STATE])(
            ComputationGraph(h))
        assert is_shared_variable(initial_state)
        assert initial_state.name == 'initial_state'
Beispiel #10
0
class MyRnn(BaseRecurrent): # Extend the base recurrent class to create one of your own
  def __init__(self, dim, **kwargs):
    super(MyRnn, self).__init__(**kwargs)
    self.dim = dim
    self.layer1 = SimpleRecurrent(dim=self.dim, activation=Identity(), name='recurrent layer 1', weights_init=initialization.Identity())
    self.layer2 = SimpleRecurrent(dim=self.dim, activation=Identity(), name='recurrent layer 2', weights_init=initialization.Identity())
    self.children = [self.layer1, self.layer2]

  def apply(self, inputs, first_states=None, second_states=None):
    first_h = self.layer1.apply(inputs=inputs, states=first_states, iterate=False)
    second_h = self.layer2.apply(inputs=first_h, states=second_states, iterate=False)
    return first_h, second_h

  def get_dim(self):
    pass
class TestSimpleRecurrent(unittest.TestCase):
    def setUp(self):
        self.simple = SimpleRecurrent(dim=3, weights_init=Constant(2),
                                      activation=Tanh())
        self.simple.initialize()

    def test_one_step(self):
        h0 = tensor.matrix('h0')
        x = tensor.matrix('x')
        mask = tensor.vector('mask')
        h1 = self.simple.apply(x, h0, mask=mask, iterate=False)
        next_h = theano.function(inputs=[h0, x, mask], outputs=[h1])

        h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]],
                                   dtype=theano.config.floatX)
        x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]],
                                  dtype=theano.config.floatX)
        mask_val = numpy.array([1, 0]).astype(theano.config.floatX)
        h1_val = numpy.tanh(h0_val.dot(2 * numpy.ones((3, 3))) + x_val)
        h1_val = mask_val[:, None] * h1_val + (1 - mask_val[:, None]) * h0_val
        assert_allclose(h1_val, next_h(h0_val, x_val, mask_val)[0])

    def test_many_steps(self):
        x = tensor.tensor3('x')
        mask = tensor.matrix('mask')
        h = self.simple.apply(x, mask=mask, iterate=True)
        calc_h = theano.function(inputs=[x, mask], outputs=[h])

        x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))),
                                    dtype=theano.config.floatX)
        x_val = numpy.ones((24, 4, 3),
                           dtype=theano.config.floatX) * x_val[..., None]
        mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        mask_val[12:24, 3] = 0
        h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX)
        for i in range(1, 25):
            h_val[i] = numpy.tanh(h_val[i - 1].dot(
                2 * numpy.ones((3, 3))) + x_val[i - 1])
            h_val[i] = (mask_val[i - 1, :, None] * h_val[i] +
                        (1 - mask_val[i - 1, :, None]) * h_val[i - 1])
        h_val = h_val[1:]
        assert_allclose(h_val, calc_h(x_val, mask_val)[0], rtol=1e-04)

        # Also test that initial state is a parameter
        initial_state, = VariableFilter(roles=[INITIAL_STATE])(
            ComputationGraph(h))
        assert is_shared_variable(initial_state)
        assert initial_state.name == 'initial_state'
Beispiel #12
0
class TestBidirectional(unittest.TestCase):
    def setUp(self):
        self.bidir = Bidirectional(weights_init=Orthogonal(),
                                   prototype=SimpleRecurrent(
                                       dim=3, activation=Tanh()))
        self.simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(),
                                      activation=Tanh(), seed=1)
        self.bidir.allocate()
        self.simple.initialize()
        self.bidir.children[0].params[0].set_value(
            self.simple.params[0].get_value())
        self.bidir.children[1].params[0].set_value(
            self.simple.params[0].get_value())
        self.x_val = 0.1 * numpy.asarray(
            list(itertools.permutations(range(4))),
            dtype=floatX)
        self.x_val = (numpy.ones((24, 4, 3), dtype=floatX) *
                      self.x_val[..., None])
        self.mask_val = numpy.ones((24, 4), dtype=floatX)
        self.mask_val[12:24, 3] = 0

    def test(self):
        x = tensor.tensor3('x')
        mask = tensor.matrix('mask')
        calc_bidir = theano.function([x, mask],
                                     [self.bidir.apply(x, mask=mask)])
        calc_simple = theano.function([x, mask],
                                      [self.simple.apply(x, mask=mask)])
        h_bidir = calc_bidir(self.x_val, self.mask_val)[0]
        h_simple = calc_simple(self.x_val, self.mask_val)[0]
        h_simple_rev = calc_simple(self.x_val[::-1], self.mask_val[::-1])[0]

        assert_allclose(h_simple, h_bidir[..., :3], rtol=1e-04)
        assert_allclose(h_simple_rev, h_bidir[::-1, ...,  3:], rtol=1e-04)
Beispiel #13
0
def rnn_layer(in_dim, h, h_dim, n):
    linear = Linear(input_dim=in_dim,
                    output_dim=h_dim,
                    name='linear' + str(n) + h.name)
    rnn = SimpleRecurrent(dim=h_dim, name='rnn' + str(n))
    initialize([linear, rnn])
    return rnn.apply(linear.apply(h))
class CompositionalLayerToyWithTables(Initializable):
    def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size,
                 subword_RNN_hidden_state_size, **kwargs):

        super(CompositionalLayerToyWithTables, self).__init__(**kwargs)

        self.batch_size = batch_size
        self.num_subwords = num_subwords # number of subwords which make up a word
        self.num_words = num_words  # number of words in the sentence
        self.subword_embedding_size = subword_embedding_size
        self.input_vocab_size = input_vocab_size
        self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size

        # create the look up table
        self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name='input_lookup')
        self.lookup.weights_init = Uniform(width=0.08)
        self.lookup.biases_init = Constant(0)

        # has one RNN which reads the subwords into a word embedding
        self.compositional_subword_to_word_RNN = SimpleRecurrent(
            dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN',
            weights_init=Identity_init())

        self.children = [self.lookup, self.compositional_subword_to_word_RNN]


    '''
    subword_id_input_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size).
    It is expected as a dtype=uint16 or equivalent

    subword_id_input_mask_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size).
    It is expected as a dtype=uint8 or equivalent and has binary values of 1 when there is data and zero otherwise.

    The look up table will return a 4d tensor with shape = (num_words, num_subwords, batch_size, embedding size)

    The RNN will eat up the subwords dimension, resulting in a
    3d tensor of shape = (num_words, batch_size, RNN_hidden_value_size), which is returned as 'word_embeddings'

    Also returned is a 2d tensor of shape = (num_words, batch_zize), which is the remaining mask indicated
    the length of the sentence for each sentence in the batch.  i.e., 1 when there is a word, 0 otherwise.
    '''
    @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['word_embeddings', 'word_embeddings_mask'])
    def apply(self, subword_id_input_, subword_id_input_mask_):
        ##shape = (num_words, num_subwords, batch_size, embedding size)
        subword_embeddings = self.lookup.apply(subword_id_input_)

        result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords
            fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN.apply(subword_embeddings, mask=subword_id_input_mask_),
            sequences= [subword_embeddings, subword_id_input_mask_])

        word_embeddings = result.dimshuffle(1,0,2,3) #put the states as the last dimension
        #remove this line to see the RNN states
        word_embeddings  = word_embeddings[-1] #take only the last state, since we dont need the others

        #remove subword dim from mask
        #if subword is empty then word is emptry the word is emptry, if not then the word is used
        word_embeddings_mask = subword_id_input_mask_.max(axis=1)

        return word_embeddings, word_embeddings_mask
Beispiel #15
0
def rnn_layer(in_dim, h, h_dim, n, pref=""):
    linear = Linear(input_dim=in_dim,
                    output_dim=h_dim,
                    name='linear' + str(n) + pref)
    rnn = SimpleRecurrent(dim=h_dim,
                          activation=Tanh(),
                          name='rnn' + str(n) + pref)
    initialize([linear, rnn])
    return rnn.apply(linear.apply(h))
Beispiel #16
0
def test_saved_inner_graph():
    """Make sure that the original inner graph is saved."""
    x = tensor.tensor3()
    recurrent = SimpleRecurrent(dim=3, activation=Tanh())
    y = recurrent.apply(x)

    application_call = get_application_call(y)
    assert application_call.inner_inputs
    assert application_call.inner_outputs

    cg = ComputationGraph(application_call.inner_outputs)
    # Check that the inner scan graph is annotated
    # with `recurrent.apply`
    assert len(VariableFilter(application=recurrent.apply)(cg)) == 3
    # Check that the inner graph is equivalent to the one
    # produced by a stand-alone of `recurrent.apply`
    assert is_same_graph(application_call.inner_outputs[0],
                         recurrent.apply(*application_call.inner_inputs,
                                         iterate=False))
class LanguageModelToy(Initializable):
    """
    This takes the word embeddings from CompositionalLayerToyWithTables and creates sentence embeddings

    Input is a 3d tensor with the dimensions of (num_words, num_subwords, batch_size) and
    a 3d tensor a mask of size (num_words, num_subwords, batch_size)

    All hidden state sizes are the same as the subword embedding size

    This returns a 3d tensor with dimenstions of (num_words = num RNN states, batch_size, sentence embedding size)
    """

    def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size,
                 subword_RNN_hidden_state_size, LM_RNN_hidden_state_size, **kwargs):

        super(LanguageModelToy, self).__init__(**kwargs)
        self.batch_size = batch_size
        self.num_subwords = num_subwords # number of subwords which make up a word
        self.num_words = num_words  # number of words in the sentence
        self.subword_embedding_size = subword_embedding_size
        self.input_vocab_size = input_vocab_size
        self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size
        self.LM_RNN_hidden_state_size = LM_RNN_hidden_state_size

        self.compositional_layer = CompositionalLayerToyWithTables(self.batch_size, self.num_subwords, self.num_words,
                                                              self.subword_embedding_size, self.input_vocab_size,
                                                              self.subword_RNN_hidden_state_size, name='compositional_layer')

        # has one RNN which reads the word embeddings into a sentence embedding
        self.language_model_RNN = SimpleRecurrent(
            dim=self.LM_RNN_hidden_state_size, activation=Identity(), name='language_model_RNN',
            weights_init=Identity_init())

        self.children = [self.compositional_layer, self.language_model_RNN]

    @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['sentence_embeddings', 'sentence_embeddings_mask'])
    def apply(self, subword_id_input_, subword_id_input_mask_):
        """
        subword_id_input_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size).
        It is expected as a dtype=uint16 or equivalent

        subword_id_input_mask_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size).
        It is expected as a dtype=uint8 or equivalent and has binary values of 1 when there is data and zero otherwise.

        Returned is a 3d tensor of size (num_words = num RNN states, batch_size, sentence embedding size)
        Also returned is a 1d tensor of size (batch_size) describing if the sentence is valid of empty in the batch
        """

        word_embeddings, word_embeddings_mask = self.compositional_layer.apply(subword_id_input_, subword_id_input_mask_)
        sentence_embeddings = self.language_model_RNN.apply(word_embeddings, mask=word_embeddings_mask)

        sentence_embeddings_mask = word_embeddings_mask.max(axis=0).T

        return sentence_embeddings, sentence_embeddings_mask
Beispiel #18
0
def example5():
    """Bidir + simplereccurent. Adaptation from a unittest in blocks """
    
    bidir = Bidirectional(weights_init=Orthogonal(),
                               prototype=SimpleRecurrent(
                                   dim=3, activation=Tanh()))
    
    simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(),
                                  activation=Tanh(), seed=1)
    
    bidir.allocate()
    simple.initialize()
    
    bidir.children[0].parameters[0].set_value(
        
        simple.parameters[0].get_value())
    
    bidir.children[1].parameters[0].set_value(        
        simple.parameters[0].get_value())
    
    #Initialize theano variables and functions
    x = tensor.tensor3('x')
    mask = tensor.matrix('mask')
 
    calc_bidir = theano.function([x, mask],
                                 [bidir.apply(x, mask=mask)])
    calc_simple = theano.function([x, mask],
                                  [simple.apply(x, mask=mask)])
 

    #Testing time
 
    x_val = 0.1 * np.asarray(
        list(itertools.permutations(range(4))),
        dtype=theano.config.floatX)
        
    x_val = (np.ones((24, 4, 3), dtype=theano.config.floatX) *
                  x_val[..., None])
                  
    mask_val = np.ones((24, 4), dtype=theano.config.floatX)
    mask_val[12:24, 3] = 0

    h_bidir = calc_bidir(x_val, mask_val)[0]
    h_simple = calc_simple(x_val, mask_val)[0]
    h_simple_rev = calc_simple(x_val[::-1], mask_val[::-1])[0]
    

    print(h_bidir)
    print(h_simple)
    print(h_simple_rev)
Beispiel #19
0
def example5():
    """Bidir + simplereccurent. Adaptation from a unittest in blocks """

    bidir = Bidirectional(weights_init=Orthogonal(),
                          prototype=SimpleRecurrent(dim=3, activation=Tanh()))

    simple = SimpleRecurrent(dim=3,
                             weights_init=Orthogonal(),
                             activation=Tanh(),
                             seed=1)

    bidir.allocate()
    simple.initialize()

    bidir.children[0].parameters[0].set_value(simple.parameters[0].get_value())

    bidir.children[1].parameters[0].set_value(simple.parameters[0].get_value())

    #Initialize theano variables and functions
    x = tensor.tensor3('x')
    mask = tensor.matrix('mask')

    calc_bidir = theano.function([x, mask], [bidir.apply(x, mask=mask)])
    calc_simple = theano.function([x, mask], [simple.apply(x, mask=mask)])

    #Testing time

    x_val = 0.1 * np.asarray(list(itertools.permutations(range(4))),
                             dtype=theano.config.floatX)

    x_val = (np.ones(
        (24, 4, 3), dtype=theano.config.floatX) * x_val[..., None])

    mask_val = np.ones((24, 4), dtype=theano.config.floatX)
    mask_val[12:24, 3] = 0

    h_bidir = calc_bidir(x_val, mask_val)[0]
    h_simple = calc_simple(x_val, mask_val)[0]
    h_simple_rev = calc_simple(x_val[::-1], mask_val[::-1])[0]

    print(h_bidir)
    print(h_simple)
    print(h_simple_rev)
Beispiel #20
0
class TextRNN(object):

    def __init__(self, dim_in, dim_hidden, dim_out, **kwargs):

        self.dim_in = dim_in
        self.dim_hidden = dim_hidden
        self.dim_out = dim_out

        self.input_layer = Linear(input_dim=self.dim_in, output_dim=self.dim_hidden,
                                weights_init=initialization.IsotropicGaussian(),
                                biases_init=initialization.Constant(0))
        self.input_layer.initialize()

        sparse_init = initialization.Sparse(num_init=15, weights_init=initialization.IsotropicGaussian())
        self.recurrent_layer = SimpleRecurrent(
                                dim=self.dim_hidden, activation=Tanh(), name="first_recurrent_layer",
                                weights_init=sparse_init,
                                biases_init=initialization.Constant(0.01))
        '''
        self.recurrent_layer = LSTM(dim=self.dim_hidden, activation=Tanh(),
                                    weights_init=initialization.IsotropicGaussian(std=0.001),
                                    biases_init=initialization.Constant(0.01))
        '''
        self.recurrent_layer.initialize()

        self.output_layer = Linear(input_dim=self.dim_hidden, output_dim=self.dim_out,
                                weights_init=initialization.Uniform(width=0.01),
                                biases_init=initialization.Constant(0.01))
        self.output_layer.initialize()

        self.children = [self.input_layer, self.recurrent_layer, self.output_layer]

    '''
    @recurrent(sequences=['inputs'], 
            states=['states'],
            contexts=[],
            outputs=['states', 'output'])
    '''

    def run(self, inputs):
        output = self.output_layer.apply( self.recurrent_layer.apply(self.input_layer.apply(inputs)) )
        return output
Beispiel #21
0
# Computational Graph
input = T.tensor3('input')
mask = T.fmatrix('mask')
target = T.tensor3('target')
linear1 = Linear(name='linear1', input_dim=300, output_dim=128)
recurrent = SimpleRecurrent(name='recurrent', activation=Tanh(), dim=128)
linear2 = Linear(name='linear2', input_dim=128, output_dim=9)
softmax = Softmax()
bricks = [linear1, recurrent, linear2]
for brick in bricks:
    brick.weights_init = IsotropicGaussian(0.01)
    brick.biases_init = Constant(0)
    brick.initialize()

linear1_output = linear1.apply(input)
recurrent_output = recurrent.apply(linear1_output, mask=mask)
linear2_output = linear2.apply(recurrent_output)
shape = linear2_output.shape  # 100 * 29*9
output = softmax.apply(linear2_output.reshape(
    (-1,
     9))).reshape(shape)  # hameye dimension ha be gheyr az yeki k oon 9 hast.

# Cost and Functions
cost = T.nnet.categorical_crossentropy(output, target)  # 100 x 29
cost = cost * mask
cost = cost.mean()

params = Model(cost).parameters
updates = sgd(cost, params)
f_train = theano.function(inputs=[input, mask, target],
                          outputs=cost,
Beispiel #22
0
    activation=Tanh(),
    weights_init=initialization.Uniform(width=0.01))
rnn.initialize()

linear_output = Linear(
    name='linear_output',
    input_dim=hidden_layer_dim,
    output_dim=charset_size,
    weights_init=initialization.Uniform(width=0.01),
    biases_init=Constant(0))
linear_output.initialize()

softmax = NDimensionalSoftmax(name='ndim_softmax')

activation_input = lookup_input.apply(x)
hidden = rnn.apply(linear_input.apply(activation_input))
activation_output = linear_output.apply(hidden)
y_est = softmax.apply(activation_output, extra_ndim=1)

cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean()


from blocks.graph import ComputationGraph
from blocks.algorithms import GradientDescent, Adam

cg = ComputationGraph([cost])

step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)]


algorithm = GradientDescent(
Beispiel #23
0
def rnn_layer(dim, h, n):
    linear = Linear(input_dim=dim, output_dim=dim, name='linear' + str(n))
    rnn = SimpleRecurrent(dim=dim, activation=Tanh(), name='rnn' + str(n))
    initialize([linear, rnn])
    return rnn.apply(linear.apply(h))
class BaselineCompositionalLayerToyBidirectional(Initializable):
    def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size,
                 subword_RNN_hidden_state_size, add_one = True, **kwargs):

        super(BaselineCompositionalLayerToyBidirectional, self).__init__(**kwargs)

        self.batch_size = batch_size
        self.num_subwords = num_subwords # number of subwords which make up a word
        self.num_words = num_words  # number of words in the sentence
        self.subword_embedding_size = subword_embedding_size
        self.input_vocab_size = input_vocab_size
        self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size
        self.add_one = add_one # adds 1 to the backwards embeddings

        # create the look up table
        self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name='input_lookup')
        self.lookup.weights_init = Uniform(width=0.08)
        self.lookup.biases_init = Constant(0)

        # has one RNN which reads the subwords into a word embedding
        self.compositional_subword_to_word_RNN_forward = SimpleRecurrent(
            dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN_forward',
            weights_init=Identity_init())

        self.compositional_subword_to_word_RNN_backward = SimpleRecurrent(
            dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN_backward',
            weights_init=Identity_init())

        self.children = [self.lookup, self.compositional_subword_to_word_RNN_forward,
                         self.compositional_subword_to_word_RNN_backward]


    '''
    The RNN will eat up the subwords dimension, resulting in a
    3d tensor of shape = (num_subwords, batch_size, RNN_hidden_value_size * 2), which is returned as 'word_embeddings'

    NOTE:  That it is the shape of num_subwords not num_words

    The backwords embbedding elements are +1, to show them as different from the forward ones.
    '''
    @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['word_embeddings', 'word_embeddings_with_states', 'word_embeddings_mask'])
    def apply(self, subword_id_input_, subword_id_input_mask_):
        ##shape = (num_words, num_subwords, batch_size, embedding size)
        subword_embeddings = self.lookup.apply(subword_id_input_)

        #forward sequence
        forward_result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords
            fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN_forward.apply(subword_embeddings, mask=subword_id_input_mask_),
            sequences= [subword_embeddings, subword_id_input_mask_])

        forward_word_embeddings_with_states = forward_result.dimshuffle(1,0,2,3) # keep to check for values as output

        #DO NOT DIMSHUFFLE AS YOU WANT IT TO BE state 1 then state 2 then state 3 etc.
        s = forward_result.shape
        forward_word_embeddings = T.reshape(forward_result, (s[0]*s[1], s[2], s[3]))

        #backward sequence
        backward_result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords
            fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN_backward.apply(subword_embeddings, mask=subword_id_input_mask_),
            sequences= [subword_embeddings[:,::-1,:], subword_id_input_mask_[:,::-1,:]])

        #NOTE! added + 1 to backword embeddings to show them as different from forward embeddings
        if self.add_one:
            backward_result = backward_result + 1


        backward_word_embeddings_with_states = backward_result.dimshuffle(1,0,2,3) # keep to check for values as output

        backward_word_embeddings = T.reshape(backward_result, (s[0]*s[1], s[2], s[3]))


        word_embeddings_with_states = T.concatenate([forward_word_embeddings_with_states, backward_word_embeddings_with_states], axis=3)
        word_embeddings_with_states = word_embeddings_with_states.dimshuffle(2,0,1,3)[-1]

        word_embeddings = T.concatenate([forward_word_embeddings, backward_word_embeddings], axis=2)

        #remove subword dim from mask
        #if subword is empty then word is emptry the word is emptry, if not then the word is used
        word_embeddings_mask_with_states = subword_id_input_mask_.max(axis=1)

        return word_embeddings, word_embeddings_with_states, word_embeddings_mask_with_states
Beispiel #25
0
                      dim=hidden_layer_dim,
                      activation=Tanh(),
                      weights_init=initialization.Uniform(width=0.01))
rnn.initialize()

linear_output = Linear(name='linear_output',
                       input_dim=hidden_layer_dim,
                       output_dim=train_dataset.durations_vocab_size(),
                       weights_init=initialization.Uniform(width=0.01),
                       biases_init=Constant(0))
linear_output.initialize()

softmax = NDimensionalSoftmax(name='ndim_softmax')

activation_input = lookup_input.apply(x)
hidden = rnn.apply(linear_input.apply(activation_input))
activation_output = linear_output.apply(hidden)
y_est = softmax.apply(activation_output, extra_ndim=1)

cost = softmax.categorical_cross_entropy(y, activation_output,
                                         extra_ndim=1).mean()

from blocks.graph import ComputationGraph
from blocks.algorithms import GradientDescent, Adam

cg = ComputationGraph([cost])

step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)]

algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
Beispiel #26
0
    def __init__(self, rnn_dims, num_actions, data_X_np=None, data_y_np=None, width=32, height=32):
        ###############################################################
        #
        #       Network and data setup
        #
        ##############################################################
        RNN_DIMS = 100
        NUM_ACTIONS = num_actions

        tensor5 = T.TensorType('float32', [False, True, True, True, True])
        self.x = T.tensor4('features')
        self.reward = T.tensor3('targets', dtype='float32')
        self.state = T.matrix('states', dtype='float32')

        self.hidden_states = [] # holds hidden states in np array form

        
        #data_X & data_Y supplied in init function now...

        if data_X_np is None or data_y_np is None:
            print 'you did not supply data at init'
            data_X_np = np.float32(np.random.normal(size=(1280, 1,1, width, height)))
            data_y_np = np.float32(np.random.normal(size=(1280, 1,1,1)))
        #data_states_np = np.float32(np.ones((1280, 1, 100)))
        state_shape = (data_X_np.shape[0],rnn_dims)
        self.data_states_np = np.float32(np.zeros(state_shape))


        self.datastream = IterableDataset(dict(features=data_X_np,
                                            targets=data_y_np,
                                            states=self.data_states_np)).get_example_stream()
        self.datastream_test = IterableDataset(dict(features=data_X_np,
                                            targets=data_y_np,
                                            states=self.data_states_np)).get_example_stream()
        data_X = self.datastream


        # 2 conv inputs
        # we want to take our sequence of input images and convert them to convolutional
        # representations
        conv_layers = [ConvolutionalLayer(Rectifier().apply, (3, 3), 16, (2, 2), name='l1'),
                       ConvolutionalLayer(Rectifier().apply, (3, 3), 32, (2, 2), name='l2'),
                       ConvolutionalLayer(Rectifier().apply, (3, 3), 64, (2, 2), name='l3'),
                       ConvolutionalLayer(Rectifier().apply, (3, 3), 128, (2, 2), name='l4'),
                       ConvolutionalLayer(Rectifier().apply, (3, 3), 128, (2, 2), name='l5'),
                       ConvolutionalLayer(Rectifier().apply, (3, 3), 128, (2, 2), name='l6')]
        convnet = ConvolutionalSequence(conv_layers, num_channels=4,
                                        image_size=(width, height),
                                        weights_init=init.Uniform(0, 0.01),
                                        biases_init=init.Constant(0.0),
                                        tied_biases=False,
                                        border_mode='full')
        convnet.initialize()
        output_dim = np.prod(convnet.get_dim('output'))

        conv_out = convnet.apply(self.x)

        reshape_dims = (conv_out.shape[0], conv_out.shape[1]*conv_out.shape[2]*conv_out.shape[3])
        hidden_repr = conv_out.reshape(reshape_dims)
        conv2rnn = Linear(input_dim=output_dim, output_dim=RNN_DIMS, 
                            weights_init=init.Uniform(width=0.01),
                            biases_init=init.Constant(0.))
        conv2rnn.initialize()
        conv2rnn_output = conv2rnn.apply(hidden_repr)

        # RNN hidden layer
        # then we want to feed those conv representations into an RNN
        rnn = SimpleRecurrent(dim=RNN_DIMS, activation=Rectifier(), weights_init=init.Uniform(width=0.01))
        rnn.initialize()
        self.learned_state = rnn.apply(inputs=conv2rnn_output, states=self.state, iterate=False)


        # linear output from hidden layer
        # the RNN has two outputs, but only this one has a target. That is, this is "expected return"
        # which the network attempts to minimize difference between expected return and actual return
        lin_output = Linear(input_dim=RNN_DIMS, output_dim=1, 
                            weights_init=init.Uniform(width=0.01),
                            biases_init=init.Constant(0.))
        lin_output.initialize()
        self.exp_reward = lin_output.apply(self.learned_state)
        self.get_exp_reward = theano.function([self.x, self.state], self.exp_reward)

        # softmax output from hidden layer
        # this provides a softmax of action recommendations
        # the hypothesis is that adjusting the other outputs magically influences this set of outputs
        # to suggest smarter (or more realistic?) moves
        action_output = Linear(input_dim=RNN_DIMS, output_dim=NUM_ACTIONS, 
                            weights_init=init.Constant(.001), 
                            biases_init=init.Constant(0.))
        action_output.initialize()

        self.suggested_actions = Softmax().apply(action_output.apply(self.learned_state[-1]))

        ######################
        # use this to get suggested actions... it requires the state of the hidden units from the previous
        # timestep
        #####################
        self.get_suggested_actions = theano.function([self.x, self.state], [self.suggested_actions, self.learned_state])
Beispiel #27
0
def test_attention_recurrent():
    rng = numpy.random.RandomState(1234)

    dim = 5
    batch_size = 4
    input_length = 20

    attended_dim = 10
    attended_length = 15

    wrapped = SimpleRecurrent(dim, Identity())
    attention = SequenceContentAttention(state_names=wrapped.apply.states,
                                         attended_dim=attended_dim,
                                         match_dim=attended_dim)
    recurrent = AttentionRecurrent(wrapped, attention, seed=1234)
    recurrent.weights_init = IsotropicGaussian(0.5)
    recurrent.biases_init = Constant(0)
    recurrent.initialize()

    attended = tensor.tensor3("attended")
    attended_mask = tensor.matrix("attended_mask")
    inputs = tensor.tensor3("inputs")
    inputs_mask = tensor.matrix("inputs_mask")
    outputs = recurrent.apply(inputs=inputs,
                              mask=inputs_mask,
                              attended=attended,
                              attended_mask=attended_mask)
    states, glimpses, weights = outputs
    assert states.ndim == 3
    assert glimpses.ndim == 3
    assert weights.ndim == 3

    # For values.
    def rand(size):
        return rng.uniform(size=size).astype(theano.config.floatX)

    # For masks.
    def generate_mask(length, batch_size):
        mask = numpy.ones((length, batch_size), dtype=theano.config.floatX)
        # To make it look like read data
        for i in range(batch_size):
            mask[1 + rng.randint(0, length - 1):, i] = 0.0
        return mask

    input_vals = rand((input_length, batch_size, dim))
    input_mask_vals = generate_mask(input_length, batch_size)
    attended_vals = rand((attended_length, batch_size, attended_dim))
    attended_mask_vals = generate_mask(attended_length, batch_size)

    func = theano.function([inputs, inputs_mask, attended, attended_mask],
                           [states, glimpses, weights])
    states_vals, glimpses_vals, weight_vals = func(input_vals, input_mask_vals,
                                                   attended_vals,
                                                   attended_mask_vals)
    assert states_vals.shape == (input_length, batch_size, dim)
    assert glimpses_vals.shape == (input_length, batch_size, attended_dim)

    assert (len(ComputationGraph(outputs).shared_variables) == len(
        Selector(recurrent).get_parameters()))

    # Manual reimplementation
    inputs2d = tensor.matrix()
    states2d = tensor.matrix()
    mask1d = tensor.vector()
    weighted_averages = tensor.matrix()
    distribute_func = theano.function([inputs2d, weighted_averages],
                                      recurrent.distribute.apply(
                                          inputs=inputs2d,
                                          weighted_averages=weighted_averages))
    wrapped_apply_func = theano.function([states2d, inputs2d, mask1d],
                                         wrapped.apply(states=states2d,
                                                       inputs=inputs2d,
                                                       mask=mask1d,
                                                       iterate=False))
    attention_func = theano.function([states2d, attended, attended_mask],
                                     attention.take_glimpses(
                                         attended=attended,
                                         attended_mask=attended_mask,
                                         states=states2d))
    states_man = wrapped.initial_states(batch_size).eval()
    glimpses_man = numpy.zeros((batch_size, attended_dim),
                               dtype=theano.config.floatX)
    for i in range(input_length):
        inputs_man = distribute_func(input_vals[i], glimpses_man)
        states_man = wrapped_apply_func(states_man, inputs_man,
                                        input_mask_vals[i])
        glimpses_man, weights_man = attention_func(states_man, attended_vals,
                                                   attended_mask_vals)
        assert_allclose(states_man, states_vals[i], rtol=1e-5)
        assert_allclose(glimpses_man, glimpses_vals[i], rtol=1e-5)
        assert_allclose(weights_man, weight_vals[i], rtol=1e-5)

    # weights for not masked position must be zero
    assert numpy.all(weight_vals * (1 - attended_mask_vals.T) == 0)
    # weights for masked positions must be non-zero
    assert numpy.all(abs(weight_vals + (1 - attended_mask_vals.T)) > 1e-5)
    # weights from different steps should be noticeably different
    assert (abs(weight_vals[0] - weight_vals[1])).sum() > 1e-2
    # weights for all state after the last masked position should be same
    for i in range(batch_size):
        last = int(input_mask_vals[:, i].sum())
        for j in range(last, input_length):
            assert_allclose(weight_vals[last, i], weight_vals[j, i], 1e-5)
Beispiel #28
0
    brick.initialize()

lstm.weights_init = IsotropicGaussian(0.01)
#lstm.weights_init = Orthogonal()
lstm.biases_init = Constant(0.)
lstm.initialize()

#ComputationGraph(encode.apply(x)).get_theano_function()(features_test)[0].shape
#ComputationGraph(lstm.apply(encoded)).get_theano_function()(features_test)
#ComputationGraph(decode.apply(hiddens[-1])).get_theano_function()(features_test)[0].shape

#ComputationGraph(SquaredError().apply(y, y_hat.flatten())).get_theano_function()(features_test, targets_test)[0].shape

encoded = encode.apply(x)
#hiddens = lstm.apply(encoded, gates.apply(x))
hiddens = lstm.apply(encoded)
y_hat  = decode.apply(hiddens[-1])

cost = SquaredError().apply(y, y_hat)
cost.name = 'cost'

#ipdb.set_trace()

#ComputationGraph(y_hat).get_theano_function()(features_test)[0].shape
#ComputationGraph(cost).get_theano_function()(features_test, targets_test)[0].shape

cg = ComputationGraph(cost)

#cg = ComputationGraph(hiddens).get_theano_function()
#ipdb.set_trace()
algorithm = GradientDescent(cost=cost, 
Beispiel #29
0
    brick.initialize()

lstm.weights_init = IsotropicGaussian(0.01)
#lstm.weights_init = Orthogonal()
lstm.biases_init = Constant(0.)
lstm.initialize()

#ComputationGraph(encode.apply(x)).get_theano_function()(features_test)[0].shape
#ComputationGraph(lstm.apply(encoded)).get_theano_function()(features_test)
#ComputationGraph(decode.apply(hiddens[-1])).get_theano_function()(features_test)[0].shape

#ComputationGraph(SquaredError().apply(y, y_hat.flatten())).get_theano_function()(features_test, targets_test)[0].shape

encoded = encode.apply(x)
#hiddens = lstm.apply(encoded, gates.apply(x))
hiddens = lstm.apply(encoded)
y_hat = decode.apply(hiddens[-1])

cost = SquaredError().apply(y, y_hat)
cost.name = 'cost'

#ipdb.set_trace()

#ComputationGraph(y_hat).get_theano_function()(features_test)[0].shape
#ComputationGraph(cost).get_theano_function()(features_test, targets_test)[0].shape

cg = ComputationGraph(cost)

#cg = ComputationGraph(hiddens).get_theano_function()
#ipdb.set_trace()
algorithm = GradientDescent(cost=cost,
Beispiel #30
0
class Rnn(Initializable, BaseRecurrent):
    def __init__(self, dims=(88, 100, 100), **kwargs):
        super(Rnn, self).__init__(**kwargs)
        self.dims = dims

        self.input_transform = Linear(
            input_dim=dims[0],
            output_dim=dims[1],
            weights_init=IsotropicGaussian(0.01),
            # biases_init=Constant(0.0),
            use_bias=False,
            name="input_transfrom")

        self.gru_layer = SimpleRecurrent(dim=dims[1],
                                         activation=Tanh(),
                                         weights_init=IsotropicGaussian(0.01),
                                         biases_init=Constant(0.0),
                                         use_bias=True,
                                         name="gru_rnn_layer")

        # TODO: find a way to automatically set the output dim in case of lstm vs normal rnn
        self.linear_trans = Linear(input_dim=dims[1],
                                   output_dim=dims[2] * 4,
                                   weights_init=IsotropicGaussian(0.01),
                                   biases_init=Constant(0.0),
                                   use_bias=False,
                                   name="h2h_transform")

        self.lstm_layer = LSTM(dim=dims[2],
                               activation=Tanh(),
                               weights_init=IsotropicGaussian(0.01),
                               biases_init=Constant(0.0),
                               use_bias=True,
                               name="lstm_rnn_layer")

        self.out_transform = MLP(activations=[Sigmoid()],
                                 dims=[dims[2], dims[0]],
                                 weights_init=IsotropicGaussian(0.01),
                                 use_bias=True,
                                 biases_init=Constant(0.0),
                                 name="out_layer")

        self.children = [
            self.input_transform, self.gru_layer, self.linear_trans,
            self.lstm_layer, self.out_transform
        ]

    # @recurrent(sequences=['inputs', 'input_mask'], contexts=[],
    # states=['gru_state', 'lstm_state', 'lstm_cells'],
    # outputs=['gru_state', 'lstm_state', 'lstm_cells'])
    def rnn_apply(self,
                  inputs,
                  mask=None,
                  gru_state=None,
                  lstm_state=None,
                  lstm_cells=None):
        input_transform = self.input_transform.apply(inputs)
        gru_state = self.gru_layer.apply(
            inputs=input_transform,
            # update_inputs=input_transform,
            # reset_inputs=input_transform,
            states=gru_state,
            mask=mask,
            iterate=False)
        lstm_transform = self.linear_trans.apply(gru_state)
        lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform,
                                                       states=lstm_state,
                                                       cells=lstm_cells,
                                                       mask=mask,
                                                       iterate=False)
        return gru_state, lstm_state, lstm_cells

    @recurrent(sequences=[],
               contexts=[],
               states=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'],
               outputs=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'])
    def rnn_generate(self,
                     inputs=None,
                     gru_state=None,
                     lstm_state=None,
                     lstm_cells=None):
        output = self.apply(inputs=inputs,
                            gru_state=gru_state,
                            lstm_state=lstm_state,
                            lstm_cells=lstm_cells,
                            iterate=False)
        return output, gru_state, lstm_state, lstm_cells

    @recurrent(sequences=['inputs', 'mask'],
               contexts=[],
               states=['gru_state', 'lstm_state', 'lstm_cells'],
               outputs=['output', 'gru_state', 'lstm_state', 'lstm_cells'])
    def apply(self,
              inputs,
              mask,
              gru_state=None,
              lstm_state=None,
              lstm_cells=None):
        # input_transform = self.input_transform.apply(inputs)
        # gru_state = self.gru_layer.apply(
        # inputs=input_transform,
        #     mask=mask,
        #     states=gru_state,
        #     iterate=False)
        # lstm_transform = self.linear_trans.apply(gru_state)
        # lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state,
        #                                                cells=lstm_cells,
        #                                                mask=mask, iterate=False)
        gru_state, lstm_state, lstm_cells = self.rnn_apply(
            inputs=inputs,
            mask=mask,
            gru_state=gru_state,
            lstm_state=lstm_state,
            lstm_cells=lstm_cells)

        output = 1.17 * self.out_transform.apply(lstm_state) * mask[:, None]
        return output, gru_state, lstm_state, lstm_cells

    def get_dim(self, name):
        dims = dict(zip(['outputs', 'gru_state', 'lstm_state'], self.dims))
        dims['lstm_cells'] = dims['lstm_state']
        return dims.get(name, None) or super(Rnn, self).get_dim(name)
def test_attention_recurrent():
    rng = numpy.random.RandomState(1234)

    dim = 5
    batch_size = 4
    input_length = 20

    attended_dim = 10
    attended_length = 15

    wrapped = SimpleRecurrent(dim, Identity())
    attention = SequenceContentAttention(
        state_names=wrapped.apply.states,
        attended_dim=attended_dim, match_dim=attended_dim)
    recurrent = AttentionRecurrent(wrapped, attention, seed=1234)
    recurrent.weights_init = IsotropicGaussian(0.5)
    recurrent.biases_init = Constant(0)
    recurrent.initialize()

    attended = tensor.tensor3("attended")
    attended_mask = tensor.matrix("attended_mask")
    inputs = tensor.tensor3("inputs")
    inputs_mask = tensor.matrix("inputs_mask")
    outputs = recurrent.apply(
        inputs=inputs, mask=inputs_mask,
        attended=attended, attended_mask=attended_mask)
    states, glimpses, weights = outputs
    assert states.ndim == 3
    assert glimpses.ndim == 3
    assert weights.ndim == 3

    # For values.
    def rand(size):
        return rng.uniform(size=size).astype(theano.config.floatX)

    # For masks.
    def generate_mask(length, batch_size):
        mask = numpy.ones((length, batch_size), dtype=theano.config.floatX)
        # To make it look like read data
        for i in range(batch_size):
            mask[1 + rng.randint(0, length - 1):, i] = 0.0
        return mask

    input_vals = rand((input_length, batch_size, dim))
    input_mask_vals = generate_mask(input_length, batch_size)
    attended_vals = rand((attended_length, batch_size, attended_dim))
    attended_mask_vals = generate_mask(attended_length, batch_size)

    func = theano.function([inputs, inputs_mask, attended, attended_mask],
                           [states, glimpses, weights])
    states_vals, glimpses_vals, weight_vals = func(
        input_vals, input_mask_vals,
        attended_vals, attended_mask_vals)
    assert states_vals.shape == (input_length, batch_size, dim)
    assert glimpses_vals.shape == (input_length, batch_size, attended_dim)

    assert (len(ComputationGraph(outputs).shared_variables) ==
            len(Selector(recurrent).get_parameters()))

    # Manual reimplementation
    inputs2d = tensor.matrix()
    states2d = tensor.matrix()
    mask1d = tensor.vector()
    weighted_averages = tensor.matrix()
    distribute_func = theano.function(
        [inputs2d, weighted_averages],
        recurrent.distribute.apply(
            inputs=inputs2d,
            weighted_averages=weighted_averages))
    wrapped_apply_func = theano.function(
        [states2d, inputs2d, mask1d], wrapped.apply(
            states=states2d, inputs=inputs2d, mask=mask1d, iterate=False))
    attention_func = theano.function(
        [states2d, attended, attended_mask],
        attention.take_glimpses(
            attended=attended, attended_mask=attended_mask,
            states=states2d))
    states_man = wrapped.initial_states(batch_size).eval()
    glimpses_man = numpy.zeros((batch_size, attended_dim),
                               dtype=theano.config.floatX)
    for i in range(input_length):
        inputs_man = distribute_func(input_vals[i], glimpses_man)
        states_man = wrapped_apply_func(states_man, inputs_man,
                                        input_mask_vals[i])
        glimpses_man, weights_man = attention_func(
            states_man, attended_vals, attended_mask_vals)
        assert_allclose(states_man, states_vals[i], rtol=1e-5)
        assert_allclose(glimpses_man, glimpses_vals[i], rtol=1e-5)
        assert_allclose(weights_man, weight_vals[i], rtol=1e-5)

    # weights for not masked position must be zero
    assert numpy.all(weight_vals * (1 - attended_mask_vals.T) == 0)
    # weights for masked positions must be non-zero
    assert numpy.all(abs(weight_vals + (1 - attended_mask_vals.T)) > 1e-5)
    # weights from different steps should be noticeably different
    assert (abs(weight_vals[0] - weight_vals[1])).sum() > 1e-2
    # weights for all state after the last masked position should be same
    for i in range(batch_size):
        last = int(input_mask_vals[:, i].sum())
        for j in range(last, input_length):
            assert_allclose(weight_vals[last, i], weight_vals[j, i], 1e-5)
Beispiel #32
0
h_dim = 100
o_dim = 10
batch_size = 50

print 'Building model ...'
# T x B x F
x = tensor.tensor3('x', dtype=floatX)
y = tensor.tensor3('y', dtype='int32')

x_to_h1 = Linear(name='x_to_h1',
                 input_dim=x_dim,
                 output_dim=h_dim)
pre_rnn = x_to_h1.apply(x)
rnn = SimpleRecurrent(activation=Rectifier(),
                      dim=h_dim, name="rnn")
h1 = rnn.apply(pre_rnn)
h1_to_o = Linear(name='h1_to_o',
                 input_dim=h_dim,
                 output_dim=o_dim)
pre_softmax = h1_to_o.apply(h1)
softmax = Softmax()
shape = pre_softmax.shape
softmax_out = softmax.apply(pre_softmax.reshape((-1, o_dim)))
softmax_out = softmax_out.reshape(shape)
softmax_out.name = 'softmax_out'

# comparing only last time-step
cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1])
cost.name = 'CrossEntropy'
error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1])
error_rate.name = 'error_rate'
class SimpleRecurrentLayer(Initializable, Feedforward):
    """ Blocks implementation of SimpleRecurrent is general and only
    handles the "recursive part". This class wraps the SimpleRecurrent
    class and adds linear input transformation. It can be used for most basic
    cases as a layer in a sequence of layers.

    Parameters
    ----------
    input_dim : int
    state_dim : int
    activation : Brick
    state_weights_init : NdarrayInitialization
        Initialization of weights in LSTM (including gates).
    input_weights_init : NdarrayInitialization
        Initialization of weights in linear transformation of input.
    biases_init : NdarrayInitialization
        Initialization of biases in linear transformation of input.
    """

    @lazy()
    def __init__(
        self,
        input_dim,
        state_dim,
        activation=Tanh(),
        state_weights_init=None,
        input_weights_init=None,
        biases_init=None,
        **kwargs
    ):
        super(SimpleRecurrentLayer, self).__init__(biases_init=biases_init, **kwargs)
        if state_weights_init is None:
            state_weights_init = init.IsotropicGaussian(0.01)
        if input_weights_init is None:
            input_weights_init = init.IsotropicGaussian(0.01)
        if biases_init is None:
            biases_init = init.Constant(0)

        self.input_transformation = Linear(
            input_dim=input_dim, output_dim=state_dim, weights_init=input_weights_init, biases_init=biases_init
        )
        self.rnn = SimpleRecurrent(dim=state_dim, activation=activation, weights_init=state_weights_init)
        self.children = [self.input_transformation, self.rnn]

    @application
    def apply(self, inputs, *args, **kwargs):
        """ Transforms input, sends to BasicRecurrent and returns output.

        Parameters
        ----------
        inputs : tensor.TensorVariable
            The 3 dimensional tensor of inputs in the shape (timesteps,
            batch_size, features).

        Returns
        -------
        outputs : tensor.TensorVariable
            The 3 dimensional tensor of outputs in the shape (timesteps,
            batch_size, features).
        """
        rnn_inputs = self.input_transformation.apply(inputs)
        outputs = self.rnn.apply(inputs=rnn_inputs, *args, **kwargs)

        return outputs

    @apply.delegate
    def apply_delegate(self):
        return self.children[0].apply

    @property
    def input_dim(self):
        return self.input_transformation.input_dim

    @input_dim.setter
    def input_dim(self, value):
        self.input_transformation.input_dim = value

    @property
    def output_dim(self):
        return self.rnn.dim

    @output_dim.setter
    def output_dim(self, value):
        self.rnn.dim = value
Beispiel #34
0
class EUTHM(UTHM):
    '''
    UTH model with extend information
    '''
    def __init__(self, config, dataset, *args, **kwargs):
        super(EUTHM, self).__init__(config, dataset)

    def _define_inputs(self, *args, **kwargs):
        super(EUTHM, self)._define_inputs()
        self.user_word = tensor.ivector('user_word')
        self.user_word_sparse_mask = tensor.vector('user_word_sparse_mask',
                                                   dtype=theano.config.floatX)
        self.user_word_left_idx = tensor.ivector('user_word_idx_left_idx')
        self.user_word_right_idx = tensor.ivector('user_word_idx_right_idx')
        self.hashtag_word = tensor.ivector('hashtag_word')
        self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask',
                                                 dtype=theano.config.floatX)
        self.hashtag_word_left_idx = tensor.ivector(
            'hashtag_word_idx_left_idx')
        self.hashtag_word_right_idx = tensor.ivector(
            'hashtag_word_idx_right_idx')
        self.sparse_word = tensor.imatrix('sparse_word')
        self.sparse_word_sparse_mask = tensor.vector(
            'sparse_word_sparse_mask', dtype=theano.config.floatX)
        self.sparse_word_mask = tensor.matrix('sparse_word_mask',
                                              dtype=theano.config.floatX)
        self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx')
        self.sparse_word_right_idx = tensor.ivector(
            'sparse_word_idx_right_idx')

    def _build_bricks(self, *args, **kwargs):
        # Build lookup tables
        super(EUTHM, self)._build_bricks()
        self.user2word = MLP(
            activations=[Tanh('user2word_tanh')],
            dims=[self.config.user_embed_dim, self.config.word_embed_dim],
            name='user2word_mlp')
        self.user2word.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.user2word.biases_init = Constant(0)
        self.user2word.initialize()
        self.hashtag2word = MLP(
            activations=[Tanh('hashtag2word_tanh')],
            dims=[
                self.config.user_embed_dim + self.config.word_embed_dim,
                self.config.word_embed_dim
            ],
            name='hashtag2word_mlp')
        self.hashtag2word.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.hashtag2word.biases_init = Constant(0)
        self.hashtag2word.initialize()
        self.user2word_bias = Bias(dim=1, name='user2word_bias')
        self.user2word_bias.biases_init = Constant(0)
        self.user2word_bias.initialize()
        self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias')
        self.hashtag2word_bias.biases_init = Constant(0)
        self.hashtag2word_bias.initialize()
        #Build character embedding
        self.char_embed = self._embed(len(self.dataset.char2index),
                                      self.config.char_embed_dim,
                                      name='char_embed')
        # Build sparse word encoder
        self.rnn_ins = Linear(input_dim=self.config.char_embed_dim,
                              output_dim=self.config.word_embed_dim,
                              name='rnn_in')
        self.rnn_ins.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim +
                                           self.config.word_embed_dim))
        self.rnn_ins.biases_init = Constant(0)
        self.rnn_ins.initialize()
        self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim,
                                   activation=Tanh())
        self.rnn.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.rnn.initialize()

    def _set_OV_value(self, *args, **kwargs):
        '''Train a <unk> representation'''
        tensor.set_subtensor(
            self.char_embed.W[self.dataset.char2index['<unk>']],
            numpy.zeros(self.config.char_embed_dim,
                        dtype=theano.config.floatX))

    def _get_text_vec(self, *args, **kwargs):
        # Transpose text
        self.text = self.text.dimshuffle(1, 0)
        self.text_mask = self.text_mask.dimshuffle(1, 0)
        self.sparse_word = self.sparse_word.dimshuffle(1, 0)
        self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0)
        # Turn word, user and hashtag into vector representation
        text_vec = self.word_embed.apply(self.text)
        # Apply user word, hashtag word and url
        text_vec = self._apply_user_word(text_vec)
        text_vec = self._apply_hashtag_word(text_vec)
        text_vec = self._apply_sparse_word(text_vec)
        return text_vec

    @abstractmethod
    def _apply_user_word(self, text_vec, *args, **kwargs):
        '''
        Replace @a with transformed author vector
        :param text_vec:
        :param args:
        :param kwargs:
        :return:
        '''
        user_word_vec = self.user2word.apply(self.user_embed.apply(self.user_word)) + \
                        self.user2word_bias.parameters[0][0]
        text_vec = tensor.set_subtensor(
            text_vec[self.user_word_right_idx, self.user_word_left_idx],
            text_vec[self.user_word_right_idx, self.user_word_left_idx] *
            (1 - self.user_word_sparse_mask[:, None]) +
            user_word_vec * self.user_word_sparse_mask[:, None])
        return text_vec

    @abstractmethod
    def _apply_hashtag_word(self, text_vec, *args, **kwargs):
        '''
        Replace #h with transformed hashtag vector
        :param text_vec:
        :param args:
        :param kwargs:
        :return:
        '''
        hashtag_word_vec = self.hashtag2word.apply(self.hashtag_embed.apply(self.hashtag_word)) +\
                           self.hashtag2word_bias.parameters[0][0]
        text_vec = tensor.set_subtensor(
            text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx],
            text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx] *
            (1 - self.hashtag_sparse_mask[:, None]) +
            hashtag_word_vec * self.hashtag_sparse_mask[:, None])
        return text_vec

    @abstractmethod
    def _apply_sparse_word(self, text_vec, *args, **kwargs):
        '''
        Replace sparse word encoding with character embedding. (maybe lstm)
        :param text_vec:
        :param args:
        :param kwargs:
        :return:
        '''
        sparse_word_vec = self.char_embed.apply(self.sparse_word)
        sparse_word_hiddens = self.rnn.apply(
            inputs=self.rnn_ins.apply(sparse_word_vec),
            mask=self.sparse_word_mask)
        tmp = sparse_word_hiddens[-1]
        text_vec = tensor.set_subtensor(
            text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx],
            text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx] *
            (1 - self.sparse_word_sparse_mask[:, None]) +
            tmp * self.sparse_word_sparse_mask[:, None])
        return text_vec
Beispiel #35
0
n_epochs = 30
x_dim = 1
h_dim = 100
o_dim = 10
batch_size = 50

print 'Building model ...'
# T x B x F
x = tensor.tensor3('x', dtype=floatX)
y = tensor.tensor3('y', dtype='int32')

x_to_h1 = Linear(name='x_to_h1', input_dim=x_dim, output_dim=h_dim)
pre_rnn = x_to_h1.apply(x)
rnn = SimpleRecurrent(activation=Rectifier(), dim=h_dim, name="rnn")
h1 = rnn.apply(pre_rnn)
h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=o_dim)
pre_softmax = h1_to_o.apply(h1)
softmax = Softmax()
shape = pre_softmax.shape
softmax_out = softmax.apply(pre_softmax.reshape((-1, o_dim)))
softmax_out = softmax_out.reshape(shape)
softmax_out.name = 'softmax_out'

# comparing only last time-step
cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1])
cost.name = 'CrossEntropy'
error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1])
error_rate.name = 'error_rate'

# Initialization
def main(num_epochs=100):
    x = tensor.matrix('features')
    m = tensor.matrix('features_mask')

    x_int = x.astype(dtype='int32').T
    train_dataset = TextFile('inspirational.txt')
    train_dataset.indexables[0] = numpy.array(sorted(
        train_dataset.indexables[0], key=len
    ))

    n_voc = len(train_dataset.dict.keys())

    init_probs = numpy.array(
        [sum(filter(lambda idx:idx == w,
                    [s[0] for s in train_dataset.indexables[
                        train_dataset.sources.index('features')]]
                    )) for w in xrange(n_voc)],
        dtype=theano.config.floatX
    )
    init_probs = init_probs / init_probs.sum()

    n_h = 100
    linear_embedding = LookupTable(
        length=n_voc,
        dim=n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    linear_embedding.initialize()
    lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX)
    lstm_biases[n_h:(2 * n_h)] = 4.
    rnn = SimpleRecurrent(
        dim=n_h,
        activation=Tanh(),
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    rnn.initialize()
    score_layer = Linear(
        input_dim=n_h,
        output_dim=n_voc,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    score_layer.initialize()

    embedding = (linear_embedding.apply(x_int[:-1])
                 * tensor.shape_padright(m.T[1:]))
    rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:])
    probs = softmax(
        sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0]
    )
    idx_mask = m.T[1:].nonzero()
    cost = CategoricalCrossEntropy().apply(
        x_int[1:][idx_mask[0], idx_mask[1]],
        probs[idx_mask[0], idx_mask[1]]
    )
    cost.name = 'cost'
    misclassification = MisclassificationRate().apply(
        x_int[1:][idx_mask[0], idx_mask[1]],
        probs[idx_mask[0], idx_mask[1]]
    )
    misclassification.name = 'misclassification'

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=Adam()
    )

    train_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=train_dataset.num_examples,
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )

    model = Model(cost)

    extensions = []
    extensions.append(Timing())
    extensions.append(FinishAfter(after_n_epochs=num_epochs))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True))

    batch_size = 10
    length = 30
    trng = MRG_RandomStreams(18032015)
    u = trng.uniform(size=(length, batch_size, n_voc))
    gumbel_noise = -tensor.log(-tensor.log(u))
    init_samples = (tensor.log(init_probs).dimshuffle(('x', 0))
                    + gumbel_noise[0]).argmax(axis=-1)
    init_states = rnn.initial_state('states', batch_size)

    def sampling_step(g_noise, states, samples_step):
        embedding_step = linear_embedding.apply(samples_step)
        next_states = rnn.apply(inputs=embedding_step,
                                            states=states,
                                            iterate=False)
        probs_step = softmax(score_layer.apply(next_states))
        next_samples = (tensor.log(probs_step)
                        + g_noise).argmax(axis=-1)

        return next_states, next_samples

    [_, samples], _ = theano.scan(
        fn=sampling_step,
        sequences=[gumbel_noise[1:]],
        outputs_info=[init_states, init_samples]
    )

    sampling = theano.function([], samples.owner.inputs[0].T)

    plotters = []
    plotters.append(Plotter(
        channels=[['train_cost', 'train_misclassification']],
        titles=['Costs']))

    extensions.append(PlotManager('Language modelling example',
                                  plotters=plotters,
                                  after_epoch=True,
                                  after_training=True))
    extensions.append(Printing())
    extensions.append(PrintSamples(sampler=sampling,
                                   voc=train_dataset.inv_dict))

    main_loop = MainLoop(model=model,
                         data_stream=train_data_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Beispiel #37
0
        labels_mask = data[b'mask_labels']

print('Building model ...')
# T x B x F
x = tensor.tensor3('x', dtype=floatX)
# T x B
x_mask = tensor.matrix('x_mask', dtype=floatX)
# L x B
y = tensor.matrix('y', dtype=floatX)
# L x B
y_mask = tensor.matrix('y_mask', dtype=floatX)

x_to_h = Linear(name='x_to_h', input_dim=x_dim, output_dim=h_dim)
x_transform = x_to_h.apply(x)
rnn = SimpleRecurrent(activation=Tanh(), dim=h_dim, name="rnn")
h = rnn.apply(x_transform)
h_to_o = Linear(name='h_to_o', input_dim=h_dim, output_dim=num_classes + 1)
h_transform = h_to_o.apply(h)
# T x B x C+1
y_hat = tensor.nnet.softmax(h_transform.reshape(
    (-1, num_classes + 1))).reshape((h.shape[0], h.shape[1], -1))
y_hat.name = 'y_hat'

y_hat_mask = x_mask
cost = CTC().apply(y, y_hat, y_mask, y_hat_mask, 'normal_scale')
cost.name = 'CTC'
# Initialization
for brick in (rnn, x_to_h, h_to_o):
    brick.weights_init = IsotropicGaussian(0.01)
    brick.biases_init = Constant(0)
    brick.initialize()
Beispiel #38
0
def construct_model(vocab_size, embedding_dim, hidden_dim, activation):

    # Construct the model
    x = tensor.lmatrix('features')
    x_mask = tensor.fmatrix('features_mask')
    y = tensor.lmatrix('targets')
    # Batch X Time
    y_mask = tensor.fmatrix('targets_mask')
    # Batch X Time
    frequency_mask = tensor.fmatrix('frequency_mask')
    frequency_mask_mask = tensor.fmatrix('frequency_mask_mask')

    # Only for the validation
    last_word = tensor.lvector('last_word')

    lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup')

    linear = Linear(input_dim=embedding_dim,
                    output_dim=hidden_dim,
                    name="linear")
    hidden = SimpleRecurrent(dim=hidden_dim,
                             activation=activation,
                             name='hidden_recurrent')
    top_linear = Linear(input_dim=hidden_dim,
                        output_dim=vocab_size,
                        name="top_linear")

    # Return 3D Tensor: Batch X Time X embedding_dim
    embeddings = lookup.apply(x)
    # Give time as the first index: Time X Batch X embedding_dim
    embeddings = embeddings.dimshuffle(1, 0, 2)

    pre_recurrent = linear.apply(embeddings)

    after_recurrent = hidden.apply(inputs=pre_recurrent, mask=x_mask.T)[:-1]
    after_recurrent_last = after_recurrent[-1]

    presoft = top_linear.apply(after_recurrent)

    # Define the cost
    # Give y as a vector and reshape presoft to 2D tensor
    y = y.flatten()

    shape = presoft.shape
    presoft = presoft.dimshuffle(1, 0, 2)
    presoft = presoft.reshape((shape[0] * shape[1], shape[2]))

    # Build cost_matrix
    presoft = presoft - presoft.max(axis=1).dimshuffle(0, 'x')
    log_prob = presoft - \
        tensor.log(tensor.exp(presoft).sum(axis=1).dimshuffle(0, 'x'))
    flat_log_prob = log_prob.flatten()
    range_ = tensor.arange(y.shape[0])
    flat_indices = y + range_ * presoft.shape[1]
    cost_matrix = flat_log_prob[flat_indices]

    # Mask useless values from the cost_matrix
    cost_matrix = - cost_matrix * \
        y_mask.flatten() * frequency_mask.flatten() * \
        frequency_mask_mask.flatten()

    # Average the cost
    cost = cost_matrix.sum()
    cost = cost / (y_mask * frequency_mask).sum()

    # Initialize parameters
    for brick in (lookup, linear, hidden, top_linear):
        brick.weights_init = IsotropicGaussian(0.01)
        brick.biases_init = Constant(0.)
        brick.initialize()

    return cost
class CompositionalLayerToyBidirectional(Initializable):
    def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size,
                 subword_RNN_hidden_state_size, add_one = True, **kwargs):

        super(CompositionalLayerToyBidirectional, self).__init__(**kwargs)

        self.batch_size = batch_size
        self.num_subwords = num_subwords # number of subwords which make up a word
        self.num_words = num_words  # number of words in the sentence
        self.subword_embedding_size = subword_embedding_size
        self.input_vocab_size = input_vocab_size
        self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size
        self.add_one = add_one #adds 1 to the backwards embeddings

        # create the look up table
        self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name='input_lookup')
        self.lookup.weights_init = Uniform(width=0.08)
        self.lookup.biases_init = Constant(0)

        # has one RNN which reads the subwords into a word embedding
        self.compositional_subword_to_word_RNN_forward = SimpleRecurrent(
            dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN_forward',
            weights_init=Identity_init())

        self.compositional_subword_to_word_RNN_backward = SimpleRecurrent(
            dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN_backward',
            weights_init=Identity_init())

        self.children = [self.lookup, self.compositional_subword_to_word_RNN_forward,
                         self.compositional_subword_to_word_RNN_backward]


    '''
    The RNN will eat up the subwords dimension, resulting in a
    3d tensor of shape = (num_words, batch_size, RNN_hidden_value_size * 2), which is returned as 'word_embeddings'

    The backwords embbedding elements are +1, to show them as different from the forward ones.
    '''
    @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['word_embeddings', 'word_embeddings_mask'])
    def apply(self, subword_id_input_, subword_id_input_mask_):
        ##shape = (num_words, num_subwords, batch_size, embedding size)
        subword_embeddings = self.lookup.apply(subword_id_input_)

        forward_result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords
            fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN_forward.apply(subword_embeddings, mask=subword_id_input_mask_),
            sequences= [subword_embeddings, subword_id_input_mask_])

        forward_word_embeddings = forward_result.dimshuffle(1,0,2,3) #put the states as the last dimension
        forward_word_embeddings  = forward_word_embeddings[-1] #take only the last state, since we dont need the others

        backward_result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords
            fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN_backward.apply(subword_embeddings, mask=subword_id_input_mask_),
            sequences= [subword_embeddings[:,::-1,:], subword_id_input_mask_[:,::-1,:]])

        backward_word_embeddings = backward_result.dimshuffle(1,0,2,3) #put the states as the last dimension
        backward_word_embeddings  = backward_word_embeddings[-1] #take only the last state, since we dont need the others

        # NOTE! added + 1 to backword embeddings to show them as different from forward embeddings
        backward_word_embeddings = backward_word_embeddings + 1.0
        word_embeddings = T.concatenate([forward_word_embeddings, backward_word_embeddings], axis=2)

        #remove subword dim from mask
        #if subword is empty then word is emptry the word is emptry, if not then the word is used
        word_embeddings_mask = subword_id_input_mask_.max(axis=1)

        return word_embeddings, word_embeddings_mask
Beispiel #40
0
def construct_model(vocab_size, embedding_dim, hidden_dim,
                    activation):

    # Construct the model
    x = tensor.lmatrix('features')
    x_mask = tensor.fmatrix('features_mask')
    y = tensor.lmatrix('targets')
    # Batch X Time
    y_mask = tensor.fmatrix('targets_mask')
    # Batch X Time
    frequency_mask = tensor.fmatrix('frequency_mask')
    frequency_mask_mask = tensor.fmatrix('frequency_mask_mask')

    # Only for the validation
    last_word = tensor.lvector('last_word')

    lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup')

    linear = Linear(input_dim=embedding_dim, output_dim=hidden_dim,
                    name="linear")
    hidden = SimpleRecurrent(dim=hidden_dim, activation=activation,
                             name='hidden_recurrent')
    top_linear = Linear(input_dim=hidden_dim, output_dim=vocab_size,
                        name="top_linear")

    # Return 3D Tensor: Batch X Time X embedding_dim
    embeddings = lookup.apply(x)
    # Give time as the first index: Time X Batch X embedding_dim
    embeddings = embeddings.dimshuffle(1, 0, 2)

    pre_recurrent = linear.apply(embeddings)

    after_recurrent = hidden.apply(inputs=pre_recurrent,
                                   mask=x_mask.T)[:-1]
    after_recurrent_last = after_recurrent[-1]

    presoft = top_linear.apply(after_recurrent)

    # Define the cost
    # Give y as a vector and reshape presoft to 2D tensor
    y = y.flatten()

    shape = presoft.shape
    presoft = presoft.dimshuffle(1, 0, 2)
    presoft = presoft.reshape((shape[0] * shape[1], shape[2]))

    # Build cost_matrix
    presoft = presoft - presoft.max(axis=1).dimshuffle(0, 'x')
    log_prob = presoft - \
        tensor.log(tensor.exp(presoft).sum(axis=1).dimshuffle(0, 'x'))
    flat_log_prob = log_prob.flatten()
    range_ = tensor.arange(y.shape[0])
    flat_indices = y + range_ * presoft.shape[1]
    cost_matrix = flat_log_prob[flat_indices]

    # Mask useless values from the cost_matrix
    cost_matrix = - cost_matrix * \
        y_mask.flatten() * frequency_mask.flatten() * \
        frequency_mask_mask.flatten()

    # Average the cost
    cost = cost_matrix.sum()
    cost = cost / (y_mask * frequency_mask).sum()

    # Initialize parameters
    for brick in (lookup, linear, hidden, top_linear):
        brick.weights_init = IsotropicGaussian(0.01)
        brick.biases_init = Constant(0.)
        brick.initialize()

    return cost
Beispiel #41
0
iteration = 300  # number of epochs of gradient descent

print "Building Model"
# Symbolic variables
x = tensor.tensor3('x', dtype=floatX)
target = tensor.tensor3('target', dtype=floatX)

# Build the model
linear = Linear(input_dim=n_u, output_dim=n_h, name="first_layer")
rnn = SimpleRecurrent(dim=n_h, activation=Tanh())
linear2 = Linear(input_dim=n_h, output_dim=n_y, name="output_layer")
sigm = Sigmoid()

x_transform = linear.apply(x)
h = rnn.apply(x_transform)
predict = sigm.apply(linear2.apply(h))


# only for generation B x h_dim
h_initial = tensor.tensor3('h_initial', dtype=floatX)
h_testing = rnn.apply(x_transform, h_initial, iterate=False)
y_hat_testing = linear2.apply(h_testing)
y_hat_testing = sigm.apply(y_hat_testing)
y_hat_testing.name = 'y_hat_testing'


# Cost function
cost = SquaredError().apply(predict, target)

# Initialization
Beispiel #42
0
class Rnn(Initializable, BaseRecurrent):
    def __init__(self, dims=(88, 100, 100), **kwargs):
        super(Rnn, self).__init__(**kwargs)
        self.dims = dims

        self.input_transform = Linear(input_dim=dims[0], output_dim=dims[1],
                                      weights_init=IsotropicGaussian(0.01),
                                      # biases_init=Constant(0.0),
                                      use_bias=False,
                                      name="input_transfrom")

        self.gru_layer = SimpleRecurrent(dim=dims[1], activation=Tanh(),
                                         weights_init=IsotropicGaussian(0.01),
                                         biases_init=Constant(0.0),
                                         use_bias=True,
                                         name="gru_rnn_layer")

        # TODO: find a way to automatically set the output dim in case of lstm vs normal rnn
        self.linear_trans = Linear(input_dim=dims[1], output_dim=dims[2] * 4,
                                   weights_init=IsotropicGaussian(0.01),
                                   biases_init=Constant(0.0),
                                   use_bias=False,
                                   name="h2h_transform")

        self.lstm_layer = LSTM(dim=dims[2], activation=Tanh(),
                               weights_init=IsotropicGaussian(0.01),
                               biases_init=Constant(0.0),
                               use_bias=True,
                               name="lstm_rnn_layer")

        self.out_transform = MLP(activations=[Sigmoid()], dims=[dims[2], dims[0]],
                                 weights_init=IsotropicGaussian(0.01),
                                 use_bias=True,
                                 biases_init=Constant(0.0),
                                 name="out_layer")

        self.children = [self.input_transform, self.gru_layer, self.linear_trans,
                         self.lstm_layer, self.out_transform]

    # @recurrent(sequences=['inputs', 'input_mask'], contexts=[],
    # states=['gru_state', 'lstm_state', 'lstm_cells'],
    # outputs=['gru_state', 'lstm_state', 'lstm_cells'])
    def rnn_apply(self, inputs, mask=None, gru_state=None, lstm_state=None, lstm_cells=None):
        input_transform = self.input_transform.apply(inputs)
        gru_state = self.gru_layer.apply(
            inputs=input_transform,
            # update_inputs=input_transform,
            # reset_inputs=input_transform,
            states=gru_state,
            mask=mask,
            iterate=False)
        lstm_transform = self.linear_trans.apply(gru_state)
        lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state,
                                                       cells=lstm_cells,
                                                       mask=mask, iterate=False)
        return gru_state, lstm_state, lstm_cells

    @recurrent(sequences=[], contexts=[],
               states=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'],
               outputs=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'])
    def rnn_generate(self, inputs=None, gru_state=None, lstm_state=None, lstm_cells=None):
        output = self.apply(inputs=inputs,
                            gru_state=gru_state,
                            lstm_state=lstm_state,
                            lstm_cells=lstm_cells,
                            iterate=False)
        return output, gru_state, lstm_state, lstm_cells


    @recurrent(sequences=['inputs', 'mask'], contexts=[],
               states=['gru_state', 'lstm_state', 'lstm_cells'],
               outputs=['output', 'gru_state', 'lstm_state', 'lstm_cells'])
    def apply(self, inputs, mask, gru_state=None, lstm_state=None, lstm_cells=None):
        # input_transform = self.input_transform.apply(inputs)
        # gru_state = self.gru_layer.apply(
        # inputs=input_transform,
        #     mask=mask,
        #     states=gru_state,
        #     iterate=False)
        # lstm_transform = self.linear_trans.apply(gru_state)
        # lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state,
        #                                                cells=lstm_cells,
        #                                                mask=mask, iterate=False)
        gru_state, lstm_state, lstm_cells = self.rnn_apply(inputs=inputs,
                                                           mask=mask,
                                                           gru_state=gru_state,
                                                           lstm_state=lstm_state,
                                                           lstm_cells=lstm_cells)

        output = 1.17 * self.out_transform.apply(lstm_state) * mask[:, None]
        return output, gru_state, lstm_state, lstm_cells


    def get_dim(self, name):
        dims = dict(zip(['outputs', 'gru_state', 'lstm_state'], self.dims))
        dims['lstm_cells'] = dims['lstm_state']
        return dims.get(name, None) or super(Rnn, self).get_dim(name)
Beispiel #43
0
def main(save_to, num_epochs):
    batch_size = 128
    dim = 100
    n_steps = 20
    i2h1 = MLP([Identity()], [784, dim], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001))
    h2o1 = MLP([Rectifier(), Logistic()], [dim, dim, 784],
               biases_init=Constant(0.), weights_init=IsotropicGaussian(.001))
    rec1 = SimpleRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal())
    i2h1.initialize()
    h2o1.initialize()
    rec1.initialize()

    x = tensor.tensor3('features')
    x1 = x[1:, :, :]
    x2 = x[:-1, :, :]

    preproc = i2h1.apply(x1)
    h1 = rec1.apply(preproc)
    x_hat = h2o1.apply(h1)
    cost = tensor.nnet.binary_crossentropy(x_hat, x2).mean()
    # cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
    cost.name = 'final_cost'

    cg = ComputationGraph([cost, ])

    mnist_train = MNIST("train", subset=slice(0, 50000), sources=('features', ))
    mnist_valid = MNIST("train", subset=slice(50000, 60000), sources=('features',))
    mnist_test = MNIST("test")
    trainstream = Mapping(Flatten(DataStream(mnist_train,
                          iteration_scheme=SequentialScheme(50000, batch_size))),
                          _meanize(n_steps))
    validstream = Mapping(Flatten(DataStream(mnist_valid,
                                             iteration_scheme=SequentialScheme(10000,
                                                                               batch_size))),
                          _meanize(n_steps))
    teststream = Mapping(Flatten(DataStream(mnist_test,
                                            iteration_scheme=SequentialScheme(10000,
                                                                              batch_size))),
                         _meanize(n_steps))

    algorithm = GradientDescent(
        cost=cost, params=cg.parameters,
        step_rule=CompositeRule([Adam(), StepClipping(100)]))
    main_loop = MainLoop(
        algorithm,
        trainstream,
        extensions=[Timing(),
                    FinishAfter(after_n_epochs=num_epochs),
                    # DataStreamMonitoring(
                    #     [cost, ],
                    #     teststream,
                    #     prefix="test"),
                    DataStreamMonitoringAndSaving(
                    [cost, ],
                    validstream,
                    [i2h1, h2o1, rec1],
                    'best_'+save_to+'.pkl',
                    cost_name=cost.name,
                    after_epoch=True,
                    prefix='valid'),
                    TrainingDataMonitoring(
                        [cost,
                         aggregation.mean(algorithm.total_gradient_norm)],
                        prefix="train",
                        after_epoch=True),
                    # Plot(
                    #     save_to,
                    #     channels=[
                    #         ['test_final_cost',
                    #          'test_misclassificationrate_apply_error_rate'],
                    #         ['train_total_gradient_norm']]),
                    Printing()])
    main_loop.run()
# T x B x F
x = tensor.tensor3('x', dtype=floatX)
# T x B
x_mask = tensor.matrix('x_mask', dtype=floatX)
# L x B
y = tensor.matrix('y', dtype=floatX)
# L x B
y_mask = tensor.matrix('y_mask', dtype=floatX)

x_to_h = Linear(name='x_to_h',
                input_dim=x_dim,
                output_dim=h_dim)
x_transform = x_to_h.apply(x)
rnn = SimpleRecurrent(activation=Tanh(),
                      dim=h_dim, name="rnn")
h = rnn.apply(x_transform)
h_to_o = Linear(name='h_to_o',
                input_dim=h_dim,
                output_dim=num_classes + 1)
h_transform = h_to_o.apply(h)
# T x B x C+1
y_hat = tensor.nnet.softmax(
    h_transform.reshape((-1, num_classes + 1))
).reshape((h.shape[0], h.shape[1], -1))
y_hat.name = 'y_hat'

y_hat_mask = x_mask
cost = CTC().apply(y, y_hat, y_mask, y_hat_mask, 'normal_scale')
cost.name = 'CTC'
# Initialization
for brick in (rnn, x_to_h, h_to_o):