Esempio n. 1
0
def example():
    """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """
    x = tensor.tensor3('x')

    rnn = SimpleRecurrent(dim=3, activation=Identity(), weights_init=initialization.Identity())
    rnn.initialize()
    h = rnn.apply(x)

    f = theano.function([x], h)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) 

    doubler = Linear(
                 input_dim=3, output_dim=3, weights_init=initialization.Identity(2),
                 biases_init=initialization.Constant(0))
    doubler.initialize()
    h_doubler = rnn.apply(doubler.apply(x))

    f = theano.function([x], h_doubler)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) 

    #Initial State
    h0 = tensor.matrix('h0')
    h = rnn.apply(inputs=x, states=h0)

    f = theano.function([x, h0], h)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX),
            np.ones((1, 3), dtype=theano.config.floatX))) 
Esempio n. 2
0
def example2():
    """GRU"""
    x = tensor.tensor3('x')
    dim = 3

    fork = Fork(input_dim=dim,
                output_dims=[dim, dim * 2],
                name='fork',
                output_names=["linear", "gates"],
                weights_init=initialization.Identity(),
                biases_init=Constant(0))
    gru = GatedRecurrent(dim=dim,
                         weights_init=initialization.Identity(),
                         biases_init=Constant(0))

    fork.initialize()
    gru.initialize()

    linear, gate_inputs = fork.apply(x)
    h = gru.apply(linear, gate_inputs)

    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))

    doubler = Linear(input_dim=dim,
                     output_dim=dim,
                     weights_init=initialization.Identity(2),
                     biases_init=initialization.Constant(0))
    doubler.initialize()

    lin, gate = fork.apply(doubler.apply(x))
    h_doubler = gru.apply(lin, gate)

    f = theano.function([x], h_doubler)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
Esempio n. 3
0
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        x = tensor.tensor3('x', dtype=floatX)
        y = tensor.tensor3('y', dtype=floatX)

        x_to_lstm = Linear(name="x_to_lstm", input_dim=input_size, output_dim=4 * hidden_size,
                           weights_init=IsotropicGaussian(), biases_init=Constant(0))
        lstm = LSTM(dim=hidden_size, name="lstm", weights_init=IsotropicGaussian(), biases_init=Constant(0))
        lstm_to_output = Linear(name="lstm_to_output", input_dim=hidden_size, output_dim=output_size,
                                weights_init=IsotropicGaussian(), biases_init=Constant(0))

        x_transform = x_to_lstm.apply(x)
        h, c = lstm.apply(x_transform)

        y_hat = lstm_to_output.apply(h)
        y_hat = Logistic(name="y_hat").apply(y_hat)

        self.cost = BinaryCrossEntropy(name="cost").apply(y, y_hat)

        x_to_lstm.initialize()
        lstm.initialize()
        lstm_to_output.initialize()

        self.computation_graph = ComputationGraph(self.cost)
Esempio n. 4
0
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs):
    dataset_train = IterableDataset(generate_data(max_seq_length, batch_size,
                                                  num_batches))
    dataset_test = IterableDataset(generate_data(max_seq_length, batch_size,
                                                 100))

    stream_train = DataStream(dataset=dataset_train)
    stream_test = DataStream(dataset=dataset_test)

    x = T.tensor3('x')
    y = T.matrix('y')

    # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see
    # LSTM layer documentation for the explanation
    x_to_h = Linear(1, lstm_dim * 4, name='x_to_h',
                    weights_init=IsotropicGaussian(),
                    biases_init=Constant(0.0))
    lstm = LSTM(lstm_dim, name='lstm',
                weights_init=IsotropicGaussian(),
                biases_init=Constant(0.0))
    h_to_o = Linear(lstm_dim, 1, name='h_to_o',
                    weights_init=IsotropicGaussian(),
                    biases_init=Constant(0.0))

    x_transform = x_to_h.apply(x)
    h, c = lstm.apply(x_transform)

    # only values of hidden units of the last timeframe are used for
    # the classification
    y_hat = h_to_o.apply(h[-1])
    y_hat = Logistic().apply(y_hat)

    cost = BinaryCrossEntropy().apply(y, y_hat)
    cost.name = 'cost'

    lstm.initialize()
    x_to_h.initialize()
    h_to_o.initialize()

    cg = ComputationGraph(cost)

    algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                                step_rule=Adam())
    test_monitor = DataStreamMonitoring(variables=[cost],
                                        data_stream=stream_test, prefix="test")
    train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train",
                                           after_epoch=True)

    main_loop = MainLoop(algorithm, stream_train,
                         extensions=[test_monitor, train_monitor,
                                     FinishAfter(after_n_epochs=num_epochs),
                                     Printing(), ProgressBar()])
    main_loop.run()

    print 'Learned weights:'
    for layer in (x_to_h, lstm, h_to_o):
        print "Layer '%s':" % layer.name
        for param in layer.parameters:
            print param.name, ': ', param.get_value()
        print
Esempio n. 5
0
def example():
    """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """
    x = tensor.tensor3('x')

    rnn = SimpleRecurrent(dim=3,
                          activation=Identity(),
                          weights_init=initialization.Identity())
    rnn.initialize()
    h = rnn.apply(x)

    f = theano.function([x], h)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX)))

    doubler = Linear(input_dim=3,
                     output_dim=3,
                     weights_init=initialization.Identity(2),
                     biases_init=initialization.Constant(0))
    doubler.initialize()
    h_doubler = rnn.apply(doubler.apply(x))

    f = theano.function([x], h_doubler)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX)))

    #Initial State
    h0 = tensor.matrix('h0')
    h = rnn.apply(inputs=x, states=h0)

    f = theano.function([x, h0], h)
    print(
        f(np.ones((3, 1, 3), dtype=theano.config.floatX),
          np.ones((1, 3), dtype=theano.config.floatX)))
Esempio n. 6
0
class AttentionEUTHM(EUTHM):
    '''
    EUTHM with user attention
    '''
    def __init__(self, config, dataset, *args, **kwargs):
        super(EUTHM, self).__init__(config, dataset)

    def _get_doc_embed(self, *args, **kwargs):
        text_vec = self._get_text_vec()
        user_vec = self.user_embed.apply(self.user)
        text_vec = tensor.concatenate([
            text_vec, user_vec[None, :, :][tensor.zeros(
                shape=(text_vec.shape[0], ), dtype='int32')]
        ],
                                      axis=2)
        return self._encode_text_vec(text_vec)

    def _build_bricks(self, *args, **kwargs):
        super(AttentionEUTHM, self)._build_bricks()
        self.mlstm_ins = Linear(input_dim=self.config.word_embed_dim +
                                self.config.user_embed_dim,
                                output_dim=4 * self.config.lstm_dim,
                                name='mlstm_in')
        self.mlstm_ins.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) /
            numpy.sqrt(self.config.word_embed_dim +
                       self.config.user_embed_dim + self.config.lstm_dim))
        self.mlstm_ins.biases_init = Constant(0)
        self.mlstm_ins.initialize()
Esempio n. 7
0
 def lllistool(i, inp, func):
     if func == LSTM:
         NUMS[i+1] *= 4
     sdim = DIMS[i]
     if func == SimpleRecurrent or func == LSTM:
         sdim = DIMS[i] + DIMS[i+1]
     l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], 
                weights_init=IsotropicGaussian(std=sdim**(-0.5)), 
                biases_init=IsotropicGaussian(std=sdim**(-0.5)),
                name='Lin{}'.format(i))
     l.initialize()
     if func == SimpleRecurrent:
         gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=sdim**(-0.5)))
         gong.initialize()
         ret = gong.apply(l.apply(inp))
     elif func == LSTM:
         gong = func(dim=DIMS[i+1], activation=Tanh(), weights_init=IsotropicGaussian(std=sdim**(-0.5)))
         gong.initialize()
         print(inp)
         ret, _ = gong.apply(
             l.apply(inp), 
             T.zeros((inp.shape[1], DIMS[i+1])),
             T.zeros((inp.shape[1], DIMS[i+1])),
         )
     elif func == SequenceGenerator:
         gong = func(
             readout=None, 
             transition=SimpleRecurrent(dim=100, activation=Rectifier(), weights_init=IsotropicGaussian(std=0.1)))
         ret = None
     elif func == None:
         ret = l.apply(inp)
     else:
         gong = func()
         ret = gong.apply(l.apply(inp))
     return ret
Esempio n. 8
0
class embeddingLayer:
    def __init__(self, word_dim, visual_dim, joint_dim):
        self.word_embed = Linear(word_dim,
                                 joint_dim,
                                 name='word_to_joint',
                                 weights_init=IsotropicGaussian(0.01),
                                 biases_init=Constant(0))
        self.visual_embed = Linear(visual_dim,
                                   joint_dim,
                                   name='visual_to_joint',
                                   weights_init=IsotropicGaussian(0.01),
                                   biases_init=Constant(0))
        self.word_embed.initialize()
        self.visual_embed.initialize()

    # words: batch_size x q x word_dim
    # video: batch_size x video_length x visual_dim
    def apply(self, words, video, u1, u2):
        w = self.word_embed.apply(words)
        v = self.visual_embed.apply(video)
        w = T.tanh(w)
        v = T.tanh(v)
        u = T.concatenate([u1, u2], axis=1)
        u = self.word_embed.apply(u)
        return w, v, u

    def apply_sentence(self, words, u1, u2):
        w = self.word_embed.apply(words)
        w = T.tanh(w)
        u = T.concatenate([u1, u2], axis=1)
        u = self.word_embed.apply(u)
        return w, u
Esempio n. 9
0
def example2():
    """GRU"""
    x = tensor.tensor3('x')
    dim = 3

    fork = Fork(input_dim=dim, output_dims=[dim, dim*2],name='fork',output_names=["linear","gates"], weights_init=initialization.Identity(),biases_init=Constant(0))
    gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(),biases_init=Constant(0))

    fork.initialize()
    gru.initialize()

    linear, gate_inputs = fork.apply(x)
    h = gru.apply(linear, gate_inputs)

    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) 

    doubler = Linear(
                 input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2),
                 biases_init=initialization.Constant(0))
    doubler.initialize()

    lin, gate = fork.apply(doubler.apply(x))
    h_doubler = gru.apply(lin,gate)

    f = theano.function([x], h_doubler)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) 
Esempio n. 10
0
def decoder_network(latent_sample, latent_dim=J):
  # bernoulli case
  hidden2 = get_typical_layer(latent_sample, latent_dim, 500, Logistic())
  hidden2_to_output = Linear(name="last", input_dim=500, output_dim=784)
  hidden2_to_output.weights_init = IsotropicGaussian(0.01)
  hidden2_to_output.biases_init = Constant(0)
  hidden2_to_output.initialize()
  return Logistic().apply(hidden2_to_output.apply(hidden2))
Esempio n. 11
0
    def create_model(self):
        input_dim = self.input_dim
        x = self.x
        y = self.y
        p = self.p
        mask = self.mask
        hidden_dim = self.hidden_dim
        embedding_dim = self.embedding_dim
        lookup = LookupTable(self.dict_size,
                             embedding_dim,
                             weights_init=IsotropicGaussian(0.001),
                             name='LookupTable')
        x_to_h = Linear(embedding_dim,
                        hidden_dim * 4,
                        name='x_to_h',
                        weights_init=IsotropicGaussian(0.001),
                        biases_init=Constant(0.0))
        lstm = LSTM(hidden_dim,
                    name='lstm',
                    weights_init=IsotropicGaussian(0.001),
                    biases_init=Constant(0.0))
        h_to_o = MLP([Logistic()], [hidden_dim, 1],
                     weights_init=IsotropicGaussian(0.001),
                     biases_init=Constant(0),
                     name='h_to_o')

        lookup.initialize()
        x_to_h.initialize()
        lstm.initialize()
        h_to_o.initialize()

        embed = lookup.apply(x).reshape(
            (x.shape[0], x.shape[1], self.embedding_dim))
        embed.name = "embed_vec"
        x_transform = x_to_h.apply(embed.transpose(1, 0, 2))
        x_transform.name = "Transformed X"
        self.lookup = lookup
        self.x_to_h = x_to_h
        self.lstm = lstm
        self.h_to_o = h_to_o

        #if mask is None:
        h, c = lstm.apply(x_transform)
        #else:
        #h, c = lstm.apply(x_transform, mask=mask)
        h.name = "hidden_state"
        c.name = "cell state"
        # only values of hidden units of the last timeframe are used for
        # the classification
        indices = T.sum(mask, axis=0) - 1
        rel_hid = h[indices, T.arange(h.shape[1])]
        out = self.h_to_o.apply(rel_hid)

        probs = out
        return probs
Esempio n. 12
0
class MyRecurrent(Brick):
    def __init__(self,
                 recurrent,
                 dims,
                 activations=[Identity(), Identity()],
                 **kwargs):
        super(MyRecurrent, self).__init__(**kwargs)
        self.dims = dims
        self.recurrent = recurrent
        self.activations = activations
        if isinstance(self.recurrent,
                      (SimpleRecurrent, SimpleRecurrentBatchNorm)):
            output_dim = dims[1]
        elif isinstance(self.recurrent, (LSTM, LSTMBatchNorm)):
            output_dim = 4 * dims[1]
        else:
            raise NotImplementedError
        self.input_trans = Linear(name='input_trans',
                                  input_dim=dims[0],
                                  output_dim=output_dim,
                                  weights_init=NormalizedInitialization(),
                                  biases_init=Constant(0))
        self.output_trans = Linear(name='output_trans',
                                   input_dim=dims[1],
                                   output_dim=dims[2],
                                   weights_init=NormalizedInitialization(),
                                   biases_init=Constant(0))
        self.children = (
            [self.input_trans, self.recurrent, self.output_trans] +
            self.activations)

    def _initialize(self):
        self.input_trans.initialize()
        self.output_trans.initialize()
        #self.recurrent.initialize()

    @application
    def apply(self, input_, input_mask=None, *args, **kwargs):
        input_recurrent = self.input_trans.apply(input_)
        try:
            input_recurrent = self.activations[0].apply(input_recurrent,
                                                        input_mask=input_mask)
        except TypeError:
            input_recurrent = self.activations[0].apply(input_recurrent)
        output_recurrent = self.recurrent.apply(inputs=input_recurrent,
                                                mask=input_mask)
        if isinstance(self.recurrent, (LSTM, LSTMBatchNorm)):
            output_recurrent = output_recurrent[0]
        output = self.output_trans.apply(output_recurrent)
        try:
            output = self.activations[1].apply(output, input_mask=input_mask)
        except TypeError:
            output = self.activations[1].apply(output)
        return output
Esempio n. 13
0
class questionEncoder:
    def __init__(self, word_dim, hidden_dim):
        self.forward_lstm= LSTM(hidden_dim,
                                name='question_forward_lstm',
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0))
        self.backward_lstm= LSTM(hidden_dim,
                                 name='question_backward_lstm',
                                 weights_init=IsotropicGaussian(0.01),
                                 biases_init=Constant(0))
        self.x_to_h_forward = Linear(word_dim,
                                     hidden_dim * 4,
                                     name='word_x_to_h_forward',
                                     weights_init=IsotropicGaussian(0.01),
                                     biases_init=Constant(0))
        self.x_to_h_backward = Linear(word_dim,
                                      hidden_dim * 4,
                                      name='word_x_to_h_backward',
                                      weights_init=IsotropicGaussian(0.01),
                                      biases_init=Constant(0))

        self.forward_lstm.initialize()
        self.backward_lstm.initialize()
        self.x_to_h_forward.initialize()
        self.x_to_h_backward.initialize()

    # variable question length
    # words: batch_size x q x word_dim
    # words_reverse: be the reverse sentence of words
    #                padding with 0 to max length q
    # mask: batch_size 
    def apply(self, words, words_reverse, mask_, batch_size):
        mask = mask_.flatten()
        # batch_size x q x hidden_dim
        Wx = self.x_to_h_forward.apply(words)
        Wx_r = self.x_to_h_backward.apply(words_reverse)
        # q x batch_size x hidden_dim
        Wx = Wx.swapaxes(0, 1)
        Wx_r = Wx_r.swapaxes(0, 1)
        # q x batch_size x hidden_dim
        hf, cf = self.forward_lstm.apply(Wx)
        hb, cb = self.backward_lstm.apply(Wx_r)
        for i in range(batch_size):
            T.set_subtensor(hb[0:mask[i]+1, i, :], hb[0:mask[i]+1, i, :][::-1])

        # q x batch_size x (2 x hidden_dim)
        h = T.concatenate([hf, hb], axis=2)
        # batch_size x hidden_dim
        y_q = hf[mask, range(batch_size), :]
        y_1 = hb[0, range(batch_size), :]
        
        return h.swapaxes(0, 1), y_q, y_1
Esempio n. 14
0
 def lllistool(i, inp, func):
     l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], 
                weights_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), 
                biases_init=IsotropicGaussian(std=DIMS[i]**(-0.5)),
                name='Lin{}'.format(i))
     l.initialize()
     func.name='Fun{}'.format(i)
     if func == SimpleRecurrent:
         gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=(DIMS[i]+DIMS[i+1])**(-0.5)))
     else:
         gong = func()
     ret = gong.apply(l.apply(inp))
     return ret
Esempio n. 15
0
    def apply_layer(self, layer_type, input_, in_dim, out_dim, layer_name):
        # Since we pass this path twice (clean and corr encoder), we
        # want to make sure that parameters of both layers are shared.
        layer = self.shareds.get(layer_name)
        if layer is None:
            if layer_type == "fc":
                linear = Linear(use_bias=False, name=layer_name, input_dim=in_dim, output_dim=out_dim, seed=1)
                linear.weights_init = Glorot(self.rng, in_dim, out_dim)
                linear.initialize()
                layer = linear
                self.shareds[layer_name] = layer

        return layer.apply(input_)
Esempio n. 16
0
def example4():
    """LSTM -> Plante lors de l'initialisation du lstm."""

    x = tensor.tensor3('x')
    dim = 3

    #    gate_inputs = theano.function([x],x*4)
    gate_inputs = Linear(input_dim=dim,
                         output_dim=dim * 4,
                         name="linear",
                         weights_init=initialization.Identity(),
                         biases_init=Constant(2))

    lstm = LSTM(dim=dim,
                activation=Tanh(),
                weights_init=IsotropicGaussian(),
                biases_init=Constant(0))

    gate_inputs.initialize()
    hg = gate_inputs.apply(x)

    #print(gate_inputs.parameters)
    #print(gate_inputs.parameters[1].get_value())

    lstm.initialize()
    h, cells = lstm.apply(hg)
    print(lstm.parameters)

    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
    print(f(4 * np.ones((dim, 1, dim), dtype=theano.config.floatX)))

    print("Good Job!")

    #    lstm_output =

    #Initial State
    h0 = tensor.matrix('h0')
    c = tensor.matrix('cells')
    h, c1 = lstm.apply(
        inputs=x, states=h0,
        cells=c)  # lstm.apply(states=h0,cells=cells,inputs=gate_inputs)

    f = theano.function([x, h0, c], h)
    print("a")
    print(
        f(np.ones((3, 1, 3), dtype=theano.config.floatX),
          np.ones((1, 3), dtype=theano.config.floatX),
          np.ones((1, 3), dtype=theano.config.floatX)))
class MyRecurrent(Brick):
    def __init__(self, recurrent, dims,
                 activations=[Identity(), Identity()], **kwargs):
        super(MyRecurrent, self).__init__(**kwargs)
        self.dims = dims
        self.recurrent = recurrent
        self.activations = activations
        if isinstance(self.recurrent, (SimpleRecurrent, SimpleRecurrentBatchNorm)):
            output_dim = dims[1]
        elif isinstance(self.recurrent, (LSTM, LSTMBatchNorm)):
            output_dim = 4*dims[1]
        else:
            raise NotImplementedError
        self.input_trans = Linear(name='input_trans',
                                  input_dim=dims[0],
                                  output_dim=output_dim,
                                  weights_init=NormalizedInitialization(),
                                  biases_init=Constant(0))
        self.output_trans = Linear(name='output_trans',
                                   input_dim=dims[1],
                                   output_dim=dims[2],
                                   weights_init=NormalizedInitialization(),
                                   biases_init=Constant(0))
        self.children = ([self.input_trans, self.recurrent, self.output_trans]
                         + self.activations)
        
    def _initialize(self):
        self.input_trans.initialize()
        self.output_trans.initialize()
        #self.recurrent.initialize()

    @application
    def apply(self, input_, input_mask=None, *args, **kwargs):
        input_recurrent = self.input_trans.apply(input_)
        try:
            input_recurrent = self.activations[0].apply(input_recurrent, input_mask=input_mask)
        except TypeError:
            input_recurrent = self.activations[0].apply(input_recurrent)
        output_recurrent = self.recurrent.apply(inputs=input_recurrent,
                                                mask=input_mask)
        if isinstance(self.recurrent, (LSTM, LSTMBatchNorm)):
            output_recurrent = output_recurrent[0]
        output = self.output_trans.apply(output_recurrent)
        try:
            output = self.activations[1].apply(output, input_mask=input_mask)
        except TypeError:
            output = self.activations[1].apply(output)
        return output
Esempio n. 18
0
    def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512):
        self.hidden_size = hidden_size
        self.input1_size = input1_size
        self.input2_size = input2_size
        self.lookup1_dim = lookup1_dim
        self.lookup2_dim = lookup2_dim

        x1 = tensor.lmatrix('durations')
        x2 = tensor.lmatrix('syllables')
        y = tensor.lmatrix('pitches')

        lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup1.initialize()
        lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup2.initialize()
        merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        merge.initialize()
        recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(),
                              weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3)
        recurrent_block.initialize()
        linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        linear.initialize()
        softmax = NDimensionalSoftmax()

        l1 = lookup1.apply(x1)
        l2 = lookup2.apply(x2)
        m = merge.apply(l1, l2)
        h = recurrent_block.apply(m)
        a = linear.apply(h)

        y_hat = softmax.apply(a, extra_ndim=1)
        # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D)

        self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean()

        self.ComputationGraph = ComputationGraph(self.Cost)

        self.Model = Model(y_hat)
Esempio n. 19
0
 def lllistool(i, inp, func):
     l = Linear(input_dim=DIMS[i],
                output_dim=DIMS[i + 1] * NUMS[i + 1],
                weights_init=IsotropicGaussian(std=DIMS[i]**(-0.5)),
                biases_init=IsotropicGaussian(std=DIMS[i]**(-0.5)),
                name='Lin{}'.format(i))
     l.initialize()
     func.name = 'Fun{}'.format(i)
     if func == SimpleRecurrent:
         gong = func(dim=DIMS[i + 1],
                     activation=Rectifier(),
                     weights_init=IsotropicGaussian(
                         std=(DIMS[i] + DIMS[i + 1])**(-0.5)))
     else:
         gong = func()
     ret = gong.apply(l.apply(inp))
     return ret
Esempio n. 20
0
    def apply_layer(self, layer_type, input_, in_dim, out_dim, layer_name):
        # Since we pass this path twice (clean and corr encoder), we
        # want to make sure that parameters of both layers are shared.
        layer = self.shareds.get(layer_name)
        if layer is None:
            if layer_type == 'fc':
                linear = Linear(use_bias=False,
                                name=layer_name,
                                input_dim=in_dim,
                                output_dim=out_dim,
                                seed=1)
                linear.weights_init = Glorot(self.rng, in_dim, out_dim)
                linear.initialize()
                layer = linear
                self.shareds[layer_name] = layer

        return layer.apply(input_)
Esempio n. 21
0
def test_linear():
    x = tensor.matrix()

    linear = Linear(input_dim=16, output_dim=8, weights_init=Constant(2),
                    biases_init=Constant(1))
    y = linear.apply(x)
    linear.initialize()
    x_val = numpy.ones((4, 16), dtype=theano.config.floatX)
    assert_allclose(
        y.eval({x: x_val}),
        x_val.dot(2 * numpy.ones((16, 8))) + numpy.ones((4, 8)))

    linear = Linear(input_dim=16, output_dim=8, weights_init=Constant(2),
                    use_bias=False)
    y = linear.apply(x)
    linear.initialize()
    x_val = numpy.ones((4, 16), dtype=theano.config.floatX)
    assert_allclose(y.eval({x: x_val}), x_val.dot(2 * numpy.ones((16, 8))))
def get_presoft(h, args):
    output_size = get_output_size(args.dataset)
    # If args.skip_connections: dim = args.layers * args.state_dim
    # else: dim = args.state_dim
    use_all_states = args.skip_connections or args.skip_output or (args.rnn_type in ["clockwork", "soft"])
    output_layer = Linear(
        input_dim=use_all_states * args.layers *
        args.state_dim + (1 - use_all_states) * args.state_dim,
        output_dim=output_size, name="output_layer")

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()
    presoft = output_layer.apply(h)
    if not has_indices(args.dataset):
        presoft = Tanh().apply(presoft)
    presoft.name = 'presoft'
    return presoft
Esempio n. 23
0
def test_linear():
    x = tensor.matrix()

    linear = Linear(input_dim=16, output_dim=8, weights_init=Constant(2),
                    biases_init=Constant(1))
    y = linear.apply(x)
    linear.initialize()
    x_val = numpy.ones((4, 16), dtype=theano.config.floatX)
    assert_allclose(
        y.eval({x: x_val}),
        x_val.dot(2 * numpy.ones((16, 8))) + numpy.ones((4, 8)))

    linear = Linear(input_dim=16, output_dim=8, weights_init=Constant(2),
                    use_bias=False)
    y = linear.apply(x)
    linear.initialize()
    x_val = numpy.ones((4, 16), dtype=theano.config.floatX)
    assert_allclose(y.eval({x: x_val}), x_val.dot(2 * numpy.ones((16, 8))))
Esempio n. 24
0
    def apply_layer(self, layer_type, input_, in_dim, out_dim, layer_name):
        # Since we pass this path twice (clean and corr encoder),we
        # want to make sure that parameters of both layers are shared.
        layer = self.shareds.get(layer_name)
        if layer is not None:
            return layer
        else:
            if layer_type == 'fc':
                linear = Linear(use_bias=False,
                                name=layer_name,
                                input_dim=in_dim,
                                output_dim=out_dim,
                                seed=1)
                layer = linear.apply(input_)
                linear.weights_init = IsotropicGaussian(1.0 / np.sqrt(in_dim))
                linear.initialize()
                self.shareds[layer_name] = layer

            return layer
Esempio n. 25
0
def example4():
    """LSTM -> Plante lors de l'initialisation du lstm."""

    x = tensor.tensor3('x')
    dim=3

#    gate_inputs = theano.function([x],x*4)
    gate_inputs = Linear(input_dim=dim,output_dim=dim*4, name="linear",weights_init=initialization.Identity(), biases_init=Constant(2))

    lstm = LSTM(dim=dim,activation=Tanh(), weights_init=IsotropicGaussian(), biases_init=Constant(0))
    
    gate_inputs.initialize()
    hg = gate_inputs.apply(x)
    

    #print(gate_inputs.parameters)
    #print(gate_inputs.parameters[1].get_value())
    
    lstm.initialize()
    h, cells = lstm.apply(hg)
    print(lstm.parameters)
    
    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
    print(f(4*np.ones((dim, 1, dim), dtype=theano.config.floatX)))
 
    print("Good Job!")


#    lstm_output = 

    #Initial State
    h0 = tensor.matrix('h0')
    c =  tensor.matrix('cells')
    h,c1 = lstm.apply(inputs=x, states=h0, cells=c) # lstm.apply(states=h0,cells=cells,inputs=gate_inputs)

    f = theano.function([x, h0, c], h)
    print("a")
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX),
            np.ones((1, 3), dtype=theano.config.floatX),
            np.ones((1, 3), dtype=theano.config.floatX))) 
Esempio n. 26
0
class TextRNN(object):

    def __init__(self, dim_in, dim_hidden, dim_out, **kwargs):

        self.dim_in = dim_in
        self.dim_hidden = dim_hidden
        self.dim_out = dim_out

        self.input_layer = Linear(input_dim=self.dim_in, output_dim=self.dim_hidden,
                                weights_init=initialization.IsotropicGaussian(),
                                biases_init=initialization.Constant(0))
        self.input_layer.initialize()

        sparse_init = initialization.Sparse(num_init=15, weights_init=initialization.IsotropicGaussian())
        self.recurrent_layer = SimpleRecurrent(
                                dim=self.dim_hidden, activation=Tanh(), name="first_recurrent_layer",
                                weights_init=sparse_init,
                                biases_init=initialization.Constant(0.01))
        '''
        self.recurrent_layer = LSTM(dim=self.dim_hidden, activation=Tanh(),
                                    weights_init=initialization.IsotropicGaussian(std=0.001),
                                    biases_init=initialization.Constant(0.01))
        '''
        self.recurrent_layer.initialize()

        self.output_layer = Linear(input_dim=self.dim_hidden, output_dim=self.dim_out,
                                weights_init=initialization.Uniform(width=0.01),
                                biases_init=initialization.Constant(0.01))
        self.output_layer.initialize()

        self.children = [self.input_layer, self.recurrent_layer, self.output_layer]

    '''
    @recurrent(sequences=['inputs'], 
            states=['states'],
            contexts=[],
            outputs=['states', 'output'])
    '''

    def run(self, inputs):
        output = self.output_layer.apply( self.recurrent_layer.apply(self.input_layer.apply(inputs)) )
        return output
def get_presoft(h, args):
    output_size = get_output_size(args.dataset)
    # If args.skip_connections: dim = args.layers * args.state_dim
    # else: dim = args.state_dim
    use_all_states = args.skip_connections or args.skip_output or (
        args.rnn_type in ["clockwork", "soft"])
    output_layer = Linear(
        input_dim=use_all_states * args.layers * args.state_dim +
        (1 - use_all_states) * args.state_dim,
        output_dim=output_size,
        name="output_layer")

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()
    presoft = output_layer.apply(h)
    if not has_indices(args.dataset):
        presoft = Tanh().apply(presoft)
    presoft.name = 'presoft'
    return presoft
Esempio n. 28
0
class seqDecoder:
    def __init__(self, feature_dim, memory_dim, fc1_dim, fc2_dim):
        self.W = Linear(input_dim=feature_dim,
                        output_dim=memory_dim * 4,
                        weights_init=IsotropicGaussian(0.01),
                        biases_init=Constant(0),
                        use_bias=False,
                        name='seqDecoder_W')
        self.GRU_A = LSTM(feature_dim,
                          name='seqDecoder_A',
                          weights_init=IsotropicGaussian(0.01),
                          biases_init=Constant(0))
        self.GRU_B = LSTM(memory_dim,
                          name='seqDecoder_B',
                          weights_init=IsotropicGaussian(0.01),
                          biases_init=Constant(0))
        self.W.initialize()
        self.GRU_A.initialize()
        self.GRU_B.initialize()
        self.fc1 = Linear(input_dim=memory_dim,
                          output_dim=fc1_dim,
                          weights_init=IsotropicGaussian(0.01),
                          biases_init=Constant(0),
                          name='fc1')
        self.fc2 = Linear(input_dim=fc1_dim,
                          output_dim=fc2_dim,
                          weights_init=IsotropicGaussian(0.01),
                          biases_init=Constant(0),
                          name='fc2')

        self.fc1.initialize()
        self.fc2.initialize()

    # A: the encoding of GRU_A,
    # B: the encoding of GRU_B
    # padding: the tensor constant
    def apply(self, output_length, A, B, padding):
        A_, garbage = self.GRU_A.apply(padding, states=A)
        WA_ = self.W.apply(A_)
        # output_length x batch_size x output_dim
        B_, garbage = self.GRU_B.apply(WA_, states=B)
        # batch_size x output_length x output_dim
        B_ =  B_.swapaxes(0,1)
        fc1_r = relu(self.fc1.apply(B_))
        fc2_r = relu(self.fc2.apply(fc1_r))
        return fc2_r
Esempio n. 29
0
    def __init__(self):
        srng = MRG_RandomStreams(seed=123)

        X = T.matrix('features')
        self.X = X

        #drop = Dropout(p_drop=0.5)
        #o = drop.apply(X)
        o = (X - 128) / 128.0
        self.scaled = o

        #n_hidden = 64
        n_hidden = 2048 * 2
        n_zs = 1024
        self.n_zs = n_zs

        self.n_hidden = n_hidden

        l = Linear(input_dim=32 * 32 * 3,
                   output_dim=n_hidden,
                   weights_init=IsotropicGaussian(0.01),
                   biases_init=Constant(0))
        l.initialize()
        o = l.apply(o)
        o = Rectifier().apply(o)

        l = Linear(input_dim=n_hidden,
                   output_dim=n_hidden,
                   weights_init=IsotropicGaussian(0.01),
                   biases_init=Constant(0))
        l.initialize()
        o = l.apply(o)
        o = Rectifier().apply(o)

        l = Linear(input_dim=n_hidden,
                   output_dim=n_zs,
                   weights_init=IsotropicGaussian(0.01),
                   biases_init=Constant(0))
        l.initialize()
        mu_encoder = l.apply(o)

        l = Linear(input_dim=n_hidden,
                   output_dim=n_zs,
                   weights_init=IsotropicGaussian(0.01),
                   biases_init=Constant(0))
        l.initialize()
        log_sigma_encoder = l.apply(o)

        eps = srng.normal(log_sigma_encoder.shape)

        z = eps * T.exp(log_sigma_encoder) + mu_encoder

        z_to_h1_decode = Linear(input_dim=n_zs,
                                output_dim=n_hidden,
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0))
        z_to_h1_decode.initialize()

        h1_decode_to_h_decode = Linear(input_dim=n_hidden,
                                       output_dim=n_hidden,
                                       weights_init=IsotropicGaussian(0.01),
                                       biases_init=Constant(0))
        h1_decode_to_h_decode.initialize()

        h_decode_produce = Linear(input_dim=n_hidden,
                                  output_dim=32 * 32 * 3,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="linear4")
        h_decode_produce.initialize()
        #o = h_decode_produce.apply(h_decoder)

        h_decode_produce = Linear(input_dim=n_hidden,
                                  output_dim=32 * 32 * 3,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="linear4")
        h_decode_produce.initialize()
        #self.produced = Sigmoid().apply(o)

        seq = Sequence([
            z_to_h1_decode.apply,
            Rectifier().apply, h1_decode_to_h_decode.apply,
            Rectifier().apply, h_decode_produce.apply,
            Sigmoid().apply
        ])
        seq.initialize()

        self.produced = seq.apply(z)

        self.cost = T.mean(T.sqr(self.produced - self.scaled))
        #self.cost = T.sum(T.nnet.binary_crossentropy(self.produced, self.scaled)) #T.sum(T.sqr(self.produced - self.scaled))
        self.cost.name = "cost"

        self.variational_cost = - 0.5 * T.mean(1 + 2*log_sigma_encoder - mu_encoder * mu_encoder\
                - T.exp(2 * log_sigma_encoder)) + self.cost
        self.variational_cost.name = "variational_cost"

        self.Z = T.matrix('z')
        self.sampled = seq.apply(self.Z)

        cg = ComputationGraph([self.variational_cost])
        bricks = [
            get_brick(var) for var in cg.variables + cg.scan_variables
            if get_brick(var)
        ]
        for i, b in enumerate(bricks):
            b.name = b.name + "_" + str(i)
Esempio n. 30
0
## Let's initialize the variables
lookup.allocate()
#print("lookup.parameters=", lookup.parameters)                         # ('lookup.parameters=', [W])

#lookup.weights_init = FUNCTION
#lookup.initialize()

#lookup.params[0].set_value( np.random.normal( scale = 0.1, size=(vocab_size, embedding_dim) ).astype(np.float32) )
#lookup.params[0].set_value( embedding )

# See : https://github.com/mila-udem/blocks/blob/master/tests/bricks/test_lookup.py
#lookup.W.set_value(numpy.arange(15).reshape(5, 3).astype(theano.config.floatX))
lookup.W.set_value(embedding.astype(theano.config.floatX))

rnn.initialize()
gather.initialize()

## Now for the application of these units

# Define the shape of x specifically...  :: the data has format (batch, features).
x.tag.test_value = np.random.randint(vocab_size,
                                     size=batch_of_sentences).astype(np.int32)
x_extra.tag.test_value = np.zeros(
    (max_sentence_length, mini_batch_size, 1)).astype(np.float32)
x_mask.tag.test_value = np.random.choice(
    [0.0, 1.0], size=batch_of_sentences).astype(np.float32)

print("x shape", x.shape.tag.test_value)  # array([29, 16]))

word_embedding = lookup.apply(x)
print("word_embedding shape",
## Let's initialize the variables
lookup.allocate()
#print("lookup.parameters=", lookup.parameters)                         # ('lookup.parameters=', [W])

#lookup.weights_init = FUNCTION
#lookup.initialize() 

#lookup.params[0].set_value( np.random.normal( scale = 0.1, size=(vocab_size, embedding_dim) ).astype(np.float32) )
#lookup.params[0].set_value( embedding )

# See : https://github.com/mila-udem/blocks/blob/master/tests/bricks/test_lookup.py
#lookup.W.set_value(numpy.arange(15).reshape(5, 3).astype(theano.config.floatX))
lookup.W.set_value( embedding.astype(theano.config.floatX) )
    
rnn.initialize()
gather.initialize()



## Now for the application of these units

# Define the shape of x specifically...  :: the data has format (batch, features).
x.tag.test_value       = np.random.randint(vocab_size, size=batch_of_sentences ).astype(np.int32)
x_extra.tag.test_value = np.zeros( (max_sentence_length, mini_batch_size, 1) ).astype(np.float32)
x_mask.tag.test_value  = np.random.choice( [0.0, 1.0], size=batch_of_sentences ).astype(np.float32)

print("x shape", x.shape.tag.test_value)                                # array([29, 16]))

word_embedding = lookup.apply(x)
print("word_embedding shape", word_embedding.shape.tag.test_value)      # array([ 29, 16, 100]))
print("x_extra shape", x_extra.shape.tag.test_value)                    # array([ 29, 16,   1]))
def build_model_vanilla(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())
                   for _ in range(layers)]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # If skip_connections: dim = layers * state_dim
    # else: dim = state_dim
    output_layer = Linear(
        input_dim=skip_connections * layers *
        state_dim + (1 - skip_connections) * state_dim,
        output_dim=vocab_size, name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs'] = pre_rnn
        init_states[d] = theano.shared(
            numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX),
            name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # We have
    # h = [state, state_1, state_2 ...] if layers > 1
    # h = state if layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    last_states = {}
    if layers > 1:
        # Save all the last states
        for d in range(layers):
            last_states[d] = h[d][-1, :, :]
        if skip_connections:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        last_states[0] = h[-1, :, :]
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(),
        presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
Esempio n. 33
0
x = tensor.tensor4('features')
y = tensor.lmatrix('targets')

l1 = Linear(name='l1', input_dim=784, output_dim=500,
	weights_init=IsotropicGaussian(0.01), biases_init=Constant(0))
l2 = Linear(name='l2', input_dim=500, output_dim=500,
	weights_init=IsotropicGaussian(0.01), biases_init=Constant(0))
l3 = Linear(name='l3', input_dim=500, output_dim=500,
	weights_init=IsotropicGaussian(0.01), biases_init=Constant(0))
l4 = Linear(name='l4', input_dim=500, output_dim=500,
	weights_init=IsotropicGaussian(0.01), biases_init=Constant(0))
l5 = Linear(name='l5', input_dim=500, output_dim=10,
	weights_init=IsotropicGaussian(0.01), biases_init=Constant(0))

l1.initialize()
l2.initialize()
l3.initialize()
l4.initialize()
l5.initialize()

a1 = l1.apply(x.flatten(2)/255)
a1.name = 'a1'
n1, M1, S1 = normalize(a1, output_dim = 500)
o1 = Logistic().apply(n1)

a2 = l2.apply(o1)
n2, M2, S2  = normalize(a2, output_dim = 500)
o2 = Logistic().apply(n2)

a3 = l3.apply(o2)
Esempio n. 34
0
x = tensor.lmatrix('syllables')
y = tensor.lmatrix('durations')

lookup_input = LookupTable(name='lookup_input',
                           length=train_dataset.syllables_vocab_size() + 1,
                           dim=hidden_layer_dim,
                           weights_init=initialization.Uniform(width=0.01),
                           biases_init=Constant(0))
lookup_input.initialize()

linear_input = Linear(name='linear_input',
                      input_dim=hidden_layer_dim,
                      output_dim=hidden_layer_dim,
                      weights_init=initialization.Uniform(width=0.01),
                      biases_init=Constant(0))
linear_input.initialize()

rnn = SimpleRecurrent(name='hidden',
                      dim=hidden_layer_dim,
                      activation=Tanh(),
                      weights_init=initialization.Uniform(width=0.01))
rnn.initialize()

linear_output = Linear(name='linear_output',
                       input_dim=hidden_layer_dim,
                       output_dim=train_dataset.durations_vocab_size(),
                       weights_init=initialization.Uniform(width=0.01),
                       biases_init=Constant(0))
linear_output.initialize()

softmax = NDimensionalSoftmax(name='ndim_softmax')
def build_model_soft(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]

    # Build the MLP
    dims = [2 * state_dim]
    activations = []
    for i in range(args.mlp_layers):
        activations.append(Rectifier())
        dims.append(state_dim)

    # Activation of the last layer of the MLP
    if args.mlp_activation == "logistic":
        activations.append(Logistic())
    elif args.mlp_activation == "rectifier":
        activations.append(Rectifier())
    elif args.mlp_activation == "hard_logistic":
        activations.append(HardLogistic())
    else:
        assert False

    # Output of MLP has dimension 1
    dims.append(1)

    for i in range(layers - 1):
        mlp = MLP(activations=activations, dims=dims,
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            SoftGatedRecurrent(dim=state_dim,
                               mlp=mlp,
                               activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(
        input_dim=layers * state_dim,
        output_dim=vocab_size, name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(
            numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX),
            name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have:
    # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...]

    # Extract gate_values
    gate_values = h[2::2]
    new_h = [h[0]]
    new_h.extend(h[1::2])
    h = new_h

    # Now we have:
    # h = [state, state_1, state_2, ...]
    # gate_values = [gate_value_1, gate_value_2, gate_value_3]

    for i, gate_value in enumerate(gate_values):
        gate_value.name = "gate_value_" + str(i)

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(),
        presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates, gate_values
Esempio n. 36
0
class ETHM(EUTHM):
    '''Model with only textual-hashtag information'''
    def __init__(self, config, dataset, *args, **kwargs):
        super(ETHM, self).__init__(config, dataset)

    def _build_model(self, *args, **kwargs):
        # Define inputs
        self._define_inputs()
        self._build_bricks()
        self._set_OV_value()
        # Transpose text
        self.text = self.text.dimshuffle(1, 0)
        self.text_mask = self.text_mask.dimshuffle(1, 0)
        self.sparse_word = self.sparse_word.dimshuffle(1, 0)
        self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0)
        # Turn word, and hashtag into vector representation
        text_vec = self.word_embed.apply(self.text)
        # Apply word and hashtag word and url
        text_vec = self._apply_hashtag_word(text_vec)
        text_vec = self._apply_sparse_word(text_vec)
        # Encode text
        mlstm_hidden, mlstm_cell = self.mlstm.apply(
            inputs=self.mlstm_ins.apply(text_vec),
            mask=self.text_mask.astype(theano.config.floatX))
        text_encodes = mlstm_hidden[-1]
        input_vec = text_encodes
        self._get_cost(input_vec, None, None)

    def _define_inputs(self, *args, **kwargs):
        self.hashtag = tensor.ivector('hashtag')
        self.text = tensor.imatrix('text')
        self.text_mask = tensor.matrix('text_mask', dtype=theano.config.floatX)
        self.hashtag_word = tensor.ivector('hashtag_word')
        self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask',
                                                 dtype=theano.config.floatX)
        self.hashtag_word_left_idx = tensor.ivector(
            'hashtag_word_idx_left_idx')
        self.hashtag_word_right_idx = tensor.ivector(
            'hashtag_word_idx_right_idx')
        self.sparse_word = tensor.imatrix('sparse_word')
        self.sparse_word_sparse_mask = tensor.vector(
            'sparse_word_sparse_mask', dtype=theano.config.floatX)
        self.sparse_word_mask = tensor.matrix('sparse_word_mask',
                                              dtype=theano.config.floatX)
        self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx')
        self.sparse_word_right_idx = tensor.ivector(
            'sparse_word_idx_right_idx')

    def _build_bricks(self, *args, **kwargs):
        # Build lookup tables
        self.word_embed = self._embed(len(self.dataset.word2index),
                                      self.config.word_embed_dim,
                                      name='word_embed')

        self.hashtag_embed = self._embed(len(self.dataset.hashtag2index),
                                         self.config.lstm_dim,
                                         name='hashtag_embed')
        # Build text encoder
        self.mlstm_ins = Linear(input_dim=self.config.word_embed_dim,
                                output_dim=4 * self.config.lstm_dim,
                                name='mlstm_in')
        self.mlstm_ins.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) /
            numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim))
        self.mlstm_ins.biases_init = Constant(0)
        self.mlstm_ins.initialize()
        self.mlstm = MLSTM(self.config.lstm_time,
                           self.config.lstm_dim,
                           shared=False)
        self.mlstm.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) /
            numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim))
        self.mlstm.biases_init = Constant(0)
        self.mlstm.initialize()
        self.hashtag2word = MLP(
            activations=[Tanh('hashtag2word_tanh')],
            dims=[self.config.lstm_dim, self.config.word_embed_dim],
            name='hashtag2word_mlp')
        self.hashtag2word.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.hashtag2word.biases_init = Constant(0)
        self.hashtag2word.initialize()
        self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias')
        self.hashtag2word_bias.biases_init = Constant(0)
        self.hashtag2word_bias.initialize()
        #Build character embedding
        self.char_embed = self._embed(len(self.dataset.char2index),
                                      self.config.char_embed_dim,
                                      name='char_embed')
        # Build sparse word encoder
        self.rnn_ins = Linear(input_dim=self.config.char_embed_dim,
                              output_dim=self.config.word_embed_dim,
                              name='rnn_in')
        self.rnn_ins.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim +
                                           self.config.word_embed_dim))
        self.rnn_ins.biases_init = Constant(0)
        self.rnn_ins.initialize()
        self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim,
                                   activation=Tanh())
        self.rnn.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.rnn.initialize()

    def _apply_dropout(self, outputs, *args, **kwargs):
        variables = [self.word_embed.W, self.hashtag_embed.W]
        cgs = ComputationGraph(outputs)
        cg_dropouts = apply_dropout(cgs,
                                    variables,
                                    drop_prob=self.config.dropout_prob,
                                    seed=123).outputs
        return cg_dropouts

    def _apply_reg(self, cost, params=None, *args, **kwargs):
        try:
            if self.config.l2_norm > 0:
                cost = cost + self.config.l2_norm * theano_expressions.l2_norm(
                    tensors=[self.hashtag_embed.W, self.word_embed.W])**2

            else:
                pass
        except Exception:
            pass
        return cost
Esempio n. 37
0
def run(epochs=1, corpus="data/", HIDDEN_DIMS=100, path="./"):
    brown = BrownDataset(corpus)

    INPUT_DIMS = brown.get_vocabulary_size()

    OUTPUT_DIMS = brown.get_vocabulary_size()

    # These are theano variables
    x = tensor.lmatrix('context')
    y = tensor.ivector('output')

    # Construct the graph
    input_to_hidden = LookupTable(name='input_to_hidden', length=INPUT_DIMS,
                                  dim=HIDDEN_DIMS)

    # Compute the weight matrix for every word in the context and then compute
    # the average.
    h = tensor.mean(input_to_hidden.apply(x), axis=1)

    hidden_to_output = Linear(name='hidden_to_output', input_dim=HIDDEN_DIMS,
                              output_dim=OUTPUT_DIMS)
    y_hat = Softmax().apply(hidden_to_output.apply(h))

    # And initialize with random varibales and set the bias vector to 0
    weights = IsotropicGaussian(0.01)
    input_to_hidden.weights_init = hidden_to_output.weights_init = weights
    input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0)
    input_to_hidden.initialize()
    hidden_to_output.initialize()

    # And now the cost function
    cost = CategoricalCrossEntropy().apply(y, y_hat)
    cg = ComputationGraph(cost)

    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum()
    cost.name = 'cost_with_regularization'

    mini_batch = SequentialScheme(brown.num_instances(), 512)
    data_stream = DataStream.default_stream(brown, iteration_scheme=mini_batch)

    # Now we tie up lose ends and construct the algorithm for the training
    # and define what happens in the main loop.
    algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))

    extensions = [
        ProgressBar(),
        FinishAfter(after_n_epochs=epochs),
        Printing(),
        # TrainingDataMonitoring(variables=[cost]),
        SaveWeights(layers=[input_to_hidden, hidden_to_output],
                    prefixes=['%sfirst' % path, '%ssecond' % path]),
        # Plot(
        #     'Word Embeddings',
        #     channels=[
        #         [
        #             'cost_with_regularization'
        #         ]
        #     ])
    ]

    logger.info("Starting main loop...")
    main = MainLoop(data_stream=data_stream,
                    algorithm=algorithm,
                    extensions=extensions)

    main.run()

    pickle.dump(cg, open('%scg.pickle' % path, 'wb'))
Esempio n. 38
0
cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum()
cost.name = 'cost_with_regularization'

#定义一个多层神经网络,层与层之间的计算公式已经被之前定义。
#激活函数集activations定义了每一层的非线性转换函数,多层感知器每一层的输出都包含了两部分,第一部分是线性计算,然后将线性计算的结果进行非线性转换
#x是多层感知器的输入
mlp = MLP(activations = [Rectifier(),Softmax()], dims = [784, 100, 10]).apply(x)

#定义完整个神经网络的流程后,需要设置其线性转换的参数的初始值。

input_to_hidden.weights_init = IsotropicGaussian(0.01)
input_to_hidden.biases_init = Constant(0);
hidden_to_output.weights_init = IsotropicGaussian(0.01)
hidden_to_output.biases_init = Constant(0)
#对设置进行初始化设置,必须要做这一步,否则之前的设置都没有用
input_to_hidden.initialize()
hidden_to_output.initialize()

print W1.get_value()

#然后开始进行模型训练,这里使用现有的内置的数据集 MNIST,如果想要使用别的数据集,需要使用fuel对数据进行预处理
mnist = MNIST(("train",))
#定义迭代计算的方式,使用mini-batch的方法计算,每一次mini-batch使用1024条数据。以此获得数据流,data_stream
data_stream = Flatten(DataStream.default_stream(mnist, iteration_scheme = SequentialScheme(mnist.num_examples, batch_size = 256)))

#定义优化函数的最优值计算方法,这边使用SGD来做
#algorithm = GradientDescent(cost = cost, parameters = [cg.parameters], step_rule = Scale(learning_rate = 0.01))
algorithm = GradientDescent(cost = cost, parameters = cg.parameters, step_rule = Scale(learning_rate = 0.01)) # define to use gradient dscent to compute

print "------"
Esempio n. 39
0
class impatientLayer:
    # both visual and word feature are in the joint space
    # of dim: feature_dim
    # hidden_dim: dim of m
    # output_dim: final joint document query representation dim
    def __init__(self, feature_dim, hidden_dim, output_dim):
        self.image_embed = Linear(input_dim=feature_dim,
                                  output_dim=hidden_dim,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  use_bias=False,
                                  name='image_embed')
        self.word_embed = Linear(input_dim=feature_dim,
                                 output_dim=hidden_dim,
                                 weights_init=IsotropicGaussian(0.01),
                                 biases_init=Constant(0),
                                 use_bias=False,
                                 name='word_embed')
        self.r_embed = Linear(input_dim=feature_dim,
                              output_dim=hidden_dim,
                              weights_init=IsotropicGaussian(0.01),
                              biases_init=Constant(0),
                              use_bias=False,
                              name='r_embed')
        self.m_to_s = Linear(input_dim=hidden_dim,
                             output_dim=1,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='m_to_s')
        self.attention_dist = Softmax(name='attention_dist_softmax')
        self.r_to_r = Linear(input_dim=feature_dim,
                             output_dim=feature_dim,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='r_to_r')
        # self.r_to_g = Linear(input_dim=feature_dim,
        #                      output_dim=output_dim,
        #                      weights_init=IsotropicGaussian(0.01),
        #                      biases_init=Constant(0),
        #                      use_bias=False,
        #                      name='r_to_g')
        self.image_embed.initialize()
        self.word_embed.initialize()
        self.r_embed.initialize()
        self.m_to_s.initialize()
        self.r_to_r.initialize()
        # self.r_to_g.initialize()

        # the sequence to sequence LSTM
        self.seq = LSTM(output_dim,
                        name='rewatcher_seq',
                        weights_init=IsotropicGaussian(0.01),
                        biases_init=Constant(0))
        self.seq_embed = Linear(feature_dim,
                                output_dim * 4,
                                name='rewatcher_seq_embed',
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0),
                                use_bias=False)

        self.seq.initialize()
        self.seq_embed.initialize()

    # doc: row major batch_size x doc_length x feature_dim
    # query: row major batch_size x q x feature_dim
    # mask: mask of query batch_size
    # mask: length of a sentence - 1
    def apply(self, doc, query, mask_, batch_size):
        # batch_size x doc_length x hidden_dim
        mask = mask_.flatten()
        att1 = self.image_embed.apply(doc)

        # y_q_i: the ith token of question
        #        batch_size x feature_dim
        # r_1: r_m_1
        #        batch_size x feature_dim
        # y_d: document
        #        batch_size x doc_length x feature_dim
        # y_d_m: d-to-m
        #        batch_size x doc_length x hidden_dim
        def one_step(y_q_i, r_1, y_d, y_d_m):
            # batch_size x hidden_dim
            att2 = self.r_embed.apply(r_1)
            # batch_size x hidden_dim
            att3 = self.word_embed.apply(y_q_i)
            att = y_d_m + att2.dimshuffle(0, 'x', 1) + att3.dimshuffle(0, 'x', 1)
            # batch_size x doc_length x hidden_dim
            m = T.tanh(att)
            # batch_size x doc_length x 1
            s = self.m_to_s.apply(m)
            # batch_size x doc_length
            s = s.reshape((s.shape[0], s.shape[1]))
            s = self.attention_dist.apply(s)
            y_d_s = y_d.swapaxes(1, 2)
            # return batch_size x feature_dim
            return T.batched_dot(y_d_s, s) + T.tanh(self.r_to_r.apply(r_1))

        # query: batch_size x q x feature_dim
        # r: q x batch_size x feature_dim
        r, updates = theano.scan(fn=one_step,
                                 sequences=[query.swapaxes(0,1)],
                                 outputs_info=T.zeros_like(doc[:, 0, :]),
                                 non_sequences=[doc, att1],
                                 n_steps=query.shape[1],
                                 name='impatient layer')

        # for the sequence encoder
        # q x batch_size x output_dim
        Wr = self.seq_embed.apply(r)
        # q x batch_size x output_dim
        seq_r, garbage = self.seq.apply(Wr)
        # batch_size x feature_dim
        r_q = r[mask, T.arange(batch_size), :]
        seq_r_q = seq_r[mask, T.arange(batch_size), :]
        # batch_size x output_dim
        return r_q, seq_r_q
Esempio n. 40
0
def shroom_mlp(shrooms_train, shrooms_test, num_epochs, hidden_dims,
               activation_function):

    # These are theano variables
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')

    # Construct the graph
    input_to_hidden = Linear(name='input_to_hidden', input_dim=117,
                             output_dim=hidden_dims)
    h = activation_function.apply(input_to_hidden.apply(x))
    hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_dims,
                              output_dim=2)
    y_hat = Softmax().apply(hidden_to_output.apply(h))

    # And initialize with random varibales and set the bias vector to 0
    weights = IsotropicGaussian(0.01)
    input_to_hidden.weights_init = hidden_to_output.weights_init = weights
    input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0)
    input_to_hidden.initialize()
    hidden_to_output.initialize()

    # And now the cost function
    cost = CategoricalCrossEntropy().apply(y, y_hat)
    cg = ComputationGraph(cost)
    # Not needed for now:
    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum()
    cost.name = 'cost_with_regularization'

    error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat)

    # The data streams give us access to our corpus and allow us to perform a
    # mini-batch training.
    data_stream = Flatten(DataStream.default_stream(
        shrooms_train,
        iteration_scheme=SequentialScheme(shrooms_train.num_examples,
                                          batch_size=128)))

    test_data_stream = Flatten(DataStream.default_stream(
        shrooms_test,
        iteration_scheme=SequentialScheme(shrooms_test.num_examples,
                                          batch_size=1000)))

    extensions = [
        ProgressBar(),
        PlotWeights(after_epoch=True,
                    folder="results_logistic_40_interpolation",
                    computation_graph=cg,
                    folder_per_layer=True,
                    dpi=150),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring(variables=[cost, error_rate],
                             data_stream=test_data_stream, prefix="test"),
        Printing()
    ]

    # Now we tie up lose ends and construct the algorithm for the training
    # and define what happens in the main loop.
    algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))

    main = MainLoop(data_stream=data_stream,
                    algorithm=algorithm,
                    extensions=extensions)

    main.run()

    return 0
Esempio n. 41
0
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs,
          test_cost, experiment_path, initialization, init_width, weight_noise,
          z_prob, z_prob_states, z_prob_cells, drop_prob_igates,
          ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop,
          rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len,
          decrease_lr_after_epoch, lr_decay, **kwargs):

    print '.. PTB experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    ###########################################
    #
    # LOAD DATA
    #
    ###########################################

    def onehot(x, numclasses=None):
        """ Convert integer encoding for class-labels (starting with 0 !)
            to one-hot encoding.
            The output is an array whose shape is the shape of the input array
            plus an extra dimension, containing the 'one-hot'-encoded labels.
        """
        if x.shape == ():
            x = x[None]
        if numclasses is None:
            numclasses = x.max() + 1
        result = numpy.zeros(list(x.shape) + [numclasses], dtype="int")
        z = numpy.zeros(x.shape, dtype="int")
        for c in range(numclasses):
            z *= 0
            z[numpy.where(x == c)] = 1
            result[..., c] += z
        return result.astype(theano.config.floatX)

    alphabetsize = 10000
    data = np.load('penntree_char_and_word.npz')
    trainset = data['train_words']
    validset = data['valid_words']
    testset = data['test_words']

    if testing:
        trainset = trainset[:3000]
        validset = validset[:3000]

    if share_mask:
        if not z_prob:
            raise ValueError('z_prob must be provided when using share_mask')
        if z_prob_cells or z_prob_states:
            raise ValueError(
                'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)'
            )
        z_prob_cells = z_prob
        # we don't want to actually use these masks, so this is to debug
        z_prob_states = None
    else:
        if z_prob:
            raise ValueError('z_prob is only used with share_mask')
        z_prob_cells = z_prob_cells or '1'
        z_prob_states = z_prob_states or '1'


#    rng = np.random.RandomState(seed)

###########################################
#
# MAKE STREAMS
#
###########################################

    def prep_dataset(dataset):
        dataset = dataset[:(len(dataset) - (len(dataset) %
                                            (seq_len * batch_size)))]
        dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2))

        stream = DataStream(
            IndexableDataset(indexables=OrderedDict([('data', dataset)])),
            iteration_scheme=SequentialExampleScheme(dataset.shape[0]))
        stream = Transpose(stream, [(1, 0)])
        stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells,
                                   drop_prob_igates, layer_size, num_layers,
                                   False, stoch_depth, share_mask,
                                   gaussian_drop, alphabetsize)
        stream.sources = ('data', ) * 3 + stream.sources + (
            'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates')
        return (stream, )

    train_stream, = prep_dataset(trainset)
    valid_stream, = prep_dataset(validset)
    test_stream, = prep_dataset(testset)

    ####################

    data = train_stream.get_epoch_iterator(as_dict=True).next()

    ####################

    ###########################################
    #
    # BUILD MODEL
    #
    ###########################################
    print '.. building model'

    x = T.tensor3('data')
    y = x
    zoneouts_states = T.tensor3('zoneouts_states')
    zoneouts_cells = T.tensor3('zoneouts_cells')
    zoneouts_igates = T.tensor3('zoneouts_igates')

    x.tag.test_value = data['data']
    zoneouts_states.tag.test_value = data['zoneouts_states']
    zoneouts_cells.tag.test_value = data['zoneouts_cells']
    zoneouts_igates.tag.test_value = data['zoneouts_igates']

    if init_width and not initialization == 'uniform':
        raise ValueError('Width is only for uniform init, whassup?')

    if initialization == 'glorot':
        weights_init = NormalizedInitialization()
    elif initialization == 'uniform':
        weights_init = Uniform(width=init_width)
    elif initialization == 'ortho':
        weights_init = OrthogonalInitialization()
    else:
        raise ValueError('No such initialization')

    if rnn_type.lower() == 'lstm':
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size * 4,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropLSTM(dim=layer_size,
                     weights_init=weights_init,
                     activation=Tanh(),
                     model_type=6,
                     name='rnn%d' % l,
                     ogates_zoneout=ogates_zoneout) for l in range(num_layers)
        ]
    elif rnn_type.lower() == 'gru':
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size * 3,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropGRU(dim=layer_size,
                    weights_init=weights_init,
                    activation=Tanh(),
                    name='rnn%d' % l) for l in range(num_layers)
        ]
    elif rnn_type.lower() == 'srnn':  # FIXME!!! make ReLU
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropSimpleRecurrent(dim=layer_size,
                                weights_init=weights_init,
                                activation=Rectifier(),
                                name='rnn%d' % l) for l in range(num_layers)
        ]
    else:
        raise NotImplementedError

    hid_to_out = Linear(layer_size,
                        alphabetsize,
                        name='hid_to_out',
                        weights_init=weights_init,
                        biases_init=Constant(0.0))

    for layer in in_to_hids:
        layer.initialize()
    for layer in recurrent_layers:
        layer.initialize()
    hid_to_out.initialize()

    layer_input = x  #in_to_hid.apply(x)

    init_updates = OrderedDict()
    for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)):
        rnn_embedding = in_to_hid.apply(layer_input)
        if rnn_type.lower() == 'lstm':
            states_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            cells_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name, cells_init.name = "states_init", "cells_init"
            states, cells = layer.apply(
                rnn_embedding,
                zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size],
                zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size],
                zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size],
                states_init, cells_init)
            init_updates.update([(states_init, states[-1]),
                                 (cells_init, cells[-1])])
        elif rnn_type.lower() in ['gru', 'srnn']:
            # untested!
            states_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name = "states_init"
            states = layer.apply(rnn_embedding, zoneouts_states,
                                 zoneouts_igates, states_init)
            init_updates.update([(states_init, states[-1])])
        else:
            raise NotImplementedError
        layer_input = states

    y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1]))
    shape_ = y_hat_pre_softmax.shape
    y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize)))

    ####################

    ###########################################
    #
    # SET UP COSTS AND MONITORS
    #
    ###########################################

    cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)),
                                           y_hat).copy('cost')

    bpc = (cost / np.log(2.0)).copy(name='bpr')
    perp = T.exp(cost).copy(name='perp')

    cost_train = cost.copy(name='train_cost')
    cg_train = ComputationGraph([cost_train])

    ###########################################
    #
    # NORM STABILIZER
    #
    ###########################################
    norm_cost = 0.

    def _magnitude(x, axis=-1):
        return T.sqrt(
            T.maximum(T.sqr(x).sum(axis=axis),
                      numpy.finfo(x.dtype).tiny))

    if penalty == 'cells':
        assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables)
        for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables):
            norms = _magnitude(cell)
            norm_cost += T.mean(
                T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))
    elif penalty == 'hids':
        for l in range(num_layers):
            assert 'rnn%d_apply_states' % l in [
                o.name
                for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)
            ]
        for output in VariableFilter(roles=[OUTPUT])(cg_train.variables):
            for l in range(num_layers):
                if output.name == 'rnn%d_apply_states' % l:
                    norms = _magnitude(output)
                    norm_cost += T.mean(
                        T.sum((norms[1:] - norms[:-1])**2, axis=0) /
                        (seq_len - 1))

    norm_cost.name = 'norm_cost'
    #cost_valid = cost_train
    cost_train += norm_cost_coeff * norm_cost
    cost_train = cost_train.copy(
        'cost_train')  #should this be cost_train.outputs[0]? no.

    cg_train = ComputationGraph([cost_train])

    ###########################################
    #
    # WEIGHT NOISE
    #
    ###########################################

    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
        cg_train = apply_noise(cg_train, weights, weight_noise)
        cost_train = cg_train.outputs[0].copy(name='cost_train')

    model = Model(cost_train)

    learning_rate = float(learning_rate)
    clipping = StepClipping(threshold=np.cast[floatX](clipping))
    if algorithm == 'adam':
        adam = Adam(learning_rate=learning_rate)
        learning_rate = adam.learning_rate
        step_rule = CompositeRule([adam, clipping])
    elif algorithm == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate)
        learning_rate = rms_prop.learning_rate
        step_rule = CompositeRule([clipping, rms_prop])
    elif algorithm == 'momentum':
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        learning_rate = sgd_momentum.learning_rate
        step_rule = CompositeRule([clipping, sgd_momentum])
    elif algorithm == 'sgd':
        sgd = Scale(learning_rate=learning_rate)
        learning_rate = sgd.learning_rate
        step_rule = CompositeRule([clipping, sgd])
    else:
        raise NotImplementedError
    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters)
    # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)})

    algorithm.add_updates(init_updates)

    def cond_number(x):
        _, _, sing_vals = T.nlinalg.svd(x, True, True)
        sing_mags = abs(sing_vals)
        return T.max(sing_mags) / T.min(sing_mags)

    def rms(x):
        return (x * x).mean().sqrt()

    whysplode_cond = []
    whysplode_rms = []
    for i, p in enumerate(init_updates):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(
                cond_number(p).copy(
                    'ini%d:%s_cond(%s)' %
                    (i, p.name, "x".join(map(str,
                                             p.get_value().shape)))))
        whysplode_rms.append(
            rms(p).copy('ini%d:%s_rms(%s)' %
                        (i, p.name, "x".join(map(str,
                                                 p.get_value().shape)))))
    for i, p in enumerate(cg_train.parameters):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(
                cond_number(p).copy(
                    'ini%d:%s_cond(%s)' %
                    (i, p.name, "x".join(map(str,
                                             p.get_value().shape)))))
        whysplode_rms.append(
            rms(p).copy('ini%d:%s_rms(%s)' %
                        (i, p.name, "x".join(map(str,
                                                 p.get_value().shape)))))

    observed_vars = [
        cost_train, cost, bpc, perp, learning_rate,
        aggregation.mean(
            algorithm.total_gradient_norm).copy("gradient_norm_mean")
    ]  # + whysplode_rms

    parameters = model.get_parameter_dict()
    for name, param in parameters.iteritems():
        observed_vars.append(param.norm(2).copy(name=name + "_norm"))
        observed_vars.append(
            algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm"))

    train_monitor = TrainingDataMonitoring(variables=observed_vars,
                                           prefix="train",
                                           after_epoch=True)

    dev_inits = [p.clone() for p in init_updates]
    cg_dev = ComputationGraph([cost, bpc, perp] +
                              init_updates.values()).replace(
                                  zip(init_updates.keys(), dev_inits))
    dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3]
    dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:]))

    dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp],
                                       data_stream=valid_stream,
                                       prefix="dev",
                                       updates=dev_init_updates)

    # noone does this
    if 'load_path' in kwargs:
        with open(kwargs['load_path']) as f:
            loaded = np.load(f)
            model = Model(cost_train)
            params_dicts = model.get_parameter_dict()
            params_names = params_dicts.keys()
            for param_name in params_names:
                param = params_dicts[param_name]
                # '/f_6_.W' --> 'f_6_.W'
                slash_index = param_name.find('/')
                param_name = param_name[slash_index + 1:]
                if param.get_value().shape == loaded[param_name].shape:
                    print 'Found: ' + param_name
                    param.set_value(loaded[param_name])
                else:
                    print 'Not found: ' + param_name

    extensions = []
    extensions.extend(
        [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor])
    if test_cost:
        test_inits = [p.clone() for p in init_updates]
        cg_test = ComputationGraph([cost, bpc, perp] +
                                   init_updates.values()).replace(
                                       zip(init_updates.keys(), test_inits))
        test_cost, test_bpc, test_perp = cg_test.outputs[:3]
        test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:]))

        test_monitor = DataStreamMonitoring(
            variables=[test_cost, test_bpc, test_perp],
            data_stream=test_stream,
            prefix="test",
            updates=test_init_updates)
        extensions.extend([test_monitor])

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    log_path = os.path.join(experiment_path, 'log.txt')
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    extensions.append(
        SaveParams('dev_cost', model, experiment_path, every_n_epochs=1))
    extensions.append(SaveLog(every_n_epochs=1))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    class RollsExtension(TrainingExtension):
        """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """
        def __init__(self, shvars):
            self.shvars = shvars

        def before_epoch(self):
            for v in self.shvars:
                v.set_value(np.roll(v.get_value(), 1, 0))

    extensions.append(
        RollsExtension(init_updates.keys() + dev_init_updates.keys() +
                       (test_init_updates.keys() if test_cost else [])))

    class LearningRateSchedule(TrainingExtension):
        """ Lets you set a number to divide learning rate by each epoch + when to start doing that """
        def __init__(self):
            self.epoch_number = 0

        def after_epoch(self):
            self.epoch_number += 1
            if self.epoch_number > decrease_lr_after_epoch:
                learning_rate.set_value(learning_rate.get_value() / lr_decay)

    if bool(lr_decay) != bool(decrease_lr_after_epoch):
        raise ValueError(
            'Need to define both lr_decay and decrease_lr_after_epoch')
    if lr_decay and decrease_lr_after_epoch:
        extensions.append(LearningRateSchedule())

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    t1 = time.time()
    print "Building time: %f" % (t1 - t0)

    main_loop.run()
    print "Execution time: %f" % (time.time() - t1)
Esempio n. 42
0
lookup_input = LookupTable(
    name='lookup_input',
    length=charset_size+1,
    dim=hidden_layer_dim,
    weights_init=initialization.Uniform(width=0.01),
    biases_init=Constant(0))
lookup_input.initialize()

linear_input = Linear(
    name='linear_input',
    input_dim=hidden_layer_dim,
    output_dim=hidden_layer_dim,
    weights_init=initialization.Uniform(width=0.01),
    biases_init=Constant(0))
linear_input.initialize()

rnn = SimpleRecurrent(
    name='hidden',
    dim=hidden_layer_dim,
    activation=Tanh(),
    weights_init=initialization.Uniform(width=0.01))
rnn.initialize()

linear_output = Linear(
    name='linear_output',
    input_dim=hidden_layer_dim,
    output_dim=charset_size,
    weights_init=initialization.Uniform(width=0.01),
    biases_init=Constant(0))
linear_output.initialize()
Esempio n. 43
0
def rf_lstm_experiment(data_name, exp_network, in_dim, out_dim, num_layers,
                       start_neurons, num_neurons, batch_size, num_epochs):
    """LSTM Experiment."""
    # load dataset
    train_set = IterableDataset(
        ds.transform_sequence(data_name, "train", batch_size))
    test_set = IterableDataset(
        ds.transform_sequence(data_name, "test", batch_size))
    stream_train = DataStream(dataset=train_set)
    stream_test = DataStream(dataset=test_set)
    methods = ['sgd', 'momentum', 'adagrad', 'rmsprop']

    for n_layers in xrange(1, num_layers + 1):
        for n_neurons in xrange(start_neurons, num_neurons + 5, 5):
            for method in methods:
                X = T.tensor3("features")
                y = T.matrix("targets")

                x_to_h = Linear(in_dim,
                                n_neurons * 4,
                                name='x_to_h',
                                weights_init=IsotropicGaussian(),
                                biases_init=Constant(0.0))
                lstm = LSTM(n_neurons,
                            name='lstm',
                            weights_init=IsotropicGaussian(),
                            biases_init=Constant(0.0))

                h_to_o = nc.setup_ff_network(n_neurons, out_dim, n_layers - 1,
                                             n_neurons)

                X_trans = x_to_h.apply(X)
                h, c = lstm.apply(X_trans)
                y_hat = h_to_o.apply(h[-1])
                cost, cg = nc.create_cg_and_cost(y, y_hat, "none")

                lstm.initialize()
                x_to_h.initialize()
                h_to_o.initialize()

                algorithm = nc.setup_algorithms(cost, cg, method, type="RNN")

                test_monitor = DataStreamMonitoring(variables=[cost],
                                                    data_stream=stream_test,
                                                    prefix="test")
                train_monitor = TrainingDataMonitoring(variables=[cost],
                                                       prefix="train",
                                                       after_epoch=True)

                main_loop = MainLoop(
                    algorithm=algorithm,
                    data_stream=stream_train,
                    extensions=[
                        test_monitor, train_monitor,
                        FinishAfter(after_n_epochs=num_epochs),
                        Printing(),
                        ProgressBar()
                    ])

                main_loop.run()

                # Saving results
                exp_id = ds.create_exp_id(exp_network, n_layers, n_neurons,
                                          batch_size, num_epochs, method,
                                          "none")

                # prepare related functions
                predict = theano.function([X], y_hat)

                # prepare related data
                train_features, train_targets = ds.get_iter_data(train_set)
                test_features, test_targets = ds.get_iter_data(test_set)

                # Prediction of result
                train_predicted = gen_prediction(predict, train_features)
                test_predicted = gen_prediction(predict, test_features)

                # Get cost
                cost = ds.get_cost_data(test_monitor, train_set.num_examples,
                                        num_epochs)

                # logging
                ds.save_experiment(train_targets, train_predicted,
                                   test_targets, test_predicted, cost,
                                   exp_network, n_layers, n_neurons,
                                   batch_size, num_epochs, method, "none",
                                   exp_id, "../results/")
Esempio n. 44
0
print "vocab size:", VOCAB_DIM
EMBEDDING_DIM = 100

Xs = tensor.imatrix("context")
y = tensor.ivector('center')

w1 = LookupTable(name="w1", length=VOCAB_DIM, dim=EMBEDDING_DIM)
w2 = Linear(name='w2', input_dim=EMBEDDING_DIM, output_dim=VOCAB_DIM)

hidden = tensor.mean(w1.apply(Xs), axis=1)
y_hat = Softmax().apply(w2.apply(hidden))

w1.weights_init = w2.weights_init = IsotropicGaussian(0.01)
w1.biases_init = w2.biases_init = Constant(0)
w1.initialize()
w2.initialize()

cost = CategoricalCrossEntropy().apply(y, y_hat)

cg = ComputationGraph(cost)
W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum()
cost.name = "loss"


#
# the actual training of the model
#
main = MainLoop(data_stream = DataStream.default_stream(
                    dataset,
Esempio n. 45
0
def main():
    x = T.tensor3('features')
    m = T.matrix('features_mask')
    y = T.imatrix('targets')

    #rnn = SimpleRecurrent(
            #dim = 50,
            #activation=Tanh(),
            #weights_init = Uniform(std=0.01),
            #biases_init = Constant(0.)
        #)

    #rnn = GatedRecurrent(
            #dim = 50,
            #activation=Tanh(),
            #weights_init = Uniform(std=0.01),
            #biases_init = Constant(0.)
        #)

    embedding_size = 300
    #glove_version = "vectors.6B.100d.txt"
    glove_version = "glove.6B.300d.txt"
    #fork = Fork(weights_init=IsotropicGaussian(0.02),
            #biases_init=Constant(0.),
            #input_dim=embedding_size,
            #output_dims=[embedding_size]*3,
            #output_names=['inputs', 'reset_inputs', 'update_inputs']
            #)

    rnn = LSTM(
            dim = embedding_size,
            activation=Tanh(),
            weights_init = IsotropicGaussian(std=0.02),
        )
    rnn.initialize()

    #fork.initialize()
    wstd = 0.02

    score_layer = Linear(
            input_dim = 128,
            output_dim = 1,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(0.),
            name="linear2")
    score_layer.initialize()

    gloveMapping = Linear(
            input_dim = embedding_size,
            output_dim = embedding_size,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(0.0),
            name="gloveMapping"
            )
    gloveMapping.initialize()
    o = gloveMapping.apply(x)
    o = Rectifier(name="rectivfyglove").apply(o)

    forget_bias = np.zeros((embedding_size*4), dtype=theano.config.floatX)
    forget_bias[embedding_size:embedding_size*2] = 4.0
    toLSTM = Linear(
            input_dim = embedding_size,
            output_dim = embedding_size*4,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(forget_bias),
            #biases_init = Constant(0.0),
            name="ToLSTM"
            )
    toLSTM.initialize()


    rnn_states, rnn_cells = rnn.apply(toLSTM.apply(o) * T.shape_padright(m), mask=m)
    #inputs, reset_inputs, update_inputs = fork.apply(x)
    #rnn_states = rnn.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs, mask=m)

    #rnn_out = rnn_states[:, -1, :]
    rnn_out = (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1) / m.sum(axis=1).dimshuffle(0, 'x')
    #rnn_out = (rnn_states).mean(axis=1)# / m.sum(axis=1)

    hidden = Linear(
        input_dim = embedding_size,
        output_dim = 128,
        weights_init = Uniform(std=0.01),
        biases_init = Constant(0.))
    hidden.initialize()

    o = hidden.apply(rnn_out)
    o = Rectifier().apply(o)
    hidden = Linear(
        input_dim = 128,
        output_dim = 128,
        weights_init = IsotropicGaussian(std=0.02),
        biases_init = Constant(0.),
        name="hiddenmap2")
    hidden.initialize()

    o = hidden.apply(o)
    o = Rectifier(name="rec2").apply(o)

    o = score_layer.apply(o)

    probs = Sigmoid().apply(o)

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval(
            #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()


    # =================

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
            cost = cost,
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=10),
                AdaM(),
                #AdaDelta(),
                ])

            )


    # ========
    print "setting up data"

    train_dataset = IMDBText('train')
    test_dataset = IMDBText('test')
    batch_size = 16
    n_train = train_dataset.num_examples
    train_stream = DataStream(
            dataset=train_dataset,
            iteration_scheme=ShuffledScheme(
                examples=n_train,
                batch_size=batch_size)
            )
    glove = GloveTransformer(glove_version, data_stream=train_stream)

    train_padded = Padding(
            data_stream=glove,
            mask_sources=('features',)
            #mask_sources=[]
            )


    test_stream = DataStream(
            dataset=test_dataset,
            iteration_scheme=ShuffledScheme(
                examples=n_train,
                batch_size=batch_size)
            )
    glove = GloveTransformer(glove_version, data_stream=test_stream)
    test_padded = Padding(
            data_stream=glove,
            mask_sources=('features',)
            #mask_sources=[]
            )
    print "setting up model"
    #import ipdb
    #ipdb.set_trace()

    lstm_norm = rnn.W_state.norm(2)
    lstm_norm.name = "lstm_norm"

    pre_norm= gloveMapping.W.norm(2)
    pre_norm.name = "pre_norm"

    #======
    model = Model(cost)
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification, lstm_norm, pre_norm],
        prefix='train',
        after_epoch=True
        ))

    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        data_stream=test_padded,
        prefix='test',
        after_epoch=True
        ))
    extensions.append(Timing())
    extensions.append(Printing())

    extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    extensions.append(Plot("result", channels=[['train_cost', 'train_misclassification']], after_epoch=True))

    main_loop = MainLoop(
            model=model,
            data_stream=train_padded,
            algorithm=algorithm,
            extensions=extensions)
    main_loop.run()
Esempio n. 46
0
    def train(self):

        x = self.sharedBatch['x']
        x.name = 'x_myinput'
        xmini = self.sharedBatch['xmini']
        xmini.name = 'xmini_myinput'
        y = self.sharedBatch['y']
        y.name = 'y_myinput'

        # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see
        # LSTM layer documentation for the explanation
        x_to_h = Linear(self.input_dimx,
                        self.dim,
                        name='x_to_h',
                        weights_init=IsotropicGaussian(),
                        biases_init=Constant(0.0))
        xmini_to_h = Linear(self.input_dimxmini,
                            self.mini_dim,
                            name='xmini_to_h',
                            weights_init=IsotropicGaussian(),
                            biases_init=Constant(0.0))

        rnnwmini = RNNwMini(dim=self.dim,
                            mini_dim=self.mini_dim,
                            summary_dim=self.summary_dim)

        h_to_o = Linear(self.summary_dim,
                        1,
                        name='h_to_o',
                        weights_init=IsotropicGaussian(),
                        biases_init=Constant(0.0))

        x_transform = x_to_h.apply(x)
        xmini_transform = xmini_to_h.apply(xmini)

        h = rnnwmini.apply(x=x_transform, xmini=xmini_transform)

        # only values of hidden units of the last timeframe are used for
        # the classification
        y_hat = h_to_o.apply(h[-1])
        #y_hat = Logistic().apply(y_hat)

        cost = SquaredError().apply(y, y_hat)
        cost.name = 'cost'

        rnnwmini.initialize()
        x_to_h.initialize()
        xmini_to_h.initialize()
        h_to_o.initialize()

        self.f = theano.function(inputs=[], outputs=y_hat)

        #print("self.f === ")
        #print(self.f())
        #print(self.f().shape)
        #print("====")

        self.cg = ComputationGraph(cost)
        m = Model(cost)

        algorithm = GradientDescent(cost=cost,
                                    parameters=self.cg.parameters,
                                    step_rule=RMSProp(learning_rate=0.01),
                                    on_unused_sources='ignore')
        valid_monitor = DataStreamMonitoringShared(
            variables=[cost],
            data_stream=self.stream_valid_int,
            prefix="valid",
            sharedBatch=self.sharedBatch,
            sharedData=self.sharedData)
        train_monitor = TrainingDataMonitoring(variables=[cost],
                                               prefix="train",
                                               after_epoch=True)

        sharedVarMonitor = SwitchSharedReferences(self.sharedBatch,
                                                  self.sharedData)
        tBest = self.track_best('valid_cost', self.cg)
        self.tracker = tBest[0]
        extensions = [sharedVarMonitor, valid_monitor] + tBest

        if self.debug:
            extensions.append(Printing())

        self.algorithm = algorithm
        self.extensions = extensions
        self.model = m
        self.mainloop = MainLoop(self.algorithm,
                                 self.stream_train_int,
                                 extensions=self.extensions,
                                 model=self.model)
        self.main_loop(True)
Esempio n. 47
0
def main(num_epochs=100):
    x = tensor.matrix('features')
    m = tensor.matrix('features_mask')

    x_int = x.astype(dtype='int32').T
    train_dataset = TextFile('inspirational.txt')
    train_dataset.indexables[0] = numpy.array(sorted(
        train_dataset.indexables[0], key=len
    ))

    n_voc = len(train_dataset.dict.keys())

    init_probs = numpy.array(
        [sum(filter(lambda idx:idx == w,
                    [s[0] for s in train_dataset.indexables[
                        train_dataset.sources.index('features')]]
                    )) for w in xrange(n_voc)],
        dtype=theano.config.floatX
    )
    init_probs = init_probs / init_probs.sum()

    n_h = 100
    linear_embedding = LookupTable(
        length=n_voc,
        dim=n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    linear_embedding.initialize()
    lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX)
    lstm_biases[n_h:(2 * n_h)] = 4.
    rnn = SimpleRecurrent(
        dim=n_h,
        activation=Tanh(),
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    rnn.initialize()
    score_layer = Linear(
        input_dim=n_h,
        output_dim=n_voc,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    score_layer.initialize()

    embedding = (linear_embedding.apply(x_int[:-1])
                 * tensor.shape_padright(m.T[1:]))
    rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:])
    probs = softmax(
        sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0]
    )
    idx_mask = m.T[1:].nonzero()
    cost = CategoricalCrossEntropy().apply(
        x_int[1:][idx_mask[0], idx_mask[1]],
        probs[idx_mask[0], idx_mask[1]]
    )
    cost.name = 'cost'
    misclassification = MisclassificationRate().apply(
        x_int[1:][idx_mask[0], idx_mask[1]],
        probs[idx_mask[0], idx_mask[1]]
    )
    misclassification.name = 'misclassification'

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=Adam()
    )

    train_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=train_dataset.num_examples,
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )

    model = Model(cost)

    extensions = []
    extensions.append(Timing())
    extensions.append(FinishAfter(after_n_epochs=num_epochs))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True))

    batch_size = 10
    length = 30
    trng = MRG_RandomStreams(18032015)
    u = trng.uniform(size=(length, batch_size, n_voc))
    gumbel_noise = -tensor.log(-tensor.log(u))
    init_samples = (tensor.log(init_probs).dimshuffle(('x', 0))
                    + gumbel_noise[0]).argmax(axis=-1)
    init_states = rnn.initial_state('states', batch_size)

    def sampling_step(g_noise, states, samples_step):
        embedding_step = linear_embedding.apply(samples_step)
        next_states = rnn.apply(inputs=embedding_step,
                                            states=states,
                                            iterate=False)
        probs_step = softmax(score_layer.apply(next_states))
        next_samples = (tensor.log(probs_step)
                        + g_noise).argmax(axis=-1)

        return next_states, next_samples

    [_, samples], _ = theano.scan(
        fn=sampling_step,
        sequences=[gumbel_noise[1:]],
        outputs_info=[init_states, init_samples]
    )

    sampling = theano.function([], samples.owner.inputs[0].T)

    plotters = []
    plotters.append(Plotter(
        channels=[['train_cost', 'train_misclassification']],
        titles=['Costs']))

    extensions.append(PlotManager('Language modelling example',
                                  plotters=plotters,
                                  after_epoch=True,
                                  after_training=True))
    extensions.append(Printing())
    extensions.append(PrintSamples(sampler=sampling,
                                   voc=train_dataset.inv_dict))

    main_loop = MainLoop(model=model,
                         data_stream=train_data_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Esempio n. 48
0
def main():
    x = T.tensor3('features')
    #m = T.matrix('features_mask')
    y = T.imatrix('targets')

    #x = x+m.mean()*0

    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    #embedding_size = 50
    #glove_version = "vectors.6B.50d.txt"
    wstd = 0.02

    #vaguely normalize
    x = x / 3.0 - .5

    #gloveMapping = Linear(
    #input_dim = embedding_size,
    #output_dim = 128,
    #weights_init = Orthogonal(),
    #biases_init = Constant(0.0),
    #name="gloveMapping"
    #)
    #gloveMapping.initialize()
    #o = gloveMapping.apply(x)
    #o = Rectifier(name="gloveRec").apply(o)
    o = x
    input_dim = 300

    gru = GatedRecurrentFull(
        hidden_dim=input_dim,
        activation=Tanh(),
        #activation=bricks.Identity(),
        gate_activation=Sigmoid(),
        state_to_state_init=IsotropicGaussian(0.02),
        state_to_reset_init=IsotropicGaussian(0.02),
        state_to_update_init=IsotropicGaussian(0.02),
        input_to_state_transform=Linear(input_dim=input_dim,
                                        output_dim=input_dim,
                                        weights_init=IsotropicGaussian(0.02),
                                        biases_init=Constant(0.0)),
        input_to_update_transform=Linear(input_dim=input_dim,
                                         output_dim=input_dim,
                                         weights_init=IsotropicGaussian(0.02),
                                         biases_init=Constant(0.0)),
        input_to_reset_transform=Linear(input_dim=input_dim,
                                        output_dim=input_dim,
                                        weights_init=IsotropicGaussian(0.02),
                                        biases_init=Constant(0.0)))
    gru.initialize()
    rnn_in = o.dimshuffle(1, 0, 2)
    #rnn_in = o
    #rnn_out = gru.apply(rnn_in, mask=m.T)
    rnn_out = gru.apply(rnn_in)
    state_to_state = gru.rnn.state_to_state
    state_to_state.name = "state_to_state"
    #o = rnn_out[-1, :, :]
    o = rnn_out[-1]

    #o = rnn_out[:, -1, :]
    #o = rnn_out.mean(axis=1)

    #print rnn_last_out.eval({
    #x: np.ones((3, 101, 300), dtype=theano.config.floatX),
    #m: np.ones((3, 101), dtype=theano.config.floatX)})
    #raw_input()
    #o = rnn_out.mean(axis=1)

    score_layer = Linear(input_dim=300,
                         output_dim=1,
                         weights_init=IsotropicGaussian(std=wstd),
                         biases_init=Constant(0.),
                         use_bias=True,
                         name="linear_score")
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print rnn_in.shape.eval(
    #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    #})
    #print rnn_out.shape.eval(
    #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()

    # =================

    cg = ComputationGraph([cost])
    #cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5)
    params = cg.parameters
    for p in params:
        p.name += "___" + p.tag.annotations[0].name

    algorithm = GradientDescent(
        cost=cg.outputs[0],
        params=params,
        step_rule=CompositeRule([
            StepClipping(threshold=4),
            AdaM(),
            #NAG(lr=0.1, momentum=0.9),
            #AdaDelta(),
        ]))

    #algorithm.initialize()
    print params
    f = theano.function([x, y], algorithm.cost)
    ipdb.set_trace()

    print "making plots"
    #theano.printing.pydotprint(algorithm.cost, outfile='unopt.png')
    theano.printing.pydotprint(f, outfile='opt.png', scan_graphs=True)
Esempio n. 49
0
def main():
    nvis, nhid, nlat, learn_prior = 784, 200, 100, False
    theano_rng = MRG_RandomStreams(134663)

    # Initialize prior
    prior_mu = shared_floatx(numpy.zeros(nlat), name='prior_mu')
    prior_log_sigma = shared_floatx(numpy.zeros(nlat), name='prior_log_sigma')
    if learn_prior:
        add_role(prior_mu, PARAMETER)
        add_role(prior_log_sigma, PARAMETER)

    # Initialize encoding network
    encoding_network = MLP(activations=[Rectifier()],
                           dims=[nvis, nhid],
                           weights_init=IsotropicGaussian(std=0.001),
                           biases_init=Constant(0))
    encoding_network.initialize()
    encoding_parameter_mapping = Fork(
        output_names=['mu_phi', 'log_sigma_phi'], input_dim=nhid,
        output_dims=dict(mu_phi=nlat, log_sigma_phi=nlat), prototype=Linear(),
        weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0))
    encoding_parameter_mapping.initialize()

    # Initialize decoding network
    decoding_network = MLP(activations=[Rectifier()],
                           dims=[nlat, nhid],
                           weights_init=IsotropicGaussian(std=0.001),
                           biases_init=Constant(0))
    decoding_network.initialize()
    decoding_parameter_mapping = Linear(
        input_dim=nhid, output_dim=nvis, name='mu_theta',
        weights_init=IsotropicGaussian(std=0.001),
        biases_init=Constant(0))
    decoding_parameter_mapping.initialize()

    # Encode / decode
    x = tensor.matrix('features')
    h_phi = encoding_network.apply(x)
    mu_phi, log_sigma_phi = encoding_parameter_mapping.apply(h_phi)
    epsilon = theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype)
    epsilon.name = 'epsilon'
    z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
    z.name = 'z'
    h_theta = decoding_network.apply(z)
    mu_theta = decoding_parameter_mapping.apply(h_theta)

    # Compute cost
    kl_term = (
        prior_log_sigma - log_sigma_phi
        + 0.5 * (
            tensor.exp(2 * log_sigma_phi) + (mu_phi - prior_mu) ** 2
        ) / tensor.exp(2 * prior_log_sigma)
        - 0.5
    ).sum(axis=1)
    kl_term.name = 'kl_term'
    kl_term_mean = kl_term.mean()
    kl_term_mean.name = 'avg_kl_term'
    reconstruction_term = - (
        x * tensor.nnet.softplus(-mu_theta)
        + (1 - x) * tensor.nnet.softplus(mu_theta)).sum(axis=1)
    reconstruction_term.name = 'reconstruction_term'
    reconstruction_term_mean = -reconstruction_term.mean()
    reconstruction_term_mean.name = 'avg_reconstruction_term'
    cost = -(reconstruction_term - kl_term).mean()
    cost.name = 'nll_upper_bound'

    # Datasets and data streams
    mnist_train = MNIST(
        'train', start=0, stop=50000, binary=True, sources=('features',))
    train_loop_stream = DataStream(
        dataset=mnist_train,
        iteration_scheme=SequentialScheme(mnist_train.num_examples, 100))
    train_monitor_stream = DataStream(
        dataset=mnist_train,
        iteration_scheme=SequentialScheme(mnist_train.num_examples, 500))
    mnist_valid = MNIST(
        'train', start=50000, stop=60000, binary=True, sources=('features',))
    valid_monitor_stream = DataStream(
        dataset=mnist_valid,
        iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500))
    mnist_test = MNIST('test', binary=True, sources=('features',))
    test_monitor_stream = DataStream(
        dataset=mnist_test,
        iteration_scheme=SequentialScheme(mnist_test.num_examples, 500))

    # Get parameters
    computation_graph = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(computation_graph.variables)

    # Training loop
    step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95)
    algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule)
    monitored_quantities = [cost, reconstruction_term_mean, kl_term_mean]
    main_loop = MainLoop(
        model=None, data_stream=train_loop_stream, algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=200),
            DataStreamMonitoring(
                monitored_quantities, train_monitor_stream, prefix="train"),
            DataStreamMonitoring(
                monitored_quantities, valid_monitor_stream, prefix="valid"),
            DataStreamMonitoring(
                monitored_quantities, test_monitor_stream, prefix="test"),
            Printing()])
    main_loop.run()
Esempio n. 50
0
File: model.py Progetto: caomw/MLFun
    def __init__(self):
        srng = MRG_RandomStreams(seed=123)

        X = T.matrix('features')
        self.X = X

        #drop = Dropout(p_drop=0.5)
        #o = drop.apply(X)
        o = (X - 128) / 128.0
        self.scaled = o

        #n_hidden = 64
        n_hidden = 2048 * 2
        n_zs = 1024
        self.n_zs = n_zs

        self.n_hidden = n_hidden

        l = Linear(input_dim=32*32*3, output_dim=n_hidden,
                weights_init=IsotropicGaussian(0.01),
                biases_init=Constant(0))
        l.initialize()
        o = l.apply(o)
        o = Rectifier().apply(o)


        l = Linear(input_dim=n_hidden, output_dim=n_hidden,
                weights_init=IsotropicGaussian(0.01),
                biases_init=Constant(0))
        l.initialize()
        o = l.apply(o)
        o = Rectifier().apply(o)

        l = Linear(input_dim=n_hidden, output_dim=n_zs,
                weights_init=IsotropicGaussian(0.01),
                biases_init=Constant(0))
        l.initialize()
        mu_encoder = l.apply(o)

        l = Linear(input_dim=n_hidden, output_dim=n_zs,
                weights_init=IsotropicGaussian(0.01),
                biases_init=Constant(0))
        l.initialize()
        log_sigma_encoder = l.apply(o)

        eps = srng.normal(log_sigma_encoder.shape)

        z = eps * T.exp(log_sigma_encoder) + mu_encoder

        z_to_h1_decode = Linear(input_dim=n_zs, output_dim=n_hidden,
                weights_init=IsotropicGaussian(0.01),
                biases_init=Constant(0))
        z_to_h1_decode.initialize()

        h1_decode_to_h_decode = Linear(input_dim=n_hidden, output_dim=n_hidden,
                weights_init=IsotropicGaussian(0.01),
                biases_init=Constant(0))
        h1_decode_to_h_decode.initialize()


        h_decode_produce = Linear(input_dim=n_hidden, output_dim=32*32*3,
                weights_init=IsotropicGaussian(0.01),
                biases_init=Constant(0),
                name="linear4")
        h_decode_produce.initialize()
        #o = h_decode_produce.apply(h_decoder)

        h_decode_produce = Linear(input_dim=n_hidden, output_dim=32*32*3,
                weights_init=IsotropicGaussian(0.01),
                biases_init=Constant(0),
                name="linear4")
        h_decode_produce.initialize()
        #self.produced = Sigmoid().apply(o)

        seq = Sequence([z_to_h1_decode.apply, Rectifier().apply, h1_decode_to_h_decode.apply, Rectifier().apply,
            h_decode_produce.apply, Sigmoid().apply])
        seq.initialize()

        self.produced = seq.apply(z)

        self.cost = T.mean(T.sqr(self.produced - self.scaled))
        #self.cost = T.sum(T.nnet.binary_crossentropy(self.produced, self.scaled)) #T.sum(T.sqr(self.produced - self.scaled))
        self.cost.name = "cost"

        self.variational_cost = - 0.5 * T.mean(1 + 2*log_sigma_encoder - mu_encoder * mu_encoder\
                - T.exp(2 * log_sigma_encoder)) + self.cost
        self.variational_cost.name = "variational_cost"

        self.Z = T.matrix('z')
        self.sampled = seq.apply(self.Z)


        cg = ComputationGraph([self.variational_cost])
        bricks = [get_brick(var) for var
                in cg.variables + cg.scan_variables if get_brick(var)]
        for i, b in enumerate(bricks):
            b.name = b.name + "_" + str(i)
Esempio n. 51
0
def build_model_hard(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names,
                input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]
    for i in range(layers - 1):
        mlp = MLP(activations=[Logistic()],
                  dims=[2 * state_dim, 1],
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(input_dim=layers * state_dim,
                          output_dim=vocab_size,
                          name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                       name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have correctly:
    # h = [state_1, state_2, state_3 ...]

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(), presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
Esempio n. 52
0
class EUTHM(UTHM):
    '''
    UTH model with extend information
    '''
    def __init__(self, config, dataset, *args, **kwargs):
        super(EUTHM, self).__init__(config, dataset)

    def _define_inputs(self, *args, **kwargs):
        super(EUTHM, self)._define_inputs()
        self.user_word = tensor.ivector('user_word')
        self.user_word_sparse_mask = tensor.vector('user_word_sparse_mask',
                                                   dtype=theano.config.floatX)
        self.user_word_left_idx = tensor.ivector('user_word_idx_left_idx')
        self.user_word_right_idx = tensor.ivector('user_word_idx_right_idx')
        self.hashtag_word = tensor.ivector('hashtag_word')
        self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask',
                                                 dtype=theano.config.floatX)
        self.hashtag_word_left_idx = tensor.ivector(
            'hashtag_word_idx_left_idx')
        self.hashtag_word_right_idx = tensor.ivector(
            'hashtag_word_idx_right_idx')
        self.sparse_word = tensor.imatrix('sparse_word')
        self.sparse_word_sparse_mask = tensor.vector(
            'sparse_word_sparse_mask', dtype=theano.config.floatX)
        self.sparse_word_mask = tensor.matrix('sparse_word_mask',
                                              dtype=theano.config.floatX)
        self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx')
        self.sparse_word_right_idx = tensor.ivector(
            'sparse_word_idx_right_idx')

    def _build_bricks(self, *args, **kwargs):
        # Build lookup tables
        super(EUTHM, self)._build_bricks()
        self.user2word = MLP(
            activations=[Tanh('user2word_tanh')],
            dims=[self.config.user_embed_dim, self.config.word_embed_dim],
            name='user2word_mlp')
        self.user2word.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.user2word.biases_init = Constant(0)
        self.user2word.initialize()
        self.hashtag2word = MLP(
            activations=[Tanh('hashtag2word_tanh')],
            dims=[
                self.config.user_embed_dim + self.config.word_embed_dim,
                self.config.word_embed_dim
            ],
            name='hashtag2word_mlp')
        self.hashtag2word.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.hashtag2word.biases_init = Constant(0)
        self.hashtag2word.initialize()
        self.user2word_bias = Bias(dim=1, name='user2word_bias')
        self.user2word_bias.biases_init = Constant(0)
        self.user2word_bias.initialize()
        self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias')
        self.hashtag2word_bias.biases_init = Constant(0)
        self.hashtag2word_bias.initialize()
        #Build character embedding
        self.char_embed = self._embed(len(self.dataset.char2index),
                                      self.config.char_embed_dim,
                                      name='char_embed')
        # Build sparse word encoder
        self.rnn_ins = Linear(input_dim=self.config.char_embed_dim,
                              output_dim=self.config.word_embed_dim,
                              name='rnn_in')
        self.rnn_ins.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim +
                                           self.config.word_embed_dim))
        self.rnn_ins.biases_init = Constant(0)
        self.rnn_ins.initialize()
        self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim,
                                   activation=Tanh())
        self.rnn.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.rnn.initialize()

    def _set_OV_value(self, *args, **kwargs):
        '''Train a <unk> representation'''
        tensor.set_subtensor(
            self.char_embed.W[self.dataset.char2index['<unk>']],
            numpy.zeros(self.config.char_embed_dim,
                        dtype=theano.config.floatX))

    def _get_text_vec(self, *args, **kwargs):
        # Transpose text
        self.text = self.text.dimshuffle(1, 0)
        self.text_mask = self.text_mask.dimshuffle(1, 0)
        self.sparse_word = self.sparse_word.dimshuffle(1, 0)
        self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0)
        # Turn word, user and hashtag into vector representation
        text_vec = self.word_embed.apply(self.text)
        # Apply user word, hashtag word and url
        text_vec = self._apply_user_word(text_vec)
        text_vec = self._apply_hashtag_word(text_vec)
        text_vec = self._apply_sparse_word(text_vec)
        return text_vec

    @abstractmethod
    def _apply_user_word(self, text_vec, *args, **kwargs):
        '''
        Replace @a with transformed author vector
        :param text_vec:
        :param args:
        :param kwargs:
        :return:
        '''
        user_word_vec = self.user2word.apply(self.user_embed.apply(self.user_word)) + \
                        self.user2word_bias.parameters[0][0]
        text_vec = tensor.set_subtensor(
            text_vec[self.user_word_right_idx, self.user_word_left_idx],
            text_vec[self.user_word_right_idx, self.user_word_left_idx] *
            (1 - self.user_word_sparse_mask[:, None]) +
            user_word_vec * self.user_word_sparse_mask[:, None])
        return text_vec

    @abstractmethod
    def _apply_hashtag_word(self, text_vec, *args, **kwargs):
        '''
        Replace #h with transformed hashtag vector
        :param text_vec:
        :param args:
        :param kwargs:
        :return:
        '''
        hashtag_word_vec = self.hashtag2word.apply(self.hashtag_embed.apply(self.hashtag_word)) +\
                           self.hashtag2word_bias.parameters[0][0]
        text_vec = tensor.set_subtensor(
            text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx],
            text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx] *
            (1 - self.hashtag_sparse_mask[:, None]) +
            hashtag_word_vec * self.hashtag_sparse_mask[:, None])
        return text_vec

    @abstractmethod
    def _apply_sparse_word(self, text_vec, *args, **kwargs):
        '''
        Replace sparse word encoding with character embedding. (maybe lstm)
        :param text_vec:
        :param args:
        :param kwargs:
        :return:
        '''
        sparse_word_vec = self.char_embed.apply(self.sparse_word)
        sparse_word_hiddens = self.rnn.apply(
            inputs=self.rnn_ins.apply(sparse_word_vec),
            mask=self.sparse_word_mask)
        tmp = sparse_word_hiddens[-1]
        text_vec = tensor.set_subtensor(
            text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx],
            text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx] *
            (1 - self.sparse_word_sparse_mask[:, None]) +
            tmp * self.sparse_word_sparse_mask[:, None])
        return text_vec
Esempio n. 53
0
def train(algorithm, learning_rate, clipping, momentum,
          layer_size, epochs, test_cost, experiment_path,
          initialization, init_width, weight_noise, z_prob,
          z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout,
          batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type,
          num_layers, norm_cost_coeff, penalty, testing, seq_len, 
          decrease_lr_after_epoch, lr_decay,
          **kwargs):

    print '.. PTB experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    ###########################################
    #
    # LOAD DATA
    #
    ###########################################

    def onehot(x, numclasses=None):
        """ Convert integer encoding for class-labels (starting with 0 !)
            to one-hot encoding.
            The output is an array whose shape is the shape of the input array
            plus an extra dimension, containing the 'one-hot'-encoded labels.
        """
        if x.shape == ():
            x = x[None]
        if numclasses is None:
            numclasses = x.max() + 1
        result = numpy.zeros(list(x.shape) + [numclasses], dtype="int")
        z = numpy.zeros(x.shape, dtype="int")
        for c in range(numclasses):
            z *= 0
            z[numpy.where(x == c)] = 1
            result[..., c] += z
        return result.astype(theano.config.floatX)

    alphabetsize = 10000
    data = np.load('penntree_char_and_word.npz')
    trainset = data['train_words']
    validset = data['valid_words']
    testset = data['test_words']

    if testing:
        trainset = trainset[:3000]
        validset = validset[:3000]

    if share_mask:
        if not z_prob:
            raise ValueError('z_prob must be provided when using share_mask')
        if z_prob_cells or z_prob_states:
            raise ValueError('z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)')
        z_prob_cells = z_prob
        # we don't want to actually use these masks, so this is to debug
        z_prob_states = None
    else:
        if z_prob:
            raise ValueError('z_prob is only used with share_mask')
        z_prob_cells = z_prob_cells or '1'
        z_prob_states = z_prob_states or '1'

#    rng = np.random.RandomState(seed)



    ###########################################
    #
    # MAKE STREAMS
    #
    ###########################################

    def prep_dataset(dataset):
        dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))]
        dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2))


        stream = DataStream(IndexableDataset(indexables=OrderedDict([
            ('data', dataset)])),
            iteration_scheme=SequentialExampleScheme(dataset.shape[0]))
        stream = Transpose(stream, [(1, 0)])
        stream = SampleDropsNPWord(
          stream, z_prob_states, z_prob_cells, drop_prob_igates,
          layer_size, num_layers, False, stoch_depth, share_mask,
          gaussian_drop, alphabetsize)
        stream.sources = ('data',) * 3 + stream.sources + ('zoneouts_states', 'zoneouts_cells', 'zoneouts_igates')
        return (stream,)
    train_stream, = prep_dataset(trainset)
    valid_stream, = prep_dataset(validset)
    test_stream, = prep_dataset(testset)


    ####################


    data = train_stream.get_epoch_iterator(as_dict=True).next()


    ####################


    ###########################################
    #
    # BUILD MODEL
    #
    ###########################################
    print '.. building model'

    x = T.tensor3('data')
    y = x
    zoneouts_states = T.tensor3('zoneouts_states')
    zoneouts_cells = T.tensor3('zoneouts_cells')
    zoneouts_igates = T.tensor3('zoneouts_igates')

    x.tag.test_value = data['data']
    zoneouts_states.tag.test_value = data['zoneouts_states']
    zoneouts_cells.tag.test_value = data['zoneouts_cells']
    zoneouts_igates.tag.test_value = data['zoneouts_igates']

    if init_width and not initialization == 'uniform':
        raise ValueError('Width is only for uniform init, whassup?')

    if initialization == 'glorot':
        weights_init = NormalizedInitialization()
    elif initialization == 'uniform':
        weights_init = Uniform(width=init_width)
    elif initialization == 'ortho':
        weights_init = OrthogonalInitialization()
    else:
        raise ValueError('No such initialization')

    if rnn_type.lower() == 'lstm':
        in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*4, name='in_to_hid%d'%l,
                           weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)]
        recurrent_layers = [DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d'%l, ogates_zoneout=ogates_zoneout) for l in range(num_layers)]
    elif rnn_type.lower() == 'gru':
        in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*3, name='in_to_hid%d'%l,
                           weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)]
        recurrent_layers = [DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d'%l) for l in range(num_layers)]
    elif rnn_type.lower() == 'srnn':  # FIXME!!! make ReLU
        in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d'%l,
                           weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)]
        recurrent_layers = [DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d'%l) for l in range(num_layers)]
    else:
        raise NotImplementedError

    hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out',
                        weights_init=weights_init, biases_init=Constant(0.0))

    for layer in in_to_hids:
        layer.initialize()
    for layer in recurrent_layers:
        layer.initialize()
    hid_to_out.initialize()

    layer_input = x #in_to_hid.apply(x)

    init_updates = OrderedDict()
    for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)):
        rnn_embedding = in_to_hid.apply(layer_input)
        if rnn_type.lower() == 'lstm':
            states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX))
            cells_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name, cells_init.name = "states_init", "cells_init"
            states, cells = layer.apply(rnn_embedding,
                                        zoneouts_states[:, :, l * layer_size : (l + 1) * layer_size],
                                        zoneouts_cells[:, :, l * layer_size : (l + 1) * layer_size],
                                        zoneouts_igates[:, :, l * layer_size : (l + 1) * layer_size],
                                        states_init,
                                        cells_init)
            init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])])
        elif rnn_type.lower() in ['gru', 'srnn']:
            # untested!
            states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name = "states_init"
            states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init)
            init_updates.update([(states_init, states[-1])])
        else:
            raise NotImplementedError
        layer_input = states

    y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1]))
    shape_ = y_hat_pre_softmax.shape
    y_hat = Softmax().apply(
        y_hat_pre_softmax.reshape((-1, alphabetsize)))

    ####################


    ###########################################
    #
    # SET UP COSTS AND MONITORS
    #
    ###########################################

    cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost')

    bpc = (cost/np.log(2.0)).copy(name='bpr')
    perp = T.exp(cost).copy(name='perp')


    cost_train = cost.copy(name='train_cost')
    cg_train = ComputationGraph([cost_train])


    ###########################################
    #
    # NORM STABILIZER
    #
    ###########################################
    norm_cost = 0.

    def _magnitude(x, axis=-1):
        return T.sqrt(T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny))

    if penalty == 'cells':
        assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables)
        for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables):
            norms = _magnitude(cell)
            norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))
    elif penalty == 'hids':
        for l in range(num_layers):
            assert 'rnn%d_apply_states'%l in [o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)]
        for output in VariableFilter(roles=[OUTPUT])(cg_train.variables):
            for l in range(num_layers):
                if output.name == 'rnn%d_apply_states'%l:
                    norms = _magnitude(output)
                    norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))

    norm_cost.name = 'norm_cost'
    #cost_valid = cost_train
    cost_train += norm_cost_coeff * norm_cost
    cost_train = cost_train.copy('cost_train') #should this be cost_train.outputs[0]? no.

    cg_train = ComputationGraph([cost_train])


    ###########################################
    #
    # WEIGHT NOISE
    #
    ###########################################

    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
        cg_train = apply_noise(cg_train, weights, weight_noise)
        cost_train = cg_train.outputs[0].copy(name='cost_train')

    model = Model(cost_train)

    learning_rate = float(learning_rate)
    clipping = StepClipping(threshold=np.cast[floatX](clipping))
    if algorithm == 'adam':
        adam = Adam(learning_rate=learning_rate)
        learning_rate = adam.learning_rate
        step_rule = CompositeRule([adam, clipping])
    elif algorithm == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate)
        learning_rate = rms_prop.learning_rate
        step_rule = CompositeRule([clipping, rms_prop])
    elif algorithm == 'momentum':
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        learning_rate = sgd_momentum.learning_rate
        step_rule = CompositeRule([clipping, sgd_momentum])
    elif algorithm == 'sgd':
        sgd = Scale(learning_rate=learning_rate)
        learning_rate = sgd.learning_rate
        step_rule = CompositeRule([clipping, sgd])
    else:
        raise NotImplementedError
    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters)
                                # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)})

    algorithm.add_updates(init_updates)

    def cond_number(x):
        _, _, sing_vals = T.nlinalg.svd(x, True, True)
        sing_mags = abs(sing_vals)
        return T.max(sing_mags) / T.min(sing_mags)
    def rms(x):
        return (x*x).mean().sqrt()

    whysplode_cond = []
    whysplode_rms = []
    for i, p in enumerate(init_updates):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape)))))
        whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape)))))
    for i, p in enumerate(cg_train.parameters):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape)))))
        whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape)))))

    observed_vars = [cost_train, cost, bpc, perp, learning_rate,
                     aggregation.mean(algorithm.total_gradient_norm).copy("gradient_norm_mean")] # + whysplode_rms

    parameters = model.get_parameter_dict()
    for name, param in parameters.iteritems():
        observed_vars.append(param.norm(2).copy(name=name + "_norm"))
        observed_vars.append(
            algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm"))
    
    train_monitor = TrainingDataMonitoring(
        variables=observed_vars,
        prefix="train", after_epoch=True
    )

    dev_inits = [p.clone() for p in init_updates]
    cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), dev_inits))
    dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3]
    dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:]))

    dev_monitor = DataStreamMonitoring(
        variables=[dev_cost, dev_bpc, dev_perp],
        data_stream=valid_stream, prefix="dev",
        updates=dev_init_updates
    )

    # noone does this
    if 'load_path' in kwargs:
        with open(kwargs['load_path']) as f:
            loaded = np.load(f)
            model = Model(cost_train)
            params_dicts = model.get_parameter_dict()
            params_names = params_dicts.keys()
            for param_name in params_names:
                param = params_dicts[param_name]
                # '/f_6_.W' --> 'f_6_.W'
                slash_index = param_name.find('/')
                param_name = param_name[slash_index + 1:]
                if param.get_value().shape == loaded[param_name].shape:
                    print 'Found: ' + param_name
                    param.set_value(loaded[param_name])
                else:
                    print 'Not found: ' + param_name
    
    extensions = []
    extensions.extend([FinishAfter(after_n_epochs=epochs),
                       train_monitor, dev_monitor])
    if test_cost:
        test_inits = [p.clone() for p in init_updates]
        cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), test_inits))
        test_cost, test_bpc, test_perp = cg_test.outputs[:3]
        test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:]))

        test_monitor = DataStreamMonitoring(
            variables=[test_cost, test_bpc, test_perp],
            data_stream=test_stream,
            prefix="test",
            updates=test_init_updates
        )
        extensions.extend([test_monitor])

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    log_path = os.path.join(experiment_path, 'log.txt')
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    extensions.append(SaveParams('dev_cost', model, experiment_path,
                                 every_n_epochs=1))
    extensions.append(SaveLog(every_n_epochs=1))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    class RollsExtension(TrainingExtension):
        """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """
        def __init__(self, shvars):
            self.shvars = shvars
        def before_epoch(self):
            for v in self.shvars:
                v.set_value(np.roll(v.get_value(), 1, 0))
    extensions.append(RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else [])))

    class LearningRateSchedule(TrainingExtension):
        """ Lets you set a number to divide learning rate by each epoch + when to start doing that """
        def __init__(self):
            self.epoch_number = 0
        def after_epoch(self):
            self.epoch_number += 1
            if self.epoch_number > decrease_lr_after_epoch:
                learning_rate.set_value(learning_rate.get_value()/lr_decay)
    if bool(lr_decay) != bool(decrease_lr_after_epoch):
        raise ValueError('Need to define both lr_decay and decrease_lr_after_epoch')
    if lr_decay and decrease_lr_after_epoch:
        extensions.append(LearningRateSchedule())


    main_loop = MainLoop(model=model, data_stream=train_stream,
                         algorithm=algorithm, extensions=extensions)
    t1 = time.time()
    print "Building time: %f" % (t1 - t0)

    main_loop.run()
    print "Execution time: %f" % (time.time() - t1)
Esempio n. 54
0
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs):
    dataset_train = IterableDataset(
        generate_data(max_seq_length, batch_size, num_batches))
    dataset_test = IterableDataset(
        generate_data(max_seq_length, batch_size, 100))

    stream_train = DataStream(dataset=dataset_train)
    stream_test = DataStream(dataset=dataset_test)

    x = T.tensor3('x')
    y = T.matrix('y')

    # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see
    # LSTM layer documentation for the explanation
    x_to_h = Linear(1,
                    lstm_dim * 4,
                    name='x_to_h',
                    weights_init=IsotropicGaussian(),
                    biases_init=Constant(0.0))
    lstm = LSTM(lstm_dim,
                name='lstm',
                weights_init=IsotropicGaussian(),
                biases_init=Constant(0.0))
    h_to_o = Linear(lstm_dim,
                    1,
                    name='h_to_o',
                    weights_init=IsotropicGaussian(),
                    biases_init=Constant(0.0))

    x_transform = x_to_h.apply(x)
    h, c = lstm.apply(x_transform)

    # only values of hidden units of the last timeframe are used for
    # the classification
    y_hat = h_to_o.apply(h[-1])
    y_hat = Logistic().apply(y_hat)

    cost = BinaryCrossEntropy().apply(y, y_hat)
    cost.name = 'cost'

    lstm.initialize()
    x_to_h.initialize()
    h_to_o.initialize()

    cg = ComputationGraph(cost)

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Adam())
    test_monitor = DataStreamMonitoring(variables=[cost],
                                        data_stream=stream_test,
                                        prefix="test")
    train_monitor = TrainingDataMonitoring(variables=[cost],
                                           prefix="train",
                                           after_epoch=True)

    main_loop = MainLoop(algorithm,
                         stream_train,
                         extensions=[
                             test_monitor, train_monitor,
                             FinishAfter(after_n_epochs=num_epochs),
                             Printing(),
                             ProgressBar()
                         ])
    main_loop.run()

    print('Learned weights:')
    for layer in (x_to_h, lstm, h_to_o):
        print("Layer '%s':" % layer.name)
        for param in layer.parameters:
            print(param.name, ': ', param.get_value())
        print()

    return main_loop
Esempio n. 55
0
def main():
    x = T.tensor3('features')
    #m = T.matrix('features_mask')
    y = T.imatrix('targets')

    #x = x+m.mean()*0

    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    #embedding_size = 50
    #glove_version = "vectors.6B.50d.txt"
    wstd = 0.02

    #vaguely normalize
    x = x / 3.0 - .5

    #gloveMapping = Linear(
            #input_dim = embedding_size,
            #output_dim = 128,
            #weights_init = Orthogonal(),
            #biases_init = Constant(0.0),
            #name="gloveMapping"
            #)
    #gloveMapping.initialize()
    #o = gloveMapping.apply(x)
    #o = Rectifier(name="gloveRec").apply(o)
    o = x
    input_dim = 300

    gru = GatedRecurrentFull(
            hidden_dim = input_dim,
            activation=Tanh(),
            #activation=bricks.Identity(),
            gate_activation=Sigmoid(),
            state_to_state_init=IsotropicGaussian(0.02),
            state_to_reset_init=IsotropicGaussian(0.02),
            state_to_update_init=IsotropicGaussian(0.02),
            input_to_state_transform = Linear(
                input_dim=input_dim,
                output_dim=input_dim,
                weights_init=IsotropicGaussian(0.02),
                biases_init=Constant(0.0)),
            input_to_update_transform = Linear(
                input_dim=input_dim,
                output_dim=input_dim,
                weights_init=IsotropicGaussian(0.02),
                biases_init=Constant(0.0)),
            input_to_reset_transform = Linear(
                input_dim=input_dim,
                output_dim=input_dim,
                weights_init=IsotropicGaussian(0.02),
                biases_init=Constant(0.0))
            )
    gru.initialize()
    rnn_in = o.dimshuffle(1, 0, 2)
    #rnn_in = o
    #rnn_out = gru.apply(rnn_in, mask=m.T)
    rnn_out = gru.apply(rnn_in)
    state_to_state = gru.rnn.state_to_state
    state_to_state.name = "state_to_state"
    #o = rnn_out[-1, :, :]
    o = rnn_out[-1]

    #o = rnn_out[:, -1, :]
    #o = rnn_out.mean(axis=1)

    #print rnn_last_out.eval({
        #x: np.ones((3, 101, 300), dtype=theano.config.floatX), 
        #m: np.ones((3, 101), dtype=theano.config.floatX)})
    #raw_input()
    #o = rnn_out.mean(axis=1)

    score_layer = Linear(
            input_dim = 300,
            output_dim = 1,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(0.),
            use_bias=True,
            name="linear_score")
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print rnn_in.shape.eval(
            #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
                #})
    #print rnn_out.shape.eval(
            #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()


    # =================

    cg = ComputationGraph([cost])
    #cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5)
    params = cg.parameters
    for p in params:
        p.name += "___" + p.tag.annotations[0].name

    algorithm = GradientDescent(
            cost = cg.outputs[0],
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=4),
                AdaM(),
                #NAG(lr=0.1, momentum=0.9),
                #AdaDelta(),
                ])

            )

    #algorithm.initialize()
    print params
    f = theano.function([x, y], algorithm.cost)
    ipdb.set_trace()

    print "making plots"
    #theano.printing.pydotprint(algorithm.cost, outfile='unopt.png')
    theano.printing.pydotprint(f , outfile='opt.png', scan_graphs=True)
Esempio n. 56
0
def main(num_epochs=100):
    x = tensor.matrix('features')
    m = tensor.matrix('features_mask')
    y = tensor.imatrix('targets')

    x_int = x.astype(dtype='int32').T - 2
    train_dataset = IMDB()
    idx_sort = numpy.argsort(
        [len(s) for s in
         train_dataset.indexables[
             train_dataset.sources.index('features')]]
    )
    n_voc = len(train_dataset.dict.keys())
    for idx in xrange(len(train_dataset.sources)):
        train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort]

    n_h = 10
    linear_embedding = LookupTable(
        length=n_voc,
        dim=4 * n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    linear_embedding.initialize()
    lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX)
    lstm_biases[n_h:(2 * n_h)] = 4.
    rnn = Bidirectional(LSTM(
        dim=n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    ))
    rnn.initialize()
    score_layer = Linear(
        input_dim=2*n_h,
        output_dim=1,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    score_layer.initialize()

    embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T)
    rnn_out = rnn.apply(embedding)
    rnn_out_mean_pooled = tensor.mean(rnn_out[0], axis=0)

    probs = Sigmoid().apply(
        score_layer.apply(rnn_out_mean_pooled))

    cost = - (y * tensor.log(probs)
              + (1 - y) * tensor.log(1 - probs)
              ).mean()
    cost.name = 'cost'

    misclassification = (y * (probs < 0.5)
                         + (1 - y) * (probs > 0.5)
                         ).mean()
    misclassification.name = 'misclassification'

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule(
            components=[StepClipping(threshold=10.),
                        Adam()
                        ]
        )
    )

    n_train = int(numpy.floor(.8 * train_dataset.num_examples))
    n_valid = int(numpy.floor(.1 * train_dataset.num_examples))
    train_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=range(100),
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )
    valid_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=range(100, 110),
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )
    test_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=range(110, 120),
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )

    model = Model(cost)

    extensions = []
    extensions.append(Timing())
    extensions.append(FinishAfter(after_n_epochs=num_epochs))
    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        test_data_stream,
        prefix='test'))
    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        valid_data_stream,
        prefix='valid'))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True))

    plotters = []
    plotters.append(Plotter(
        channels=[['train_cost', 'train_misclassification',
                   'valid_cost', 'valid_misclassification']],
        titles=['Costs']))

    extensions.append(PlotManager('IMDB classification example',
                                  plotters=plotters,
                                  after_epoch=True,
                                  after_training=True))
    extensions.append(Printing())

    main_loop = MainLoop(model=model,
                         data_stream=train_data_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()