Beispiel #1
0
    def test_get_all_params(self):
        from lasagne.layers import (InputLayer, DenseLayer, get_all_params)
        l1 = InputLayer((10, 20))
        l2 = DenseLayer(l1, 30)
        l3 = DenseLayer(l2, 40)

        assert get_all_params(l3) == l2.get_params() + l3.get_params()
        assert (get_all_params(l3, regularizable=False) ==
                (l2.get_params(regularizable=False) +
                 l3.get_params(regularizable=False)))

        assert (get_all_params(l3, regularizable=True) ==
                (l2.get_params(regularizable=True) +
                 l3.get_params(regularizable=True)))
Beispiel #2
0
    def test_get_all_params(self):
        from lasagne.layers import (InputLayer, DenseLayer, get_all_params)
        l1 = InputLayer((10, 20))
        l2 = DenseLayer(l1, 30)
        l3 = DenseLayer(l2, 40)

        assert get_all_params(l3) == l2.get_params() + l3.get_params()
        assert (get_all_params(
            l3, regularizable=False) == (l2.get_params(regularizable=False) +
                                         l3.get_params(regularizable=False)))

        assert (get_all_params(
            l3, regularizable=True) == (l2.get_params(regularizable=True) +
                                        l3.get_params(regularizable=True)))
Beispiel #3
0
class PretrainedNetwork:
    def __init__(self, load=True):
        # Architecture
        net = {}
        net['input'] = InputLayer((None, 3, 224, 224))
        net['conv1'] = ConvLayer(net['input'],
                                 num_filters=96,
                                 filter_size=7,
                                 stride=2,
                                 flip_filters=False)
        net['norm1'] = NormLayer(
            net['conv1'], alpha=0.0001)  # caffe has alpha = alpha * pool_size
        net['pool1'] = PoolLayer(net['norm1'],
                                 pool_size=3,
                                 stride=3,
                                 ignore_border=False)
        net['conv2'] = ConvLayer(net['pool1'],
                                 num_filters=256,
                                 filter_size=5,
                                 flip_filters=False)
        net['pool2'] = PoolLayer(net['conv2'],
                                 pool_size=2,
                                 stride=2,
                                 ignore_border=False)
        net['conv3'] = ConvLayer(net['pool2'],
                                 num_filters=512,
                                 filter_size=3,
                                 pad=1,
                                 flip_filters=False)
        net['conv4'] = ConvLayer(net['conv3'],
                                 num_filters=512,
                                 filter_size=3,
                                 pad=1,
                                 flip_filters=False)
        net['conv5'] = ConvLayer(net['conv4'],
                                 num_filters=512,
                                 filter_size=3,
                                 pad=1,
                                 flip_filters=False)
        net['pool5'] = PoolLayer(net['conv5'],
                                 pool_size=3,
                                 stride=3,
                                 ignore_border=False)
        net['fc6'] = DenseLayer(net['pool5'], num_units=4096)
        net['drop6'] = DropoutLayer(net['fc6'], p=0.5)
        net['fc7'] = DenseLayer(net['drop6'], num_units=4096)
        net['drop7'] = DropoutLayer(net['fc7'], p=0.5)
        net['fc8'] = DenseLayer(net['drop7'],
                                num_units=1000,
                                nonlinearity=lasagne.nonlinearities.softmax)
        self.output_layer = net['fc8']
        self.net = net

        if load:
            self.load_weights()

        # Compile
        self.predict_fn = None
        self.predict_fns = {}
        self.train_fn = {}
        self.lr = theano.shared(np.array(1e-2, dtype=np.float32))
        self.regularizer_amount = theano.shared(
            np.array(4e-5, dtype=np.float32))

    def get_output_fn(self, layer):
        input_var = self.net['input'].input_var
        out = lasagne.layers.get_output(layer, deterministic=True)
        return theano.function([input_var], out)

    def add_output_layer(self, num_units, after='drop7'):
        self.output_layer = DenseLayer(
            self.net[after],
            num_units=num_units,
            nonlinearity=lasagne.nonlinearities.softmax)
        self.predict_fn = None
        self.train_fn = {}

    def load_weights(self):
        # weights
        import pickle
        with open('/home/twanvl/test/vgg_cnn_s.pkl', 'rb') as file:
            model = pickle.load(file, encoding='latin1')
        self.classes = model['synset words']
        self.mean_image = model['mean image']
        lasagne.layers.set_all_param_values(self.output_layer, model['values'])

    def save_weights_np(self, filename):
        np.savez(filename,
                 *lasagne.layers.get_all_param_values(self.output_layer),
                 mean_image=self.mean_image)

    def load_weights_np(self, filename):
        params = lasagne.layers.get_all_params(self.output_layer)
        with np.load(filename) as f:
            param_values = [f['arr_%d' % i] for i in range(len(params))]
            self.mean_image = f['mean_image']
        lasagne.layers.set_all_param_values(self.output_layer, param_values)

    def preprocess_many(self, ims, **kwargs):
        # Preprocess a list of images
        return np.array([self.preprocess(x, many=True, **kwargs) for x in ims])

    def preprocess(self,
                   im,
                   many=False,
                   crop_h=0.5,
                   crop_w=0.5,
                   flip=False,
                   size=256,
                   smallest=True,
                   random=False):
        # Preprocess an image
        # Resize so smallest/largest dim = 256, preserving aspect ratio
        im = resize(im, size, smallest)
        # Central crop to 224x224
        h, w, _ = im.shape
        if random:
            y0 = np.random.randint(h - 224)
            x0 = np.random.randint(w - 224)
            flip = np.random.randint(2)
        else:
            y0 = int((h - 224) * crop_h)
            x0 = int((w - 224) * crop_w)
        im = im[y0:y0 + 224, x0:x0 + 224]
        # Flip horizontally?
        if flip:
            im = im[:, ::-1]
        if not many:
            rawim = np.copy(im).astype('uint8')
        # Shuffle axes to c01
        im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1)
        # Convert to BGR
        im = im[::-1, :, :]
        # Subtract mean
        im = im - self.mean_image
        if many:
            return floatX(im)
        else:
            return rawim, floatX(im[np.newaxis])

    def classify(self, im, preprocess=False, **kwargs):
        if preprocess:
            im = self.preprocess_many(im, **kwargs)
        if self.predict_fn is None:
            self.predict_fn = self.get_output_fn(self.output_layer)
        prob = batch_predict(self.predict_fn, im)
        return np.array(np.argmax(prob, axis=1), dtype=np.int32)

    def classify_test(self, im, **kwargs):
        # Run a test of the classifier, output nice looking matplotlib figure
        rawim, im = self.preprocess(im, **kwargs)
        #prob = np.array(lasagne.layers.get_output(self.output_layer, im, deterministic=True).eval())
        if self.predict_fn is None:
            self.predict_fn = self.get_output_fn(self.output_layer)
        prob = np.array(self.predict_fn(im))
        top5 = np.argsort(prob[0])[-1:-6:-1]
        import matplotlib.pyplot as plt
        plt.figure()
        plt.imshow(rawim.astype('uint8'))
        plt.axis('off')
        for n, label in enumerate(top5):
            plt.text(250,
                     70 + n * 20,
                     '{}. {}'.format(n + 1, self.classes[label]),
                     fontsize=14)

    def get_features(self, im, layer, preprocess=False):
        if layer not in self.predict_fns:
            self.predict_fns[layer] = self.get_output_fn(self.net[layer])
        # apply
        if preprocess:
            rawim, im = self.preprocess(im)
        return batch_predict(self.predict_fns[layer], im)

    def get_train_fn(self, last_only=False):
        input_var = self.net['input'].input_var
        target_var = T.ivector('targets')
        prediction = lasagne.layers.get_output(self.output_layer)
        loss = categorical_crossentropy(prediction, target_var)
        loss = loss.mean()
        error = T.mean(T.neq(T.argmax(prediction, axis=1), target_var),
                       dtype=theano.config.floatX)
        regularization = self.regularizer_amount * regularize_network_params(
            self.output_layer, l2)
        if last_only:
            all_params = self.output_layer.get_params(trainable=True)
        else:
            all_params = lasagne.layers.get_all_params(self.output_layer,
                                                       trainable=True)
        updates = nesterov_momentum(loss + regularization,
                                    all_params,
                                    learning_rate=self.lr)
        return theano.function([input_var, target_var], (loss, error),
                               updates=updates)

    def train(self,
              x,
              y,
              num_epochs=50,
              learning_rate=1e-3,
              batchsize=128,
              regularizer_amount=5e-4,
              preprocess=False,
              last_only=False):
        if last_only not in self.train_fn:
            self.train_fn[last_only] = self.get_train_fn(last_only)
        train_fn = self.train_fn[last_only]
        self.regularizer_amount.set_value(np.float32(regularizer_amount))
        #augment = augment_data
        augment = None

        for epoch in range(num_epochs):
            if epoch < 0.8 * num_epochs:
                lr = learning_rate
            elif epoch < 0.9 * num_epochs:
                lr = learning_rate / 10
            else:
                lr = learning_rate / 100
            self.lr.set_value(np.float32(lr))

            loss = 0
            err = 0
            n = 0
            for batch_x, batch_y in iterate_minibatches(x,
                                                        y,
                                                        batchsize=batchsize,
                                                        shuffle=True,
                                                        augment=augment):
                if preprocess:
                    batch_x = self.preprocess_many(batch_x, random=True)
                l, e = train_fn(batch_x, batch_y)
                loss += l
                err += e
                n += 1
                print("  {:3} / {:3}:  loss={:6.3f}, error={:5.3f}  ".format(
                    epoch, num_epochs, loss / n, err / n),
                      end='\r')
            if epoch % 10 == 9:
                print()
class PRAE:
    def __init__(self,
                 num_batch,
                 max_len,
                 n_features,
                 hidden=[200, 200],
                 **kwargs):
        self.num_batch = num_batch
        self.n_features = n_features
        self.max_len = max_len
        self.hidden = hidden
        rng = np.random.RandomState(123)
        self.drng = rng
        self.rng = RandomStreams(rng.randint(2**30))

        # params
        # initial_W = np.asarray(
        #     rng.uniform(
        #             low=1e-5,
        #             high=1,
        #             size=(self.hidden[1], self.n_features)
        #     ),
        #     dtype=theano.config.floatX
        # )
        #
        # self.W_y_theta = theano.shared(value=initial_W, name='W_y_theta', borrow=True)
        # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True)
        # self.b_y_theta = theano.shared(
        #         value=np.zeros(
        #             self.n_features,
        #             dtype=theano.config.floatX
        #         ),
        #         borrow=True
        #     )
        # self.b_y_kappa = theano.shared(
        #         value=np.zeros(
        #             self.n_features,
        #             dtype=theano.config.floatX
        #         ),
        #         name='b',
        #         borrow=True
        #     )

        # I could directly create the model here since it is fixed
        self.l_in = InputLayer(shape=(self.num_batch, self.max_len,
                                      self.n_features))
        self.mask_input = InputLayer(shape=(self.num_batch, self.max_len))
        first_hidden = LSTMLayer(self.l_in,
                                 mask_input=self.mask_input,
                                 num_units=hidden[0],
                                 nonlinearity=rectify)
        second_hidden = LSTMLayer(first_hidden,
                                  num_units=hidden[1],
                                  nonlinearity=rectify)
        # need some reshape voodoo
        l_shp = ReshapeLayer(second_hidden, (-1, hidden[1]))
        # after the reshape I have batch*max_len X features
        self.model = DenseLayer(l_shp,
                                num_units=self.n_features,
                                nonlinearity=rectify)
        # if now I put a dense layer this will collect all the output temporally which is what I want, I'll have to fix
        # the dimensions probably later
        # For every gaussian in the sum I need 3 values plus a value for the total scale
        # the output of this layer will be (num_batch, num_units, max_len) TODO check size

    def get_output_shape_for(self):
        return self.model.get_output_shape_for(self.num_batch, self.max_len,
                                               self.hidden[2])

    def get_output_y(self, output):
        # (batch, time, hidden) X (hidden, features) + (, features) => (batch, time, features)
        theta_out = T.nnet.relu(T.dot(output, self.W_y_theta) + self.b_y_theta)
        #kappa_out = T.nnet.relu(T.dot(output, self.W_y_kappa) + self.b_y_kappa)
        return theta_out

    def get_log_x(self, x, theta_out):
        # DIM = (batch, time, hidden)
        # (kappa-1)log(x) + x/theta -log(gamma(kappa)) -(kappa)log(theta)
        # everything is elementwise
        log_x = T.log(theta_out + 1e-8) - theta_out * x
        log_x = log_x.sum(axis=2, dtype=theano.config.floatX)
        return log_x

    def build_model(self, train_x, train_mask_x, train_mask_out, train_target,
                    test_x, test_mask_x, test_mask_out, test_target):
        self.train_x = train_x
        self.train_mask_x = train_mask_x
        self.train_mask_out = train_mask_out
        self.train_target = train_target
        self.test_x = test_x
        self.test_mask_x = test_mask_x
        self.test_mask_out = test_mask_out
        self.test_target = test_target
        self.index = T.iscalar('index')
        self.num_batch_test = T.iscalar('index')
        self.b_slice = slice(self.index * self.num_batch,
                             (self.index + 1) * self.num_batch)

        sym_x = T.dtensor3()
        sym_mask_x = T.dmatrix()
        sym_target = T.dtensor3()
        # sym_mask_out = T.dtensor3() should not be useful since output is still zero
        # TODO think about this if it is true

        theta = lasagne.layers.get_output(self.model,
                                          inputs={
                                              self.l_in: sym_x,
                                              self.mask_input: sym_mask_x
                                          })
        theta = T.reshape(theta,
                          (self.num_batch, self.max_len, self.n_features))
        log_px = self.get_log_x(sym_target, theta)
        log_px_sum_time = log_px.sum(
            axis=1, dtype=theano.config.floatX)  # sum over time
        loss = -T.sum(log_px_sum_time) / self.num_batch  # average over batch
        ##
        theta_test = T.reshape(
            theta, (self.num_batch_test, self.max_len, self.n_features))
        log_px_test = self.get_log_x(sym_target, theta_test)
        log_px_sum_time_test = log_px_test.sum(
            axis=1, dtype=theano.config.floatX)  # sum over time
        loss_test = -T.sum(
            log_px_sum_time_test) / self.num_batch_test  # average over batch
        # loss = T.mean(lasagne.objectives.squared_error(mu, sym_target))
        all_params = self.model.get_params()
        print len(all_params)
        all_grads_target = [
            T.clip(g, -10, 10) for g in T.grad(loss, all_params)
        ]
        all_grads_target = lasagne.updates.total_norm_constraint(
            all_grads_target, 10)
        updates_target = adam(all_grads_target, all_params)

        train_model = theano.function(
            [self.index], [loss, theta, log_px],
            givens={
                sym_x: self.train_x[self.b_slice],
                sym_mask_x: self.train_mask_x[self.b_slice],
                sym_target: self.train_target[self.b_slice]
            },
            updates=updates_target)
        test_model = theano.function(
            [self.num_batch_test], [loss_test, theta_test],
            givens={
                sym_x: self.test_x,
                sym_mask_x: self.test_mask_x,
                sym_target: self.test_target
            })

        return train_model, test_model
Beispiel #5
0
class WriteHead(Head):
    r"""
    Write head. In addition to the weight vector, the write head
    also outputs an add vector :math:`a_{t}` and an erase vector
    :math:`e_{t}` defined by

    .. math ::
        \delta_{t} &= \sigma_{delta}(h_{t} W_{delta} + b_{delta})\\
        a_{t} &= \delta_{t} * \sigma_{a}(h_{t} W_{a} + b_{a})
        e_{t} &= \sigma_{e}(h_{t} W_{e} + b_{e})

    Parameters
    ----------
    controller: a :class:`Controller` instance
        The controller of the Neural Turing Machine.
    num_shifts: int
        Number of shifts allowed by the convolutional shift operation
        (centered on 0, eg. ``num_shifts=3`` represents shifts
        in [-1, 0, 1]).
    memory_shape: tuple
        Shape of the NTM's memory
    W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
    b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_sign: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\alpha_{t}`.
    W_hid_to_key: callable, Numpy array or Theano shared variable
    b_hid_to_key: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_key: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`k_{t}`.
    W_hid_to_beta: callable, Numpy array or Theano shared variable
    b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_beta: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\beta_{t}`.
    W_hid_to_gate: callable, Numpy array or Theano shared variable
    b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gate: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`g_{t}`.
    W_hid_to_shift: callable, Numpy array or Theano shared variable
    b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_shift: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`s_{t}`.
    W_hid_to_gamma: callable, Numpy array or Theano shared variable
    b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gamma: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\gamma_{t}`
    W_hid_to_erase: callable, Numpy array or Theano shared variable
    b_hid_to_erase: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_erase: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`e_{t}`
    W_hid_to_add: callable, Numpy array or Theano shared variable
    b_hid_to_add: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_add: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`a_{t}`
    W_hid_to_sign_add: callable, Numpy array, Theano shared variable, or ``None``
    b_hid_to_sign_add: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_sign_add: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\delta_{t}`
    weights_init: callable, Numpy array or Theano shared variable
        Initializer for the initial weight vector (:math:`w_{0}`).
    learn_init: bool
        If ``True``, initial hidden values are learned.
    """
    def __init__(self, controller, num_shifts=3, memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x),
                 W_hid_to_erase=lasagne.init.GlorotUniform(),
                 b_hid_to_erase=lasagne.init.Constant(0.),
                 nonlinearity_erase=nonlinearities.hard_sigmoid,
                 W_hid_to_add=lasagne.init.GlorotUniform(),
                 b_hid_to_add=lasagne.init.Constant(0.),
                 nonlinearity_add=nonlinearities.ClippedLinear(low=0., high=1.),
                 W_hid_to_sign_add=None,
                 b_hid_to_sign_add=lasagne.init.Constant(0.),
                 nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1., high=1.),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(WriteHead, self).__init__(controller, num_shifts=num_shifts, memory_shape=memory_shape,
            W_hid_to_sign=W_hid_to_sign, b_hid_to_sign=b_hid_to_sign, nonlinearity_sign=nonlinearity_sign,
            W_hid_to_key=W_hid_to_key, b_hid_to_key=b_hid_to_key, nonlinearity_key=nonlinearity_key,
            W_hid_to_beta=W_hid_to_beta, b_hid_to_beta=b_hid_to_beta, nonlinearity_beta=nonlinearity_beta,
            W_hid_to_gate=W_hid_to_gate, b_hid_to_gate=b_hid_to_gate, nonlinearity_gate=nonlinearity_gate,
            W_hid_to_shift=W_hid_to_shift, b_hid_to_shift=b_hid_to_shift, nonlinearity_shift=nonlinearity_shift,
            W_hid_to_gamma=W_hid_to_gamma, b_hid_to_gamma=b_hid_to_gamma, nonlinearity_gamma=nonlinearity_gamma,
            weights_init=weights_init, learn_init=learn_init, **kwargs)
    
        self.erase = DenseLayer(controller, num_units=self.memory_shape[1],
            W=W_hid_to_erase, b=b_hid_to_erase, nonlinearity=nonlinearity_erase,
            name=self.basename + '.erase')
        self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b

        self.add = DenseLayer(controller, num_units=self.memory_shape[1],
            W=W_hid_to_add, b=b_hid_to_add, nonlinearity=nonlinearity_add,
            name=self.basename + '.add')
        self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b

        if W_hid_to_sign_add is not None:
            self.sign_add = DenseLayer(controller, num_units=self.memory_shape[1],
                W=W_hid_to_sign_add, b=b_hid_to_sign_add, nonlinearity=nonlinearity_sign_add,
                name=self.basename + '.sign_add')
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b
        else:
            self.sign_add = None
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None

    def get_params(self, **tags):
        params = super(WriteHead, self).get_params(**tags)
        params += self.erase.get_params(**tags)
        params += self.add.get_params(**tags)
        if self.sign_add is not None:
            params += self.sign_add.get_params(**tags)

        return params
Beispiel #6
0
class Head(Layer):
    r"""
    The base class :class:`Head` represents a generic head for the
    Neural Turing Machine. The heads are responsible for the read/write
    operations on the memory. An instance of :class:`Head` outputs a
    weight vector defined by

    .. math ::
        \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\
        k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\
        \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\
        g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\
        s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\
        \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma})

    .. math ::
        w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\
        w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\
        \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\
        w_{t} \propto \tilde{w}_{t}^{\gamma_{t}}

    Parameters
    ----------
    controller: a :class:`Controller` instance
        The controller of the Neural Turing Machine.
    num_shifts: int
        Number of shifts allowed by the convolutional shift operation
        (centered on 0, eg. ``num_shifts=3`` represents shifts
        in [-1, 0, 1]).
    memory_shape: tuple
        Shape of the NTM's memory
    W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the weights for the parameter
        :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is
        ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape
        ``(controller.num_units, memory_shape[1])``.
    b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the biases for the parameter
        :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix
        with shape ``(memory_shape[1],)``.
    nonlinearity_sign: callable or ``None``
        The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If
        ``None``, the nonlinearity is ``identity``.
    W_hid_to_key: callable, Numpy array or Theano shared variable
    b_hid_to_key: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_key: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`k_{t}`.
    W_hid_to_beta: callable, Numpy array or Theano shared variable
    b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_beta: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\beta_{t}`.
    W_hid_to_gate: callable, Numpy array or Theano shared variable
    b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gate: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`g_{t}`.
    W_hid_to_shift: callable, Numpy array or Theano shared variable
    b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_shift: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`s_{t}`.
    W_hid_to_gamma: callable, Numpy array or Theano shared variable
    b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gamma: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\gamma_{t}`
    weights_init: callable, Numpy array or Theano shared variable
        Initializer for the initial weight vector (:math:`w_{0}`).
    learn_init: bool
        If ``True``, initial hidden values are learned.
    """
    def __init__(self, controller, num_shifts=3, memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(Head, self).__init__(controller, **kwargs)

        self.memory_shape = memory_shape
        self.basename = kwargs.get('name', 'head')
        self.learn_init = learn_init

        if W_hid_to_sign is not None:
            self.sign = DenseLayer(controller, num_units=self.memory_shape[1],
                W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign,
                name=self.basename + '.sign')
            self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b
        else:
            self.sign = None
            self.W_hid_to_sign, self.b_hid_to_sign = None, None

        self.key = DenseLayer(controller, num_units=self.memory_shape[1],
            W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key,
            name=self.basename + '.key')
        self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b
        
        self.beta = DenseLayer(controller, num_units=1,
            W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta,
            name=self.basename + '.beta')
        self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b

        self.gate = DenseLayer(controller, num_units=1,
            W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate,
            name=self.basename + '.gate')
        self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b

        self.num_shifts = num_shifts
        self.shift = DenseLayer(controller, num_units=num_shifts,
            W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift,
            name=self.basename + '.shift')
        self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b

        self.gamma = DenseLayer(controller, num_units=1,
            W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma,
            name=self.basename + '.gamma')
        self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b

        self.weights_init = self.add_param(
            weights_init, (1, self.memory_shape[0]),
            name='weights_init', trainable=learn_init, regularizable=False)

    def get_output_for(self, h_t, w_tm1, M_t, **kwargs):
        if self.sign is not None:
            sign_t = self.sign.get_output_for(h_t, **kwargs)
        else:
            sign_t = 1.
        k_t = self.key.get_output_for(h_t, **kwargs)
        beta_t = self.beta.get_output_for(h_t, **kwargs)
        g_t = self.gate.get_output_for(h_t, **kwargs)
        s_t = self.shift.get_output_for(h_t, **kwargs)
        gamma_t = self.gamma.get_output_for(h_t, **kwargs)

        # Content Adressing (3.3.1)
        beta_t = T.addbroadcast(beta_t, 1)
        betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t)
        w_c = lasagne.nonlinearities.softmax(betaK)

        # Interpolation (3.3.2)
        g_t = T.addbroadcast(g_t, 1)
        w_g = g_t * w_c + (1. - g_t) * w_tm1

        # Convolutional Shift (3.3.2)
        w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1)
        conv_filter = s_t.dimshuffle(0, 'x', 'x', 1)
        pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2)
        w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3)
        convolution = T.nnet.conv2d(w_g_padded, conv_filter,
            input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]),
            filter_shape=(self.input_shape[0], 1, 1, self.num_shifts),
            subsample=(1, 1),
            border_mode='valid')
        w_tilde = convolution[:, 0, 0, :]

        # Sharpening (3.3.2)
        gamma_t = T.addbroadcast(gamma_t, 1)
        w = T.pow(w_tilde + 1e-6, gamma_t)
        w /= T.sum(w)

        return w

    def get_params(self, **tags):
        params = super(Head, self).get_params(**tags)
        if self.sign is not None:
            params += self.sign.get_params(**tags)
        params += self.key.get_params(**tags)
        params += self.beta.get_params(**tags)
        params += self.gate.get_params(**tags)
        params += self.shift.get_params(**tags)
        params += self.gamma.get_params(**tags)

        return params
Beispiel #7
0
def dae_0419(input=None,n_vis=784,n_hid=100, p_drop = 0.,
             encoder_nonlin=lasagne.nonlinearities.sigmoid,
             decoder_nonlin=lasagne.nonlinearities.sigmoid):
    """ Created denoising autoencoder with tied weights.

    http://benanne.github.io/2015/11/10/arbitrary-expressions-as-params.html

    >>> network, encoder_fn, decoder_fn, output_fn = dae_0419(...)

    Update 04/26 - allowed optional input to specify encoder and decoder
    nonlinearity

    Parameters
    ----------
    input : theano.tensor.TensorType (default=None)
        a symbolic description of the input.
        if ``None``, sym-var will be created internally
    n_vis : int
        number of visible units (input units)
    n_hid : int
        number of hidden units
    p_drop : float
        Probability of setting an input unit to zero ("masking" noise)
        (note: implemented via ``DropoutLayer``)

    Returns
    -------
    l_output : ...
        ...
    encoder_fn : ...
        ...

    Dev
    ---
    - ``t_0419_decoder_func.py``
    - ``t_0419_decoder_func2.py``

    History
    -------
    Created 04/19/2016...difference from 0418 version: added decoder function
    as output
    """
    if input is None:
        input = T.matrix('input')

    # input layer
    l_input = InputLayer((None, n_vis),input_var=input,name='input')

    if p_drop != 0:
        # rescale set off then this is not for dropout, but for "masking" input for DAE
        l_input = DropoutLayer(l_input,p=p_drop,rescale=False,name='input_drop')

    # l_hid and l_output share the same weight matrix!
    l_hidden = DenseLayer(l_input,  n_hid, name='hidden',
                          nonlinearity=encoder_nonlin)
    l_output = DenseLayer(l_hidden, n_vis, name='output',W=l_hidden.W.T,
                          nonlinearity=decoder_nonlin)

    # === get deterministic encoder function === #
    # Theano tensor for encoder function (deterministic=True to disable Dropout)
    encoder_tn = lasagne.layers.get_output(l_hidden,deterministic=True)
    encoder_fn = theano.function([input],encoder_tn)

    # === get decoder function (new in 04/19 version) === #
    # theano symvar for hidden unit representation
    hid = T.matrix('hid')
    W_out, b_out = l_output.get_params()
    decoder_tn = l_output.nonlinearity(hid.dot(W_out.T) + b_out)
    decoder_fn = theano.function([hid],decoder_tn)

    # === get output function (new in 04/19 version) ===#
    """Note: this outputs the same thing as decoder_fn, but takes as input
       the original feature (so a composition of encoding/decoding operation)"""
    output_tn = lasagne.layers.get_output(l_output,deterministic=True)
    output_fn = theano.function([input], output_tn)

    return l_output, encoder_fn, decoder_fn, output_fn
Beispiel #8
0
    smooth_train_loss = 0.95 * smooth_train_loss + 0.05 * batch_train_loss
    print 'iter: ', iter_n, "\t training loss:", smooth_train_loss
    if iter_n % 100 == 0:
        X_val, y_val = data_iter.fetch_validation()
        val_loss, val_acc = val_fn(X_val, y_val)
        print "====" * 20
        print "validation loss: \t", val_loss
        print "validation accuracy: \t", val_acc
        print "====" * 20

print "... training done"

print "... serializing model"

import cPickle

params = []
params.extend(dram.get_params())
params.extend(y_hat.get_params())

np_params = []
for param in params:
    np_params.append(param.get_value())

f = open('params.model', 'w')
cPickle.dump(np_params, f)
f.close()

print "... done serializing model"
print "... exiting ..."
Beispiel #9
0
class WriteHead(Head):
    r"""
    Write head. In addition to the weight vector, the write head
    also outputs an add vector :math:`a_{t}` and an erase vector
    :math:`e_{t}` defined by

    .. math ::
        \delta_{t} &= \sigma_{delta}(h_{t} W_{delta} + b_{delta})\\
        a_{t} &= \delta_{t} * \sigma_{a}(h_{t} W_{a} + b_{a})
        e_{t} &= \sigma_{e}(h_{t} W_{e} + b_{e})

    Parameters
    ----------
    controller: a :class:`Controller` instance
        The controller of the Neural Turing Machine.
    num_shifts: int
        Number of shifts allowed by the convolutional shift operation
        (centered on 0, eg. ``num_shifts=3`` represents shifts
        in [-1, 0, 1]).
    memory_shape: tuple
        Shape of the NTM's memory
    W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
    b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_sign: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\alpha_{t}`.
    W_hid_to_key: callable, Numpy array or Theano shared variable
    b_hid_to_key: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_key: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`k_{t}`.
    W_hid_to_beta: callable, Numpy array or Theano shared variable
    b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_beta: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\beta_{t}`.
    W_hid_to_gate: callable, Numpy array or Theano shared variable
    b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gate: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`g_{t}`.
    W_hid_to_shift: callable, Numpy array or Theano shared variable
    b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_shift: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`s_{t}`.
    W_hid_to_gamma: callable, Numpy array or Theano shared variable
    b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gamma: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\gamma_{t}`
    W_hid_to_erase: callable, Numpy array or Theano shared variable
    b_hid_to_erase: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_erase: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`e_{t}`
    W_hid_to_add: callable, Numpy array or Theano shared variable
    b_hid_to_add: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_add: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`a_{t}`
    W_hid_to_sign_add: callable, Numpy array, Theano shared variable, or ``None``
    b_hid_to_sign_add: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_sign_add: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\delta_{t}`
    weights_init: callable, Numpy array or Theano shared variable
        Initializer for the initial weight vector (:math:`w_{0}`).
    learn_init: bool
        If ``True``, initial hidden values are learned.
    """
    def __init__(self,
                 controller,
                 num_shifts=3,
                 memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1.,
                                                                high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0.,
                                                               high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.
                 rectify(x),
                 W_hid_to_erase=lasagne.init.GlorotUniform(),
                 b_hid_to_erase=lasagne.init.Constant(0.),
                 nonlinearity_erase=nonlinearities.hard_sigmoid,
                 W_hid_to_add=lasagne.init.GlorotUniform(),
                 b_hid_to_add=lasagne.init.Constant(0.),
                 nonlinearity_add=nonlinearities.ClippedLinear(low=0.,
                                                               high=1.),
                 W_hid_to_sign_add=None,
                 b_hid_to_sign_add=lasagne.init.Constant(0.),
                 nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1.,
                                                                    high=1.),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(WriteHead, self).__init__(controller,
                                        num_shifts=num_shifts,
                                        memory_shape=memory_shape,
                                        W_hid_to_sign=W_hid_to_sign,
                                        b_hid_to_sign=b_hid_to_sign,
                                        nonlinearity_sign=nonlinearity_sign,
                                        W_hid_to_key=W_hid_to_key,
                                        b_hid_to_key=b_hid_to_key,
                                        nonlinearity_key=nonlinearity_key,
                                        W_hid_to_beta=W_hid_to_beta,
                                        b_hid_to_beta=b_hid_to_beta,
                                        nonlinearity_beta=nonlinearity_beta,
                                        W_hid_to_gate=W_hid_to_gate,
                                        b_hid_to_gate=b_hid_to_gate,
                                        nonlinearity_gate=nonlinearity_gate,
                                        W_hid_to_shift=W_hid_to_shift,
                                        b_hid_to_shift=b_hid_to_shift,
                                        nonlinearity_shift=nonlinearity_shift,
                                        W_hid_to_gamma=W_hid_to_gamma,
                                        b_hid_to_gamma=b_hid_to_gamma,
                                        nonlinearity_gamma=nonlinearity_gamma,
                                        weights_init=weights_init,
                                        learn_init=learn_init,
                                        **kwargs)

        self.erase = DenseLayer(controller,
                                num_units=self.memory_shape[1],
                                W=W_hid_to_erase,
                                b=b_hid_to_erase,
                                nonlinearity=nonlinearity_erase,
                                name=self.basename + '.erase')
        self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b

        self.add = DenseLayer(controller,
                              num_units=self.memory_shape[1],
                              W=W_hid_to_add,
                              b=b_hid_to_add,
                              nonlinearity=nonlinearity_add,
                              name=self.basename + '.add')
        self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b

        if W_hid_to_sign_add is not None:
            self.sign_add = DenseLayer(controller,
                                       num_units=self.memory_shape[1],
                                       W=W_hid_to_sign_add,
                                       b=b_hid_to_sign_add,
                                       nonlinearity=nonlinearity_sign_add,
                                       name=self.basename + '.sign_add')
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b
        else:
            self.sign_add = None
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None

    def get_params(self, **tags):
        params = super(WriteHead, self).get_params(**tags)
        params += self.erase.get_params(**tags)
        params += self.add.get_params(**tags)
        if self.sign_add is not None:
            params += self.sign_add.get_params(**tags)

        return params
Beispiel #10
0
class Head(Layer):
    r"""
    The base class :class:`Head` represents a generic head for the
    Neural Turing Machine. The heads are responsible for the read/write
    operations on the memory. An instance of :class:`Head` outputs a
    weight vector defined by

    .. math ::
        \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\
        k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\
        \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\
        g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\
        s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\
        \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma})

    .. math ::
        w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\
        w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\
        \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\
        w_{t} \propto \tilde{w}_{t}^{\gamma_{t}}

    Parameters
    ----------
    controller: a :class:`Controller` instance
        The controller of the Neural Turing Machine.
    num_shifts: int
        Number of shifts allowed by the convolutional shift operation
        (centered on 0, eg. ``num_shifts=3`` represents shifts
        in [-1, 0, 1]).
    memory_shape: tuple
        Shape of the NTM's memory
    W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the weights for the parameter
        :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is
        ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape
        ``(controller.num_units, memory_shape[1])``.
    b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the biases for the parameter
        :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix
        with shape ``(memory_shape[1],)``.
    nonlinearity_sign: callable or ``None``
        The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If
        ``None``, the nonlinearity is ``identity``.
    W_hid_to_key: callable, Numpy array or Theano shared variable
    b_hid_to_key: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_key: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`k_{t}`.
    W_hid_to_beta: callable, Numpy array or Theano shared variable
    b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_beta: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\beta_{t}`.
    W_hid_to_gate: callable, Numpy array or Theano shared variable
    b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gate: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`g_{t}`.
    W_hid_to_shift: callable, Numpy array or Theano shared variable
    b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_shift: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`s_{t}`.
    W_hid_to_gamma: callable, Numpy array or Theano shared variable
    b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gamma: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\gamma_{t}`
    weights_init: callable, Numpy array or Theano shared variable
        Initializer for the initial weight vector (:math:`w_{0}`).
    learn_init: bool
        If ``True``, initial hidden values are learned.
    """
    def __init__(self,
                 controller,
                 num_shifts=3,
                 memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1.,
                                                                high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0.,
                                                               high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.
                 rectify(x),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(Head, self).__init__(controller, **kwargs)

        self.memory_shape = memory_shape
        self.basename = kwargs.get('name', 'head')
        self.learn_init = learn_init

        if W_hid_to_sign is not None:
            self.sign = DenseLayer(controller,
                                   num_units=self.memory_shape[1],
                                   W=W_hid_to_sign,
                                   b=b_hid_to_sign,
                                   nonlinearity=nonlinearity_sign,
                                   name=self.basename + '.sign')
            self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b
        else:
            self.sign = None
            self.W_hid_to_sign, self.b_hid_to_sign = None, None

        self.key = DenseLayer(controller,
                              num_units=self.memory_shape[1],
                              W=W_hid_to_key,
                              b=b_hid_to_key,
                              nonlinearity=nonlinearity_key,
                              name=self.basename + '.key')
        self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b

        self.beta = DenseLayer(controller,
                               num_units=1,
                               W=W_hid_to_beta,
                               b=b_hid_to_beta,
                               nonlinearity=nonlinearity_beta,
                               name=self.basename + '.beta')
        self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b

        self.gate = DenseLayer(controller,
                               num_units=1,
                               W=W_hid_to_gate,
                               b=b_hid_to_gate,
                               nonlinearity=nonlinearity_gate,
                               name=self.basename + '.gate')
        self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b

        self.num_shifts = num_shifts
        self.shift = DenseLayer(controller,
                                num_units=num_shifts,
                                W=W_hid_to_shift,
                                b=b_hid_to_shift,
                                nonlinearity=nonlinearity_shift,
                                name=self.basename + '.shift')
        self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b

        self.gamma = DenseLayer(controller,
                                num_units=1,
                                W=W_hid_to_gamma,
                                b=b_hid_to_gamma,
                                nonlinearity=nonlinearity_gamma,
                                name=self.basename + '.gamma')
        self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b

        self.weights_init = self.add_param(weights_init,
                                           (1, self.memory_shape[0]),
                                           name='weights_init',
                                           trainable=learn_init,
                                           regularizable=False)

    def get_output_for(self, h_t, w_tm1, M_t, **kwargs):
        if self.sign is not None:
            sign_t = self.sign.get_output_for(h_t, **kwargs)
        else:
            sign_t = 1.
        k_t = self.key.get_output_for(h_t, **kwargs)
        beta_t = self.beta.get_output_for(h_t, **kwargs)
        g_t = self.gate.get_output_for(h_t, **kwargs)
        s_t = self.shift.get_output_for(h_t, **kwargs)
        gamma_t = self.gamma.get_output_for(h_t, **kwargs)

        # Content Adressing (3.3.1)
        beta_t = T.addbroadcast(beta_t, 1)
        betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t)
        w_c = lasagne.nonlinearities.softmax(betaK)

        # Interpolation (3.3.2)
        g_t = T.addbroadcast(g_t, 1)
        w_g = g_t * w_c + (1. - g_t) * w_tm1

        # Convolutional Shift (3.3.2)
        w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1)
        conv_filter = s_t.dimshuffle(0, 'x', 'x', 1)
        pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2)
        w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3)
        convolution = T.nnet.conv2d(
            w_g_padded,
            conv_filter,
            input_shape=(self.input_shape[0], 1, 1,
                         self.memory_shape[0] + pad[0] + pad[1]),
            filter_shape=(self.input_shape[0], 1, 1, self.num_shifts),
            subsample=(1, 1),
            border_mode='valid')
        w_tilde = convolution[:, 0, 0, :]

        # Sharpening (3.3.2)
        gamma_t = T.addbroadcast(gamma_t, 1)
        w = T.pow(w_tilde + 1e-6, gamma_t)
        w /= T.sum(w)

        return w

    def get_params(self, **tags):
        params = super(Head, self).get_params(**tags)
        if self.sign is not None:
            params += self.sign.get_params(**tags)
        params += self.key.get_params(**tags)
        params += self.beta.get_params(**tags)
        params += self.gate.get_params(**tags)
        params += self.shift.get_params(**tags)
        params += self.gamma.get_params(**tags)

        return params