def test_get_all_params(self): from lasagne.layers import (InputLayer, DenseLayer, get_all_params) l1 = InputLayer((10, 20)) l2 = DenseLayer(l1, 30) l3 = DenseLayer(l2, 40) assert get_all_params(l3) == l2.get_params() + l3.get_params() assert (get_all_params(l3, regularizable=False) == (l2.get_params(regularizable=False) + l3.get_params(regularizable=False))) assert (get_all_params(l3, regularizable=True) == (l2.get_params(regularizable=True) + l3.get_params(regularizable=True)))
def test_get_all_params(self): from lasagne.layers import (InputLayer, DenseLayer, get_all_params) l1 = InputLayer((10, 20)) l2 = DenseLayer(l1, 30) l3 = DenseLayer(l2, 40) assert get_all_params(l3) == l2.get_params() + l3.get_params() assert (get_all_params( l3, regularizable=False) == (l2.get_params(regularizable=False) + l3.get_params(regularizable=False))) assert (get_all_params( l3, regularizable=True) == (l2.get_params(regularizable=True) + l3.get_params(regularizable=True)))
class PretrainedNetwork: def __init__(self, load=True): # Architecture net = {} net['input'] = InputLayer((None, 3, 224, 224)) net['conv1'] = ConvLayer(net['input'], num_filters=96, filter_size=7, stride=2, flip_filters=False) net['norm1'] = NormLayer( net['conv1'], alpha=0.0001) # caffe has alpha = alpha * pool_size net['pool1'] = PoolLayer(net['norm1'], pool_size=3, stride=3, ignore_border=False) net['conv2'] = ConvLayer(net['pool1'], num_filters=256, filter_size=5, flip_filters=False) net['pool2'] = PoolLayer(net['conv2'], pool_size=2, stride=2, ignore_border=False) net['conv3'] = ConvLayer(net['pool2'], num_filters=512, filter_size=3, pad=1, flip_filters=False) net['conv4'] = ConvLayer(net['conv3'], num_filters=512, filter_size=3, pad=1, flip_filters=False) net['conv5'] = ConvLayer(net['conv4'], num_filters=512, filter_size=3, pad=1, flip_filters=False) net['pool5'] = PoolLayer(net['conv5'], pool_size=3, stride=3, ignore_border=False) net['fc6'] = DenseLayer(net['pool5'], num_units=4096) net['drop6'] = DropoutLayer(net['fc6'], p=0.5) net['fc7'] = DenseLayer(net['drop6'], num_units=4096) net['drop7'] = DropoutLayer(net['fc7'], p=0.5) net['fc8'] = DenseLayer(net['drop7'], num_units=1000, nonlinearity=lasagne.nonlinearities.softmax) self.output_layer = net['fc8'] self.net = net if load: self.load_weights() # Compile self.predict_fn = None self.predict_fns = {} self.train_fn = {} self.lr = theano.shared(np.array(1e-2, dtype=np.float32)) self.regularizer_amount = theano.shared( np.array(4e-5, dtype=np.float32)) def get_output_fn(self, layer): input_var = self.net['input'].input_var out = lasagne.layers.get_output(layer, deterministic=True) return theano.function([input_var], out) def add_output_layer(self, num_units, after='drop7'): self.output_layer = DenseLayer( self.net[after], num_units=num_units, nonlinearity=lasagne.nonlinearities.softmax) self.predict_fn = None self.train_fn = {} def load_weights(self): # weights import pickle with open('/home/twanvl/test/vgg_cnn_s.pkl', 'rb') as file: model = pickle.load(file, encoding='latin1') self.classes = model['synset words'] self.mean_image = model['mean image'] lasagne.layers.set_all_param_values(self.output_layer, model['values']) def save_weights_np(self, filename): np.savez(filename, *lasagne.layers.get_all_param_values(self.output_layer), mean_image=self.mean_image) def load_weights_np(self, filename): params = lasagne.layers.get_all_params(self.output_layer) with np.load(filename) as f: param_values = [f['arr_%d' % i] for i in range(len(params))] self.mean_image = f['mean_image'] lasagne.layers.set_all_param_values(self.output_layer, param_values) def preprocess_many(self, ims, **kwargs): # Preprocess a list of images return np.array([self.preprocess(x, many=True, **kwargs) for x in ims]) def preprocess(self, im, many=False, crop_h=0.5, crop_w=0.5, flip=False, size=256, smallest=True, random=False): # Preprocess an image # Resize so smallest/largest dim = 256, preserving aspect ratio im = resize(im, size, smallest) # Central crop to 224x224 h, w, _ = im.shape if random: y0 = np.random.randint(h - 224) x0 = np.random.randint(w - 224) flip = np.random.randint(2) else: y0 = int((h - 224) * crop_h) x0 = int((w - 224) * crop_w) im = im[y0:y0 + 224, x0:x0 + 224] # Flip horizontally? if flip: im = im[:, ::-1] if not many: rawim = np.copy(im).astype('uint8') # Shuffle axes to c01 im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1) # Convert to BGR im = im[::-1, :, :] # Subtract mean im = im - self.mean_image if many: return floatX(im) else: return rawim, floatX(im[np.newaxis]) def classify(self, im, preprocess=False, **kwargs): if preprocess: im = self.preprocess_many(im, **kwargs) if self.predict_fn is None: self.predict_fn = self.get_output_fn(self.output_layer) prob = batch_predict(self.predict_fn, im) return np.array(np.argmax(prob, axis=1), dtype=np.int32) def classify_test(self, im, **kwargs): # Run a test of the classifier, output nice looking matplotlib figure rawim, im = self.preprocess(im, **kwargs) #prob = np.array(lasagne.layers.get_output(self.output_layer, im, deterministic=True).eval()) if self.predict_fn is None: self.predict_fn = self.get_output_fn(self.output_layer) prob = np.array(self.predict_fn(im)) top5 = np.argsort(prob[0])[-1:-6:-1] import matplotlib.pyplot as plt plt.figure() plt.imshow(rawim.astype('uint8')) plt.axis('off') for n, label in enumerate(top5): plt.text(250, 70 + n * 20, '{}. {}'.format(n + 1, self.classes[label]), fontsize=14) def get_features(self, im, layer, preprocess=False): if layer not in self.predict_fns: self.predict_fns[layer] = self.get_output_fn(self.net[layer]) # apply if preprocess: rawim, im = self.preprocess(im) return batch_predict(self.predict_fns[layer], im) def get_train_fn(self, last_only=False): input_var = self.net['input'].input_var target_var = T.ivector('targets') prediction = lasagne.layers.get_output(self.output_layer) loss = categorical_crossentropy(prediction, target_var) loss = loss.mean() error = T.mean(T.neq(T.argmax(prediction, axis=1), target_var), dtype=theano.config.floatX) regularization = self.regularizer_amount * regularize_network_params( self.output_layer, l2) if last_only: all_params = self.output_layer.get_params(trainable=True) else: all_params = lasagne.layers.get_all_params(self.output_layer, trainable=True) updates = nesterov_momentum(loss + regularization, all_params, learning_rate=self.lr) return theano.function([input_var, target_var], (loss, error), updates=updates) def train(self, x, y, num_epochs=50, learning_rate=1e-3, batchsize=128, regularizer_amount=5e-4, preprocess=False, last_only=False): if last_only not in self.train_fn: self.train_fn[last_only] = self.get_train_fn(last_only) train_fn = self.train_fn[last_only] self.regularizer_amount.set_value(np.float32(regularizer_amount)) #augment = augment_data augment = None for epoch in range(num_epochs): if epoch < 0.8 * num_epochs: lr = learning_rate elif epoch < 0.9 * num_epochs: lr = learning_rate / 10 else: lr = learning_rate / 100 self.lr.set_value(np.float32(lr)) loss = 0 err = 0 n = 0 for batch_x, batch_y in iterate_minibatches(x, y, batchsize=batchsize, shuffle=True, augment=augment): if preprocess: batch_x = self.preprocess_many(batch_x, random=True) l, e = train_fn(batch_x, batch_y) loss += l err += e n += 1 print(" {:3} / {:3}: loss={:6.3f}, error={:5.3f} ".format( epoch, num_epochs, loss / n, err / n), end='\r') if epoch % 10 == 9: print()
class PRAE: def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs): self.num_batch = num_batch self.n_features = n_features self.max_len = max_len self.hidden = hidden rng = np.random.RandomState(123) self.drng = rng self.rng = RandomStreams(rng.randint(2**30)) # params # initial_W = np.asarray( # rng.uniform( # low=1e-5, # high=1, # size=(self.hidden[1], self.n_features) # ), # dtype=theano.config.floatX # ) # # self.W_y_theta = theano.shared(value=initial_W, name='W_y_theta', borrow=True) # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True) # self.b_y_theta = theano.shared( # value=np.zeros( # self.n_features, # dtype=theano.config.floatX # ), # borrow=True # ) # self.b_y_kappa = theano.shared( # value=np.zeros( # self.n_features, # dtype=theano.config.floatX # ), # name='b', # borrow=True # ) # I could directly create the model here since it is fixed self.l_in = InputLayer(shape=(self.num_batch, self.max_len, self.n_features)) self.mask_input = InputLayer(shape=(self.num_batch, self.max_len)) first_hidden = LSTMLayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0], nonlinearity=rectify) second_hidden = LSTMLayer(first_hidden, num_units=hidden[1], nonlinearity=rectify) # need some reshape voodoo l_shp = ReshapeLayer(second_hidden, (-1, hidden[1])) # after the reshape I have batch*max_len X features self.model = DenseLayer(l_shp, num_units=self.n_features, nonlinearity=rectify) # if now I put a dense layer this will collect all the output temporally which is what I want, I'll have to fix # the dimensions probably later # For every gaussian in the sum I need 3 values plus a value for the total scale # the output of this layer will be (num_batch, num_units, max_len) TODO check size def get_output_shape_for(self): return self.model.get_output_shape_for(self.num_batch, self.max_len, self.hidden[2]) def get_output_y(self, output): # (batch, time, hidden) X (hidden, features) + (, features) => (batch, time, features) theta_out = T.nnet.relu(T.dot(output, self.W_y_theta) + self.b_y_theta) #kappa_out = T.nnet.relu(T.dot(output, self.W_y_kappa) + self.b_y_kappa) return theta_out def get_log_x(self, x, theta_out): # DIM = (batch, time, hidden) # (kappa-1)log(x) + x/theta -log(gamma(kappa)) -(kappa)log(theta) # everything is elementwise log_x = T.log(theta_out + 1e-8) - theta_out * x log_x = log_x.sum(axis=2, dtype=theano.config.floatX) return log_x def build_model(self, train_x, train_mask_x, train_mask_out, train_target, test_x, test_mask_x, test_mask_out, test_target): self.train_x = train_x self.train_mask_x = train_mask_x self.train_mask_out = train_mask_out self.train_target = train_target self.test_x = test_x self.test_mask_x = test_mask_x self.test_mask_out = test_mask_out self.test_target = test_target self.index = T.iscalar('index') self.num_batch_test = T.iscalar('index') self.b_slice = slice(self.index * self.num_batch, (self.index + 1) * self.num_batch) sym_x = T.dtensor3() sym_mask_x = T.dmatrix() sym_target = T.dtensor3() # sym_mask_out = T.dtensor3() should not be useful since output is still zero # TODO think about this if it is true theta = lasagne.layers.get_output(self.model, inputs={ self.l_in: sym_x, self.mask_input: sym_mask_x }) theta = T.reshape(theta, (self.num_batch, self.max_len, self.n_features)) log_px = self.get_log_x(sym_target, theta) log_px_sum_time = log_px.sum( axis=1, dtype=theano.config.floatX) # sum over time loss = -T.sum(log_px_sum_time) / self.num_batch # average over batch ## theta_test = T.reshape( theta, (self.num_batch_test, self.max_len, self.n_features)) log_px_test = self.get_log_x(sym_target, theta_test) log_px_sum_time_test = log_px_test.sum( axis=1, dtype=theano.config.floatX) # sum over time loss_test = -T.sum( log_px_sum_time_test) / self.num_batch_test # average over batch # loss = T.mean(lasagne.objectives.squared_error(mu, sym_target)) all_params = self.model.get_params() print len(all_params) all_grads_target = [ T.clip(g, -10, 10) for g in T.grad(loss, all_params) ] all_grads_target = lasagne.updates.total_norm_constraint( all_grads_target, 10) updates_target = adam(all_grads_target, all_params) train_model = theano.function( [self.index], [loss, theta, log_px], givens={ sym_x: self.train_x[self.b_slice], sym_mask_x: self.train_mask_x[self.b_slice], sym_target: self.train_target[self.b_slice] }, updates=updates_target) test_model = theano.function( [self.num_batch_test], [loss_test, theta_test], givens={ sym_x: self.test_x, sym_mask_x: self.test_mask_x, sym_target: self.test_target }) return train_model, test_model
class WriteHead(Head): r""" Write head. In addition to the weight vector, the write head also outputs an add vector :math:`a_{t}` and an erase vector :math:`e_{t}` defined by .. math :: \delta_{t} &= \sigma_{delta}(h_{t} W_{delta} + b_{delta})\\ a_{t} &= \delta_{t} * \sigma_{a}(h_{t} W_{a} + b_{a}) e_{t} &= \sigma_{e}(h_{t} W_{e} + b_{e}) Parameters ---------- controller: a :class:`Controller` instance The controller of the Neural Turing Machine. num_shifts: int Number of shifts allowed by the convolutional shift operation (centered on 0, eg. ``num_shifts=3`` represents shifts in [-1, 0, 1]). memory_shape: tuple Shape of the NTM's memory W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_sign: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\alpha_{t}`. W_hid_to_key: callable, Numpy array or Theano shared variable b_hid_to_key: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_key: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`k_{t}`. W_hid_to_beta: callable, Numpy array or Theano shared variable b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_beta: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\beta_{t}`. W_hid_to_gate: callable, Numpy array or Theano shared variable b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gate: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`g_{t}`. W_hid_to_shift: callable, Numpy array or Theano shared variable b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_shift: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`s_{t}`. W_hid_to_gamma: callable, Numpy array or Theano shared variable b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gamma: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\gamma_{t}` W_hid_to_erase: callable, Numpy array or Theano shared variable b_hid_to_erase: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_erase: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`e_{t}` W_hid_to_add: callable, Numpy array or Theano shared variable b_hid_to_add: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_add: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`a_{t}` W_hid_to_sign_add: callable, Numpy array, Theano shared variable, or ``None`` b_hid_to_sign_add: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_sign_add: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\delta_{t}` weights_init: callable, Numpy array or Theano shared variable Initializer for the initial weight vector (:math:`w_{0}`). learn_init: bool If ``True``, initial hidden values are learned. """ def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x), W_hid_to_erase=lasagne.init.GlorotUniform(), b_hid_to_erase=lasagne.init.Constant(0.), nonlinearity_erase=nonlinearities.hard_sigmoid, W_hid_to_add=lasagne.init.GlorotUniform(), b_hid_to_add=lasagne.init.Constant(0.), nonlinearity_add=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_sign_add=None, b_hid_to_sign_add=lasagne.init.Constant(0.), nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1., high=1.), weights_init=init.OneHot(), learn_init=False, **kwargs): super(WriteHead, self).__init__(controller, num_shifts=num_shifts, memory_shape=memory_shape, W_hid_to_sign=W_hid_to_sign, b_hid_to_sign=b_hid_to_sign, nonlinearity_sign=nonlinearity_sign, W_hid_to_key=W_hid_to_key, b_hid_to_key=b_hid_to_key, nonlinearity_key=nonlinearity_key, W_hid_to_beta=W_hid_to_beta, b_hid_to_beta=b_hid_to_beta, nonlinearity_beta=nonlinearity_beta, W_hid_to_gate=W_hid_to_gate, b_hid_to_gate=b_hid_to_gate, nonlinearity_gate=nonlinearity_gate, W_hid_to_shift=W_hid_to_shift, b_hid_to_shift=b_hid_to_shift, nonlinearity_shift=nonlinearity_shift, W_hid_to_gamma=W_hid_to_gamma, b_hid_to_gamma=b_hid_to_gamma, nonlinearity_gamma=nonlinearity_gamma, weights_init=weights_init, learn_init=learn_init, **kwargs) self.erase = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_erase, b=b_hid_to_erase, nonlinearity=nonlinearity_erase, name=self.basename + '.erase') self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b self.add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_add, b=b_hid_to_add, nonlinearity=nonlinearity_add, name=self.basename + '.add') self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b if W_hid_to_sign_add is not None: self.sign_add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign_add, b=b_hid_to_sign_add, nonlinearity=nonlinearity_sign_add, name=self.basename + '.sign_add') self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b else: self.sign_add = None self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None def get_params(self, **tags): params = super(WriteHead, self).get_params(**tags) params += self.erase.get_params(**tags) params += self.add.get_params(**tags) if self.sign_add is not None: params += self.sign_add.get_params(**tags) return params
class Head(Layer): r""" The base class :class:`Head` represents a generic head for the Neural Turing Machine. The heads are responsible for the read/write operations on the memory. An instance of :class:`Head` outputs a weight vector defined by .. math :: \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\ k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\ \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\ g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\ s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\ \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma}) .. math :: w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\ w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\ \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\ w_{t} \propto \tilde{w}_{t}^{\gamma_{t}} Parameters ---------- controller: a :class:`Controller` instance The controller of the Neural Turing Machine. num_shifts: int Number of shifts allowed by the convolutional shift operation (centered on 0, eg. ``num_shifts=3`` represents shifts in [-1, 0, 1]). memory_shape: tuple Shape of the NTM's memory W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the weights for the parameter :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape ``(controller.num_units, memory_shape[1])``. b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the biases for the parameter :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix with shape ``(memory_shape[1],)``. nonlinearity_sign: callable or ``None`` The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If ``None``, the nonlinearity is ``identity``. W_hid_to_key: callable, Numpy array or Theano shared variable b_hid_to_key: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_key: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`k_{t}`. W_hid_to_beta: callable, Numpy array or Theano shared variable b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_beta: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\beta_{t}`. W_hid_to_gate: callable, Numpy array or Theano shared variable b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gate: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`g_{t}`. W_hid_to_shift: callable, Numpy array or Theano shared variable b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_shift: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`s_{t}`. W_hid_to_gamma: callable, Numpy array or Theano shared variable b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gamma: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\gamma_{t}` weights_init: callable, Numpy array or Theano shared variable Initializer for the initial weight vector (:math:`w_{0}`). learn_init: bool If ``True``, initial hidden values are learned. """ def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x), weights_init=init.OneHot(), learn_init=False, **kwargs): super(Head, self).__init__(controller, **kwargs) self.memory_shape = memory_shape self.basename = kwargs.get('name', 'head') self.learn_init = learn_init if W_hid_to_sign is not None: self.sign = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign, name=self.basename + '.sign') self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b else: self.sign = None self.W_hid_to_sign, self.b_hid_to_sign = None, None self.key = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key, name=self.basename + '.key') self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b self.beta = DenseLayer(controller, num_units=1, W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta, name=self.basename + '.beta') self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b self.gate = DenseLayer(controller, num_units=1, W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate, name=self.basename + '.gate') self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b self.num_shifts = num_shifts self.shift = DenseLayer(controller, num_units=num_shifts, W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift, name=self.basename + '.shift') self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b self.gamma = DenseLayer(controller, num_units=1, W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma, name=self.basename + '.gamma') self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b self.weights_init = self.add_param( weights_init, (1, self.memory_shape[0]), name='weights_init', trainable=learn_init, regularizable=False) def get_output_for(self, h_t, w_tm1, M_t, **kwargs): if self.sign is not None: sign_t = self.sign.get_output_for(h_t, **kwargs) else: sign_t = 1. k_t = self.key.get_output_for(h_t, **kwargs) beta_t = self.beta.get_output_for(h_t, **kwargs) g_t = self.gate.get_output_for(h_t, **kwargs) s_t = self.shift.get_output_for(h_t, **kwargs) gamma_t = self.gamma.get_output_for(h_t, **kwargs) # Content Adressing (3.3.1) beta_t = T.addbroadcast(beta_t, 1) betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 1) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d(w_g_padded, conv_filter, input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(self.input_shape[0], 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[:, 0, 0, :] # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 1) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w) return w def get_params(self, **tags): params = super(Head, self).get_params(**tags) if self.sign is not None: params += self.sign.get_params(**tags) params += self.key.get_params(**tags) params += self.beta.get_params(**tags) params += self.gate.get_params(**tags) params += self.shift.get_params(**tags) params += self.gamma.get_params(**tags) return params
def dae_0419(input=None,n_vis=784,n_hid=100, p_drop = 0., encoder_nonlin=lasagne.nonlinearities.sigmoid, decoder_nonlin=lasagne.nonlinearities.sigmoid): """ Created denoising autoencoder with tied weights. http://benanne.github.io/2015/11/10/arbitrary-expressions-as-params.html >>> network, encoder_fn, decoder_fn, output_fn = dae_0419(...) Update 04/26 - allowed optional input to specify encoder and decoder nonlinearity Parameters ---------- input : theano.tensor.TensorType (default=None) a symbolic description of the input. if ``None``, sym-var will be created internally n_vis : int number of visible units (input units) n_hid : int number of hidden units p_drop : float Probability of setting an input unit to zero ("masking" noise) (note: implemented via ``DropoutLayer``) Returns ------- l_output : ... ... encoder_fn : ... ... Dev --- - ``t_0419_decoder_func.py`` - ``t_0419_decoder_func2.py`` History ------- Created 04/19/2016...difference from 0418 version: added decoder function as output """ if input is None: input = T.matrix('input') # input layer l_input = InputLayer((None, n_vis),input_var=input,name='input') if p_drop != 0: # rescale set off then this is not for dropout, but for "masking" input for DAE l_input = DropoutLayer(l_input,p=p_drop,rescale=False,name='input_drop') # l_hid and l_output share the same weight matrix! l_hidden = DenseLayer(l_input, n_hid, name='hidden', nonlinearity=encoder_nonlin) l_output = DenseLayer(l_hidden, n_vis, name='output',W=l_hidden.W.T, nonlinearity=decoder_nonlin) # === get deterministic encoder function === # # Theano tensor for encoder function (deterministic=True to disable Dropout) encoder_tn = lasagne.layers.get_output(l_hidden,deterministic=True) encoder_fn = theano.function([input],encoder_tn) # === get decoder function (new in 04/19 version) === # # theano symvar for hidden unit representation hid = T.matrix('hid') W_out, b_out = l_output.get_params() decoder_tn = l_output.nonlinearity(hid.dot(W_out.T) + b_out) decoder_fn = theano.function([hid],decoder_tn) # === get output function (new in 04/19 version) ===# """Note: this outputs the same thing as decoder_fn, but takes as input the original feature (so a composition of encoding/decoding operation)""" output_tn = lasagne.layers.get_output(l_output,deterministic=True) output_fn = theano.function([input], output_tn) return l_output, encoder_fn, decoder_fn, output_fn
smooth_train_loss = 0.95 * smooth_train_loss + 0.05 * batch_train_loss print 'iter: ', iter_n, "\t training loss:", smooth_train_loss if iter_n % 100 == 0: X_val, y_val = data_iter.fetch_validation() val_loss, val_acc = val_fn(X_val, y_val) print "====" * 20 print "validation loss: \t", val_loss print "validation accuracy: \t", val_acc print "====" * 20 print "... training done" print "... serializing model" import cPickle params = [] params.extend(dram.get_params()) params.extend(y_hat.get_params()) np_params = [] for param in params: np_params.append(param.get_value()) f = open('params.model', 'w') cPickle.dump(np_params, f) f.close() print "... done serializing model" print "... exiting ..."
class WriteHead(Head): r""" Write head. In addition to the weight vector, the write head also outputs an add vector :math:`a_{t}` and an erase vector :math:`e_{t}` defined by .. math :: \delta_{t} &= \sigma_{delta}(h_{t} W_{delta} + b_{delta})\\ a_{t} &= \delta_{t} * \sigma_{a}(h_{t} W_{a} + b_{a}) e_{t} &= \sigma_{e}(h_{t} W_{e} + b_{e}) Parameters ---------- controller: a :class:`Controller` instance The controller of the Neural Turing Machine. num_shifts: int Number of shifts allowed by the convolutional shift operation (centered on 0, eg. ``num_shifts=3`` represents shifts in [-1, 0, 1]). memory_shape: tuple Shape of the NTM's memory W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_sign: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\alpha_{t}`. W_hid_to_key: callable, Numpy array or Theano shared variable b_hid_to_key: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_key: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`k_{t}`. W_hid_to_beta: callable, Numpy array or Theano shared variable b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_beta: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\beta_{t}`. W_hid_to_gate: callable, Numpy array or Theano shared variable b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gate: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`g_{t}`. W_hid_to_shift: callable, Numpy array or Theano shared variable b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_shift: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`s_{t}`. W_hid_to_gamma: callable, Numpy array or Theano shared variable b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gamma: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\gamma_{t}` W_hid_to_erase: callable, Numpy array or Theano shared variable b_hid_to_erase: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_erase: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`e_{t}` W_hid_to_add: callable, Numpy array or Theano shared variable b_hid_to_add: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_add: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`a_{t}` W_hid_to_sign_add: callable, Numpy array, Theano shared variable, or ``None`` b_hid_to_sign_add: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_sign_add: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\delta_{t}` weights_init: callable, Numpy array or Theano shared variable Initializer for the initial weight vector (:math:`w_{0}`). learn_init: bool If ``True``, initial hidden values are learned. """ def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities. rectify(x), W_hid_to_erase=lasagne.init.GlorotUniform(), b_hid_to_erase=lasagne.init.Constant(0.), nonlinearity_erase=nonlinearities.hard_sigmoid, W_hid_to_add=lasagne.init.GlorotUniform(), b_hid_to_add=lasagne.init.Constant(0.), nonlinearity_add=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_sign_add=None, b_hid_to_sign_add=lasagne.init.Constant(0.), nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1., high=1.), weights_init=init.OneHot(), learn_init=False, **kwargs): super(WriteHead, self).__init__(controller, num_shifts=num_shifts, memory_shape=memory_shape, W_hid_to_sign=W_hid_to_sign, b_hid_to_sign=b_hid_to_sign, nonlinearity_sign=nonlinearity_sign, W_hid_to_key=W_hid_to_key, b_hid_to_key=b_hid_to_key, nonlinearity_key=nonlinearity_key, W_hid_to_beta=W_hid_to_beta, b_hid_to_beta=b_hid_to_beta, nonlinearity_beta=nonlinearity_beta, W_hid_to_gate=W_hid_to_gate, b_hid_to_gate=b_hid_to_gate, nonlinearity_gate=nonlinearity_gate, W_hid_to_shift=W_hid_to_shift, b_hid_to_shift=b_hid_to_shift, nonlinearity_shift=nonlinearity_shift, W_hid_to_gamma=W_hid_to_gamma, b_hid_to_gamma=b_hid_to_gamma, nonlinearity_gamma=nonlinearity_gamma, weights_init=weights_init, learn_init=learn_init, **kwargs) self.erase = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_erase, b=b_hid_to_erase, nonlinearity=nonlinearity_erase, name=self.basename + '.erase') self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b self.add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_add, b=b_hid_to_add, nonlinearity=nonlinearity_add, name=self.basename + '.add') self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b if W_hid_to_sign_add is not None: self.sign_add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign_add, b=b_hid_to_sign_add, nonlinearity=nonlinearity_sign_add, name=self.basename + '.sign_add') self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b else: self.sign_add = None self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None def get_params(self, **tags): params = super(WriteHead, self).get_params(**tags) params += self.erase.get_params(**tags) params += self.add.get_params(**tags) if self.sign_add is not None: params += self.sign_add.get_params(**tags) return params
class Head(Layer): r""" The base class :class:`Head` represents a generic head for the Neural Turing Machine. The heads are responsible for the read/write operations on the memory. An instance of :class:`Head` outputs a weight vector defined by .. math :: \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\ k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\ \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\ g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\ s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\ \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma}) .. math :: w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\ w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\ \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\ w_{t} \propto \tilde{w}_{t}^{\gamma_{t}} Parameters ---------- controller: a :class:`Controller` instance The controller of the Neural Turing Machine. num_shifts: int Number of shifts allowed by the convolutional shift operation (centered on 0, eg. ``num_shifts=3`` represents shifts in [-1, 0, 1]). memory_shape: tuple Shape of the NTM's memory W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the weights for the parameter :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape ``(controller.num_units, memory_shape[1])``. b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the biases for the parameter :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix with shape ``(memory_shape[1],)``. nonlinearity_sign: callable or ``None`` The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If ``None``, the nonlinearity is ``identity``. W_hid_to_key: callable, Numpy array or Theano shared variable b_hid_to_key: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_key: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`k_{t}`. W_hid_to_beta: callable, Numpy array or Theano shared variable b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_beta: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\beta_{t}`. W_hid_to_gate: callable, Numpy array or Theano shared variable b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gate: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`g_{t}`. W_hid_to_shift: callable, Numpy array or Theano shared variable b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_shift: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`s_{t}`. W_hid_to_gamma: callable, Numpy array or Theano shared variable b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gamma: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\gamma_{t}` weights_init: callable, Numpy array or Theano shared variable Initializer for the initial weight vector (:math:`w_{0}`). learn_init: bool If ``True``, initial hidden values are learned. """ def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities. rectify(x), weights_init=init.OneHot(), learn_init=False, **kwargs): super(Head, self).__init__(controller, **kwargs) self.memory_shape = memory_shape self.basename = kwargs.get('name', 'head') self.learn_init = learn_init if W_hid_to_sign is not None: self.sign = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign, name=self.basename + '.sign') self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b else: self.sign = None self.W_hid_to_sign, self.b_hid_to_sign = None, None self.key = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key, name=self.basename + '.key') self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b self.beta = DenseLayer(controller, num_units=1, W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta, name=self.basename + '.beta') self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b self.gate = DenseLayer(controller, num_units=1, W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate, name=self.basename + '.gate') self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b self.num_shifts = num_shifts self.shift = DenseLayer(controller, num_units=num_shifts, W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift, name=self.basename + '.shift') self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b self.gamma = DenseLayer(controller, num_units=1, W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma, name=self.basename + '.gamma') self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b self.weights_init = self.add_param(weights_init, (1, self.memory_shape[0]), name='weights_init', trainable=learn_init, regularizable=False) def get_output_for(self, h_t, w_tm1, M_t, **kwargs): if self.sign is not None: sign_t = self.sign.get_output_for(h_t, **kwargs) else: sign_t = 1. k_t = self.key.get_output_for(h_t, **kwargs) beta_t = self.beta.get_output_for(h_t, **kwargs) g_t = self.gate.get_output_for(h_t, **kwargs) s_t = self.shift.get_output_for(h_t, **kwargs) gamma_t = self.gamma.get_output_for(h_t, **kwargs) # Content Adressing (3.3.1) beta_t = T.addbroadcast(beta_t, 1) betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 1) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d( w_g_padded, conv_filter, input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(self.input_shape[0], 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[:, 0, 0, :] # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 1) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w) return w def get_params(self, **tags): params = super(Head, self).get_params(**tags) if self.sign is not None: params += self.sign.get_params(**tags) params += self.key.get_params(**tags) params += self.beta.get_params(**tags) params += self.gate.get_params(**tags) params += self.shift.get_params(**tags) params += self.gamma.get_params(**tags) return params