def get_output_for(self, input, **kwargs): if self.pad == 'strictsamex': assert(self.stride[0] == 1) kk = self.pool_size[0] ll = int(np.ceil(kk/2.)) # rr = kk-ll # pad = (ll, 0) pad = [(ll, 0)] length = input.shape[2] self.ignore_border = True input = padding.pad(input, pad, batch_ndim=2) pad = (0, 0) else: pad = self.pad pooled = pool.pool_2d(input, ds=self.pool_size, st=self.stride, ignore_border=self.ignore_border, padding=pad, mode=self.mode, ) if self.pad == 'strictsamex': pooled = pooled[:, :, :length or None, :] return pooled
def get_output_for(self, input, input_shape=None, **kwargs): # The optional input_shape argument is for when get_output_for is # called directly with a different shape than self.input_shape. if input_shape is None: input_shape = self.input_shape if self.stride == (1, 1) and self.pad == 'same': # simulate same convolution by cropping a full convolution conved = self.convolution(input, self.W, subsample=self.stride, input_shape=input_shape, # image_shape=input_shape, filter_shape=self.get_W_shape(), border_mode='full') crop_x = self.filter_size[0] // 2 crop_y = self.filter_size[1] // 2 conved = conved[:, :, crop_x:-crop_x or None, crop_y:-crop_y or None] else: # no padding needed, or explicit padding of input needed if self.pad == 'full': border_mode = 'full' pad = [(0, 0), (0, 0)] elif self.pad == 'same': border_mode = 'valid' pad = [(self.filter_size[0] // 2, self.filter_size[0] // 2), (self.filter_size[1] // 2, self.filter_size[1] // 2)] elif self.pad == 'strictsamex': border_mode = 'valid' kk = self.filter_size[0]-1 rr = kk // 2 ll = kk-rr pad = [(ll, rr), (0, 0)] else: border_mode = 'valid' pad = [(self.pad[0], self.pad[0]), (self.pad[1], self.pad[1])] if pad != [(0, 0), (0, 0)]: input = padding.pad(input, pad, batch_ndim=2) input_shape = (input_shape[0], input_shape[1], None if input_shape[2] is None else input_shape[2] + pad[0][0] + pad[0][1], None if input_shape[3] is None else input_shape[3] + pad[1][0] + pad[1][1]) conved = self.convolution(input, self.W, subsample=self.stride, input_shape=input_shape, # image_shape=input_shape, filter_shape=self.get_W_shape(), border_mode=border_mode) if self.b is None: activation = conved elif self.untie_biases: activation = conved + self.b.dimshuffle('x', 0, 1, 2) else: activation = conved + self.b.dimshuffle('x', 0, 'x', 'x') return self.nonlinearity(activation)
def get_output_for(self, input, **kwargs): if self.pad == 'strictsamex': assert (self.stride[0] == 1) kk = self.pool_size[0] ll = int(np.ceil(kk / 2.)) # rr = kk-ll # pad = (ll, 0) pad = [(ll, 0)] length = input.shape[2] self.ignore_border = True input = padding.pad(input, pad, batch_ndim=2) pad = (0, 0) else: pad = self.pad pooled = pool.pool_2d( input, ds=self.pool_size, st=self.stride, ignore_border=self.ignore_border, padding=pad, mode=self.mode, ) if self.pad == 'strictsamex': pooled = pooled[:, :, :length or None, :] return pooled
def test_pad(batch_ndim, val, width=3): from lasagne.theano_extensions.padding import pad X = T.tensor4() X0 = lasagne.utils.floatX(np.ones((2, 3, 4, 5))) X_pad_theano = pad(X, width, val, batch_ndim).eval({X: X0}) pads = tuple((width, width) if i >= batch_ndim else (0, 0) for i, _ in enumerate(X0.shape)) X_pad_np = np.pad(X0, pads, mode="constant", constant_values=val) assert (X_pad_theano == X_pad_np).all()
def test_pad(val, width=3, batch_ndim=2): from lasagne.theano_extensions.padding import pad X = T.tensor4() X0 = lasagne.utils.floatX(np.ones((2, 3, 4, 5))) X_pad_theano = pad(X, width, val, batch_ndim).eval({X: X0}) pads = tuple((width, width) if i >= batch_ndim else (0, 0) for i, _ in enumerate(X0.shape)) X_pad_np = np.pad(X0, pads, mode='constant', constant_values=val) assert (X_pad_theano == X_pad_np).all()
def step(input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.nnet.conv2d(input_n, W_in_stacked, None, None, subsample=(1, 1), border_mode='half', filter_flip=False) + b_stacked # Calculate gates pre-activations and slice hid_previous = pad(hid_previous, [(1, 0)], 0, 2) gates = input_n + conv1d_mc1(hid_previous, W_hid_stacked, None, None, subsample=(1, ), border_mode='valid', filter_flip=False) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) return [cell, hid]
def get_output_for(self, input, input_shape=None, **kwargs): if self.stride == (1, 1, 1) and self.pad == 'same': conved = self.convolution(img=input, kerns=self.W, border_mode='full', subsample=self.stride, conv_mode='conv') shift_x = (self.filter_size[0] - 1) // 2 shift_y = (self.filter_size[1] - 1) // 2 shift_z = (self.filter_size[2] - 1) // 2 conved = conved[:, :, shift_x:input.shape[2] + shift_x, shift_y:input.shape[3] + shift_y, shift_z:input.shape[4] + shift_z] else: if self.pad == 'full': border_mode = 'full' pad = [(0, 0), (0, 0), (0, 0)] elif self.pad == 'same': border_mode = 'valid' pad = [(self.filter_size[0] // 2, self.filter_size[0] // 2), (self.filter_size[1] // 2, self.filter_size[1] // 2), (self.filter_size[2] // 2, self.filter_size[2] // 2)] else: border_mode = 'valid' pad = [(self.pad[0], self.pad[0]), (self.pad[1], self.pad[1]), (self.pad[2], self.pad[2])] if pad != [(0, 0), (0, 0), (0, 0)]: input = padding.pad(input, pad, batch_ndim=3) input_shape = (input_shape[0], input_shape[1], None if input_shape[2] is None else input_shape[2] + pad[0][0] + pad[0][1], None if input_shape[3] is None else input_shape[3] + pad[1][0] + pad[1][1], None if input_shape[4] is None else input_shape[4] + pad[2][0] + pad[2][1]) conved = self.convolution(img=input, kerns=self.W, border_mode=border_mode, subsample=self.stride, conv_mode='conv') if self.b is None: activation = conved elif self.untie_biases: activation = conved + self.b.dimshuffle('x', 0, 1, 2, 3) else: activation = conved + self.b.dimshuffle('x', 0, 'x', 'x', 'x') return self.nonlinearity(activation)
def test_pad_width_per_border(batch_ndim, val=0): from lasagne.theano_extensions.padding import pad width = [(1, 2), (3, 4), (1, 2), (3, 4)] X = T.tensor4() X0 = lasagne.utils.floatX(np.ones((2, 3, 4, 5))) X_pad_theano = pad(X, width[batch_ndim:], val, batch_ndim).eval({X: X0}) pads = tuple(w if i >= batch_ndim else (0, 0) for i, w in enumerate(width)) X_pad_np = np.pad(X0, pads, mode='constant', constant_values=val) assert (X_pad_theano == X_pad_np).all()
def test_pad_width_per_border(batch_ndim, val=0): from lasagne.theano_extensions.padding import pad width = [(1, 2), (3, 4), (1, 2), (3, 4)] X = T.tensor4() X0 = lasagne.utils.floatX(np.ones((2, 3, 4, 5))) X_pad_theano = pad(X, width[batch_ndim:], val, batch_ndim).eval({X: X0}) pads = tuple(w if i >= batch_ndim else (0, 0) for i, w in enumerate(width)) X_pad_np = np.pad(X0, pads, mode="constant", constant_values=val) assert (X_pad_theano == X_pad_np).all()
def get_output_for(self, input, input_shape=None, **kwargs): # the optional input_shape argument is for when get_output_for is # called directly with a different shape than self.input_shape. if input_shape is None: input_shape = self.input_shape if self.stride == (1, ) and self.pad == 'same': # simulate same convolution by cropping a full convolution conved = self.convolution(input, self.W, subsample=self.stride, input_shape=input_shape, filter_shape=self.get_W_shape(), border_mode='full') crop = self.filter_size[0] // 2 conved = conved[:, :, crop:-crop or None] else: # no padding needed, or explicit padding of input needed if self.pad == 'full': border_mode = 'full' pad = (0, 0) elif self.pad == 'same': border_mode = 'valid' pad = (self.filter_size[0] // 2, (self.filter_size[0] - 1) // 2) elif self.pad == 'strictsame': self.stride = (1, ) border_mode = 'valid' kk = self.filter_size[0] - 1 rr = kk // 2 ll = kk - rr pad = (ll, rr) else: border_mode = 'valid' pad = (self.pad[0], self.pad[0]) if pad != (0, 0): input = padding.pad(input, [pad], batch_ndim=2) input_shape = (input_shape[0], input_shape[1], None if input_shape[2] is None else input_shape[2] + pad[0] + pad[1]) conved = self.convolution(input, self.W, subsample=self.stride, input_shape=input_shape, filter_shape=self.get_W_shape(), border_mode=border_mode) activation = conved return self.nonlinearity(activation)
def get_weights(self, h_t, w_tm1, M_t, **kwargs): batch_size = self.heads[0].input_shape[0] # QKFIX: Get the size of the batches from the 1st head num_heads = len(self.heads) k_t = self.nonlinearity_key(T.dot(h_t, self.W_hid_to_key) + self.b_hid_to_key) beta_t = self.nonlinearity_beta(T.dot(h_t, self.W_hid_to_beta) + self.b_hid_to_beta) g_t = self.nonlinearity_gate(T.dot(h_t, self.W_hid_to_gate) + self.b_hid_to_gate) # QKFIX: If the nonlinearity is softmax (which is usually the case), then the activations # need to be reshaped (T.nnet.softmax only accepts 2D inputs) try: s_t = self.nonlinearity_shift(T.dot(h_t, self.W_hid_to_shift) + self.b_hid_to_shift) except ValueError: shift_activation_t = T.dot(h_t, self.W_hid_to_shift) + self.b_hid_to_shift s_t = self.nonlinearity_shift(shift_activation_t.reshape((h_t.shape[0] * num_heads, self.num_shifts))) s_t = s_t.reshape(shift_activation_t.shape) gamma_t = self.nonlinearity_gamma(T.dot(h_t, self.W_hid_to_gamma) + self.b_hid_to_gamma) # Content Addressing (3.3.1) beta_t = T.addbroadcast(beta_t, 2) betaK = beta_t * similarities.cosine_similarity(k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK.flatten(ndim=2)) w_c = w_c.reshape(betaK.shape) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 2) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) # NOTE: This library is using a flat (zero-padded) convolution instead of the circular # convolution from the original paper. In practice, this change has a minimal impact. w_g_padded = w_g.reshape((h_t.shape[0] * num_heads, self.memory_shape[0])).dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.reshape((h_t.shape[0] * num_heads, self.num_shifts)).dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d(w_g_padded, conv_filter, input_shape=(None if batch_size is None else \ batch_size * num_heads, 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(None if batch_size is None else \ batch_size * num_heads, 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[T.arange(h_t.shape[0] * num_heads), T.arange(h_t.shape[0] * num_heads), 0, :] w_tilde = w_tilde.reshape((h_t.shape[0], num_heads, self.memory_shape[0])) # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 2) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w, axis=2).dimshuffle(0, 1, 'x') return w
def get_output_for(self, input, input_shape=None, **kwargs): # the optional input_shape argument is for when get_output_for is # called directly with a different shape than self.input_shape. if input_shape is None: input_shape = self.input_shape if self.stride == (1,) and self.pad == 'same': # simulate same convolution by cropping a full convolution conved = self.convolution(input, self.W, subsample=self.stride, input_shape=input_shape, filter_shape=self.get_W_shape(), border_mode='full') crop = self.filter_size[0] // 2 conved = conved[:, :, crop:-crop or None] else: # no padding needed, or explicit padding of input needed if self.pad == 'full': border_mode = 'full' pad = (0, 0) elif self.pad == 'same': border_mode = 'valid' pad = (self.filter_size[0] // 2, (self.filter_size[0] - 1) // 2) elif self.pad == 'strictsame': self.stride = (1,) border_mode = 'valid' kk = self.filter_size[0]-1 rr = kk // 2 ll = kk-rr pad = (ll, rr) else: border_mode = 'valid' pad = (self.pad[0], self.pad[0]) if pad != (0, 0): input = padding.pad(input, [pad], batch_ndim=2) input_shape = (input_shape[0], input_shape[1], None if input_shape[2] is None else input_shape[2] + pad[0] + pad[1]) conved = self.convolution(input, self.W, subsample=self.stride, input_shape=input_shape, filter_shape=self.get_W_shape(), border_mode=border_mode) activation = conved return self.nonlinearity(activation)
def test_convolutional_shift(): weights_var, shift_var = T.tensor3s('weights', 'shift') num_shifts = 3 weights_reshaped = weights_var.reshape((16 * 4, 128)) weights_reshaped = weights_reshaped.dimshuffle(0, 'x', 'x', 1) shift_reshaped = shift_var.reshape((16 * 4, num_shifts)) shift_reshaped = shift_reshaped.dimshuffle(0, 'x', 'x', 1) pad = (num_shifts // 2, (num_shifts - 1) // 2) weights_padded = padding.pad(weights_reshaped, [pad], batch_ndim=3) convolution = T.nnet.conv2d(weights_padded, shift_reshaped, input_shape=(16 * 4, 1, 1, 128 + pad[0] + pad[1]), filter_shape=(16 * 4, 1, 1, num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[T.arange(16 * 4), T.arange(16 * 4), 0, :] w_tilde = w_tilde.reshape((16, 4, 128)) convolutional_shift_fn = theano.function([weights_var, shift_var], w_tilde) weights = np.random.rand(16, 4, 128) shift = np.random.rand(16, 4, 3) weight_tilde = convolutional_shift_fn(weights, shift) weight_tilde_manual = np.zeros_like(weight_tilde) for i in range(16): for j in range(4): for k in range(128): # Filters in T.nnet.conv2d are reversed if (k - 1) >= 0: weight_tilde_manual[i, j, k] += shift[i, j, 2] * weights[i, j, k - 1] weight_tilde_manual[i, j, k] += shift[i, j, 1] * weights[i, j, k] if (k + 1) < 128: weight_tilde_manual[i, j, k] += shift[i, j, 0] * weights[i, j, k + 1] assert weight_tilde.shape == (16, 4, 128) assert np.allclose(weight_tilde, weight_tilde_manual)
def get_output_for(self, h_t, w_tm1, M_t, **kwargs): if self.sign is not None: sign_t = self.sign.get_output_for(h_t, **kwargs) else: sign_t = 1. k_t = self.key.get_output_for(h_t, **kwargs) beta_t = self.beta.get_output_for(h_t, **kwargs) g_t = self.gate.get_output_for(h_t, **kwargs) s_t = self.shift.get_output_for(h_t, **kwargs) gamma_t = self.gamma.get_output_for(h_t, **kwargs) # Content Adressing (3.3.1) beta_t = T.addbroadcast(beta_t, 1) betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 1) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d( w_g_padded, conv_filter, input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(self.input_shape[0], 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[:, 0, 0, :] # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 1) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w) return w
def get_output_for(self, h_t, w_tm1, M_t, **kwargs): if self.sign is not None: sign_t = self.sign.get_output_for(h_t, **kwargs) else: sign_t = 1. k_t = self.key.get_output_for(h_t, **kwargs) beta_t = self.beta.get_output_for(h_t, **kwargs) g_t = self.gate.get_output_for(h_t, **kwargs) s_t = self.shift.get_output_for(h_t, **kwargs) gamma_t = self.gamma.get_output_for(h_t, **kwargs) # Content Adressing (3.3.1) beta_t = T.addbroadcast(beta_t, 1) betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 1) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d(w_g_padded, conv_filter, input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(self.input_shape[0], 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[:, 0, 0, :] # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 1) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w) return w
def get_output_for(self, input, input_shape=None, **kwargs): # The optional input_shape argument is for when get_output_for is # called directly with a different shape than self.input_shape. if input_shape is None: input_shape = self.input_shape #print("Input Shape",input_shape) #print("Filter Shape",self.get_W_shape()) ############################################################ if self.stride == (1, 1) and self.pad == 'same': # simulate same convolution by cropping a full convolution conved = self.convolution(input, self.W, subsample=self.stride, image_shape=input_shape, filter_shape=self.get_W_shape(), border_mode='full') shift_x = (self.filter_size[0] - 1) // 2 shift_y = (self.filter_size[1] - 1) // 2 conved = conved[:, :, shift_x:input.shape[2] + shift_x, shift_y:input.shape[3] + shift_y] else: # no padding needed, or explicit padding of input needed if self.pad == 'full': border_mode = 'full' pad = [(0, 0), (0, 0)] elif self.pad == 'same': border_mode = 'valid' pad = [ (self.filter_size[0] // 2, (self.filter_size[0] - 1) // 2), (self.filter_size[1] // 2, (self.filter_size[1] - 1) // 2) ] else: border_mode = 'valid' pad = [(self.pad[0], self.pad[0]), (self.pad[1], self.pad[1])] if pad != [(0, 0), (0, 0)]: input = padding.pad(input, pad, batch_ndim=2) input_shape = (input_shape[0], input_shape[1], None if input_shape[2] is None else input_shape[2] + pad[0][0] + pad[0][1], None if input_shape[3] is None else input_shape[3] + pad[1][0] + pad[1][1]) # ##### # input = input.eval() # #input_max = np.amax(input) # #input_min = np.amin(input) # input = input*6 #32768 # #input = (input - input_min)*4 # #print('max input', np.amax(input)) # #print('min input', np.amin(input)) # input = input.astype(int) # input = input.astype(float) # # weight_tmp = self.W.eval() # #weight_max = np.amax(self.W) # #weight_min = np.amin(self.W) # weight_tmp = weight_tmp * 128 #65536 # #self.W = (self.W - weight_min)*4 # #print('max weight', np.amax(self.W)) # #print('min weight', np.amin(self.W)) # weight_tmp = weight_tmp.astype(int) # weight_tmp = weight_tmp.astype(float) # ##### conved = self.convolution(input, self.W, subsample=self.stride, image_shape=input_shape, filter_shape=self.get_W_shape(), border_mode=border_mode) # ##### # conved = conved.eval() # #conved_max = np.amax(conved) # #conved_min = np.amin(conved) # conved = conved / 768 #2147483648 # #conved = (conved - (conved_max+conved_min)/2)/16 # #print('max output', np.amax(conved)) # #print('min output', np.amin(conved)) # #conved = conved.astype(int) # ##### if self.b is None: activation = conved elif self.untie_biases: activation = conved + self.b.dimshuffle('x', 0, 1, 2) else: activation = conved + self.b.dimshuffle('x', 0, 'x', 'x') activation = conved return self.nonlinearity(activation)
def get_output_for(self, input, **kwargs): return padding.pad(input, self.width, self.val, self.batch_ndim)
def get_weights(self, h_t, w_tm1, M_t, **kwargs): batch_size = self.heads[0].input_shape[ 0] # QKFIX: Get the size of the batches from the 1st head num_heads = len(self.heads) k_t = self.nonlinearity_key( T.dot(h_t, self.W_hid_to_key) + self.b_hid_to_key) beta_t = self.nonlinearity_beta( T.dot(h_t, self.W_hid_to_beta) + self.b_hid_to_beta) g_t = self.nonlinearity_gate( T.dot(h_t, self.W_hid_to_gate) + self.b_hid_to_gate) # QKFIX: If the nonlinearity is softmax (which is usually the case), then the activations # need to be reshaped (T.nnet.softmax only accepts 2D inputs) try: s_t = self.nonlinearity_shift( T.dot(h_t, self.W_hid_to_shift) + self.b_hid_to_shift) except ValueError: shift_activation_t = T.dot( h_t, self.W_hid_to_shift) + self.b_hid_to_shift s_t = self.nonlinearity_shift( shift_activation_t.reshape( (h_t.shape[0] * num_heads, self.num_shifts))) s_t = s_t.reshape(shift_activation_t.shape) gamma_t = self.nonlinearity_gamma( T.dot(h_t, self.W_hid_to_gamma) + self.b_hid_to_gamma) # Content Addressing (3.3.1) beta_t = T.addbroadcast(beta_t, 2) betaK = beta_t * similarities.cosine_similarity(k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK.flatten(ndim=2)) w_c = w_c.reshape(betaK.shape) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 2) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) # NOTE: This library is using a flat (zero-padded) convolution instead of the circular # convolution from the original paper. In practice, this change has a minimal impact. w_g_padded = w_g.reshape( (h_t.shape[0] * num_heads, self.memory_shape[0])).dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.reshape( (h_t.shape[0] * num_heads, self.num_shifts)).dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d(w_g_padded, conv_filter, input_shape=(None if batch_size is None else \ batch_size * num_heads, 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(None if batch_size is None else \ batch_size * num_heads, 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[T.arange(h_t.shape[0] * num_heads), T.arange(h_t.shape[0] * num_heads), 0, :] w_tilde = w_tilde.reshape( (h_t.shape[0], num_heads, self.memory_shape[0])) # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 2) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w, axis=2).dimshuffle(0, 1, 'x') return w