def keep_max(input, theta, k, sent_mask): sig_input = T.nnet.sigmoid(T.dot(input, theta)) sent_mask = sent_mask.dimshuffle(0, 'x', 1, 'x') sig_input = sig_input * sent_mask #sig_input = T.dot(input, theta) if k == 0: result = input * T.addbroadcast(sig_input, 3) return result, sig_input # get the sorted idx sort_idx = T.argsort(sig_input, axis=2) k_max_ids = sort_idx[:,:,-k:,:] dim0, dim1, dim2, dim3 = k_max_ids.shape batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3) mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3)) mapids = T.repeat(mapids, dim0, axis=0).flatten() rowids = k_max_ids.flatten() colids = T.arange(dim3).reshape((1, dim3)) colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten() sig_mask = T.zeros_like(sig_input) choosed = sig_input[batchids, mapids, rowids, colids] sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1) input_mask = sig_mask * sig_input result = input * T.addbroadcast(input_mask, 3) return result, sig_input
def fprop(self, x, mode='train'): if mode == 'test': # this is for use during test/validation time x_avg = self.params.getParameter('x_avg') elif mode == 'calculate': x_avg = x.mean(self.norm_axis, keepdims=True) elif mode == 'train': # otherwise calculate the batch mean and std x_avg = x.mean(self.norm_axis, keepdims=True) # the following trick is learend from lasagne implementation running_mean = theano.clone(self.params.getParameter('x_avg'), share_inputs=False) running_mean_udpate = ((1 - self.alpha) * running_mean +self.alpha * x_avg) # set a default update for them running_mean.default_update = running_mean_udpate x_avg += 0 * running_mean else: raise "mode can only take ['train', 'test', 'calculate']" self.x_avg = x_avg x_avg = T.addbroadcast(x_avg, *self.norm_axis) beta = T.addbroadcast(self.params.getParameter('beta'), *self.norm_axis) bn_x = x / (x_avg + 1e-18) * beta return bn_x if self.actFunc is None else self.actFunc(bn_x) # End BatchExpNormLayer #-------------------------------------------------------------------------------
def local_contrast_normalize(X, window, img_shape): """Return normalized X and the convolution transform """ batchsize, channels, R, C = img_shape assert window.shape[0] == 1 assert window.shape[1] == channels N = window.shape[2] assert window.shape[3] == N blur = tlinear.Conv2d( filters=sharedX(window, 'LCN_window'), img_shape=img_shape, border_mode='full') N2 = N//2 # remove global mean X = X - X.mean(axis=[1, 2, 3]).dimshuffle(0, 'x', 'x', 'x') #remove local mean blurred_x = tensor.addbroadcast(blur.lmul(X), 1) x2c = X - blurred_x[:, :, N2:R + N2, N2:C + N2] # standardize contrast blurred_x2c_sqr = tensor.addbroadcast(blur.lmul(x2c ** 2), 1) x2c_lcn = x2c / tensor.sqrt((10 + blurred_x2c_sqr[:, :, N2:R + N2, N2:C + N2])) return x2c_lcn, blur
def keep_max(input, theta, k): """ :type input: theano.tensor.tensor4 :param input: the input data :type theta: theano.tensor.matrix :param theta: the parameter for sigmoid function :type k: int :param k: the number k used to define top k sentence to remain """ sig_input = T.nnet.sigmoid(T.dot(input, theta)) if k == 0: # using all the sentences result = input * T.addbroadcast(sig_input, 3) return result, sig_input # get the sorted idx sort_idx = T.argsort(sig_input, axis=2) k_max_ids = sort_idx[:,:,-k:,:] dim0, dim1, dim2, dim3 = k_max_ids.shape batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3) mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3)) mapids = T.repeat(mapids, dim0, axis=0).flatten() rowids = k_max_ids.flatten() colids = T.arange(dim3).reshape((1, dim3)) colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten() # construct masked data sig_mask = T.zeros_like(sig_input) choosed = sig_input[batchids, mapids, rowids, colids] sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1) input_mask = sig_mask * sig_input result = input * T.addbroadcast(input_mask, 3) return result, sig_input
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: # use stored mean and std mean = self.mean std = self.std else: # use this batch's mean and std mean = input.mean(self.axes, keepdims=True) std = input.std(self.axes, keepdims=True) # and update the stored mean and std: # we create (memory-aliased) clones of the stored mean and std running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them running_mean.default_update = (1 - self.alpha) * running_mean + self.alpha * mean running_std.default_update = (1 - self.alpha) * running_std + self.alpha * std # and include them in the graph so their default updates will be # applied (although the expressions will be optimized away later) mean += 0 * running_mean std += 0 * running_std std += self.epsilon mean = T.addbroadcast(mean, *self.axes) std = T.addbroadcast(std, *self.axes) beta = T.addbroadcast(self.beta, *self.axes) gamma = T.addbroadcast(self.gamma, *self.axes) normalized = (input - mean) * (gamma / std) + beta return self.nonlinearity(normalized)
def _ct(self, other): ''' Helper function to make tensors dimensions compatible''' if (other.var_set == self.var_set): return (self.pt_tensor, other.pt_tensor) union_var_set = other.scope.union(self.scope) vidx1 = frozenset(self.var_indices) vidx2 = frozenset(other.var_indices) union_indices = vidx1.union(vidx2) shape1 = [] shape2 = [] b1 = [] b2 = [] u1 = [] u2 = [] for i,vidx in enumerate(sorted(union_indices)): if (vidx in vidx1): shape1.append(self.discrete_pgm.cardinalities[vidx]) u1.append(i) else: shape1.append(1) b1.append(i) if (vidx in vidx2): shape2.append(self.discrete_pgm.cardinalities[vidx]) u2.append(i) else: shape2.append(1) b2.append(i) t1 = T.addbroadcast(T.unbroadcast(self.pt_tensor.reshape(shape1, len(shape1)), *u1), *b1) t2 = T.addbroadcast(T.unbroadcast(other.pt_tensor.reshape(shape2, len(shape2)), *u2), *b2) return (t1, t2)
def output(self, input): if self.unflatten_input != None: input = T.reshape(input, self.unflatten_input) W_shuffled = self.W.val.dimshuffle(3, 0, 1, 2) # c01b to bc01 conv_out = dnn.dnn_conv(img=input, kerns=W_shuffled, subsample=(self.convstride, self.convstride), border_mode=self.padsize) conv_out = conv_out + self.b.val.dimshuffle('x', 0, 'x', 'x') if self.batch_norm: conv_out = (conv_out - T.mean(conv_out, axis = (0,2,3), keepdims = True)) / (1.0 + T.std(conv_out, axis=(0,2,3), keepdims = True)) conv_out = conv_out * T.addbroadcast(self.bn_std,0,2,3) + T.addbroadcast(self.bn_mean, 0,2,3) self.out_store = conv_out if self.activation == "relu": self.out = T.maximum(0.0, conv_out) elif self.activation == "tanh": self.out = T.tanh(conv_out) elif self.activation == None: self.out = conv_out #if self.residual: # print "USING RESIDUAL" # self.out += input return self.out
def output(self, input_raw): if self.flatten_input: input = input_raw.flatten(2) else: input = input_raw lin_output = T.dot(input, self.W) + self.b if self.batch_norm: lin_output = (lin_output - T.mean(lin_output, axis = 0, keepdims = True)) / (1.0 + T.std(lin_output, axis = 0, keepdims = True)) lin_output = (lin_output * T.addbroadcast(self.bn_std,0) + T.addbroadcast(self.bn_mean,0)) self.out_store = lin_output if self.activation == None: activation = lambda x: x elif self.activation == "relu": activation = lambda x: T.maximum(0.0, x) elif self.activation == "exp": activation = lambda x: T.exp(x) elif self.activation == "tanh": activation = lambda x: T.tanh(x) elif self.activation == 'softplus': activation = lambda x: T.nnet.softplus(x) else: raise Exception("Activation not found") out = activation(lin_output) #if self.residual: # return out + input_raw #else: # return out return out
def Softmax(x, temp = 1): """ Softmax Units. Applies row-wise softmax to the input supplied. Args: x: could be a ``theano.tensor`` or a ``theano.shared`` or ``numpy`` arrays or ``python lists``. temp: temperature of type ``float``. Mainly used during distillation, normal softmax prefer ``T=1``. Notes: Refer [3] for details. .. [#] Hinton, Geoffrey, Oriol Vinyals, and Jeff Dean. "Distilling the knowledge in a neural network." arXiv preprint arXiv:1503.02531 (2015). Returns: same as input: returns a row-wise softmax output of the same shape as the input. """ if temp != 1: expo = T.exp(x / float(temp)) # at this moment this is mini_batch_size X num_classes. normalizer = T.sum(expo,axis=1,keepdims=True) # at this moment this is mini_batch_size X 1. T.addbroadcast(normalizer,1) return expo / normalizer else: return T.nnet.softmax(x)
def output(self, input): W_shuffled = self.W.dimshuffle(3, 0, 1, 2) # c01b to bc01 print "input ndim", input.ndim conv_out = dnn.dnn_conv(img=input, kerns=W_shuffled, subsample=(self.stride, self.stride), border_mode=self.padsize) conv_out = conv_out + self.b.dimshuffle('x', 0, 'x', 'x') if self.batch_norm: conv_out = (conv_out - T.mean(conv_out, axis = (0,2,3), keepdims = True)) / (1.0 + T.std(conv_out, axis=(0,2,3), keepdims = True)) conv_out = conv_out * T.addbroadcast(self.bn_std,0,2,3) + T.addbroadcast(self.bn_mean, 0,2,3) self.out_store = conv_out if self.activation == "relu": self.out = T.maximum(0.0, conv_out) elif self.activation == "tanh": self.out = T.tanh(conv_out) elif self.activation == None: self.out = conv_out return T.specify_shape(self.out, (self.batch_size, self.out_channels, self.in_length / self.stride, self.in_length / self.stride))
def output(self, input): if self.unflatten_input != None: input = T.reshape(input, self.unflatten_input) conv_out = deconv(input, self.W, subsample=(2, 2), border_mode=(2,2)) conv_out = conv_out + self.b.dimshuffle('x', 0, 'x', 'x') if self.batch_norm: conv_out = (conv_out - conv_out.mean(axis = (0,2,3), keepdims = True)) / (1.0 + conv_out.std(axis = (0,2,3), keepdims = True)) conv_out = conv_out * T.addbroadcast(self.bn_std,0,2,3) + T.addbroadcast(self.bn_mean,0,2,3) if self.activation == "relu": out = T.maximum(0.0, conv_out) elif self.activation == "tanh": out = T.tanh(conv_out) elif self.activation == None: out = conv_out else: raise Exception() self.params = {'W' : self.W, 'b' : self.b} if self.batch_norm: self.params["mu"] = self.bn_mean self.params["sigma"] = self.bn_std return out
def resample_step(self): idx=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T s_samp=T.sum(self.s_now*T.addbroadcast(idx,1),axis=0) h_samp=T.sum(self.h_now*T.addbroadcast(idx,1),axis=0) return T.cast(s_samp,'float32'), T.cast(h_samp,'float32')
def get_state(self): st = super(LatentTypeWithTuningCurve, self).get_state() # The filters are non-identifiable as we can negate both the # temporal and the spatial filters and get the same net effect. # By convention, choose the sign that results in the most # positive temporal filter. sign = T.sgn(T.sum(self.stim_resp_t, axis=0)) T.addbroadcast(sign, 0) # Similarly, we can trade a constant between the spatial and temporal # pieces. By convention, set the temporal filter to norm 1. Z = T.sqrt(T.sum(self.stim_resp_t**2, axis=0)) T.addbroadcast(Z, 0) # Compute the normalized temporal response stim_resp_t = sign*(1.0/Z)*self.stim_resp_t # Finally, reshape the spatial component as necessary if self.spatial_ndim == 2: stim_resp_x = sign*Z*self.stim_resp_x stim_resp_x = T.reshape(stim_resp_x, self.spatial_shape + (self.R,)) else: stim_resp_x = sign*Z*self.stim_resp_x st.update({'stim_response_x' : stim_resp_x, 'stim_response_t' : stim_resp_t}) return st
def log_p(self, L): """ Compute log prob of the given value under this prior Input: L ~ NxD """ assert L.ndim == 2, "L must be 2d!" # Compute pairwise L2 norm L1 = L.dimshuffle(0,'x',1) # Nx1xD L2 = L.dimshuffle('x',0,1) # 1xNxD T.addbroadcast(L1,1) T.addbroadcast(L2,0) # Compute pairwise distances D = ((L1-L2)**2).sum(axis=2) # Compute the kernel K = T.exp(-D / self.sigma**2) # Log prob is the log determinant of the pairwise distances lp_det = T.log(self.d(K)) # Also multiply by a spherical Gaussian with standard deviation of 'bound' # to prevent points from diverging to infinity lp_gauss = self.gaussian.log_p(L) return lp_det + lp_gauss
def embed(self,x, y, kth): hidden = self.hidden_k(x,self.superw,self.dicw, kth) size = y.ndim y = T.addbroadcast(y,size - 1) embedding = T.sum(hidden*y,0)/T.addbroadcast(T.cast(T.sum(y,0), 'int16'), size - 2) return embedding
def __Theano_build__(self): Td = T.tensor3('Td') Ty = T.ivector('Ty') Tlr = T.scalar('Tlr') #Talpha = T.TensorType(dtype='float32', broadcastable=(0, 1, 1))('alpha') A = theano.shared(np.ones((self.D.shape[0]))\ .astype('float32').reshape(-1, 1, 1), 'A') Ttriple = [T.ivector('triple'+x) for x in ['i', 'j', 'l']] Tneighbor = [T.ivector('neighbor'+x) for x in ['i', 'j']] d = (Td * T.addbroadcast(A, 1, 2)).sum(0) pull_error, _ = theano.scan( fn = lambda i, j, d: d[i, j], sequences=[Tneighbor[0], Tneighbor[1]], outputs_info=None, non_sequences=[d]) pull_error = pull_error.sum() push_error, _ = theano.scan( fn = lambda i, j, l, d: T.neq(Ty[i], Ty[l]) * T.maximum((d[i]-d[j]) - (d[i]-d[l]) +1, 0), sequences=[Ttriple[0], Ttriple[1], Ttriple[2]], outputs_info=None, non_sequences=[d]) # zerocount = T.eq(linalg.diag(mask*T.maximum(lossij - lossil + 1, 0)), 0).sum() error = pull_error.sum() + push_error.sum() grad = T.grad(error, A) newA = A - Tlr*grad #T.maximum(A - Tlr*grad, 0) updates = [(A, newA/newA.sum())] self.Ttrain = theano.function(Ttriple+Tneighbor+[Tlr], Tlr*grad, givens={Td: self.D, Ty: self.y}, updates=updates, allow_input_downcast=True, on_unused_input='warn') self.Tloss = theano.function(Ttriple+Tneighbor, error, givens={Td: self.D, Ty: self.y}, allow_input_downcast=True) # eig, eigv = linalg.eig((d+d.T)/2.0) # self.Tmineig = theano.function([], # T.min(eig), # givens={Td: self.D}, # allow_input_downcast=True) self.Tmindist = theano.function([], T.min(d), givens={Td: self.D}, allow_input_downcast=True) self.Ttransform = theano.function([Td], (Td*T.addbroadcast(A, 1, 2)).sum(0), allow_input_downcast=True) self.TA = A
def __init__(self, model): self.model = model self.imp_model = model['impulse'] # Number of presynaptic neurons self.N = model['N'] # Get parameters of the prior self.alpha = self.imp_model['alpha'] # Create a basis for the impulse responses response self.basis = create_basis(self.imp_model['basis']) (_,self.B) = self.basis.shape # The basis is interpolated once the data is specified self.initialize_basis() # Initialize memory for the filtered spike train self.ir = theano.shared(name='ir', value=np.zeros((1,self.N,self.B))) # Define Dirichlet distributed weights by normalizing gammas # The variables are log-gamma distributed self.lng = T.dvector('w_lng') self.g = T.exp(self.lng) self.g2 = T.reshape(self.g, [self.N,self.B]) self.g_sum = T.reshape(T.sum(self.g2, axis=1), [self.N,1]) # Normalize the gammas to get a Dirichlet draw T.addbroadcast(self.g_sum, 1) self.w_ir2 = self.g2 / self.g_sum self.w_ir2.name = 'w_ir' # Repeat them (in a differentiable manner) to create a 3-tensor self.w_ir3 = T.reshape(self.w_ir2, [1,self.N,self.B]) # Make w_ir3 broadcastable in the 1st dim T.addbroadcast(self.w_ir3,0) # Take the elementwise product of the filtered stimulus and # the repeated weights to get the weighted impulse current along each # impulse basis dimension. Then sum over bases to get the # total coupling current from each presynaptic neurons at # all time points self.I_imp = T.sum(self.ir*self.w_ir3, axis=2) # Log probability of a set of independent log-gamma r.v.'s # This is log p(log(g)) under the prior. Since we are taking the # log, we multiply by a factor of g to ensure normalization and # thus the \alpha-1 in the exponent becomes \alpha self.log_p = -self.B*self.N*scipy.special.gammaln(self.alpha) \ + T.sum(self.alpha*self.lng) \ - T.sum(self.g) # Define a helper variable for the impulse response # after projecting onto the basis self.impulse = T.dot(self.w_ir2, T.transpose(self.ibasis))
def __init__(self, model, latent): """ Initialize the stochastic block model for the adjacency matrix """ self.model = model self.prms = model['network']['graph'] self.N = model['N'] self.N_dims = self.prms['N_dims'] # Get the latent location self.location = latent[self.prms['locations']] self.Lm = self.location.Lm # self.location_prior = create_prior(self.prms['location_prior']) # # # Latent distance model has NxR matrix of locations L # self.L = T.dvector('L') # self.Lm = T.reshape(self.L, (self.N, self.N_dims)) # Compute the distance between each pair of locations # Reshape L into a Nx1xD matrix and a 1xNxD matrix, then add the requisite # broadcasting in order to subtract the two matrices L1 = self.Lm.dimshuffle(0,'x',1) # Nx1xD L2 = self.Lm.dimshuffle('x',0,1) # 1xNxD T.addbroadcast(L1,1) T.addbroadcast(L2,0) #self.D = T.sqrt(T.sum((L1-L2)**2, axis=2)) #self.D = T.sum((L1-L2)**2, axis=2) # It seems we need to use L1 norm for now because # Theano doesn't properly compute the gradients of the L2 # norm. (It gives NaNs because it doesn't realize that some # terms will cancel out) # self.D = (L1-L2).norm(1, axis=2) self.D = T.pow(L1-L2,2).sum(axis=2) # There is a distance scale, \delta self.delta = T.dscalar(name='delta') # Define complete adjacency matrix self.A = T.bmatrix('A') # The probability of A is exponentially decreasing in delta # self.pA = T.exp(-1.0*self.D/self.delta) self.pA = T.exp(-0.5*self.D/self.delta**2) if 'rho_refractory' in self.prms: self.pA += T.eye(self.N) * (self.prms['rho_refractory']-self.pA) # self.pA[np.diag_indices(self.N)] = self.prms['rho_refractory'] # Allow for scaling the log likelihood of the graph so that we can do # Annealed importance sampling self.lkhd_scale = theano.shared(value=1.0, name='lkhd_scale') # Define log probability self.lkhd = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA)) # self.log_p = self.lkhd_scale * self.lkhd + self.location_prior.log_p(self.Lm) self.log_p = self.lkhd_scale * self.lkhd
def get_t_weights(self, t): """ Generate vector of weights allowing selection of current timestep. (if t is not an integer, the weights will linearly interpolate) """ n_seg = self.trajectory_length t_compare = T.arange(n_seg, dtype=theano.config.floatX).reshape((1,n_seg)) diff = abs(T.addbroadcast(t,1) - T.addbroadcast(t_compare,0)) t_weights = T.max(T.join(1, (-diff+1).reshape((n_seg,1)), T.zeros((n_seg,1))), axis=1) return t_weights.reshape((-1,1))
def __init__(self, input, n_filt, n_in, n_out, y, hist_len, y_len): """ Initialize the parameters of the poisson regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) #self.W = theano.shared(value=numpy.identity(n_in, dtype=theano.config.floatX), name='W', borrow=True) self.W = theano.shared(value=numpy.tile([-1, -1, -1, -1, 1, 1, 1, 1, 1]*numpy.ones((n_in,), dtype=theano.config.floatX)/n_filt, (1,n_filt,1) ).astype(theano.config.floatX), name='W', borrow=True) #self.W = theano.shared(value=numpy.concatenate((-1*numpy.ones((4,)),numpy.ones((5,)),-1*numpy.ones((4,)),numpy.ones((5,))))*numpy.ones((n_in,), dtype=theano.config.floatX), name='W', borrow=True) #self.W = theano.shared(value=.001*numpy.ones((n_in,), dtype=theano.config.floatX), name='W', borrow=True) # initialize the baises b as a vector of n_out 0s self.b = theano.shared(value=-1*numpy.ones((n_out,), dtype=theano.config.floatX), name='b', borrow=True) self.h = theano.shared(value=numpy.zeros((hist_len,n_out), dtype=theano.config.floatX), name='h', borrow=True) # helper variables for adagrad self.b_helper = theano.shared(value=numpy.zeros((n_out,), \ dtype=theano.config.floatX), name='b_helper', borrow=True) self.W_helper = theano.shared(value=numpy.tile(numpy.zeros((n_in,), \ dtype=theano.config.floatX), (1,n_filt,1) ), name='W_helper', borrow=True) self.h_helper = theano.shared(value=numpy.zeros((hist_len,n_out), dtype=theano.config.floatX), name='h_helper', borrow=True) # helper variables for L1 self.b_helper2 = theano.shared(value=numpy.zeros((n_out,), \ dtype=theano.config.floatX), name='b_helper2', borrow=True) self.W_helper2 = theano.shared(value=numpy.tile(numpy.zeros((n_in,), \ dtype=theano.config.floatX), (1,n_filt,1) ), name='W_helper2', borrow=True) self.h_helper2 = theano.shared(value=numpy.zeros((hist_len,n_out), dtype=theano.config.floatX), name='h_helper', borrow=True) # parameters of the model self.params = [self.W, self.b, self.h] self.params_helper = [self.W_helper, self.b_helper, self.h_helper] self.params_helper2 = [self.W_helper2, self.b_helper2, self.h_helper2] #history dependent input self.h_in = theano.shared(value=numpy.zeros((y_len,n_out), dtype=theano.config.floatX), borrow=True) for hi in xrange(hist_len): self.h_in = T.set_subtensor(self.h_in[(1+hi):y_len], self.h_in[(1+hi):y_len] + T.addbroadcast(T.shape_padleft(self.h[hi,:],n_ones=1),0)*y[0:(y_len-(hi+1))]) # compute vector of expected values (for each output) in symbolic form self.E_y_given_x = T.log(1+T.exp(T.sum(input*T.addbroadcast(self.W,0), axis=1) + self.b + self.h_in)) #sums over multiple filters self.input_responses = T.sum(input*T.addbroadcast(self.W,0), axis=1) + self.b #sums over multiple filters
def mulclassloss(self,kth,x,y,label): #mutiple label classification loss using wikidata for pretrain hidden = self.hidden_k(x,self.w,self.dicw,kth) print "hidden type : "+str(hidden.type) size = y.ndim y = T.addbroadcast(y,size - 1) embedding = T.sum(hidden*y,0)/T.addbroadcast(T.cast(T.sum(y,0), 'int16'), size - 2) #embedding = T.sum(hidden*y,0)/ T.addbroadcast(T.sum(y,0), size-2) print "embedding type : "+str(embedding.type) logloss = (0. - T.sum(T.log(1. / (1. + T.exp(0. - (T.dot(embedding, self.w["mulw"])+self.w["mulb"])*label)))))/embedding.shape[0] return logloss
def node_update(self, s_, h_, m_) : """ Update params in Attention. """ preact = tensor.dot(h_, self.params[self._p(self.prefix, 'U')]) preact += tensor.addbroadcast(tensor.dot(s_, self.params[self._p(self.prefix, 'W')]).dimshuffle('x', 0, 1), 0) preact = tensor.dot(tensor.tanh(preact), self.params[self._p(self.prefix, 'va')]) * m_ alpha = tensor.nnet.softmax(preact.dimshuffle(1, 0)).dimshuffle(1, 0, 'x') c = (h_ * tensor.addbroadcast(alpha, 2)).sum(axis=0) # c is (samples,2*hidden) return c, alpha
def test_dnn_batchnorm_train(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) if dnn.version(raises=False) < 5000: raise SkipTest("batch normalization requires cudnn v5+") utt.seed_rng() for mode in ('per-activation', 'spatial'): for vartype in (T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # forward pass out, x_mean, x_invstd = dnn.dnn_batch_normalization_train( x, scale, bias, mode, eps) # reference forward pass if mode == 'per-activation': axes = (0,) elif mode == 'spatial': axes = (0,) + tuple(range(2, ndim)) x_mean2 = x.mean(axis=axes, keepdims=True) x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) scale2 = T.addbroadcast(scale, *axes) bias2 = T.addbroadcast(bias, *axes) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + grads + grads2, mode=mode_with_gpu) # run for data_shape in ((10, 20, 30, 40), (4, 3, 1, 1), (1, 1, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype('float32') outputs = f(X, Scale, Bias, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd # compare gradients utt.assert_allclose(outputs[6], outputs[6 + 3]) # dx utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=3e-3) # dscale utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias
def multi_dim_softmax(X): """ compute a softmax for a filter_map at each point X : a 4d tensor (batch, feature_map, x, y) returns a 4d tensor (batch, softmax, x, y) """ maxs = X.max(axis=1, keepdims=True) maxs = tensor.addbroadcast(maxs, 1) exps = tensor.exp(X-maxs) sums = exps.sum(axis=1, keepdims=True) sums = tensor.addbroadcast(sums, 1) return exps/sums
def objective(self, x): # first, reshape x into a set of parameters we need i, W, b = 0, [], [] for shape in self.layer_shapes: l = np.prod(shape) W.append(x[:, i:i+l].reshape((n_batch,)+shape)) i += l l = shape[1] b.append(x[:, i:i+l].reshape((n_batch, 1, l))) # calculate the cost z = T.tile(self.mini_batch.reshape((1, 50, 784)), (20, 1, 1)) for wi, bi in zip(W, b): z = T.nnet.sigmoid(T.batched_dot(z, wi) + T.addbroadcast(bi,1)) return T.mean((z-T.addbroadcast(T.extra_ops.to_one_hot(self.classes, 10).reshape((1, 50, 10)),0))**2, axis=2)
def get_output_for(self, input, deterministic=False, collect=False, **kwargs): if collect: # use this batch's mean and var if self.stat_indices is None: mean = input.mean(self.axes, keepdims=True) var = input.var(self.axes, keepdims=True) else: mean = input[self.stat_indices].mean(self.axes, keepdims=True) var = input[self.stat_indices].var(self.axes, keepdims=True) # and update the stored mean and var: # we create (memory-aliased) clones of the stored mean and var running_mean = theano.clone(self.mean, share_inputs=False) running_var = theano.clone(self.var, share_inputs=False) # set a default update for them if self.alpha is not 'single_pass': running_mean.default_update = ( (1 - self.alpha) * running_mean + self.alpha * mean) running_var.default_update = ( (1 - self.alpha) * running_var + self.alpha * var) else: print "Collecting using single pass..." # this is ugly figure out what can be safely removed... running_mean.default_update = (0 * running_mean + 1.0 * mean) running_var.default_update = (0 * running_var + 1.0 * var) # and include them in the graph so their default updates will be # applied (although the expressions will be optimized away later) mean += 0 * running_mean var += 0 * running_var elif deterministic: # use stored mean and var mean = self.mean var = self.var else: # use this batch's mean and var mean = input.mean(self.axes, keepdims=True) var = input.var(self.axes, keepdims=True) mean = T.addbroadcast(mean, *self.axes) var = T.addbroadcast(var, *self.axes) normalized = (input - mean) / T.sqrt(var + self.epsilon) if self.return_stats: return [normalized, mean, var] else: return normalized
def get_weights(self, h_t, w_tm1, M_t, **kwargs): batch_size = self.heads[0].input_shape[0] # QKFIX: Get the size of the batches from the 1st head num_heads = len(self.heads) k_t = self.nonlinearity_key(T.dot(h_t, self.W_hid_to_key) + self.b_hid_to_key) beta_t = self.nonlinearity_beta(T.dot(h_t, self.W_hid_to_beta) + self.b_hid_to_beta) g_t = self.nonlinearity_gate(T.dot(h_t, self.W_hid_to_gate) + self.b_hid_to_gate) # QKFIX: If the nonlinearity is softmax (which is usually the case), then the activations # need to be reshaped (T.nnet.softmax only accepts 2D inputs) try: s_t = self.nonlinearity_shift(T.dot(h_t, self.W_hid_to_shift) + self.b_hid_to_shift) except ValueError: shift_activation_t = T.dot(h_t, self.W_hid_to_shift) + self.b_hid_to_shift s_t = self.nonlinearity_shift(shift_activation_t.reshape((h_t.shape[0] * num_heads, self.num_shifts))) s_t = s_t.reshape(shift_activation_t.shape) gamma_t = self.nonlinearity_gamma(T.dot(h_t, self.W_hid_to_gamma) + self.b_hid_to_gamma) # Content Addressing (3.3.1) beta_t = T.addbroadcast(beta_t, 2) betaK = beta_t * similarities.cosine_similarity(k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK.flatten(ndim=2)) w_c = w_c.reshape(betaK.shape) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 2) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) # NOTE: This library is using a flat (zero-padded) convolution instead of the circular # convolution from the original paper. In practice, this change has a minimal impact. w_g_padded = w_g.reshape((h_t.shape[0] * num_heads, self.memory_shape[0])).dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.reshape((h_t.shape[0] * num_heads, self.num_shifts)).dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d(w_g_padded, conv_filter, input_shape=(None if batch_size is None else \ batch_size * num_heads, 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(None if batch_size is None else \ batch_size * num_heads, 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[T.arange(h_t.shape[0] * num_heads), T.arange(h_t.shape[0] * num_heads), 0, :] w_tilde = w_tilde.reshape((h_t.shape[0], num_heads, self.memory_shape[0])) # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 2) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w, axis=2).dimshuffle(0, 1, 'x') return w
def output(self, input): conv_out = conv.conv2d( input, self.conv_w, filter_shape=self.filter_shape, image_shape=self.image_shape ) conv_b = T.addbroadcast(self.conv_b, 1, 2) conv_out = conv_out + conv_b # add bias pool_out = downsample.max_pool_2d( conv_out, (self.poolsize, self.poolsize), ignore_border=True ) pool_w = T.addbroadcast(self.pool_w, 1, 2) pool_b = T.addbroadcast(self.pool_b, 1, 2) pool_out = pool_out * pool_w + pool_b return 1.7159*T.tanh(2/3 * pool_out)
def sample_joint(self, sp): t2_samp=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T s2_samp=T.cast(T.sum(self.s_now*T.addbroadcast(t2_samp,1),axis=0),'float32') diffs=(s2_samp-sp) abs_term=T.sum(T.abs_(diffs)/self.b,axis=1) alpha=T.exp(-abs_term) probs_unnorm=self.weights_past*alpha probs=probs_unnorm/T.sum(probs_unnorm) t1_samp=self.theano_rng.multinomial(pvals=T.reshape(probs,(1,self.npcl))).T s1_samp=T.cast(T.sum(self.s_past*T.addbroadcast(t1_samp,1),axis=0),'float32') return [s1_samp, s2_samp]
def get_uhs_operator(uhs, depth, n_hidden, rhos): """ :param uhs: :param depth: :param n_hidden: :param rhos: can be shared variable or constant of shape (depth, )!! :return: """ # Will use a Fourier matrix (will be O(n^2)...) # Doesn't seem to slow things down much though! exp_phases = [T.cos(uhs), T.sin(uhs)] neg_exp_phases = [T.cos(uhs[:, ::-1]), -T.sin(uhs[:, ::-1])] ones_ = [T.ones((depth, 1), dtype=theano.config.floatX), T.zeros((depth, 1), dtype=theano.config.floatX)] rhos_reshaped = T.reshape(rhos, (depth, 1), ndim=2) rhos_reshaped = T.addbroadcast(rhos_reshaped, 1) eigvals_re = rhos_reshaped * T.concatenate((ones_[0], exp_phases[0], -ones_[0], neg_exp_phases[0]), axis=1) eigvals_im = rhos_reshaped * T.concatenate((ones_[1], exp_phases[1], -ones_[1], neg_exp_phases[1]), axis=1) phase_array = -2 * np.pi * np.outer(np.arange(n_hidden), np.arange(n_hidden)) / n_hidden f_array_re_val = np.cos(phase_array) / n_hidden f_array_im_val = np.sin(phase_array) / n_hidden f_array_re = theano.shared(f_array_re_val.astype(theano.config.floatX), name="f_arr_re") f_array_im = theano.shared(f_array_im_val.astype(theano.config.floatX), name="f_arr_im") a_k = T.dot(eigvals_re, f_array_re) + T.dot(eigvals_im, f_array_im) uhs_op = rep_vec(a_k, n_hidden, n_hidden) # shape (depth, 2 * n_hidden - 1) return uhs_op
def create_gradients(self, loss, deterministic=False): # load networks l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network # load params p_params = lasagne.layers.get_all_params( [l_px_mu, l_pa_mu, l_pa_logsigma], trainable=True) qa_params = lasagne.layers.get_all_params(l_qa_mu, trainable=True) qz_params = lasagne.layers.get_all_params(l_qz, trainable=True) cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) # load neural net outputs (probabilities have been precomputed) log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz log_qza_given_x = self.log_qza_given_x log_qz_given_x = self.log_qz_given_x log_qz_given_x_dgz = self.log_qz_given_x_dgz cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) # compute learning signals l0 = log_px_given_z + log_pz - log_qz_given_x - cv # NOTE: this disn't have q(a) l_avg, l_var = l0.mean(), l0.var() c_new = 0.8*c + 0.2*l_avg v_new = 0.8*v + 0.2*l_var l = (l0 - c_new) / T.maximum(1, T.sqrt(v_new)) l_target = (l0 - c_new) / T.maximum(1, T.sqrt(v_new)) # compute grad wrt p p_grads = T.grad(-log_pxz.mean(), p_params) # compute grad wrt q_a elbo = T.mean(log_pxz - log_qza_given_x) qa_grads = T.grad(-elbo, qa_params) # compute grad wrt q_z qz_target = T.mean(dg(l_target) * log_qz_given_x_dgz) qz_grads = T.grad(-0.2*qz_target, qz_params) # 5x slower rate for q # compute grad of cv net cv_target = T.mean(l0**2) cv_grads = [0.2*g for g in T.grad(cv_target, cv_params)] # combine and clip gradients clip_grad = 1 max_norm = 5 grads = p_grads + qa_grads + qz_grads + cv_grads mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] return cgrads
def t_forward_step(self, mask, cur_w_in_sig, pre_out_sig, w_hidden_hidden, b_act, ln_s1, ln_b1, ln_s2, ln_b2): pre_w_sig = T.dot(pre_out_sig, w_hidden_hidden) inner_act = self.activation pre_w_sig_ln = self.ln(pre_w_sig, ln_b1, ln_s1) cur_w_in_sig_ln = self.ln(cur_w_in_sig, ln_b2, ln_s2) out_sig = inner_act(T.add(cur_w_in_sig_ln, pre_w_sig_ln, b_act)) mask = T.addbroadcast(mask, 1) out_sig_m = mask * out_sig + (1. - mask) * pre_out_sig return [out_sig_m]
def sample_joint(self, sp): t2_samp = self.theano_rng.multinomial( pvals=T.reshape(self.weights_now, (1, self.npcl))).T s2_samp = T.cast( T.sum(self.s_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') h2_samp = T.cast( T.sum(self.h_now * T.addbroadcast(t2_samp, 1), axis=0), 'float32') diffs = self.b * (s2_samp - sp) sqr_term = T.sum(diffs**2, axis=1) alpha = T.exp(-sqr_term) probs_unnorm = self.weights_past * alpha probs = probs_unnorm / T.sum(probs_unnorm) t1_samp = self.theano_rng.multinomial( pvals=T.reshape(probs, (1, self.npcl))).T s1_samp = T.cast( T.sum(self.s_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') h1_samp = T.cast( T.sum(self.h_past * T.addbroadcast(t1_samp, 1), axis=0), 'float32') return [s1_samp, h1_samp, s2_samp, h2_samp]
def setup(self, bottom, top): attention = T.tensor4("attention") input = T.tensor4("input") v = T.matrix("v") attention_bc = T.addbroadcast(attention, 1) attended = T.mul(input, attention_bc) result = T.sum(attended, axis=(2, 3)) result_g_attention, result_g_input = T.Lop(result, [attention, input], v) self.f = theano.function([attention, input], result) self.b_attention = theano.function([attention, input, v], result_g_attention) self.b_input = theano.function([attention, input, v], result_g_attention)
def keep_max(input, theta, k): sig_input = T.nnet.sigmoid(T.dot(input, theta)) #sig_input = T.dot(input, theta) if k == 0: result = input * T.addbroadcast(sig_input, 3) return result, sig_input # get the sorted idx sort_idx = T.argsort(sig_input, axis=2) k_max_ids = sort_idx[:,:,-k:,:] dim0, dim1, dim2, dim3 = k_max_ids.shape batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3) mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3)) mapids = T.repeat(mapids, dim0, axis=0).flatten() rowids = k_max_ids.flatten() colids = T.arange(dim3).reshape((1, dim3)) colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten() sig_mask = T.zeros_like(sig_input) choosed = sig_input[batchids, mapids, rowids, colids] sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1) input_mask = sig_mask * sig_input result = input * T.addbroadcast(input_mask, 3) return result, sig_input
def __call__(self, input, input_lm=None, h0=None, c0=None): batch_size = input_lm.shape[0] if h0 == None: h0 = T.alloc(np.asarray(0., dtype=theano.config.floatX), batch_size, self.n_hidden) if c0 == None: c0 = T.alloc(np.asarray(0., dtype=theano.config.floatX), batch_size, self.n_hidden) if input_lm == None: def step(x_t, h_tm_prev, c_tm_prev): x_i = T.dot(x_t, self.W_i) + self.b_i x_f = T.dot(x_t, self.W_f) + self.b_f x_c = T.dot(x_t, self.W_c) + self.b_c x_o = T.dot(x_t, self.W_o) + self.b_o i_t = self.inner_activation(x_i + T.dot(h_tm_prev, self.U_i)) f_t = self.inner_activation(x_f + T.dot(h_tm_prev, self.U_f)) c_t = f_t * c_tm_prev + i_t * self.activation(x_c + T.dot(h_tm_prev, self.U_c)) # internal memory o_t = self.inner_activation(x_o + T.dot(h_tm_prev, self.U_o)) h_t = o_t * self.activation(c_t) # actual hidden state return [h_t, c_t] self.h_1, _ = theano.scan(step, sequences=input.dimshuffle(1, 0, 2), outputs_info=[h0, c0] ) else: def step(x_t, mask, h_tm_prev, c_tm_prev): x_i = T.dot(x_t, self.W_i) + self.b_i x_f = T.dot(x_t, self.W_f) + self.b_f x_c = T.dot(x_t, self.W_c) + self.b_c x_o = T.dot(x_t, self.W_o) + self.b_o i_t = self.inner_activation(x_i + T.dot(h_tm_prev, self.U_i)) f_t = self.inner_activation(x_f + T.dot(h_tm_prev, self.U_f)) c_t = f_t * c_tm_prev + i_t * self.activation(x_c + T.dot(h_tm_prev, self.U_c)) # internal memory o_t = self.inner_activation(x_o + T.dot(h_tm_prev, self.U_o)) h_t = o_t * self.activation(c_t) # actual hidden state h_t = mask * h_t + (1 - mask) * h_tm_prev c_t = mask * c_t + (1 - mask) * c_tm_prev return [h_t, c_t] self.h_1, _ = theano.scan(step, sequences=[input.dimshuffle(1, 0, 2), T.addbroadcast(input_lm.dimshuffle(1, 0, 'x'), -1)], outputs_info=[h0, c0]) self.h_1 = self.h_1[0].dimshuffle(1, 0, 2) return self.h_1[:, -1, :]
def get_padded_shuffled_mask(self, mask, X, pad=0): # mask is (nb_samples, time) if mask is None: mask = T.ones((X.shape[0], X.shape[1])) mask = T.shape_padright(mask) # (nb_samples, time, 1) mask = T.addbroadcast(mask, -1) # (time, nb_samples, 1) matrix. mask = mask.dimshuffle(1, 0, 2) # (time, nb_samples, 1) if pad > 0: # left-pad in time with 0 padding = alloc_zeros_matrix(pad, mask.shape[1], 1) mask = T.concatenate([padding, mask], axis=0) return mask.astype('int8')
def __init__(self, attended_layer, attended_layer_mask, condition_layer, gate_covariance=False, covariance_decay=None, name=None): MergeLayer.__init__(self, [attended_layer, attended_layer_mask, condition_layer], name=name) self.gate_covariance = gate_covariance self.covariance_decay = covariance_decay if gate_covariance: n_units = attended_layer.output_shape[-1] self.w_gate = self.add_param(init.Constant(0.0), (n_units,), name="gate") self.b_gate = self.add_param(init.Constant(1.0), (1,), name="gate") self.b_gate = T.addbroadcast(self.b_gate, 0)
def sequence_iteration(self, output, mask, use_dropout=0, dropout_value=0.5): dot_product = T.dot(output, self.t_w_out) linear_o = T.add(dot_product, self.t_b_out) mask = T.addbroadcast(mask, 2) # to do nesseccary? output = T.mul(mask, linear_o) + T.mul((1. - mask), 1e-6) return output # result
def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) wgru_out = tensor.exp( self.wl.apply(self.bidir_w.apply(embeddings, char_aux))) if self.dgru_depth > 1: gru_out = gru_out[-1] gru_out = tensor.addbroadcast(wgru_out, 2) * gru_out sampled_representation = tensor.tanh( tensor.batched_dot(sample_matrix, gru_out.dimshuffle([1, 0, 2]))) return sampled_representation.dimshuffle([1, 0, 2]), wgru_out
def get_padded_shuffled_mask(self, train, X, pad=0): mask = self.get_input_mask(train) if mask is None: mask = T.ones_like(X.sum(axis=-1)) # is there a better way to do this without a sum? # mask is (nb_samples, time) mask = T.shape_padright(mask) # (nb_samples, time, 1) mask = T.addbroadcast(mask, -1) # (time, nb_samples, 1) matrix. mask = mask.dimshuffle(1, 0, 2) # (time, nb_samples, 1) if pad > 0: # left-pad in time with 0 padding = alloc_zeros_matrix(pad, mask.shape[1], 1) mask = T.concatenate([padding, mask], axis=0) return mask.astype('int8')
def output(self, input_scalars): """ Computes the n_output output scalars @param input_scalars: the layer's input @return: n_output scalars """ z = T.dot(input_scalars, self.W) + T.addbroadcast(self.b, 0) if self.activation == 'linear': return z elif self.activation == 'rectified': return T.maximum(z, 0) elif self.activation == 'tanh': return T.tanh(z) else: raise "Invalid activation %s" % self.activation
def get_output_for(self, inputs, deterministic=False, **kwargs): event = inputs[0] #(None, 1000, embed) feature_idx = inputs[1] #(None, 1000, feature_num, embed) feature_b = inputs[2] #(None, 1000, feature_num, 1) feature_trans = inputs[3] #(None, 1000, feature_num, 1) feature_value = inputs[4] #(None, 1000, feature_num) value_up = T.shape_padright(feature_value, 1) #(None, 1000, feature_num, 1) bias_value = feature_trans * (value_up + feature_b) bias_value_broad = T.addbroadcast(bias_value, 3) #make the last axis broadcastable v_idx = T.sum(feature_idx * lasagne.nonlinearities.tanh(bias_value_broad), axis=2) #(None, 1000, embed) return v_idx + event
def RBM_Free_Energy(x, y): # make input data binary data = makeBinary(x) # determine initial params (W_init, b_v_init, b_h_init) = initParams(x.shape[0], NUM_HID) W = theano.shared(W_init, name='W') b_v = theano.shared(b_v_init.reshape(b_v_init.shape[0], 1), name='b_v') b_h = theano.shared(b_h_init.reshape(b_h_init.shape[0], 1), name='b_h') # compute free energy v = T.matrix('v') F = -T.dot(T.flatten(b_v, 1), v)\ - T.sum(T.log(1.0 + T.exp(T.addbroadcast(b_h, 1) + T.dot(W, v))), axis=0) free_energy = theano.function([v], F.sum()) value = free_energy(data) print ' Total Free Energy =', value # approximate expected free energy # using k=1 constrastive divergence rng = RandomStreams(RANDOM_SEED) h_0_mean = 1.0 / (1.0 + T.exp(-T.addbroadcast(b_h, 1) - T.dot(W, v))) h_0 = rng.binomial(size=h_0_mean.shape, n=1, p=h_0_mean) v_0_mean = 1.0 / (1.0 + T.exp(-T.addbroadcast(b_v, 1) - T.dot(W.T, h_0))) v_0 = rng.binomial(size=v_0_mean.shape, n=1, p=v_0_mean) F_exp = -T.dot(T.flatten(b_v, 1), v_0)\ - T.sum(T.log(1.0 + T.exp(T.addbroadcast(b_h, 1) + T.dot(W, v_0))), axis=0) exp_free_energy = theano.function([v], F_exp.sum()) value = exp_free_energy(data) print 'Estimated Expected Free Energy =', value # compute param deltas dParams = T.grad(F.sum() - F_exp.sum(), [W, b_v, b_h], consider_constant=[v_0]) dParams_func = theano.function([v], dParams) value = dParams_func(data)
def __init__(self, incoming, n_codewords=24, V=lasagne.init.Normal(0.1), gamma=lasagne.init.Constant(0.1), eps=0.00001, input_var=None, initializers=None, spatial_level=1, **kwargs): """ Creates a BoF layer :param incoming: :param n_codewords: number of codewords :param V: initializer used for the codebook :param gamma: initializer used for the scaling factors :param eps: epsilon used to ensure numerical stability :param input_var: input_var of the model (used to compile a function that extract the features fed to layer) :param initializers: :param spatial_level: 0 (no spatial segmentation), 1 (first spatial level) :param pooling_type: either 'mean' or 'max' :param kwargs: """ super(CBoF_Layer, self).__init__(incoming, **kwargs) self.n_codewords = n_codewords self.spatial_level = spatial_level n_filters = self.input_shape[1] self.eps = eps # Create parameters self.V = self.add_param(V, (n_codewords, n_filters, 1, 1), name='V') self.gamma = self.add_param(gamma, (1, n_codewords, 1, 1), name='gamma') # Make gammas broadcastable self.gamma = T.addbroadcast(self.gamma, 0, 2, 3) # Compile function used for feature extraction if input_var is not None: self.features_fn = theano.function([input_var], lasagne.layers.get_output( incoming, deterministic=True)) if initializers is not None: initializers.append(self.initialize_layer)
def get_padded_shuffled_mask(self, train, X, pad=0): mask = self.get_input_mask(train) if mask is None: mask = T.ones_like(X.sum(axis=-1)) # is there a better way to do this without a sum? # mask is (nb_samples, time) mask = T.shape_padright(mask) # (nb_samples, time, 1) mask = T.addbroadcast(mask, -1) # the new dimension (the '1') is made broadcastable # see http://deeplearning.net/software/theano/library/tensor/basic.html#broadcasting-in-theano-vs-numpy mask = mask.dimshuffle(1, 0, 2) # (time, nb_samples, 1) if pad > 0: # left-pad in time with 0 padding = alloc_zeros_matrix(pad, mask.shape[1], 1) mask = T.concatenate([padding, mask], axis=0) return mask.astype('int8')
def permute_dimensions(x, pattern): '''Transpose dimensions. pattern should be a tuple or list of dimension indices, e.g. [0, 2, 1]. ''' if len( pattern ) < x.ndim: # [DV] handle the case that one dimension is to be dropped bcaxis = [] for i in range(x.ndim): if i not in pattern: bcaxis.append(i) x = T.addbroadcast(x, *bcaxis) pattern = tuple(pattern) return x.dimshuffle(pattern)
def __call__(self, input,input_lm=None, return_list = False): # activation function if input_lm == None: self.h_l, _ = theano.scan(self.step2, sequences=input.dimshuffle(1,0,2), outputs_info=theano.shared(value=np.zeros((self.batch_size,self.n_hidden), dtype=theano.config.floatX),borrow=True)) else: self.h_l, _ = theano.scan(self.step, sequences=[input.dimshuffle(1,0,2),T.addbroadcast(input_lm.dimshuffle(1,0,'x'), -1)], outputs_info=theano.shared(value=np.zeros((self.batch_size,self.n_hidden), dtype=theano.config.floatX),borrow=True)) self.h_l = self.h_l.dimshuffle(1,0,2) if return_list == True: return self.h_l return self.h_l[:,-1,:]
def get_attention(Wg, bg, M, w): g_t = T.nnet.sigmoid(T.dot(x_t, Wg) + bg) # [instances, mem] # eqn 11 k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # eqn 13 beta = T.dot(h_tm1, self.Wb) + self.bb beta = T.log(1 + T.exp(beta)) beta = T.addbroadcast(beta, 1) # [instances, 1] # eqn 12 w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # eqn 14 return (1 - g_t) * w + g_t * w_hat # [instances, mem]
def get_output_for(self, inputs, **kwargs): inputs = autocrop(inputs, self.cropping) # modify broadcasting pattern. if self.broadcastable is not None: for n, broadcasting_dim in enumerate(self.broadcastable): for dim, broadcasting in enumerate(broadcasting_dim): if broadcasting: inputs[n] = T.addbroadcast(inputs[n], dim) output = None for input in inputs: if output is not None: output = self.merge_function(output, input) else: output = input return output
def __call__(self, input, input_lm=None, h0=None): batch_size = input.shape[0] if h0 == None: h0 = T.alloc(np.asarray(0., dtype=theano.config.floatX), batch_size, self.n_hidden) if input_lm == None: def step(x_t, h_tm_prev): x_z = T.dot(x_t, self.W_z) + self.b_z x_r = T.dot(x_t, self.W_r) + self.b_r x_h = T.dot(x_t, self.W_h) + self.b_h z_t = self.inner_activation(x_z + T.dot(h_tm_prev, self.U_z)) r_t = self.inner_activation(x_r + T.dot(h_tm_prev, self.U_r)) hh_t = self.activation(x_h + T.dot(r_t * h_tm_prev, self.U_h)) h_t = (1 - z_t) * hh_t + z_t * h_tm_prev h_t = T.cast(h_t, dtype=theano.config.floatX) return h_t self.output, _ = theano.scan(step, sequences=input.dimshuffle(1, 0, 2), outputs_info=h0) else: def step(x_t, mask, h_tm_prev): x_z = T.dot(x_t, self.W_z) + self.b_z x_r = T.dot(x_t, self.W_r) + self.b_r x_h = T.dot(x_t, self.W_h) + self.b_h z_t = self.inner_activation(x_z + T.dot(h_tm_prev, self.U_z)) r_t = self.inner_activation(x_r + T.dot(h_tm_prev, self.U_r)) hh = self.activation(x_h + T.dot(r_t * h_tm_prev, self.U_h)) h_t = z_t * h_tm_prev + (1 - z_t) * hh h_t = mask * h_t + (1 - mask) * h_tm_prev h_t = T.cast(h_t, dtype=theano.config.floatX) return h_t self.output, _ = theano.scan( step, sequences=[ input.dimshuffle(1, 0, 2), T.addbroadcast(input_lm.dimshuffle(1, 0, 'x'), -1) ], outputs_info=h0) h = self.output # [max_length, batch_size, hidden_size] h = h.dimshuffle(1, 0, 2) return h, h[:, -1, :]
def output(self, input_vectors, input_scalars): """ Calculate the n_output transformed vectors for this layer @param input_scalars: n_input x n_output scalar vector @param input_vectors: n_input vectors (actual shape should be (n_batch, n_input, n_dimension) """ mat = input_scalars.reshape((n_batch, self.n_input, self.n_output)) z = T.batched_tensordot(input_vectors, mat, [[1], [1]]).swapaxes( 1, 2) + T.addbroadcast(self.b, 0, 2) if self.activation == 'linear': return z elif self.activation == 'rectified': return T.maximum(z, 0) elif self.activation == 'tanh': return T.tanh(z) else: raise "Unknown activation, %s" % self.activation
def make_network_multiscale( network_type, loss_function, lr, n_scales, net_options, do_clip=True, make_pr_func=False): target_var = T.matrix('targets') lr_var = theano.shared(np.array(lr, dtype=floatX)) print("Building model and compiling functions...") if n_scales >= 1: input_var_list = [T.tensor4('inputs{}'.format(i)) for i in range(n_scales)] network = getattr(network_design, network_type)(input_var_list, **net_options) else: # if the network requires input_var not a list, set n_sources=-1 input_var_list = [T.addbroadcast(T.tensor4('inputs{}'.format(i))) for i in range(1)] network = getattr(network_design, network_type)(input_var_list[0], **net_options) # Compute loss prediction = lasagne.layers.get_output(network) if do_clip: prediction = T.clip(prediction, epsilon, one-epsilon) loss = loss_function(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adagrad( loss, params, learning_rate=lr_var) test_prediction = lasagne.layers.get_output(network, deterministic=True) if do_clip: test_prediction = T.clip(test_prediction, epsilon, one-epsilon) test_loss = loss_function(test_prediction, target_var) test_loss = test_loss.mean() train_func = theano.function(input_var_list+[target_var], loss, updates=updates) val_func = theano.function(input_var_list+[target_var], [test_prediction, test_loss]) if make_pr_func: pr_func = theano.function(input_var_list, test_prediction) return network, input_var_list, lr_var, train_func, val_func, pr_func else: return network, input_var_list, lr_var, train_func, val_func
def do_apply(self, input_): X = input_ naxes = self.naxes broadcast_n = T.addbroadcast(self.n, 0) if naxes == 4: # CNN if self.use_population: u = self.u / broadcast_n else: u = T.mean(X, axis=[0, 2, 3]) b_u = u.dimshuffle('x', 0, 'x', 'x') if self.use_population: s = self.s / broadcast_n else: s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]) X = (X - b_u) / T.sqrt(s.dimshuffle('x', 0, 'x', 'x') + self.e) X = self.g.dimshuffle('x', 0, 'x', 'x')*X +\ self.b.dimshuffle('x', 0, 'x', 'x') elif naxes == 3: # RNN if self.use_population: u = self.u / broadcast_n else: u = T.mean(X, axis=[0, 1]) b_u = u.dimshuffle('x', 'x', 0) if self.use_population: s = self.s / broadcast_n else: s = T.mean(T.sqr(X - b_u), axis=[0, 1]) X = (X - b_u) / T.sqrt(s.dimshuffle('x', 'x', 0) + self.e) X = self.g.dimshuffle('x', 'x', 0)*X +\ self.b.dimshuffle('x', 'x', 0) elif naxes == 2: # FC if self.use_population: u = self.u / broadcast_n else: u = T.mean(X, axis=0) if self.use_population: s = self.s / broadcast_n else: s = T.mean(T.sqr(X - u), axis=0) X = (X - u) / T.sqrt(s + self.e) X = self.g * X + self.b else: raise NotImplementedError return X, u, s
def output(self, input_raw): input = input_raw lin_output = T.dot(input, self.W) + self.b if self.batch_norm: lin_output = (lin_output - T.mean(lin_output, axis=0, keepdims=True)) / ( 1.0 + T.std(lin_output, axis=0, keepdims=True)) lin_output = (lin_output * T.addbroadcast(self.bn_std, 0) + T.addbroadcast(self.bn_mean, 0)) if self.layer_norm: lin_output = (lin_output - T.mean(lin_output, axis=1, keepdims=True)) / ( 1.0 + T.std(lin_output, axis=1, keepdims=True)) lin_output = (lin_output * T.addbroadcast(self.bn_std, 0) + T.addbroadcast(self.bn_mean, 0)) if self.norm_prop: lin_output = lin_output / T.sqrt(T.mean(T.sqr(lin_output), axis=0)) lin_output = (lin_output * T.addbroadcast(self.bn_std, 0) + T.addbroadcast(self.bn_mean, 0)) clip_preactive = True if clip_preactive: lin_output = theano.tensor.clip(lin_output, -10, 10) self.out_store = lin_output if self.activation == None: activation = lambda x: x elif self.activation == "relu": activation = lambda x: T.maximum(0.0, x) elif self.activation == "lrelu": activation = lambda x: T.nnet.relu(x, alpha=0.02) elif self.activation == "exp": activation = lambda x: T.exp(x) elif self.activation == "tanh": activation = lambda x: T.tanh(x) elif self.activation == 'softplus': activation = lambda x: T.nnet.softplus(x) elif self.activation == 'sigmoid': activation = lambda x: T.nnet.sigmoid(x) else: raise Exception("Activation not found") out = activation(lin_output) return out
def sequence_iteration(self, output, mask, use_dropout=0, dropout_value=0.5): dot_product = T.dot(output, self.t_w_out) net_o = T.add(dot_product, self.t_b_out) ex_net = T.exp(net_o) sum_net = T.sum(ex_net, axis=2, keepdims=True) softmax_o = ex_net / sum_net mask = T.addbroadcast(mask, 2) # to do nesseccary? output = T.mul(mask, softmax_o) + T.mul((1. - mask), 1e-6) return output #result
def get_inverse_for(self, input, **kwargs): C_fft = T.addbroadcast(theano.tensor.fft.rfft(self.C_pad), 0) C_fft_norm2 = C_fft[:, :, 0]**2 + C_fft[:, :, 1]**2 C_fft_inv = C_fft C_fft_inv = T.set_subtensor(C_fft_inv[:, :, 0], C_fft[:, :, 0] / C_fft_norm2) C_fft_inv = T.set_subtensor(C_fft_inv[:, :, 1], -C_fft[:, :, 1] / C_fft_norm2) z_fft = theano.tensor.fft.rfft(input) Cz_fft = z_fft Cz_fft = T.set_subtensor( Cz_fft[:, :, 0], z_fft[:, :, 0] * C_fft_inv[:, :, 0] - z_fft[:, :, 1] * C_fft_inv[:, :, 1]) Cz_fft = T.set_subtensor( Cz_fft[:, :, 1], z_fft[:, :, 0] * C_fft_inv[:, :, 1] + z_fft[:, :, 1] * C_fft_inv[:, :, 0]) rlt = theano.tensor.fft.irfft(Cz_fft) return rlt
def g_deconv(self, z_ver, in_dims, out_dims, weight_name, fspec): """ Inverse operation for each type of f used in convnets """ f_type, f_dims = fspec assert z_ver is not None num_channels = in_dims[0] if in_dims is not None else None num_filters, width, height = out_dims[:3] if f_type in ['globalmeanpool']: u = T.addbroadcast(z_ver, 2, 3) assert in_dims[1] == 1 and in_dims[2] == 1, \ "global pooling needs in_dims (1,1): %s" % str(in_dims) elif f_type in ['maxpool']: sh, str, size = z_ver.shape, f_dims[0], f_dims[1] assert str == size, "depooling requires stride == size" u = T.zeros((sh[0], sh[1], sh[2] * str, sh[3] * str), dtype=z_ver.dtype) for x in xrange(str): for y in xrange(str): u = T.set_subtensor(u[:, :, x::str, y::str], z_ver) u = u[:, :, :width, :height] elif f_type in ['convv', 'convf']: filter_size, str = (f_dims[1], f_dims[1]), f_dims[2] W_shape = (num_filters, num_channels) + filter_size W = self.weight(self.rand_init_conv(W_shape), weight_name) if str > 1: # upsample if strided version sh = z_ver.shape u = T.zeros((sh[0], sh[1], sh[2] * str, sh[3] * str), dtype=z_ver.dtype) u = T.set_subtensor(u[:, :, ::str, ::str], z_ver) else: u = z_ver # no strides, only deconv u = conv2d(u, W, filter_shape=W_shape, border_mode='valid' if 'convf' in f_type else 'full') u = u[:, :, :width, :height] else: raise NotImplementedError('Layer %s has no convolutional decoder' % f_type) return u
def outputs(self, groundtruth, groundtruth_mask, **inputs): # Copy-pasted from all_outputs, because Theano does not support ellipsis outputs = self.merge(**dict_subset(inputs, self.merge_names)) if self.value_softmax: logger.debug('Applying value softmax') outputs = (tensor.addbroadcast(outputs[:, :1], 1) + self.softmax.apply(outputs[:, 1:])) if self.same_value_for_wrong: logger.debug('Same value for apriori wrong actions') wrong_output = outputs[:, 0] outputs = outputs[:, 1:] indices = tensor.repeat(tensor.arange(groundtruth.shape[1]), groundtruth.shape[0]) wrong_mask = tensor.ones_like(outputs) wrong_mask = tensor.set_subtensor( wrong_mask[indices, groundtruth.T.flatten()], 0) outputs = (outputs * (1 - wrong_mask) + wrong_output[:, None] * wrong_mask) return outputs
def __init__(self, incoming, kernel_shape=[5, 5], input_shape=[50, 50], C=lasagne.init.Normal(0.01), **kwargs): super(CircMatLayerSparse2D, self).__init__(incoming, **kwargs) num_inputs = self.input_shape[1] self.kernel_shape = kernel_shape self.input_shape = input_shape self.C = self.add_param(C, (1, kernel_shape[0] * kernel_shape[1]), name='C') self.C = T.addbroadcast(self.C, 0) #self.C_pad = self.C.reshape(kernel_shape, ndim=2) self.C_pad = T.zeros(input_shape) self.C_pad = T.set_subtensor( self.C_pad[:kernel_shape[0], :kernel_shape[1]], self.C.reshape(kernel_shape, ndim=2)) self.C_pad = self.C_pad.reshape([1, input_shape[0] * input_shape[1]])