def output(self, x, index_selection_func=None): if self.n_out > 1: iWin = self.k if self.n_in == 1: iWin = 1 rnd_proj = T.dot( x.reshape((x.shape[0], x.shape[1]*x.shape[2])), self.rand_proj_mat ) if index_selection_func is not None: self.out_idxs = index_selection_func(rnd_proj) else: self.out_idxs = T.argsort(rnd_proj) self.out_idxs = T.sort(self.out_idxs[:, -self.k:]) # self.out_idxs.set_value( # np.random.randint(0, self.n_out, (self.batch_size, self.k)) # ) sparse = sparse_block_dot_SS( self.W, x, self.in_idxs, self.b, self.out_idxs ) return (sparse if self.activation is None else self.activation(sparse))
def _pooling_function(self, inputs, pool_size, strides, border_mode, dim_ordering): if pool_size[0]<-1: # k-max pooling input_layer = T.transpose(inputs, axes=(0, 1, 3, 2)) sorted_values = T.argsort(input_layer, axis=3) topmax_indexes = sorted_values[:, :, :, -self.k:] # sort indexes so that we keep the correct order within the sentence topmax_indexes_sorted = T.sort(topmax_indexes) # given that topmax only gives the index of the third dimension, we need to generate the other 3 dimensions dim0 = T.arange(0, input_layer.shape[0]).repeat(input_layer.shape[1] * input_layer.shape[2] * self.k) dim1 = T.arange(0, input_layer.shape[1]).repeat(self.k * input_layer.shape[2]).reshape((1, -1)).repeat( input_layer.shape[0], axis=0).flatten() dim2 = T.arange(0, input_layer.shape[2]).repeat(self.k).reshape((1, -1)).repeat( input_layer.shape[0] * input_layer.shape[1], axis=0).flatten() dim3 = topmax_indexes_sorted.flatten() x = T.transpose( input_layer[dim0, dim1, dim2, dim3].reshape( (input_layer.shape[0], input_layer.shape[1], input_layer.shape[2], self.k)), axes=(0, 1, 3, 2)) return x else: return super(MaxPooling2DWrapper, self)._pooling_function(inputs, pool_size, strides, border_mode, dim_ordering)
def keep_max(input, theta, k, sent_mask): sig_input = T.nnet.sigmoid(T.dot(input, theta)) sent_mask = sent_mask.dimshuffle(0, 'x', 1, 'x') sig_input = sig_input * sent_mask #sig_input = T.dot(input, theta) if k == 0: result = input * T.addbroadcast(sig_input, 3) return result, sig_input # get the sorted idx sort_idx = T.argsort(sig_input, axis=2) k_max_ids = sort_idx[:,:,-k:,:] dim0, dim1, dim2, dim3 = k_max_ids.shape batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3) mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3)) mapids = T.repeat(mapids, dim0, axis=0).flatten() rowids = k_max_ids.flatten() colids = T.arange(dim3).reshape((1, dim3)) colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten() sig_mask = T.zeros_like(sig_input) choosed = sig_input[batchids, mapids, rowids, colids] sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1) input_mask = sig_mask * sig_input result = input * T.addbroadcast(input_mask, 3) return result, sig_input
def compute_probabilistic_matrix(self,X, y, num_cases, k=5): z = T.dot(X, self.A) #Transform x into z space dists = T.sqr(dist2hy(z,z)) dists = T.extra_ops.fill_diagonal(dists, T.max(dists)+1) nv = T.min(dists,axis=1) # value of nearest neighbour dists = (dists.T - nv).T d = T.extra_ops.fill_diagonal(dists, 0) #Take only k nearest num = T.zeros((num_cases, self.num_classes)) denom = T.zeros((num_cases,)) for c_i in xrange(self.num_classes): #Mask for class i mask_i = T.eq(T.outer(T.ones_like(y),y),c_i) #K nearest neighbour within a class i dim_ci = T.sum(mask_i[0]) d_c_i = T.reshape(d[mask_i.nonzero()],(num_cases,dim_ci)) k_indice = T.argsort(d_c_i, axis=1)[:,0:k] kd = T.zeros((num_cases,k)) for it in xrange(k): kd = T.set_subtensor(kd[:,it], d_c_i[T.arange(num_cases),k_indice[:,it]]) #Numerator value = T.exp(-T.mean(kd,axis=1)) num = T.set_subtensor(num[:,c_i], value) denom += value p = num / denom.dimshuffle(0,'x') #prob that point i will be correctly classified return p
def kmaxpooling_output(input): ''' 实现 k-max pooling 1. 先排序 2. 再分别取出前k个值 :param k: k top higiest value :type k: int :return: ''' input = T.transpose(input, axes=(0, 1, 3, 2)) sorted_values = T.argsort(input, axis=3) topmax_indexes = sorted_values[:, :, :, -k:] # sort indexes so that we keep the correct order within the sentence topmax_indexes_sorted = T.sort(topmax_indexes) # given that topmax only gives the index of the third dimension, we need to generate the other 3 dimensions dim0 = T.arange(0, input.shape[0]).repeat(input.shape[1] * input.shape[2] * k) dim1 = T.arange(0, input.shape[1]).repeat(k * input.shape[2]).reshape((1, -1)).repeat(input.shape[0], axis=0).flatten() dim2 = T.arange(0, input.shape[2]).repeat(k).reshape((1, -1)).repeat(input.shape[0] * input.shape[1], axis=0).flatten() dim3 = topmax_indexes_sorted.flatten() return T.transpose( input[dim0, dim1, dim2, dim3].reshape((input.shape[0], input.shape[1], input.shape[2], k)), axes=(0, 1, 3, 2))
def top_k_pooling(matrix, sentlength_1, sentlength_2, Np): #tensor: (1, feature maps, 66, 66) #sentlength_1=dim-left1-right1 #sentlength_2=dim-left2-right2 #core=tensor[:,:, left1:(dim-right1),left2:(dim-right2) ] ''' repeat_row=Np/sentlength_1 extra_row=Np%sentlength_1 repeat_col=Np/sentlength_2 extra_col=Np%sentlength_2 ''' #repeat core matrix_1=repeat_whole_tensor(matrix, 5, True) matrix_2=repeat_whole_tensor(matrix_1, 5, False) list_values=matrix_2.flatten() neighborsArgSorted = T.argsort(list_values) kNeighborsArg = neighborsArgSorted[-(Np**2):] top_k_values=list_values[kNeighborsArg] all_max_value=top_k_values.reshape((1, Np**2)) return all_max_value
def get_best_sense(self, word, curr_sense, context_vector, W_s): scores_all_senses = T.dot(context_vector, W_s[word].T) sorted_senses = T.argsort(scores_all_senses) score_best = scores_all_senses[sorted_senses[-1]] score_second_best = scores_all_senses[sorted_senses[-2]] new_sense = T.switch(T.gt(score_best-score_second_best, epsilon), sorted_senses[-1], curr_sense) return new_sense
def dynamic_kmaxPooling(self, curConv_out, k): neighborsForPooling = TSN.images2neibs(ten4=curConv_out, neib_shape=(1,curConv_out.shape[3]), mode='ignore_borders') self.neighbors = neighborsForPooling neighborsArgSorted = T.argsort(neighborsForPooling, axis=1) kNeighborsArg = neighborsArgSorted[:,-k:] #self.bestK = kNeighborsArg kNeighborsArgSorted = T.sort(kNeighborsArg, axis=1) ii = T.repeat(T.arange(neighborsForPooling.shape[0]), k) jj = kNeighborsArgSorted.flatten() pooledkmaxTmp = neighborsForPooling[ii, jj] new_shape = T.cast(T.join(0, T.as_tensor([neighborsForPooling.shape[0]]), T.as_tensor([k])), 'int64') pooledkmax_matrix = T.reshape(pooledkmaxTmp, new_shape, ndim=2) rightWidth=self.unifiedWidth-k right_padding = T.zeros((neighborsForPooling.shape[0], rightWidth), dtype=theano.config.floatX) matrix_padded = T.concatenate([pooledkmax_matrix, right_padding], axis=1) #recover tensor form new_shape = T.cast(T.join(0, curConv_out.shape[:-2], T.as_tensor([curConv_out.shape[2]]), T.as_tensor([self.unifiedWidth])), 'int64') curPooled_out = T.reshape(matrix_padded, new_shape, ndim=4) return curPooled_out
def link(self, input): self.input = input # select the lines where we apply k-max pooling neighbors_for_pooling = TSN.images2neibs( ten4=self.input, neib_shape=(self.input.shape[2], 1), # we look the max on every dimension mode='valid' # 'ignore_borders' ) neighbors_arg_sorted = T.argsort(neighbors_for_pooling, axis=1) k_neighbors_arg = neighbors_arg_sorted[:, -self.k_max:] k_neighbors_arg_sorted = T.sort(k_neighbors_arg, axis=1) ii = T.repeat(T.arange(neighbors_for_pooling.shape[0]), self.k_max) jj = k_neighbors_arg_sorted.flatten() flattened_pooled_out = neighbors_for_pooling[ii, jj] pooled_out_pre_shape = T.join( 0, self.input.shape[:-2], [self.input.shape[3]], [self.k_max] ) self.output = flattened_pooled_out.reshape( pooled_out_pre_shape, ndim=self.input.ndim ).dimshuffle(0, 1, 3, 2) return self.output
def keep_max(input, theta, k): """ :type input: theano.tensor.tensor4 :param input: the input data :type theta: theano.tensor.matrix :param theta: the parameter for sigmoid function :type k: int :param k: the number k used to define top k sentence to remain """ sig_input = T.nnet.sigmoid(T.dot(input, theta)) if k == 0: # using all the sentences result = input * T.addbroadcast(sig_input, 3) return result, sig_input # get the sorted idx sort_idx = T.argsort(sig_input, axis=2) k_max_ids = sort_idx[:,:,-k:,:] dim0, dim1, dim2, dim3 = k_max_ids.shape batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3) mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3)) mapids = T.repeat(mapids, dim0, axis=0).flatten() rowids = k_max_ids.flatten() colids = T.arange(dim3).reshape((1, dim3)) colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten() # construct masked data sig_mask = T.zeros_like(sig_input) choosed = sig_input[batchids, mapids, rowids, colids] sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1) input_mask = sig_mask * sig_input result = input * T.addbroadcast(input_mask, 3) return result, sig_input
def link(self, input): self.input = input.dimshuffle(0, 1, 3, 2) # get the indexes that give the max on every line and sort them ind = T.argsort(self.input, axis=3) sorted_ind = T.sort(ind[:, :, :, -self.k_max:], axis=3) dim0, dim1, dim2, dim3 = sorted_ind.shape # prepare indices for selection indices_dim0 = T.arange(dim0)\ .repeat(dim1 * dim2 * dim3) indices_dim1 = T.arange(dim1)\ .repeat(dim2 * dim3)\ .reshape((dim1 * dim2 * dim3, 1))\ .repeat(dim0, axis=1)\ .T\ .flatten() indices_dim2 = T.arange(dim2)\ .repeat(dim3)\ .reshape((dim2 * dim3, 1))\ .repeat(dim0 * dim1, axis=1)\ .T\ .flatten() # output self.output = self.input[ indices_dim0, indices_dim1, indices_dim2, sorted_ind.flatten() ].reshape(sorted_ind.shape).dimshuffle(0, 1, 3, 2) return self.output
def k_max_pool(self, x, k): """ perform k-max pool on the input along the rows input: theano.tensor.tensor4 k: theano.tensor.iscalar the k parameter Returns: 4D tensor """ x = T.reshape(x, (x.shape[0], x.shape[1], 1, x.shape[2] * x.shape[3])) ind = T.argsort(x, axis=3) sorted_ind = T.sort(ind[:, :, :, -k:], axis=3) dim0, dim1, dim2, dim3 = sorted_ind.shape indices_dim0 = T.arange(dim0).repeat(dim1 * dim2 * dim3) indices_dim1 = ( T.arange(dim1).repeat(dim2 * dim3).reshape((dim1 * dim2 * dim3, 1)).repeat(dim0, axis=1).T.flatten() ) indices_dim2 = T.arange(dim2).repeat(dim3).reshape((dim2 * dim3, 1)).repeat(dim0 * dim1, axis=1).T.flatten() result = x[indices_dim0, indices_dim1, indices_dim2, sorted_ind.flatten()].reshape(sorted_ind.shape) shape = (result.shape[0], result.shape[1], result.shape[2] * result.shape[3], 1) result = T.reshape(result, shape) return result
def _step(x, k, max_seq_len): tmp = x[ T.arange(x.shape[0])[:, np.newaxis, np.newaxis], T.sort(T.argsort(x, axis=1)[:, -k:, :], axis=1), T.arange(x.shape[2])[np.newaxis, np.newaxis,:], ] return T.concatenate([tmp, T.zeros([x.shape[0], max_seq_len-k, x.shape[2]])], axis=1)
def __call__(self,X): ind = T.argsort(X, axis = 3) sorted_ind = T.sort(ind[:,:,:, -self.poolsize:], axis = 3) dim0, dim1, dim2, dim3 = sorted_ind.shape indices_dim0 = T.arange(dim0).repeat(dim1 * dim2 * dim3) indices_dim1 = T.arange(dim1).repeat(dim2 * dim3).reshape((dim1*dim2*dim3, 1)).repeat(dim0, axis=1).T.flatten() indices_dim2 = T.arange(dim2).repeat(dim3).reshape((dim2*dim3, 1)).repeat(dim0 * dim1, axis = 1).T.flatten() return X[indices_dim0, indices_dim1, indices_dim2, sorted_ind.flatten()].reshape(sorted_ind.shape)
def argtop_k(x, k=1): # top-k accuracy top = T.argsort(x, axis=-1) # (Theano cannot index with [..., -top_k:], we need to simulate that) top = top[[slice(None) for _ in range(top.ndim - 1)] + [slice(-k, None)]] top = top[(slice(None),) * (top.ndim - 1) + (slice(None, None, -1),)] return top
def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest
def fix_k_max(self, k, masked_data): # @ref: https://github.com/fchollet/keras/issues/373 result = masked_data[ T.arange(masked_data.shape[0]).dimshuffle(0, "x", "x"), T.sort(T.argsort(masked_data, axis=1)[:, -k:, :], axis=1), T.arange(masked_data.shape[2]).dimshuffle("x", "x", 0) ] return result
def _k_max_pooling(input, kmax): pool = input.dimshuffle(0, 2, 1, 3).flatten(ndim=3).dimshuffle(1,0,2).flatten(ndim=2).dimshuffle(1,0) neighborsArgSorted = T.argsort(pool, axis=1) yy = T.sort(neighborsArgSorted[:, -kmax:], axis=1).flatten() xx = T.repeat(T.arange(neighborsArgSorted.shape[0]), kmax) pool_kmax = pool[xx, yy] pool_kmax_shape = T.join(0, T.as_tensor([input.shape[0], input.shape[1], input.shape[3], kmax])) pooled_out = pool_kmax.reshape(pool_kmax_shape, ndim=4).dimshuffle(0, 1, 3, 2) return pooled_out
def inv_fprop(self, output_): shape = output_.shape[1] index = self.dim state_below = output_ state_below = state_below[:, T.argsort(self.permutation)] coupling_out = -self.function(state_below[:, :index]) state_below = T.inc_subtensor(state_below[:, index:], coupling_out) return state_below
def arg_sort(): a, b, c = 2, 4, 4 input = np.arange(a*b*c).reshape([a, b, c]).astype('float32') print input print x = T.tensor3() z = T.argsort(x, axis=2)[:, :, :2].astype('int64') z = x[z[0].flatten()] # z = x[T.arange(x.shape[0], dtype='int32'), T.arange(x.shape[1], dtype='int32'), z] f = theano.function(inputs=[x], outputs=z) print f(input)
def kmaxPool(self, conv_out, pool_shape, k): ''' Perform k-max Pooling. ''' n0, n1, d, size = pool_shape imgs = images2neibs(conv_out, T.as_tensor_variable((1, size))) indices = T.argsort(T.mul(imgs, -1)) k_max_indices = T.sort(indices[:, :k]) S = T.arange(d*n1*n0).reshape((d*n1*n0, 1)) return imgs[S, k_max_indices].reshape((n0, n1, d, k))
def l2C(curr_word, i, curr_senses, context_vector): # theano vector of size (num_senses,) scores_all_senses = T.dot(context_vector, W_s[curr_word].T) sorted_senses = T.argsort(scores_all_senses) score_best = scores_all_senses[sorted_senses[-1]] score_second_best = scores_all_senses[sorted_senses[-2]] prev_sense = curr_senses[i] context_vector = T.switch(T.gt(score_best-score_second_best, epsilon), change_context_vec(context_vector, sorted_senses[-1], prev_sense, curr_word), context_vector ) new_senses = T.set_subtensor(curr_senses[i], sorted_senses[-1]) return [new_senses, context_vector]
def get_hard_examples(self, _, x, y, batch_size, transformed_x=identity): ''' Returns the set of training cases (above avg reconstruction error) :param _: :param x: :param y: :param batch_size: :return: ''' # sort the values by cost and get the top half of it (above average error) indexes = T.argsort(self.cost_vector)[(self.cost_vector.shape[0] // 2):] return self.make_func(x=x, y=y, batch_size=batch_size, output=[self._x[indexes], self._y[indexes]], update=None, transformed_x=transformed_x)
def __init__(self, dnodex,inputdim,dim): X=T.ivector() Y=T.ivector() Z=T.lscalar() NP=T.ivector() lambd = T.scalar() eta = T.scalar() temperature=T.scalar() num_input = inputdim self.umatrix=theano.shared(floatX(np.random.rand(dnodex.nuser,inputdim, inputdim))) self.pmatrix=theano.shared(floatX(np.random.rand(dnodex.npoi,inputdim))) self.p_l2_norm=(self.pmatrix**2).sum() self.u_l2_norm=(self.umatrix**2).sum() num_hidden = dim num_output = inputdim inputs = InputPLayer(self.pmatrix[X,:], self.umatrix[Z,:,:], name="inputs") lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1") #lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2") #lstm3 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm2, name="lstm3") softmax = SoftmaxPLayer(num_hidden, num_output, self.umatrix[Z,:,:], input_layer=lstm1, name="yhat", temperature=temperature) Y_hat = softmax.output() self.layers = inputs, lstm1,softmax params = get_params(self.layers) #caches = make_caches(params) tmp_u=T.mean(T.dot(self.pmatrix[X,:],self.umatrix[Z,:,:]),axis=0) tr=T.dot(tmp_u,(self.pmatrix[X,:]-self.pmatrix[NP,:]).transpose()) pfp_loss1=sigmoid(tr) pfp_loss=pfp_loss1*(T.ones_like(pfp_loss1)-pfp_loss1) tmp_u1=T.reshape(T.repeat(tmp_u,X.shape[0]),(inputdim,X.shape[0])).T pfp_lossv=T.reshape(T.repeat(pfp_loss,inputdim),(inputdim,X.shape[0])).T cost = lambd*10*T.mean(T.nnet.categorical_crossentropy(Y_hat, T.dot(self.pmatrix[Y,:],self.umatrix[Z,:,:])))+lambd*self.p_l2_norm+lambd*self.u_l2_norm # updates = PerSGD(cost,params,eta,X,Z,dnodex)#momentum(cost, params, caches, eta) updates = [] grads = T.grad(cost=cost, wrt=params) updates.append([self.pmatrix,T.set_subtensor(self.pmatrix[X,:],self.pmatrix[X,:]-eta*grads[0])]) updates.append([self.umatrix,T.set_subtensor(self.umatrix[Z,:,:],self.umatrix[Z,:,:]-eta*grads[1])]) for p,g in zip(params[2:], grads[2:]): updates.append([p, p - eta * g]) rlist=T.argsort(T.dot(tmp_u,self.pmatrix.T))[::-1] n_updates=[(self.pmatrix, T.set_subtensor(self.pmatrix[NP,:],self.pmatrix[NP,:]-eta*pfp_lossv*tmp_u1-eta*lambd*self.pmatrix[NP,:]))] p_updates=[(self.pmatrix, T.set_subtensor(self.pmatrix[X,:],self.pmatrix[X,:]+eta*pfp_lossv*tmp_u1-eta*lambd*self.pmatrix[X,:])),(self.umatrix, T.set_subtensor(self.umatrix[Z,:,:],self.umatrix[Z,:,:]+eta*T.mean(pfp_loss)*(T.reshape(tmp_u,(tmp_u.shape[0],1))*T.mean(self.pmatrix[X,:]-self.pmatrix[NP,:],axis=0)))-eta*lambd*self.umatrix[Z,:,:])] self.train = theano.function([X,Y,Z, eta, lambd, temperature], cost, updates=updates, allow_input_downcast=True) self.trainpos=theano.function([X,NP,Z,eta, lambd],tmp_u, updates=p_updates,allow_input_downcast=True) self.trainneg=theano.function([X,NP,Z,eta, lambd],T.mean(pfp_loss), updates=n_updates,allow_input_downcast=True) self.predict_pfp = theano.function([X,Z], rlist, allow_input_downcast=True)
def __init__(self, conv_out, k=1): """ Allocate a LeNetConvPoolLayer with shared variable internal parameters. :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height,filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows,#cols) """ #images2neibs produces a 2D matrix neighborsForPooling = TSN.images2neibs(ten4=conv_out, neib_shape=(conv_out.shape[2], 1), mode='ignore_borders') #k = poolsize[1] neighborsArgSorted = T.argsort(neighborsForPooling, axis=1) kNeighborsArg = neighborsArgSorted[:,-k:] kNeighborsArgSorted = T.sort(kNeighborsArg, axis=1) ii = T.repeat(T.arange(neighborsForPooling.shape[0]), k) jj = kNeighborsArgSorted.flatten() pooledkmaxTmp = neighborsForPooling[ii, jj] # reshape pooledkmaxTmp new_shape = T.cast(T.join(0, conv_out.shape[:-2], T.as_tensor([conv_out.shape[3]]), T.as_tensor([k])), 'int32') pooled_out = T.reshape(pooledkmaxTmp, new_shape, ndim=4) # downsample each feature map individually, using maxpooling ''' pooled_out = downsample.max_pool_2d(input=conv_out, ds=poolsize, ignore_border=True) ''' # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.output = T.tanh(pooled_out)
def errors_top_x(self, y, num_top=5): if y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type)) # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction y_pred_top_x = T.argsort(self.p_y_given_x, axis=1)[:, -num_top:] y_top_x = y.reshape((y.shape[0], 1)).repeat(num_top, axis=1) return T.mean(T.min(T.neq(y_pred_top_x, y_top_x), axis=1)) else: raise NotImplementedError()
def kmaxpooling(self,input,k): sorted_values = T.argsort(input,axis=3) topmax_indexes = sorted_values[:,:,:,-k:] # sort indexes so that we keep the correct order within the sentence topmax_indexes_sorted = T.sort(topmax_indexes) #given that topmax only gives the index of the third dimension, we need to generate the other 3 dimensions dim0 = T.arange(0,self.input_shape[0]).repeat(self.input_shape[1]*self.input_shape[2]*k) dim1 = T.arange(0,self.input_shape[1]).repeat(k*self.input_shape[2]).reshape((1,-1)).repeat(self.input_shape[0],axis=0).flatten() dim2 = T.arange(0,self.input_shape[2]).repeat(k).reshape((1,-1)).repeat(self.input_shape[0]*self.input_shape[1],axis=0).flatten() dim3 = topmax_indexes_sorted.flatten() return input[dim0,dim1,dim2,dim3].reshape((self.input_shape[0], self.input_shape[1], self.input_shape[2], k))
def errors_top_x(self, p_y_given_x, y, num_top=5): if num_top != 5: print 'val errors from top %d' % num_top # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction y_pred_top_x = T.argsort(p_y_given_x, axis=1)[:, -num_top:] y_top_x = y.reshape((y.shape[0], 1)).repeat(num_top, axis=1) return T.mean(T.min(T.neq(y_pred_top_x, y_top_x), axis=1)) else: raise NotImplementedError()
def process(self): data = self.data - self.data.mean(axis=0) cov = T.dot(data.T, data.conj())/ (data.shape[0]-1) evals, evecs = T.nlinalg.eig(cov) if self.components is None and \ self.threshold is not None: self.components = T.gt(evals, self.threshold).sum() key = T.argsort(evals)[::-1][:self.components] self.evals, self.evecs = evals[key], evecs[:, key] self.pca = T.dot(self.evecs.T, data.T).T return self.pca
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size'])) f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size'])) o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options['hidden_size'])) c = f * c_ + i * c h = o * tensor.tanh(c) p = tensor.dot(h,tparams['Wd']) + tparams['bd'] p = tensor.nnet.softmax(p) lProb = tensor.log(p + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division xW = tparams['Wemb'][xWIdx.flatten()] doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) h = h.take(xCandIdx.flatten(),axis=0); c = c.take(xCandIdx.flatten(),axis=0) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
def test_big_and_little_train_both(rng, batch_size=1, learning_rate=0.01, n_epochs=1000, L1_reg=0.0, L2_reg=0.0001): l_learning_rate = learning_rate b_learning_rate = 10 * learning_rate index = T.lscalar('index') l_x = T.matrix('l_x', dtype=config.floatX) b_x = T.tensor3('b_x', dtype=config.floatX) y = T.ivector('y') print "Loading Data" dataset = 'mnist.pkl.gz' datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size print "Building models" print "... Building layers" # Create network structure x_size = train_set_x.shape[1].eval() n_in = x_size n_units_per = 32 n_out = 500 l_layers = [] b_layers = [] l_layers.append( HiddenLayer( n_in, n_out, batch_size, #k=0.05, k=1, activation=T.tanh, name='l_layer_' + str(len(l_layers)))) in_idxs_0 = shared(np.zeros((batch_size, 1), dtype='int64'), name='in_idxs_0') b_layers.append( HiddenBlockLayer((1, x_size), (n_out, n_units_per), in_idxs_0, l_layers[-1].top_active, batch_size, activation=T.tanh, name='b_layer_' + str(len(b_layers)))) #n_in = n_out #n_out = 100 #k_activations = 0.12 #l_layers.append( # HiddenLayer( # n_in, # n_out, # k=k_activations, # name='l_layer_' + str(len(l_layers)) # ) #) #b_layers.append(HiddenBlockLayer(n_in, n_out, batch_size)) n_in = n_out n_out = 10 l_layers.append( HiddenLayer(n_in, n_out, batch_size, k=1, activation=T.nnet.softmax, name='l_layer_' + str(len(l_layers)))) l_layers[-1].W.set_value(0 * l_layers[-1].W.get_value()) # T.nnet.softmax takes a matrix not a tensor so just calculate the linear # component in the layer and apply the softmax later #out_idxs_n = shared( # np.repeat( # np.arange(n_out, dtype='int64').reshape(1, n_out), # batch_size, # axis=0 # ), # name='out_idxs_' + str(len(l_layers)) #) b_layers.append( HiddenBlockLayer( (n_in, n_units_per), (n_out, n_units_per), l_layers[-2].top_active, l_layers[-1].top_active, #out_idxs_n, batch_size, None, name='b_layer_' + str(len(b_layers)))) #b_layers[-1].W.set_value(0*b_layers[-1].W.get_value()) print "... Building top active updates" top_active = [] l_activation = l_x b_activation = b_x b_activations = [b_activation] for i in range(len(l_layers)): l_activation = l_layers[i].output(l_activation) b_activation = b_layers[i].output(b_activation) b_activations.append(b_activation) top_active.append((l_layers[i].top_active, T.argsort(T.abs_(l_activation))[:, :l_layers[i].k])) print "... Building costs and errors" l_cost = add_regularization(l_layers, l_layers[-1].cost(l_activation, y), L1_reg, L2_reg) l_error = l_layers[-1].error(l_activation, y) # T.nnet.softmax takes a matrix not a tensor so we only calculate the # linear component at the last layer and here we reshape and then # apply the softmax #b_activation = T.nnet.softmax(((b_activation*b_activation)**2).sum(axis=2)) #b_activation = relu_softmax(((b_activation*b_activation)**2).sum(axis=2)) b_activation = T.nnet.softmax(T.mean(b_activation, axis=2)) #b_activation = relu_softmax(T.mean(b_activation, axis=2)) #b_activation = T.nnet.softmax(T.max(b_activation, axis=2)) #b_activation = relu_softmax(T.max(b_activation, axis=2)) b_activations.append(b_activation) b_cost = add_regularization(b_layers, b_layers[-1].cost(b_activation, y), L1_reg, L2_reg) b_error = b_layers[-1].error(b_activation, y) print "... Building parameter updates" l_grads = [] l_param_updates = [] b_grads = [] b_param_updates = [] for i in range(len(l_layers)): for param in l_layers[i].params: gparam = T.grad(l_cost, param) l_grads.append(gparam) l_param_updates.append((param, param - l_learning_rate * gparam)) for param in b_layers[i].params: gparam = T.grad( b_cost, param, consider_constant=[b_layers[i].in_idxs, b_layers[i].out_idxs]) b_grads.append(gparam) b_param_updates.append((param, param - b_learning_rate * gparam)) print "... Compiling little net train function" l_train_model = function( [index], [l_cost, l_x, y], updates=top_active + l_param_updates, givens={ l_x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net train function" temp = train_set_x.get_value(borrow=True, return_internal_type=True) train_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='train_set_x_b') b_train_model = function( [index], [b_cost], updates=b_param_updates, givens={ b_x: train_set_x_b[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) #theano.printing.debugprint(b_train_model) #ipdb.set_trace() # verify_layers(batch_size, b_layers, train_set_x_b, train_set_y) # temp = verify_cost( # b_cost, # b_layers, # b_x, # y, # batch_size, # train_set_x_b, # train_set_y # ) # T.verify_grad( # temp, # [b_layers[0].W.get_value(), b_layers[1].W.get_value()], # rng=rng # ) print "... Compiling little net test function" l_test_model = function( [index], l_error, givens={ l_x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net test function" temp = test_set_x.get_value(borrow=True, return_internal_type=True) test_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='test_set_x_b') b_test_model = function( [index], b_error, givens={ b_x: test_set_x_b[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling little net validate function" l_validate_model = function( [index], l_error, givens={ l_x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net validate function" temp = valid_set_x.get_value(borrow=True, return_internal_type=True) valid_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='valid_set_x_b') b_validate_model = function( [index], b_error, givens={ b_x: valid_set_x_b[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) print "Training" # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 100 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False accum = 0 accum_b = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = l_train_model(minibatch_index) minibatch_avg_cost_b = b_train_model(minibatch_index, learning_rate.rate) #print "minibatch_avg_cost: " + str(minibatch_avg_cost) + " minibatch_avg_cost_b: " + str(minibatch_avg_cost_b) #print l_layers[0].W.get_value().sum(), l_layers[1].W.get_value().sum(), b_layers[0].W.get_value().sum(), b_layers[1].W.get_value().sum() #print "A: ", np.max(np.abs(b_layers[0].W.get_value())), np.max(np.abs(b_layers[0].b.get_value())), np.max(np.abs(b_layers[1].W.get_value())), np.max(np.abs(b_layers[1].b.get_value())) #print "B: ", np.abs(b_layers[0].W.get_value()).sum(), np.abs(b_layers[0].b.get_value()).sum(), np.abs(b_layers[1].W.get_value()).sum(), np.abs(b_layers[1].b.get_value()).sum() #print "C: ", np.abs(np.array(minibatch_avg_cost_b[1])).sum(), np.abs(np.array(minibatch_avg_cost_b[2])).sum(), np.abs(np.array(minibatch_avg_cost_b[3])).sum(), np.abs(np.array(minibatch_avg_cost_b[4])).sum() minibatch_avg_cost = minibatch_avg_cost[0] minibatch_avg_cost_b = minibatch_avg_cost_b[0] accum = accum + minibatch_avg_cost accum_b = accum_b + minibatch_avg_cost_b # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: accum = accum / validation_frequency accum_b = accum_b / validation_frequency print "minibatch_avg_cost: ", accum, \ "minibatch_avg_cost_b: ", accum_b accum = 0 accum_b = 0 # compute zero-one loss on validation set validation_losses = [ l_validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) validation_losses_b = [ b_validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss_b = np.mean(validation_losses_b) #this_validation_loss_b = 0 print( 'epoch %i, minibatch %i/%i, validation error %f %% ' '(%f %%)' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100., this_validation_loss_b * 100.)) #ipdb.set_trace() # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ l_test_model(i) for i in xrange(n_test_batches) ] test_score = np.mean(test_losses) test_losses_b = [ b_test_model(i) for i in xrange(n_test_batches) ] test_score_b = np.mean(test_losses_b) #test_score_b = 0 print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %% (%f %%)') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100., test_score_b * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def build_model(self, tparams, options, xI=None, prior_inp_list = []): trng = RandomStreams() rng = np.random.RandomState() # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xWi = T.matrix('xW', dtype='int64') # Now input is transposed compared to the generator!! xW = xWi.T n_samples = xW.shape[0] n_words= xW.shape[1] Words = T.concatenate([tparams['Wemb'], T.alloc(numpy_floatX(0.),1,self.word_encoding_size)],axis=0) embW = Words[xW.flatten()].reshape([options['batch_size'], 1, n_words, self.word_encoding_size]) if options.get('use_dropout',0): embW = dropout_layer(embW, use_noise, trng, options['drop_prob_encoder'], shp = embW.shape) sent_emb, cnn_out , tparams = self.sent_conv_layer(tparams, options, embW, options['batch_size'], use_noise, trng) if xI == None: xI = T.matrix('xI', dtype=config.floatX) xI_is_inp = True else: xI_is_inp = False if options.get('mode','batchtrain') != 'batchtrain': posSamp = T.ivector('posSamp') if xI_is_inp: embImg = T.dot(xI, tparams['WIemb']) + tparams['b_Img'] else: embImg = xI + tparams['b_Img'] if options.get('use_dropout',0): embImg = dropout_layer(embImg, use_noise, trng, options['drop_prob_encoder'], shp = embImg.shape) #-------------------------------------------------------------------------------------------------------------# # Curr prob is computed by applying softmax over (I0,c0), (I0,c1),... (I0,cn-1) pairs # It could also be computed with (I0,c0), (I1,c0),... (In,c0) pairs, but will lead to different discrimination # Maybe even sum of the two could be used #-------------------------------------------------------------------------------------------------------------# probMatchImg, sim_score = multimodal_cosine_sim_softmax(embImg, sent_emb, tparams, options.get('sim_smooth_factor',1.0)) inp_list = [xWi] if xI_is_inp: inp_list.append(xI) if options.get('en_aux_inp',0): xAux = T.matrix('xAux', dtype=config.floatX) embAux = T.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] xAuxEmb = dropout_layer(embAux, use_noise, trng, options['drop_prob_aux'], shp = embAux.shape) inp_list.append(xAux) probMatchAux, sim_scoreAux = multimodal_cosine_sim_softmax(embAux, sent_emb, tparams, options.get('sim_smooth_factor',1.0)) else: probMatchAux = T.alloc(numpy_floatX(0.),1,1) probMatch = (probMatchImg + probMatchAux) / 2. sortedProb = T.argsort(probMatch,axis=1) batch_idces = T.arange(probMatch.shape[0]) opponents = T.switch(T.eq(sortedProb[:,-1], batch_idces), sortedProb[:,-2], sortedProb[:,-1]) violator_mask = (probMatch.diagonal() - probMatch[batch_idces,opponents]) < (options.get('cost_margin',0.02)) n_violators = violator_mask.sum() if options.get('mode','batchtrain') == 'batchtrain': cost = [-((T.log(probMatch.diagonal())* (1+2.0*violator_mask)).sum())/probMatch.shape[0]] else: cost = [-(T.log(probMatch[0,posSamp]).sum())/posSamp.shape[0]] cost.append(n_violators) cost.append((probMatch.diagonal() - probMatch[batch_idces,opponents])) f_pred_sim_prob = theano.function(prior_inp_list + inp_list, [probMatchImg, probMatchAux, probMatch, opponents], name='f_pred_sim_prob') f_pred_sim_scr = theano.function(prior_inp_list + inp_list[:2], sim_score, name='f_pred_sim_scr') f_sent_emb = theano.function(inp_list[:1], cnn_out, name='f_sent_emb') if options.get('mode','batchtrain') != 'batchtrain': inp_list.append(posSamp) return use_noise, inp_list, [f_pred_sim_prob, f_pred_sim_scr, f_sent_emb], cost, sim_score, tparams
def _stepP(*in_list): x_inp = [] h_inp = [] c_inp = [] for i in xrange(nmodels): x_inp.append(in_list[i]) h_inp.append(in_list[nmodels + i]) c_inp.append(in_list[2 * nmodels + i]) lP_ = in_list[3 * nmodels] dV_ = in_list[3 * nmodels + 1] p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']) cf = [] h = [] xW = [] for i in xrange(nmodels): preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')]) preact += ( tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) + tparams[i][_p(prefix, 'b')]) if options[i].get('en_aux_inp', 0): preact += tensor.dot(aux_input2[i], tparams[i][_p(prefix, 'W_aux')]) inp = tensor.nnet.sigmoid( sliceT(preact, 0, options[i]['hidden_size'])) f = tensor.nnet.sigmoid( sliceT(preact, 1, options[i]['hidden_size'])) o = tensor.nnet.sigmoid( sliceT(preact, 2, options[i]['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size'])) cf.append(f * c_inp[i] + inp * c) h.append(o * tensor.tanh(cf[i])) p = tensor.dot(h[i], tparams[i]['Wd']) + tparams[i]['bd'] if i == 0: p_comb = tparams[i]['comb_weight'] * tensor.nnet.softmax(p) else: p_comb += tparams[i]['comb_weight'] * tensor.nnet.softmax( p) lProb = tensor.log(p_comb + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) x_out = [] h_out = [] c_out = [] for i in xrange(nmodels): x_out.append(tparams[i]['Wemb'][xWIdx.flatten()]) h_out.append(h[i].take(xCandIdx.flatten(), axis=0)) c_out.append(cf[i].take(xCandIdx.flatten(), axis=0)) out_list = [] out_list.extend(x_out) out_list.extend(h_out) out_list.extend(c_out) out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx]) return out_list, theano.scan_module.until(doneVec.all())
def test_big_and_little_train_big(rng, batch_size, learning_rate, momentum_rate, n_epochs=1000, L1_reg=0.0, L2_reg=0.0001, restore_parameters=False, select_top_active=False, mult_small_net_params=False, zero_last_layer_params=False, train_little_net=False, train_big_net=True): def summarize_rates(): print "Learning rate: ", learning_rate.rate, \ "Momentum: ", momentum.get_value() assert (train_big_net or train_little_net) l_learning_rate = shared(np.array(learning_rate.rate, dtype=config.floatX), name='learning_rate') b_learning_rate = shared(np.array(learning_rate.rate, dtype=config.floatX), name='learning_rate') momentum = shared(np.array(momentum_rate.rate, dtype=config.floatX), name='momentum') index = T.lscalar('index') l_x = T.matrix('l_x', dtype=config.floatX) b_x = T.tensor3('b_x', dtype=config.floatX) y = T.ivector('y') print "Loading Data" print "... MNIST" dataset = 'mnist.pkl.gz' datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size print "Building models" print "... Building layers" # Create network structure x_size = train_set_x.shape[1].eval() y_size = train_set_y.shape[0].eval() n_in = x_size n_units_per = 1 n_out = 5000 l_layers = [] b_layers = [] l_params = None # Shared variable used for always activating one block in a layer as in the # input and output layer one_block_idxs = shared(np.zeros((batch_size, 1), dtype='int64'), name='one_block_idxs') l_layers.append( HiddenLayer(n_in, n_out, batch_size, k=0.1, activation=T.tanh, name='l_layer_' + str(len(l_layers)))) if mult_small_net_params: l_params = l_layers[-1].params b_layers.append( HiddenBlockLayer((1, x_size), (n_out, n_units_per), one_block_idxs, l_layers[-1].top_active, batch_size, activation=T.tanh, name='b_layer_' + str(len(b_layers)), l_params=l_params, l_param_map=[('x', 1, 0, 'x'), (0, 'x')])) n_in = n_out l_layers.append( HiddenLayer(n_in, n_out, batch_size, k=0.1, activation=T.tanh, name='l_layer_' + str(len(l_layers)))) if mult_small_net_params: l_params = l_layers[-1].params b_layers.append( HiddenBlockLayer( (n_in, n_units_per), (n_out, n_units_per), l_layers[-2].top_active, l_layers[-1].top_active, #out_idxs_n, batch_size, activation=T.tanh, name='b_layer_' + str(len(b_layers)), l_params=l_params, l_param_map=[(0, 1, 'x', 'x'), (0, 'x')])) n_out = 10 l_layers.append( HiddenLayer(n_in, n_out, batch_size, k=1, activation=T.nnet.softmax, name='l_layer_' + str(len(l_layers)))) if zero_last_layer_params: l_layers[-1].W.set_value(0 * l_layers[-1].W.get_value()) l_layers[-1].b.set_value(0 * l_layers[-1].b.get_value()) if mult_small_net_params: l_params = l_layers[-1].params b_layers.append( HiddenBlockLayer((n_in, n_units_per), (1, n_out), l_layers[-2].top_active, one_block_idxs, batch_size, None, name='b_layer_' + str(len(b_layers)), l_params=l_params, l_param_map=[(0, 'x', 'x', 1), ('x', 0)])) if zero_last_layer_params: b_layers[-1].W.set_value(0 * b_layers[-1].W.get_value()) b_layers[-1].b.set_value(0 * b_layers[-1].b.get_value()) if train_little_net or select_top_active: for layer in l_layers: print "\t%s" % layer if train_big_net: for layer in b_layers: print layer if restore_parameters: print "... Restoring weights of little model" restore_parameters('parameters_20_20_l1_0.0001_l2_0.0001.pkl', l_layers) #for l_layer in l_layers: # for param in l_layer.params: # param.set_value(np.ones_like(param.get_value())) print "... Building top active updates" top_active = [] l_activation = l_x b_activation = b_x b_activations = [b_activation] for i in range(len(l_layers)): l_activation = l_layers[i].output(l_activation) b_activation = b_layers[i].output(b_activation) b_activations.append(b_activation) top_active.append((l_layers[i].top_active, T.argsort(T.abs_(l_activation))[:, :l_layers[i].k])) print "... Building costs and errors" l_cost = add_regularization(l_layers, l_layers[-1].cost(l_activation, y), L1_reg, L2_reg) l_error = l_layers[-1].error(l_activation, y) # T.nnet.softmax takes a matrix not a tensor so we only calculate the # linear component at the last layer and here we reshape and then # apply the softmax #b_activation = T.nnet.softmax(((b_activation*b_activation)**2).sum(axis=2)) #b_activation = relu_softmax(((b_activation*b_activation)**2).sum(axis=2)) #b_activation = T.nnet.softmax(T.mean(b_activation, axis=2)) #b_activation = relu_softmax(T.mean(b_activation, axis=2)) #b_activation = T.nnet.softmax(T.max(b_activation, axis=2)) #b_activation = relu_softmax(T.max(b_activation, axis=2)) b_shp = b_activation.shape #b_activation = relu_softmax(b_activation.reshape((b_shp[0], b_shp[2]))) b_activation = T.nnet.softmax(b_activation.reshape((b_shp[0], b_shp[2]))) b_activations.append(b_activation) b_cost = add_regularization(b_layers, b_layers[-1].cost(b_activation, y), L1_reg, L2_reg) b_error = b_layers[-1].error(b_activation, y) print "... Building parameter updates" l_grads = [] l_param_updates = [] b_grads = [] b_param_updates = [] for i in range(len(l_layers)): for param in l_layers[i].params: gparam = T.grad(l_cost, param) l_grads.append(gparam) l_param_updates.append((param, param - l_learning_rate * gparam)) for param in b_layers[i].params: b_gparam = T.grad( b_cost, param, #consider_constant=[b_layers[i].in_idxs, b_layers[i].out_idxs] ) b_velocity = shared( np.zeros_like(param.get_value(), dtype=theano.config.floatX), param.name + '_velocity') b_param_updates.append( (b_velocity, momentum * b_velocity - b_learning_rate * b_gparam)) b_grads.append(b_gparam) b_param_updates.append((param, param + b_velocity)) #if b_layers[i].l_params is not None: #for param in b_layers[i].l_params: #l_gparam = T.grad( # b_cost, # param #) #l_velocity = shared( # np.zeros_like(param.get_value()), # param.name + '_velocity' #) #b_param_updates.append(( # l_velocity, momentum*l_velocity - b_learning_rate*l_gparam #)) #l_grads.append(l_gparam) #b_param_updates.append((param, param + l_velocity)) #b_param_updates.append(( # param, param - 0.0001*l_gparam #)) print "... Compiling little net train function" l_updates = [] if select_top_active: l_updates = l_updates + top_active if train_little_net: l_updates = l_updates + l_param_updates l_train_model = function( [index], [l_cost, l_x, y], updates=l_updates, givens={ l_x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net train function" temp = train_set_x.get_value(borrow=True, return_internal_type=True) train_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='train_set_x_b') b_updates = [] if train_big_net: b_updates = b_updates + b_param_updates b_train_model = function( [index], [b_cost], updates=b_updates, givens={ b_x: train_set_x_b[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) #theano.printing.debugprint(b_train_model) #ipdb.set_trace() # verify_layers(batch_size, b_layers, train_set_x_b, train_set_y) # temp = verify_cost( # b_cost, # b_layers, # b_x, # y, # batch_size, # train_set_x_b, # train_set_y # ) # T.verify_grad( # temp, # [b_layers[0].W.get_value(), b_layers[1].W.get_value()], # rng=rng # ) print "... Compiling little net test function" l_test_model = function( [index], l_error, givens={ l_x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net test function" temp = test_set_x.get_value(borrow=True, return_internal_type=True) test_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='test_set_x_b') b_test_model = function( [index], b_error, givens={ b_x: test_set_x_b[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling little net validate function" l_validate_model = function( [index], l_error, givens={ l_x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) print "... Compiling big net validate function" temp = valid_set_x.get_value(borrow=True, return_internal_type=True) valid_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])), borrow=True, name='valid_set_x_b') b_validate_model = function( [index], b_error, givens={ b_x: valid_set_x_b[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) print "Training" # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 10 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None this_validation_loss = 0 this_validation_loss_l = 0 this_validation_loss_b = 0 best_validation_loss = np.inf best_validation_loss_l = best_validation_loss best_validation_loss_b = best_validation_loss best_iter = 0 test_score = 0. test_score_l = 0. test_score_b = 0. accum_l = 0 accum_b = 0 epoch = 0 train_time_accum_l = 0 train_time_accum_b = 0 done_looping = False timers = ['train', 'valid', 'train'] ts = TS(['epoch', 'valid']) ts_l = TS(timers) ts_b = TS(timers) summarize_rates() ts.start() while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 ts.start('epoch') for minibatch_index in xrange(n_train_batches): if train_little_net or select_top_active: ts_l.start('train') minibatch_avg_cost_l = l_train_model(minibatch_index) ts_l.end('train') minibatch_avg_cost_l = minibatch_avg_cost_l[0] if np.isnan(minibatch_avg_cost_l): print "minibatch_avg_cost_l: %f" % minibatch_avg_cost_l ipdb.set_trace() accum_l = accum_l + minibatch_avg_cost_l if train_big_net: ts_b.start('train') minibatch_avg_cost_b = b_train_model(minibatch_index) ts_b.end('train') minibatch_avg_cost_b = minibatch_avg_cost_b[0] accum_b = accum_b + minibatch_avg_cost_b #print "minibatch_avg_cost: " + str(minibatch_avg_cost) + " minibatch_avg_cost_b: " + str(minibatch_avg_cost_b) #print l_layers[0].W.get_value().sum(), l_layers[1].W.get_value().sum(), b_layers[0].W.get_value().sum(), b_layers[1].W.get_value().sum() #print "A: ", np.max(np.abs(b_layers[0].W.get_value())), np.max(np.abs(b_layers[0].b.get_value())), np.max(np.abs(b_layers[1].W.get_value())), np.max(np.abs(b_layers[1].b.get_value())) #print "B: ", np.abs(b_layers[0].W.get_value()).sum(), np.abs(b_layers[0].b.get_value()).sum(), np.abs(b_layers[1].W.get_value()).sum(), np.abs(b_layers[1].b.get_value()).sum() #print "C: ", np.abs(np.array(minibatch_avg_cost_b[1])).sum(), np.abs(np.array(minibatch_avg_cost_b[2])).sum(), np.abs(np.array(minibatch_avg_cost_b[3])).sum(), np.abs(np.array(minibatch_avg_cost_b[4])).sum() # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: ts.end('epoch') ts.reset('epoch') l_summary = "" if train_little_net or select_top_active: ts_l.reset('train') accum_l = accum_l / validation_frequency l_summary = ("minibatch_avg_cost_l: %f, time: %f" % (accum_l, ts_l.accumed['train'][-1][1])) accum_l = 0 train_time_accum_l = 0 b_summary = "" if train_big_net: ts_b.reset('train') accum_b = accum_b / validation_frequency b_summary = ("minibatch_avg_cost_b: %f, time: %f" % (accum_b, ts_b.accumed['train'][-1][1])) accum_b = 0 print "%s %s" % (l_summary, b_summary) # compute zero-one loss on validation set summary = ('epoch %i, minibatch %i/%i' % (epoch, minibatch_index + 1, n_train_batches)) l_summary = "" if train_little_net or select_top_active: validation_losses_l = [ l_validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss_l = np.mean(validation_losses_l) l_summary = ('little validation error %f %% ' % (this_validation_loss_l * 100.)) b_summary = "" if train_big_net: validation_losses_b = [ b_validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss_b = np.mean(validation_losses_b) #this_validation_loss_b = 0 b_summary = ('big validation error %f %% ' % (this_validation_loss_b * 100.)) print("%s %s %s" % (summary, l_summary, b_summary)) #ipdb.set_trace() # if we got the best validation score until now if train_big_net: this_validation_loss = this_validation_loss_b elif train_little_net: this_validation_loss = this_validation_loss_l if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss_l = this_validation_loss_l best_validation_loss_b = this_validation_loss_b if train_big_net: best_validation_loss = best_validation_loss_b elif train_little_net: best_validation_loss = best_validation_loss_l best_iter = iter # test it on the test set l_summary = "" if train_little_net: test_losses_l = [ l_test_model(i) for i in xrange(n_test_batches) ] test_score_l = np.mean(test_losses_l) l_summary = 'little: %f' % (test_score_l * 100.) b_summary = "" if train_big_net: test_losses_b = [ b_test_model(i) for i in xrange(n_test_batches) ] test_score_b = np.mean(test_losses_b) #test_score_b = 0 b_summary = 'big: %f' % (test_score_b * 100.) print( ' epoch %i, minibatch %i/%i,' ' test error of best model %s %s' % (epoch, minibatch_index + 1, n_train_batches, l_summary, b_summary)) learning_rate.update() if train_little_net: l_learning_rate.set_value(learning_rate.rate) if train_big_net: b_learning_rate.set_value(learning_rate.rate) momentum_rate.update() momentum.set_value(momentum_rate.rate) summarize_rates() if patience <= iter: done_looping = True break ts.end() print( 'Optimization complete. Best validation score of %f %% (%f %%) ' 'obtained at iteration %i, with test performance %f %% (%f %%)' % (best_validation_loss_l * 100., best_validation_loss_b * 100., best_iter + 1, test_score_l * 100., test_score_b * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %s' % ts)
def get_scores_all(mdl, fm1, fm2, X1, X2, X_new1, X_new2, num_select=10): X11 = [] X21 = [] # x = X[i], we add new values to the end of x for j in range(len(X_new1)): for k in range(len(X_new1[j])): xx = [xxx for xxx in X1[j]] xx.append(X_new1[j][k]) X11.append(xx) for j in range(len(X_new2)): for k in range(len(X_new2[j])): xx = [xxx for xxx in X2[j]] xx.append(X_new2[j][k]) X21.append(xx) print(X11) print(X21) X1 = [[fm1.f.map[fm1.f.getFeatureValue(x)] + 1 for x in XX] for XX in X11] X2 = [[fm2.f.map[fm2.f.getFeatureValue(x)] + 1 for x in XX] for XX in X21] x1, x_mask1 = preprare_seq_seq_data(X1) x1, _, mask_x1, _, _, _, _, _ = mdl.standardize_data( x1, None, x_mask1, None, None, None, None, None) x2, x_mask2 = preprare_seq_seq_data(X2) x2, _, mask_x2, _, _, _, _, _ = mdl.standardize_data( x2, None, x_mask2, None, None, None, None, None) score_pos = mdl.get_output_layer(-1, x1, x2, mask_x1) score_pos = score_pos.swapaxes(0, 1) score_pos = score_pos[:, -1] x = T.matrix("score") sort_f = th.function([x], T.argsort(x)) sorted_values = sort_f(score_pos) sorted_values = sorted_values print(sorted_values) rs1 = [] rs2 = [] rs_scores = [] my_scores = [] for i in range(sorted_values.shape[0]): #f.write(to_string(X1[i]) + " ") ss = [] for j in range(1, sorted_values.shape[1]): val = sorted_values[i][sorted_values.shape[1] - j] #val_map = fm.fY.map_inversed[val-1] score = score_pos[i][val] #f.write(str(val) + ":" + str(score) + " ") ss.append((val, score)) #f.write("\n") my_scores.append(("_", ss)) vals = [] c = 0 for t in range(sorted_values.shape[1] - 1, -1, -1): if c == num_select: break v = sorted_values[i][t] if fm1.fY.map_inversed[v - 1] != "EOS": vals.append(v) c += 1 #vals = sorted_values[i][sorted_values.shape[1]-num_select:sorted_values.shape[1]] vals1 = [] vals2 = [] #val_maps = [fm1.fY.map_inversed[v-1].split("_") for v in list(vals) ]#if fm.fY.map_inversed[v-1]!="EOS" ] scores = [score_pos[i][v] for v in list(vals)] # if fm.fY.map_inversed[v-1]!="EOS"] for v in list(vals): tm = fm1.fY.map_inversed[v - 1].split("_") vals1.append(tm[0]) vals2.append(tm[1]) rs1.append(vals1) rs2.append(vals2) rs_scores.append(scores) return (rs1, rs2), rs_scores, X11, X21, my_scores
""" k-max pooling example. """ import numpy as np import theano from theano import tensor as T from theano.sandbox import neighbours k = 3 # instantiate 4D tensor for input input = T.tensor4(name='input') neighborsForPooling = neighbours.images2neibs(input, (1, 5), mode='valid') neighborsArgSorted = T.argsort(neighborsForPooling, axis=1) kNeighborsArg = neighborsArgSorted[:, -k:] kNeighborsArgSorted = T.sort(kNeighborsArg, axis=1) ii = T.repeat(T.arange(neighborsForPooling.shape[0]), k) jj = kNeighborsArgSorted.flatten() k_pooled_2D = neighborsForPooling[ii, jj].reshape((3, k)) k_pooled = neighbours.neibs2images(k_pooled_2D, (1, 3), (1, 3, 1, 3)) k_max = theano.function([input], k_pooled) input = np.array([[2, 4, 1, 6, 8], [12, 3, 5, 7, 1], [-8, 6, -12, 4, 1]], dtype=np.float32) input = input.reshape(1, 3, 1, 5) print "input shape: ", input.shape print "input: ", input output = k_max(input) print "output shape: ", output.shape
def __init__(self, n_in, layers, hidden_dropout=0.5, max_col_norm=1.7236, rho=0.96, rmsprop=False, center_grads=False, use_nesterov=False, mean_pooling=False, normalize_acts=False, layer_dropout=True, no_final_dropout=False, loss_based_pooling=False, topN_pooling=1, adadelta=False, response_normalize=True, enable_standardization=False, l2=None, seed=1985, **kwargs): x = T.matrix('x', dtype=theano.config.floatX) y = T.lvector('y') type_map = { 'L': LogisticLayer, 'R': RectifierLayer, 'S': SoftmaxLayer, 'Sp': SoftplusLayer, 'T': TanhLayer, 'Li': LinearLayer, 'Sq': SquaredLayer, } #EPS = 1e-18 self.max_col_norm = max_col_norm self.rng = RandomStreams(seed) alpha = 0.02 beta = 0.75 k = 1.5 self.layers = [] constants = [] n_layers = len(layers) # Create hidden layers for i, layer in enumerate(layers): layer_type = layer[0] layer_size = layer[1] if i == 0: layer_input = x layer_n_in = n_in #elif i == n_layers - 1: # layer_input = self.layers[-1].output # layer_n_in = self.layers[-1].n_in else: layer_input = self.layers[-1].output layer_n_in = self.layers[-1].n_out if i == n_layers - 1: if normalize_acts: layer_input = layer_input / T.sqrt( T.sum(layer_input**2, axis=1, keepdims=True) + EPS) elif response_normalize: layer_input = (layer_input - T.min( layer_input, axis=1, keepdims=True)) / T.maximum( T.max(layer_input, axis=1, keepdims=True) - T.min(layer_input, axis=1, keepdims=True), EPS) """ layer_input = layer_input / (k + alpha * T.sum(layer_input**2, axis=1, keepdims=True))**beta """ if enable_standardization: from utils import stddev_bias std_val = stddev_bias(layer_input, EPS) mu = T.mean(layer_input, axis=0) z_val = (layer_input - mu) / std_val layer_input = z_val if loss_based_pooling: pass elif topN_pooling == 1: print "Using topN_pooling for training" max1_indx = T.argmax(layer_input, axis=0) layer_input1 = T.max(layer_input, axis=0) t1 = T.arange(layer_input.shape[1]) masked_in = layer_input * T.neq(layer_input, layer_input[max1_indx, t1]) layer_input2 = T.max(masked_in, axis=0) layer_input = (1.4 * layer_input1 + 0.6 * layer_input2) / 2 elif mean_pooling: layer_input = T.mean(layer_input, axis=0) else: layer_input = T.max(layer_input, axis=0) if layer_dropout and not no_final_dropout: layer_input = layer_input * self.rng.binomial( n=1, p=0.6, dtype=theano.config.floatX) / 0.6 elif not no_final_dropout: assert hidden_dropout != 1. layer_input = layer_input * self.rng.binomial( n=1, p=1 - hidden_dropout, dtype=theano.config.floatX, size=layer_input.shape) / 1 - hidden_dropout if hidden_dropout != 1. and i != n_layers - 1: layer_input = layer_input * self.rng.binomial( n=1, p=1 - hidden_dropout, dtype=theano.config.floatX, size=layer_input.shape) / 1 - hidden_dropout xargs = {} if layer_type == 'R' and layer == layers[-1]: xargs['mask'] = False layer = type_map[layer_type](layer_input, layer_n_in, layer_size, seed=seed, rng=self.rng, **xargs) self.layers.append(layer) self.clean_layers = [] for i, layer in enumerate(layers): layer_type = layer[0] layer_size = layer[1] if i == 0: layer_input = x layer_n_in = n_in else: layer_input = self.clean_layers[-1].output layer_n_in = self.clean_layers[-1].n_out if i == n_layers - 1: if normalize_acts: layer_input = layer_input / T.sqrt( T.sum(layer_input**2, axis=1, keepdims=True) + EPS) elif response_normalize: layer_input = (layer_input - T.min( layer_input, axis=1, keepdims=True)) / T.maximum( T.max(layer_input, axis=1, keepdims=True) - T.min(layer_input, axis=1, keepdims=True), EPS) #layer_input = T.nnet.sigmoid(layer_input) """ layer_input = layer_input / (k + alpha * T.sum(layer_input**2, axis=1, keepdims=True))**beta """ if enable_standardization: from utils import stddev_bias std_val = stddev_bias(layer_input, EPS) mu = T.mean(layer_input, axis=0) z_val = (layer_input - mu) / std_val #T.maximum(std_val, EPS) layer_input = z_val feature_out = layer_input #Perform the temporal max-pooling: if topN_pooling == 1: print "Using topN_pooling for testing." collapsed_val = T.sum(layer_input, axis=0) top_ids = T.argsort(layer_input, axis=0)[-3:][::-1] top_vals = layer_input[top_ids, T.arange(layer_input.shape[1])] #top_mean = (1.2 * top_vals[0] + 1.0 * top_vals[1] + 0.8 * top_vals[2]) / 3 top_mean = (1.4 * top_vals[0] + 0.6 * top_vals[1]) / 2 layer_input = top_mean elif mean_pooling: layer_input = T.mean(layer_input, axis=0) else: layer_input = T.max(layer_input, axis=0) xargs = {} pooled_output_features = layer_input if layer_type == 'R' and layer == layers[-1]: xargs['mask'] = False layer = type_map[layer_type](layer_input, layer_n_in, layer_size, seed=seed, W=self.layers[i].W, b=self.layers[i].b, **xargs) self.clean_layers.append(layer) self._output = theano.function([x], T.argmax(self.clean_layers[-1].output, axis=1)) self._feature_output = theano.function([x], feature_out) self._pooled_output_features = theano.function([x], pooled_output_features) self.transform = theano.function([x], T.mean(self.clean_layers[-2].output, axis=0)) loss = -T.mean(T.log(self.layers[-1].output)[T.arange(y.shape[0]), y]) pooling_loss = -T.log(self.layers[-1].output)[T.arange(y.shape[0]), y] if l2 != None: loss += l2 * sum([(l.W**2).sum(dtype=theano.config.floatX) for l in self.layers]) self.trainer = NeuralNetworkTrainer( [x, y], loss, self.layers, self.max_col_norm, rmsprop=rmsprop, rho=rho, center_grads=center_grads, use_nesterov=use_nesterov, loss_based_pooling=loss_based_pooling, adadelta=adadelta, pooling_loss=pooling_loss, constants=constants, rng=self.rng, **kwargs)
def theano_compiler(model): """Take a triflow model and return optimized theano routines. Parameters ---------- model: triflow.Model: Model to compile Returns ------- (theano function, theano_function): Optimized routine that compute the evolution equations and their jacobian matrix. """ from theano import tensor as T from theano.ifelse import ifelse import theano.sparse as ths from theano import function def th_Min(a, b): if isinstance(a, T.TensorVariable) or isinstance(b, T.TensorVariable): return T.where(a < b, a, b) return min(a, b) def th_Max(a, b): if isinstance(a, T.TensorVariable) or isinstance(b, T.TensorVariable): return T.where(a < b, b, a) return max(a, b) def th_Heaviside(a): if isinstance(a, T.TensorVariable): return T.where(a < 0, 1, 1) return 0 if a < 0 else 1 mapargs = { arg: T.vector(arg) for arg, sarg in zip(model._args, model._symbolic_args) } to_feed = mapargs.copy() x_th = mapargs['x'] N = x_th.size L = x_th[-1] - x_th[0] dx = L / (N - 1) to_feed['dx'] = dx periodic = T.scalar("periodic", dtype="int32") middle_point = int((model._window_range - 1) / 2) th_args = [ mapargs[key] for key in [ *model._indep_vars, *model._dep_vars, *model._help_funcs, *model._pars ] ] + [periodic] map_extended = {} for (varname, discretisation_tree) in \ model._symb_vars_with_spatial_diff_order.items(): pad_left, pad_right = model._bounds th_arg = mapargs[varname] per_extended_var = T.concatenate( [th_arg[pad_left:], th_arg, th_arg[:pad_right]]) edge_extended_var = T.concatenate([[th_arg[0]] * middle_point, th_arg, [th_arg[-1]] * middle_point]) extended_var = ifelse(periodic, per_extended_var, edge_extended_var) map_extended[varname] = extended_var for order in range(pad_left, pad_right + 1): if order != 0: var = ("{}_{}{}").format(varname, 'm' if order < 0 else 'p', np.abs(order)) else: var = varname new_var = extended_var[order - pad_left:extended_var.size + order - pad_right] to_feed[var] = new_var F = lambdify( (model._symbolic_args), expr=model.F_array.tolist(), modules=[T, { "Max": th_Max, "Min": th_Min, "Heaviside": th_Heaviside }])(*[to_feed[key] for key in model._args]) F = T.concatenate(F, axis=0).reshape((model._nvar, N)).T F = T.stack(F).flatten() J = lambdify( (model._symbolic_args), expr=model.J_array.tolist(), modules=[T, { "Max": th_Max, "Min": th_Min, "Heaviside": th_Heaviside }])(*[to_feed[key] for key in model._args]) J = [j if j != 0 else T.constant(0.) for j in J] J = [j if not isinstance(j, (int, float)) else T.constant(j) for j in J] J = T.stack([T.repeat(j, N) if j.ndim == 0 else j for j in J]) J = J[model._sparse_indices[0]].T.squeeze() i = T.arange(N).dimshuffle([0, 'x']) idx = T.arange(N * model._nvar).reshape((N, model._nvar)).T edge_extended_idx = T.concatenate([ T.repeat(idx[:, :1], middle_point, axis=1), idx, T.repeat(idx[:, -1:], middle_point, axis=1) ], axis=1).T.flatten() per_extended_idx = T.concatenate( [idx[:, -middle_point:], idx, idx[:, :middle_point]], axis=1).T.flatten() extended_idx = ifelse(periodic, per_extended_idx, edge_extended_idx) rows = T.tile(T.arange(model._nvar), model._window_range * model._nvar) + i * model._nvar cols = T.repeat(T.arange(model._window_range * model._nvar), model._nvar) + i * model._nvar rows = rows[:, model._sparse_indices].reshape(J.shape).flatten() cols = extended_idx[cols][:, model._sparse_indices] \ .reshape(J.shape).flatten() permutation = T.argsort(cols) J = J.flatten()[permutation] rows = rows[permutation] cols = cols[permutation] count = T.zeros((N * model._nvar + 1, ), dtype=int) uq, cnt = T.extra_ops.Unique(False, False, True)(cols) count = T.set_subtensor(count[uq + 1], cnt) indptr = T.cumsum(count) shape = T.stack([N * model._nvar, N * model._nvar]) sparse_J = ths.CSC(J, rows, indptr, shape) F_theano_function = function(inputs=th_args, outputs=F, on_unused_input='ignore', allow_input_downcast=True) J_theano_function = function(inputs=th_args, outputs=sparse_J, on_unused_input='ignore', allow_input_downcast=True) return F_theano_function, J_theano_function
r_Wa_aht_st, r_ba_aht, \ r_Wa_atmu_aht, r_ba_atmu, \ r_Wa_atsig_aht, r_ba_atsig] ) FE_mean = FEt_th.mean() KL_st_mean = KL_st_th.mean() ot_mean = p_ot_th.mean() oht_mean = p_oht_th.mean() oat_mean = p_oat_th.mean() FE_mean_perturbations = FEt_th.mean(axis=0).mean(axis=1) FE_std_perturbations = FEt_th.mean(axis=0).std(axis=1) FE_mean_perturbations_std = FE_mean_perturbations.std(axis=0) FE_rank = n_perturbations - T.argsort(T.argsort(FE_mean_perturbations)) FE_rank_score = T.clip( numpy.log(0.5 * n_perturbations + 1) - T.log(FE_rank), 0.0, 10000.0).astype(dtype=theano.config.floatX) FE_rank_score_normalized = FE_rank_score / FE_rank_score.sum( ) - 1.0 / n_perturbations run_agent_scan = theano.function(inputs=[], outputs=[ states_th, oat_th, ot_th, oht_th, FEt_th, KL_st_th, hst_th, hst2_th, stmu_th, stsig_th, force_th, pos_th ], allow_input_downcast=True,
def compile(self, optimizer, loss, class_mode="categorical", theano_mode=None): self.optimizer = optimizer.get(optimizer) self.loss = objectives.get(loss) weighted_loss = weighted_objective(objectives.get(loss)) self.X_train = self.get_input(train=True) self.X_test = self.get_input(train=False) self.y_train = self.get_input(train=True) self.y_test = self.get_input(train=False) self.y = T.zeros_like(self.y_train) self.weights = T.ones_like(self.y_train) if hasattr(self.layers[-1], "get_ouput_mask"): mask = self.layers[-1].get_output_mask() else: mask = None train_loss = weighted_loss(self.y, self.y_train, self.weights, mask) test_loss = weighted_loss(self.y, self.y_test, self.weights, mask) train_loss.name = 'train_loss' test_loss.name = 'test_loss' self.y.name = 'y' if class_mode == 'categorical': train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1))) test_accuracy = T.mean(T.eq(T.argsort(self.y, axis=-1), T.argmax(self.y_test, axis=-1))) elif class_mode == "binary": train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train))) test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test))) else: raise Exception("Invalid class mode:" + str(class_mode)) self.class_mode = class_mode self.theano_mode = theano_mode for r in self.regularizers: train_loss = r(train_loss) updates = self.optimizer.get_updates(self.params, self.constraints, train_loss) updates += self.updates if type(self.X_train) == list: train_ins = self.X_train + [self.y, self.weights] test_ins = self.X_test + [self.y, self.weights] predict_ins = self.X_test else: train_ins = [self.X_train, self.y, self.weights] test_ins = [self.X_test, self.y, self.weights] predict_ins = self.X_test self._train = theano.function(train_ins, train_loss, updates=updates, allow_input_downcast=True, mode=theano_mode) self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy], updates=updates, allow_input_downcast = True, mode= theano_mode) self._predict= theano.function(predict_ins, self.y_test, allow_input_downcast=True, mode=theano_mode) self._test = theano.function(test_ins, test_loss, updates=updates, allow_input_downcast=True, mode=theano_mode) self._test_with_acc = theano.function(test_ins, [test_loss, test_accuracy], updates=updates, allow_input_downcast = True, mode= theano_mode)
def get_word_idxs(relevant_sentence_idxs, support_, mask_): rel_support = support_[relevant_sentence_idxs, :] rel_mask = mask_[relevant_sentence_idxs, :] return rel_support[rel_mask.nonzero()].ravel() if attention: # estimate relevance of each sentence relevance_probs = attention_model.get_relevance_probs( support, mask, question_idxs) # By default, the attention model retrieves any sentence with prob > 0.5 under the model # If no sentence exists, it returns the top two sentences in chronological order max_idxs = T.sort(T.argsort(relevance_probs[:, 1])[-2:]) prob_idxs = T.arange( relevance_probs.shape[0])[T.nonzero(relevance_probs[:, 1] > 0.5)] est_idxs = ifelse(T.lt(T.sum(relevance_probs[:, 1] > 0.5), 1), max_idxs, prob_idxs) else: est_idxs = T.arange(support.shape[0]) # joint training of question model + attention model # if no attention, train on all of the sentences est_rel_facts = get_word_idxs(est_idxs, support, mask) answer_probs = qa_model.get_answer_probs(est_rel_facts, question_idxs) # train the qa-model using the hints # true_rel_facts = get_word_idxs(hints.nonzero(), support, mask) # answer_probs = qa_model.get_answer_probs(true_rel_facts, question_idxs)
def call(self, x, mask=None): output = x[T.arange(x.shape[0]).dimshuffle(0, "x", "x"), T.sort(T.argsort(x, axis=1)[:, -self.ktop:, :], axis=1), T.arange(x.shape[2]).dimshuffle("x", "x", 0)] return output
def neighbourhood(X): D = distance_tensor(X) N = T.argsort(D, axis=0) mask = T.cast(T.lt(N, nc), 'float32') return N[1:nc + 1], mask
def __init__(self, numTruncate=20, numHidden=500, inputsSize=[576], outputsSize=[1, 4]): #################################### # Create model # #################################### # Create tensor variables to store input / output data FeaturesXGt = T.matrix('FeaturesXGt', dtype='float32') FeaturesX = T.tensor3('FeaturesX') TargetY = T.tensor3('PredY') BboxY = T.tensor3('BboxY') C = T.vector('C', dtype='float32') S = T.vector('S', dtype='float32') BoxsVariances = T.matrix('BoxsVariances') RatioPosNeg = T.scalar('RatioPosNeg') # Create shared variable for input net = LSTMNet() net.NetName = 'LSTMTrackingNet' # Input # net.Layer['input'] = InputLayer(net, X) net.LayerOpts['lstm_num_truncate'] = numTruncate # net.LayerOpts['reshape_new_shape'] = (net.LayerOpts['lstm_num_truncate'], 576) # TODO: Need to set this size later # net.Layer['input_2d'] = ReshapeLayer(net, net.Layer['input'].Output) # Setting LSTM architecture net.LayerOpts['lstm_num_hidden'] = numHidden net.LayerOpts['lstm_inputs_size'] = inputsSize net.LayerOpts['lstm_outputs_size'] = outputsSize # Truncate lstm model currentC = C currentS = S preds = [] bboxs = [] predictLayers = [] for truncId in range(net.LayerOpts['lstm_num_truncate']): # Create LSTM layer currentInput = FeaturesXGt[truncId] net.Layer['lstm_truncid_%d' % (truncId)] = LSTMLayer( net, currentInput, currentC, currentS) net.LayerOpts['lstm_params'] = net.Layer['lstm_truncid_%d' % (truncId)].Params # Predict next position based on current state currentInput = FeaturesX[truncId] tempLayer = LSTMLayer(net, currentInput, currentC, currentS) predictLayers.append(tempLayer) pred = SigmoidLayer(tempLayer.Output[0]).Output bbox = tempLayer.Output[1] preds.append(pred) bboxs.append(bbox) # Update stateS and stateC currentC = net.Layer['lstm_truncid_%d' % (truncId)].C currentS = net.Layer['lstm_truncid_%d' % (truncId)].S lastS = currentS lastC = currentC self.Net = net # Calculate cost function # Confidence loss cost = 0 costPos = 0 costLoc = 0 costNeg = 0 k0 = None k1 = None k2 = None k3 = None k4 = None for truncId in range(net.LayerOpts['lstm_num_truncate']): pred = preds[truncId] bbox = bboxs[truncId] target = TargetY[truncId] bboxgt = BboxY[truncId] numFeaturesPerIm = pred.shape[0] numAnchorBoxPerLoc = pred.shape[1] pred = pred.reshape((numFeaturesPerIm * numAnchorBoxPerLoc, 1)) target = target.reshape((numFeaturesPerIm * numAnchorBoxPerLoc, 1)) bbox = bbox.reshape((numFeaturesPerIm * numAnchorBoxPerLoc, 4)) bbox = bbox / BoxsVariances bboxgt = bboxgt.reshape((numFeaturesPerIm * numAnchorBoxPerLoc, 4)) allLocCost = T.sum(T.abs_(bbox - bboxgt), axis=1, keepdims=True) * target allConfPosCost = -target * T.log(pred) allConfNegCost = -(1 - target) * T.log(1 - pred) allPosCost = allConfPosCost + allLocCost * 0 allNegCost = allConfNegCost allPosCostSum = T.sum(allPosCost, axis=1) allNegCostSum = T.sum(allNegCost, axis=1) sortedPosCostIdx = T.argsort(allPosCostSum, axis=0) sortedNegCostIdx = T.argsort(allNegCostSum, axis=0) sortedPosCost = allPosCostSum[sortedPosCostIdx] sortedNegCost = allNegCostSum[sortedNegCostIdx] if k0 == None: k0 = target if k1 == None: k1 = allLocCost if k2 == None: k2 = pred if k3 == None: k3 = sortedPosCostIdx if k4 == None: k4 = sortedNegCostIdx numMax = T.sum(T.neq(sortedPosCost, 0)) # numNegMax = T.cast(T.floor(T.minimum(T.maximum(numMax * RatioPosNeg, 2), 300)), dtype = 'int32') numNegMax = T.cast(T.floor(numMax * RatioPosNeg), dtype='int32') top2PosCost = sortedPosCost[-numMax:] top6NegCost = sortedNegCost[-numNegMax:] layerCost = (T.sum(top2PosCost) + T.sum(top6NegCost)) / numMax cost = cost + layerCost costPos = costPos + pred[sortedPosCostIdx[-numMax:]].mean() costLoc = costLoc + allLocCost.sum() / numMax costNeg = costNeg + pred[sortedNegCostIdx[-numNegMax:]].mean() cost = cost / net.LayerOpts['lstm_num_truncate'] costPos = costPos / net.LayerOpts['lstm_num_truncate'] costLoc = costLoc / net.LayerOpts['lstm_num_truncate'] costNeg = costNeg / net.LayerOpts['lstm_num_truncate'] # Create update function params = self.Net.Layer['lstm_truncid_0'].Params grads = T.grad(cost, params) updates = AdamGDUpdate(net, params=params, grads=grads).Updates # Train function self.TrainFunc = theano.function(inputs=[ FeaturesXGt, FeaturesX, TargetY, BboxY, S, C, BoxsVariances, RatioPosNeg ], updates=updates, outputs=[ cost, lastS, lastC, costPos, costLoc, costNeg, k0, k1, k2, k3, k4 ]) self.PredFunc = theano.function(inputs=[FeaturesX, S, C], outputs=[preds[0], bboxs[0]]) nextS = self.Net.Layer['lstm_truncid_0'].S nextC = self.Net.Layer['lstm_truncid_0'].C self.NextState = theano.function(inputs=[FeaturesXGt, S, C], outputs=[nextS, nextC])
energy_T = network.compute_energy(disc_score_T, state) # generated samples samples = gen_model.forward(noise) feat_F = enc_model.forward(samples) disc_score_F = disc_model.forward(feat_F) energy_F = network.compute_energy(disc_score_F, state) # sample gradient sample_sqr = T.sum(samples**2, axis=1) dist_mat = T.sqrt( sample_sqr.reshape((-1, 1)) + sample_sqr.reshape((1, -1)) - 2 * T.dot(samples, samples.T)) neighbor_ids = T.argsort(dist_mat, axis=1)[:, :21] nerghbor_mean = T.mean(samples[neighbor_ids[:, 1:]], axis=1) nerghbor_var = T.var(samples[neighbor_ids], axis=1) indices = T.repeat(T.arange(dist_mat.shape[0]).reshape((-1, 1)), 20, axis=1) nerghbod_dist = T.mean(dist_mat[indices, neighbor_ids], axis=1, keepdims=True) sample_gradient = (nerghbor_mean - samples) sample_gradient /= nerghbor_var # sample_gradient /= T.sqrt(T.sum(sample_gradient ** 2, axis=1, keepdims=True)) sample_gradient = theano.gradient.disconnected_grad(sample_gradient * state['knn_scale']) # grid = theano.shared(data) # grid_sqr = theano.shared(data_sqr, broadcastable=(False, True))
def loop(l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i): l_input_tensor = debug_print( Matrix_Bit_Shift(l_matrix[:, l_left:-l_right]), 'l_input_tensor') r_input_tensor = debug_print( Matrix_Bit_Shift(r_matrix[:, r_left:-r_right]), 'r_input_tensor') addition_l = T.sum(l_matrix[:, l_left:-l_right], axis=1) addition_r = T.sum(r_matrix[:, r_left:-r_right], axis=1) cosine_addition = cosine(addition_l, addition_r) eucli_addition = 1.0 / (1.0 + EUCLID(addition_l, addition_r)) #25.2% layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0], U=U, W=W, b=b, bptt_truncate=-1) layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0], U=U, W=W, b=b, bptt_truncate=-1) cosine_sent = cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep) eucli_sent = 1.0 / (1.0 + EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)) #25.2% attention_matrix = compute_simi_feature_matrix_with_matrix( layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength * (maxSentLength + 1) / 2) l_max_attention = T.max(attention_matrix, axis=1) neighborsArgSorted = T.argsort(l_max_attention) kNeighborsArg = neighborsArgSorted[:3] #only average the min 3 vectors ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie r_max_attention = T.max(attention_matrix, axis=0) neighborsArgSorted_r = T.argsort(r_max_attention) kNeighborsArg_r = neighborsArgSorted_r[: 3] #only average the min 3 vectors rr = T.sort( kNeighborsArg_r).flatten() # make y indices in acending lie l_max_min_attention = debug_print(layer0_A1.output_matrix[:, ll], 'l_max_min_attention') r_max_min_attention = debug_print(layer0_A2.output_matrix[:, rr], 'r_max_min_attention') layer1_A1 = GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1], U=U1, W=W1, b=b1, bptt_truncate=-1) layer1_A2 = GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1], U=U1, W=W1, b=b1, bptt_truncate=-1) vec_l = debug_print( layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l') vec_r = debug_print( layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r') # sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) # aver_uni_l=sum_uni_l/layer0_l_input.shape[3] # norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) # sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) # aver_uni_r=sum_uni_r/layer0_r_input.shape[3] # norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) # uni_cosine = cosine(vec_l, vec_r) # aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) # uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') # ''' # linear=Linear(sum_uni_l, sum_uni_r) # poly=Poly(sum_uni_l, sum_uni_r) # sigmoid=Sigmoid(sum_uni_l, sum_uni_r) # rbf=RBF(sum_uni_l, sum_uni_r) # gesd=GESD(sum_uni_l, sum_uni_r) # ''' eucli_1 = 1.0 / (1.0 + EUCLID(vec_l, vec_r)) #25.2% # #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) # len_l = norm_length_l_i.reshape((1, 1)) len_r = norm_length_r_i.reshape((1, 1)) # # ''' # len_l=length_l.reshape((1,1)) # len_r=length_r.reshape((1,1)) # ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts # layer3_input_nn=T.concatenate([vec_l, vec_r, # cosine_addition, eucli_addition, # # cosine_sent, eucli_sent, # uni_cosine,eucli_1], axis=1)#, layer2.output, layer1.output_cosine], axis=1) output_i = T.concatenate( [ vec_l, vec_r, cosine_addition, eucli_addition, # cosine_sent, eucli_sent, uni_cosine, eucli_1, mts_i.reshape((1, 14)), len_l, len_r, extra_i.reshape((1, 9)) ], axis=1) #, layer2.output, layer1.output_cosine], axis=1) return output_i
def __init__(self, inputs, cost, layers, max_col_norm=None, loss_based_pooling=False, pooling_loss=None, learning_rate=0.01, momentum=None, rmsprop=True, adadelta=False, center_grads=False, rho=0.96, epsilon=1e-8, use_nesterov=True, seed=None, rng=None, constants=None, **kw): self.loss_based_pooling = loss_based_pooling self.rng = rng params = [layer.W for layer in layers] + [layer.b for layer in layers] self.learning_rate = theano.shared( numpy.asarray(learning_rate, dtype=theano.config.floatX)) self.layers = layers self.max_col_norm = max_col_norm #Initialize parameters for rmsprop: accumulators = OrderedDict({}) accumulators_mgrad = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) e0s = OrderedDict({}) learn_rates = [] from utils import as_floatX self.max_col_norm = max_col_norm gparams = [] for param in params: eps_p = numpy.zeros_like(param.get_value()) accumulators[param] = theano.shared(value=as_floatX(eps_p), name="acc_%s" % param.name) accumulators_mgrad[param] = theano.shared(value=as_floatX(eps_p), name="acc_mgrad%s" % param.name) exp_sqr_grads[param] = theano.shared(value=as_floatX(eps_p), name="exp_grad_%s" % param.name) exp_sqr_ups[param] = theano.shared(value=as_floatX(eps_p), name="exp_grad_%s" % param.name) e0s[param] = as_floatX(learning_rate) gparam = T.grad(cost, param, consider_constant=constants) gparams.append(gparam) updates = OrderedDict({}) i = 0 for param, gparam in zip(params, gparams): if rmsprop: acc = accumulators[param] rms_grad = rho * acc + (1 - rho) * T.sqr(gparam) updates[acc] = rms_grad val = T.maximum(T.sqrt(T.sum(rms_grad, axis=0)), epsilon) learn_rates.append(e0s[param] / val) if center_grads: acc_mg = accumulators_mgrad[param] mean_grad = rho * acc_mg + (1 - rho) * gparam gparam = gparam - mean_grad updates[acc_mg] = mean_grad if momentum and not use_nesterov: memory = theano.shared(param.get_value() * 0.) updates[param] = param - memory updates[ memory] = momentum * memory + learn_rates[i] * gparam elif use_nesterov: memory = theano.shared(param.get_value() * 0.) new_memo = momentum * memory - e0s[param] * gparam #new_memo = momentum * memory - learn_rates[i] * gparam updates[memory] = new_memo updates[param] = param + (momentum * new_memo - e0s[param] * gparam) / val else: updates[param] = param - learn_rates[i] * gparam i += 1 elif adadelta: exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gparam) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gparam updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) updates[param] = param + step else: if momentum and not use_nesterov: memory = theano.shared(param.get_value() * 0.) updates[param] = param - memory updates[ memory] = momentum * memory + learning_rate * gparam elif use_nesterov: memory = theano.shared(param.get_value() * 0.) new_memo = momentum * memory - learning_rate * gparam updates[memory] = new_memo updates[ param] = param + momentum * new_memo - learning_rate * gparam else: updates[param] = param - learning_rate * gparam if max_col_norm is not None: updates = self.constrain_weights(layers, updates, max_col_norm) self.updates = updates self._train = theano.function(inputs, outputs=cost, updates=updates) self._constrain_inputs = theano.function(inputs, outputs=T.argsort( pooling_loss, axis=0))
test_layer0_output = conv_layer.predict(test_layer0_input, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) test_layer1_input = T.concatenate(test_pred_layers, 1) test_y_pred = classifier.predict(test_layer1_input) test_error = T.mean(T.neq(test_y_pred, y)) test_model_all = theano.function([x, y], test_error, allow_input_downcast=True) test_predict = theano.function([x], test_y_pred, allow_input_downcast=True) #test_probs = theano.function([x], test_y_pred_p_reduce, allow_input_downcast=True) #gradient-based update dinput = T.grad(dropout_cost, layer0_input) din_onehot = dinput.dot(W.transpose()) all_din1_indextemp = T.max(din_onehot, axis=3) all_din1_index = T.argsort(all_din1_indextemp, axis=2) Fall_din1_index = theano.function( [index], all_din1_index, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) all_din2_index = T.argsort(din_onehot, axis=3) Fall_din2_index = theano.function( [index], all_din2_index, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size],
def step(x_t, M_tm1, c_tm1, h_tm1, r_tm1, wr_tm1, wu_tm1): # Feed Forward controller # h_t = lasagne.nonlinearities.tanh(T.dot(x_t, W_h) + b_h) # LSTM controller # p.3: "This memory is used by the controller as the input to a classifier, # such as a softmax output layer, and as an additional # input for the next controller state." -> T.dot(r_tm1, W_rh) preactivations = T.dot(x_t, W_xh) + T.dot(r_tm1, W_rh) + T.dot( h_tm1, W_hh) + b_h gf_, gi_, go_, u_ = slice_equally(preactivations, controller_size, 4) gf = lasagne.nonlinearities.sigmoid(gf_) gi = lasagne.nonlinearities.sigmoid(gi_) go = lasagne.nonlinearities.sigmoid(go_) u = lasagne.nonlinearities.tanh(u_) c_t = gf * c_tm1 + gi * u h_t = go * lasagne.nonlinearities.tanh(c_t) # (batch_size, num_units) k_t = lasagne.nonlinearities.tanh( T.dot(h_t, W_key) + b_key) # (batch_size, nb_reads, memory_size[1]) a_t = lasagne.nonlinearities.tanh( T.dot(h_t, W_add) + b_add) # (batch_size, nb_reads, memory_size[1]) sigma_t = lasagne.nonlinearities.sigmoid( T.dot(h_t, W_sigma) + b_sigma) # (batch_size, nb_reads, 1) sigma_t = T.addbroadcast(sigma_t, 2) wlu_tm1 = T.argsort(wu_tm1, axis=1)[:, :nb_reads] # (batch_size, nb_reads) # ww_t = sigma_t * wr_tm1 + (1. - sigma_t) * wlu_tm1 ww_t = (sigma_t * wr_tm1).reshape( (batch_size * nb_reads, memory_shape[0])) ww_t = T.inc_subtensor( ww_t[T.arange(batch_size * nb_reads), wlu_tm1.flatten()], 1. - sigma_t.flatten()) # (batch_size * nb_reads, memory_size[0]) ww_t = ww_t.reshape( (batch_size, nb_reads, memory_shape[0])) # (batch_size, nb_reads, memory_size[0]) # p.4: "Prior to writing to memory, the least used memory location is # computed from wu_tm1 and is set to zero" M_t = T.set_subtensor(M_tm1[T.arange(batch_size), wlu_tm1[:, 0]], 0.) M_t = M_t + T.batched_dot(ww_t.dimshuffle( 0, 2, 1), a_t) # (batch_size, memory_size[0], memory_size[1]) K_t = cosine_similarity(k_t, M_t) # (batch_size, nb_reads, memory_size[0]) wr_t = lasagne.nonlinearities.softmax( K_t.reshape((batch_size * nb_reads, memory_shape[0]))) wr_t = wr_t.reshape( (batch_size, nb_reads, memory_shape[0])) # (batch_size, nb_reads, memory_size[0]) if batch_size == 1: wr_t = T.unbroadcast(wr_t, 0) wu_t = gamma * wu_tm1 + T.sum(wr_t, axis=1) + T.sum( ww_t, axis=1) # (batch_size, memory_size[0]) r_t = T.batched_dot(wr_t, M_t).flatten( ndim=2) # (batch_size, nb_reads * memory_size[1]) return (M_t, c_t, h_t, r_t, wr_t, wu_t)
def merge_inc_func(self, learning_rate, batch_size, prealloc_x, prealloc_y): ''' Return a function that can merge/increment the model ''' # matrix for scoring merges m = T.matrix('m') m_dists, _ = theano.map(lambda v: T.sqrt(T.dot(v, v.T)), m) m_cosine = (T.dot(m, m.T) / m_dists) / m_dists.dimshuffle(0, 'x') m_ranks = T.argsort( (m_cosine - T.tri(m.shape[0]) * np.finfo(theano.config.floatX).max ).flatten())[(m.shape[0] * (m.shape[0] + 1)) // 2:] score_merges = theano.function([m], m_ranks) # greedy layerwise training layer_greedy = [ ae.indexed_train_func( 0, learning_rate, prealloc_x, batch_size, lambda x, j=i: chained_output(self.layers[:j], x)) for i, ae in enumerate(self._layered_autoencoders) ] finetune = self._autoencoder.train_func(0, learning_rate, prealloc_x, prealloc_y, batch_size) combined_objective_tune = self._combined_objective.train_func( 0, learning_rate, prealloc_x, prealloc_y, batch_size) # set up layered merge-increment - build a cost function mi_cost = self._softmax.cost + self.lam * self._autoencoder.cost mi_updates = [] for i, nnlayer in enumerate(self._autoencoder.layers): if i == 0: mi_updates += [ (nnlayer.W, T.inc_subtensor( nnlayer.W[:, nnlayer.idx], -learning_rate * T.grad(mi_cost, nnlayer.W)[:, nnlayer.idx].T)) ] mi_updates += [(nnlayer.b, T.inc_subtensor( nnlayer.b[nnlayer.idx], -learning_rate * T.grad(mi_cost, nnlayer.b)[nnlayer.idx]))] else: mi_updates += [ (nnlayer.W, nnlayer.W - learning_rate * T.grad(mi_cost, nnlayer.W)) ] mi_updates += [ (nnlayer.b, nnlayer.b - learning_rate * T.grad(mi_cost, nnlayer.b)) ] mi_updates += [(nnlayer.b_prime, -learning_rate * T.grad(mi_cost, nnlayer.b_prime))] softmax_theta = [self.layers[-1].W, self.layers[-1].b] mi_updates += [(param, param - learning_rate * grad) for param, grad in zip(softmax_theta, T.grad(mi_cost, softmax_theta))] idx = T.iscalar('idx') given = { self._x: prealloc_x[idx * batch_size:(idx + 1) * batch_size], self._y: prealloc_y[idx * batch_size:(idx + 1) * batch_size] } mi_train = theano.function([idx, self.layers[0].idx], None, updates=mi_updates, givens=given) def merge_model(pool_indexes, merge_percentage, inc_percentage): ''' Merge/increment the model using the given batch ''' prev_map = {} prev_dimensions = self.layers[0].initial_size[0] # first layer used = set() empty_slots = [] layer_weights = self.layers[0].W.get_value().T.copy() layer_bias = self.layers[0].b.get_value().copy() init = 4 * np.sqrt(6.0 / (sum(layer_weights.shape))) merge_count = int(merge_percentage * layer_weights.shape[0]) inc_count = int(inc_percentage * layer_weights.shape[0]) if merge_count == 0 and inc_count == 0: return for index in score_merges(layer_weights): if len(empty_slots) == merge_count: break x_i, y_i = index % layer_weights.shape[ 0], index // layer_weights.shape[0] if x_i not in used and y_i not in used: # merge x_i with y_i layer_weights[x_i] = (layer_weights[x_i] + layer_weights[y_i]) / 2 layer_bias[x_i] = (layer_bias[x_i] + layer_bias[y_i]) / 2 used.update([x_i, y_i]) empty_slots.append(y_i) new_size = layer_weights.shape[0] + inc_count - len(empty_slots) current_size = layer_weights.shape[0] # compact weights array if neccessary if new_size < current_size: non_empty_slots = sorted( list(set(range(0, current_size)) - set(empty_slots)), reverse=True)[:len(empty_slots)] prev_map = dict(zip(empty_slots, non_empty_slots)) # compact the layer weights by removing the empty slots for dest, src in prev_map.items(): layer_weights[dest] = layer_weights[src] layer_weights[src] = np.asarray(self.rng.uniform( low=-init, high=init, size=layer_weights.shape[1]), dtype=theano.config.floatX) empty_slots = [] else: prev_map = {} # will need to add more space for new features new_layer_weights = np.zeros((new_size, prev_dimensions), dtype=theano.config.floatX) new_layer_weights[:layer_weights.shape[0], :layer_weights.shape[ 1]] = layer_weights[:new_layer_weights. shape[0], :new_layer_weights.shape[1]] # randomly initalise new neurons empty_slots = [slot for slot in empty_slots if slot < new_size ] + list(range(layer_weights.shape[0], new_size)) new_layer_weights[empty_slots] = np.asarray( self.rng.uniform(low=-init, high=init, size=(len(empty_slots), prev_dimensions)), dtype=theano.config.floatX) layer_bias.resize(new_size) layer_bias_prime = self.layers[0].b_prime.get_value().copy() layer_bias_prime.resize(prev_dimensions) prev_dimensions = new_layer_weights.shape[0] # set the new data self.layers[0].W.set_value(new_layer_weights.T) self.layers[0].b.set_value(layer_bias) self.layers[0].b_prime.set_value(layer_bias_prime) #if empty_slots: ## train this layer #for _ in range(self.iterations): #for i in pool_indexes: #layer_greedy[0](i, empty_slots) # update the last layer's weight matrix size last_layer_weights = self.layers[1].W.get_value().copy() # apply mapping to last layer for dest, src in prev_map.items(): last_layer_weights[dest] = last_layer_weights[src] last_layer_weights[src] = np.zeros(last_layer_weights.shape[1]) # fix sizes last_layer_weights.resize( (prev_dimensions, self.layers[1].initial_size[1])) last_layer_prime = self.layers[1].b_prime.get_value().copy() last_layer_prime.resize(prev_dimensions) self.layers[1].W.set_value(last_layer_weights) self.layers[1].b_prime.set_value(last_layer_prime) # finetune with the deep autoencoder for _ in range(self.iterations): for i in pool_indexes: finetune(i) # finetune with supervised if empty_slots: for _ in range(self.iterations): for i in pool_indexes: mi_train(i, empty_slots) else: for i in pool_indexes: combined_objective_tune(i) return merge_model
def preparePooling(self, conv_out): neighborsForPooling = TSN.images2neibs(ten4=conv_out, neib_shape=(1,conv_out.shape[3]), mode='ignore_borders') self.neighbors = neighborsForPooling neighborsArgSorted = T.argsort(neighborsForPooling, axis=1) neighborsArgSorted = neighborsArgSorted return neighborsForPooling, neighborsArgSorted
def get_scores_all(mdl, fm, X, X_new, num_select=10): #f = open(output, "w") X1 = [] # x = X[i], we add new values to the end of x for j in range(len(X_new)): for k in range(len(X_new[j])): xx = [xxx for xxx in X[j]] xx.append(X_new[j][k][0] + "_" + X_new[j][k][1]) X1.append(xx) X = [[fm.f.map[fm.f.getFeatureValue(x)] + 1 for x in XX] for XX in X1] x, x_mask = preprare_seq_seq_data(X) x, _, mask_x, _, _, _, _, _ = mdl.standardize_data(x, None, x_mask, None, None, None, None, None) score_pos = mdl.get_output_layer(-1, x, mask_x) score_pos = score_pos.swapaxes(0, 1) score_pos = score_pos[:, -1] print(score_pos) x = T.matrix("score") sort_f = th.function([x], T.argsort(x)) sorted_values = sort_f(score_pos) sorted_values = sorted_values print(sorted_values) rs = [] rs_scores = [] my_scores = [] for i in range(sorted_values.shape[0]): #f.write(to_string(X1[i]) + " ") ss = [] for j in range(1, sorted_values.shape[1] + 1): val = sorted_values[i][sorted_values.shape[1] - j] #val_map = fm.fY.map_inversed[val-1] score = score_pos[i][val] #f.write(str(val) + ":" + str(score) + " ") ss.append((val, score)) #f.write("\n") my_scores.append((to_string(X1[i]), ss)) vals = [] c = 0 for t in range(sorted_values.shape[1] - 1, -1, -1): if c == num_select: break v = sorted_values[i][t] if fm.fY.map_inversed[v - 1] != "EOS": vals.append(v) c += 1 #vals = sorted_values[i][sorted_values.shape[1]-num_select:sorted_values.shape[1]] val_maps = [fm.fY.map_inversed[v - 1].split("_") for v in list(vals)] #if fm.fY.map_inversed[v-1]!="EOS" ] scores = [score_pos[i][v] for v in list(vals)] # if fm.fY.map_inversed[v-1]!="EOS"] rs.append(val_maps) rs_scores.append(scores) return rs, rs_scores, X1, my_scores
def all_same(idxs): first_row = idxs[0, :].reshape((1, idxs.shape[1])) first_row = T.argsort(first_row) return first_row.repeat(idxs.shape[0], axis=0)
def dnc_step( s_x_, s_lstm_cell_, s_lstm_hid_, s_usage_, s_preced_, s_link_, s_mem_, s_read_val_, s_read_wgt_, s_write_wgt_): s_states_li_ = [ s_lstm_cell_, s_lstm_hid_, s_usage_, s_preced_, s_link_, s_mem_, s_read_val_, s_read_wgt_, s_write_wgt_] s_inp = T.join(-1, s_x_, s_read_val_.flatten()) s_lstm_cell_tp1, s_lstm_hid_tp1 = lyr.lyr_lstm( 'ctrl', s_inp, s_lstm_cell_, s_lstm_hid_, ctrl_inp_size, ctrl_wm_size ) s_out, s_itrface = T.split( lyr.lyr_linear( 'ctrl_out', s_lstm_hid_tp1, ctrl_wm_size, ctrl_wm_size, bias_=None), [OUT_DIMS,itrface_size],2, axis=-1) splits_len = [ N_READS*CELL_SIZE, N_READS, CELL_SIZE, 1, CELL_SIZE, CELL_SIZE, N_READS, 1, 1, 3*N_READS ] s_keyr, s_strr, s_keyw, s_strw, \ s_ers, s_write, s_freeg, s_allocg, s_writeg, s_rmode = \ T.split(s_itrface, splits_len, 10, axis=-1) s_keyr = T.reshape(s_keyr, (CELL_SIZE,N_READS)) s_strr = 1.+T.nnet.softplus(s_strr) s_strw = 1.+T.nnet.softplus(s_strw[0]) s_ers = T.nnet.sigmoid(s_ers) s_freeg = T.nnet.sigmoid(s_freeg) s_allocg = T.nnet.sigmoid(s_allocg[0]) s_writeg = T.nnet.sigmoid(s_writeg[0]) s_rmode = T.nnet.softmax(T.reshape(s_rmode,(N_READS,3))).dimshuffle(1,0,'x') s_mem_retention = T.prod( 1.-s_freeg.dimshuffle(0,'x')*s_read_wgt_, axis=0) s_usage_tp1 = s_mem_retention*( s_usage_+s_write_wgt_-s_usage_*s_write_wgt_) s_usage_order = T.argsort(s_usage_tp1) s_usage_order_inv = T.inverse_permutation(s_usage_order) s_usage_tp1_sorted = s_usage_tp1[s_usage_order] s_alloc_wgt = ((1.-s_usage_tp1_sorted)*( T.join( 0,np.array([1.],dtype=th.config.floatX), op_cumprod_hack(s_usage_tp1_sorted[:-1]) )))[s_usage_order_inv] s_content_wgt_w = T.nnet.softmax( s_strw*T.dot(s_mem_, s_keyw)/( T.sqrt( EPS+T.sum(T.sqr(s_mem_),axis=-1)*T.sum(T.sqr(s_keyw)))) ).flatten() s_write_wgt_tp1 = s_writeg*( s_allocg*s_alloc_wgt+(1.-s_allocg)*s_content_wgt_w) s_mem_tp1 = s_mem_*( 1.-T.outer(s_write_wgt_tp1,s_ers))+T.outer(s_write_wgt_tp1,s_write) s_preced_tp1 = (1.-T.sum(s_write_wgt_))*s_preced_ + s_write_wgt_tp1 s_link_tp1 = ( 1.-s_write_wgt_tp1-s_write_wgt_tp1.dimshuffle(0,'x') )*s_link_ + T.outer(s_write_wgt_tp1,s_preced_) s_link_tp1 = s_link_tp1 * (1.-T.identity_like(s_link_tp1))#X s_fwd = T.dot(s_read_wgt_, s_link_tp1.transpose())#X s_bwd = T.dot(s_read_wgt_, s_link_tp1)#X s_content_wgt_r= T.nnet.softmax(T.dot(s_mem_tp1, s_keyr)/(T.sqrt( EPS+T.outer( T.sum(T.sqr(s_mem_tp1),axis=-1),T.sum(T.sqr(s_keyr),axis=0) )))).transpose() s_read_wgt_tp1 = s_bwd*s_rmode[0]+s_content_wgt_r*s_rmode[1]+s_fwd*s_rmode[2] s_read_val_tp1 = T.dot(s_read_wgt_tp1, s_mem_tp1) s_y = s_out + lyr.lyr_linear( 'read_out', s_read_val_tp1.flatten(), CELL_SIZE*N_READS,OUT_DIMS, bias_=None) return [ s_y, s_lstm_cell_tp1, s_lstm_hid_tp1, s_usage_tp1, s_preced_tp1, s_link_tp1, s_mem_tp1, s_read_val_tp1, s_read_wgt_tp1, s_write_wgt_tp1]
resultOrigAndWarp = T.set_subtensor(r2[(r2 < 0.0001).nonzero()], 999999) resultOrigAndWarp = T.concatenate((c[:, 0:2], resultOrigAndWarp), axis=1) _imageWarp2GPU = theano.function([pix, KRT, KRinv, KR2, KRT2, adjust], resultOrigAndWarp, on_unused_input='ignore') #theano.printing.debugprint(_imageWarp2GPU) r3 = r2 rd = r3[:, 2] rxf = r3[:, 0] / rd ryf = r3[:, 1] / rd rx = T.cast(rxf, 'int32') ry = T.cast(ryf, 'int32') #rxy = T.transpose(T.as_tensor_variable([rx,ry])) i = T.argsort(-rd) rx = rx[i] ry = ry[i] rd = rd[i] c = c[i] p = p[i] r2 = r2[i] dest_img2 = T.set_subtensor(dest_img[rx, ry], p[:, 0]) dest_img = T.set_subtensor(dest_img[rx, ry], rd) _imageWarp2GPUFilled2 = theano.function([pix, KRT, KRinv, KR2, KRT2, adjust], T.transpose(dest_img2)) _imageWarp2GPUFilled = theano.function([pix, KRT, KRinv, KR2, KRT2, adjust], T.transpose(dest_img)) # interpolate each pixel with the color values at (x,y),(x,y+1),(x+1,y),(x+1,y+1)
def kmax(masked_data): result = masked_data[ T.arange(masked_data.shape[0]).dimshuffle(0, "x", "x"), T.sort(T.argsort(masked_data, axis=1)[:, -pooling_size:, :], axis=1), T.arange(masked_data.shape[2]).dimshuffle("x", "x", 0)] return result
def get_output(self, input_, label, mask=None): """ This function overrides the parents' one. Computes the loss by model input_ion and real label. Parameters ---------- input_: TensorVariable an array of (batch size, input_ion). for accuracy task, "input_" is 2D matrix. label: TensorVariable an array of (batch size, answer) or (batchsize,) if label is a list of class labels. for classification, highly recommend second one. should make label as integer. mask: TensorVariable an array of (batchsize,) only contains 0 and 1. loss are summed or averaged only through 1. Returns ------- TensorVariable a symbolic tensor variable which is scalar. """ # do if mask is None: if self.top_k == 1: if label.ndim == 1: return T.mean(T.eq(T.argmax(input_, axis=-1), label)) elif label.ndim == 2: return T.mean( T.eq(T.argmax(input_, axis=-1), T.argmax(label, axis=-1))) else: raise ValueError() else: # TODO: not yet tested top_k_input_ = T.argsort( input_ )[:, -self.top_k:] # sort by values and keep top k indices if label.ndim == 1: return T.mean(T.any(T.eq(top_k_input_, label), axis=-1)) elif label.ndim == 2: return T.mean( T.any(T.eq(top_k_input_, T.argmax(label, axis=-1)), axis=-1)) raise ValueError() else: if self.top_k == 1: if label.ndim == 1: return T.sum( T.eq(T.argmax(input_, axis=-1), label) * mask) / T.sum(mask) elif label.ndim == 2: return T.sum( T.eq(T.argmax(input_, axis=-1), T.argmax( label, axis=-1)) * mask) / T.sum(mask) else: raise ValueError() else: # TODO: not yet tested top_k_input_ = T.argsort( input_ )[:, -self.top_k:] # sort by values and keep top k indices if label.ndim == 1: return T.sum( T.any(T.eq(top_k_input_, label), axis=-1) * mask) / T.sum(mask) elif label.ndim == 2: return T.sum( T.any(T.eq(top_k_input_, T.argmax(label, axis=-1)), axis=-1) * mask) / T.sum(mask) raise ValueError()
def getKmaxIndices(self, weights, k): maxIndices = T.argsort(weights, axis=2)[:, :, -k:] maxIndicesSorted = T.sort(maxIndices, axis=2) ii = T.repeat(T.arange(self.batchsize), k) jj = maxIndicesSorted.flatten() return ii, jj
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) hL = [[]] * h_depth cL = [[]] * h_depth outp = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) outp[di] = hL[di] if options.get('en_residual_conn', 1): if (di > 0): outp[di] += outp[di - 1] print "Connecting residual at %d" % (di) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) if options.get('class_out_factoring', 0) == 1: if options.get('cls_diff_layer', 0) == 1: pC = tensor.dot(hL[-2], tparams['WdCls']) + tparams['bdCls'] else: pC = tensor.dot(outp[-1], tparams['WdCls']) + tparams['bdCls'] pCSft = tensor.nnet.softmax(pC) xCIdx = tensor.argmax(pCSft, axis=-1) #pW = tensor.dot(outp[-1],tparams['Wd'][:,xCIdx,:]) + tparams['bd'][:,xCIdx,:] #smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') #pWSft = tensor.nnet.softmax(pW*smooth_factor) #lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0,xCIdx] + 1e-20) ######################################################### # pW is now of size (beam_size, n_classes, class_size) if options.get('cls_zmean', 0): pW = tensor.dot( (outp[-1] - tparams['WdCls'][:, xCIdx].T), tparams['Wd'].swapaxes(0, 1)) + tparams['bd'][0, :, :] else: pW = tensor.dot((outp[-1]), tparams['Wd'].swapaxes( 0, 1)) + tparams['bd'][0, :, :] #smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') pWSft = tensor.nnet.softmax( pW.reshape([pW.shape[0] * pW.shape[1], pW.shape[2]])).reshape( [pW.shape[0], pW.shape[1] * pW.shape[2]]) ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo) lProb = tensor.log(pWSft[:,ixtoclsinfo_t[:,0]*tparams['Wd'].shape[2]+ixtoclsinfo_t[:,3]] + 1e-20) + \ tensor.log(pCSft[0,ixtoclsinfo_t[:,0]] + 1e-20) else: p = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') p = tensor.nnet.softmax(p * smooth_factor) lProb = tensor.log(p + 1e-20) if per_word_logweight is not None: log_w = theano.shared( per_word_logweight) #, dtype= theano.config.floatX) lProb = log_w + lProb if beam_size > 1: def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse( tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse( tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xCandIdx = srtIdx // beam_size # Floor division h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] if options.get('class_out_factoring', 0) == 1: clsoffset = tensor.as_tensor_variable(self.clsOffset) else: xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb, keepdims=True) xWlogProb = lProb[xWIdx] + lP_ #if options.get('class_out_factoring',0) == 1: # clsoffset = tensor.as_tensor_variable(self.clsOffset) # xWIdx += clsoffset[xCIdx] h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) if options.get('softmax_propogate', 0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
import theano import theano.tensor as T from theano.tensor.nnet.conv import conv2d from theano.tensor.signal import downsample import numpy as np from layers import * import cPickle, gzip, numpy x = T.tensor4() yinds = T.argsort(x, axis=3) sliced = x func = theano.function([x], yinds) X = np.random.random((2, 2, 3, 4)) print X print func(X)