def gibbs(self, sample, countStep, function_mode, h_lid_type = 0): # templates of Varibles for calculate h_lid by previous value calc_h_lid = lambda h_lid_old, sample: T.nnet.sigmoid(T.dot(sample, self.W) + self.hBiasbase) #+ T.dot(h_lid_old, self.W2.T) calc_hBiases = lambda h_lid: self.hBiasbase + T.dot(h_lid, self.W2.T) calc_vBiases = lambda h_lid: self.vBiasbase + T.dot(h_lid, self.W1.T) # Parameter: countGibbsStep def gibbsSamplingForAllTime(sample, start_h_lid): def gibbsSamplingForOneStepTime(sample, h_lid): vBias = calc_vBiases(h_lid) hBias = calc_hBiases(h_lid) res, updates = self.bm.gibbs(sample, self.W, vBias, hBias, countStep, function_mode) #res = res[-1] if h_lid_type == 0: return [res, calc_h_lid(h_lid, sample), vBias, hBias], updates else: return [res, calc_h_lid(h_lid, res), vBias, hBias], updates [sample_res, hLids, vBiases, hBiases], updates = theano.scan(gibbsSamplingForOneStepTime, sequences=sample, outputs_info=[None, start_h_lid, None, None]) return sample_res, hLids, vBiases, hBiases, updates # usual gibbs-sampling if len(sample.broadcastable) == 2: # matrix! it is one object res, hLids, vBiases, hBiases, updates = gibbsSamplingForAllTime([sample], self.h_lid_0) hLids = T.concatenate([[self.h_lid_0], hLids[0:-1]]) return res, hLids, updates, vBiases, hBiases else: new_dim = T.cast(sample.shape[0], 'int32'); my_sample = T.transpose(sample, (1, 0, 2)) h_lids_start = T.reshape(T.repeat(self.h_lid_0, new_dim), (self.hidden, new_dim)).T res, hLids, vBiases, hBiases, updates = gibbsSamplingForAllTime(my_sample, h_lids_start) res = T.transpose(res, (1, 0, 2)) hLids = T.concatenate([[h_lids_start], hLids[0:-1]]) hLids = T.transpose(hLids, (1, 0, 2)) vBiases = T.transpose(vBiases, (1, 0, 2)) hBiases = T.transpose(hBiases, (1, 0, 2)) return res, hLids, updates, vBiases, hBiases
def sgru3(X, h, W, U, b, t): t = 0 z_t = T.tanh(T.dot(X,W[t*2+0]) + b[t*2+0]) r_t = (T.dot(h,U[t*2+0]) + b[t*2+1]) z_t2 = (T.dot(X,W[t*2+1]) + b[t*2+2]) r_t2 = T.tanh(T.dot(h,U[t*2+1]) + b[t*2+3]) return T.tanh(T.dot(z_t*r_t,T.transpose(U[t*2+2])) + T.dot(z_t2*r_t2,T.transpose(U[t*2+3])))
def full(self, X, Xs=None): X, Xc, Xs = self._common(X, Xs) if Xs is None: return tt.dot(Xc, tt.transpose(Xc)) else: Xsc = tt.sub(Xs, self.c) return tt.dot(Xc, tt.transpose(Xsc))
def forward_batch_step(x_t, H_mask, H_tm1): H = TT.dot(W_rec,H_tm1) + W_in[:,x_t] H_t = TT.nnet.sigmoid(H) Y_t = TT.nnet.softmax(TT.transpose(TT.dot(W_out, H_t))) Y_t = -TT.log2(Y_t) Y_t = TT.dot(TT.transpose(Y_t), TT.diag(H_mask)) return [H_t, Y_t]
def bbprop(self): self.lin_bbprop = self.p_y_given_x - self.p_y_given_x * self.p_y_given_x self.lin_bbprop /= T.shape(self.p_y_given_x)[0] self.dict_bbprop = {} self.dict_bbprop.update({self.b_upmask: T.sum(self.lin_bbprop, 0)}) self.dict_bbprop.update({self.W_upmask: T.dot(T.transpose(self.inp * self.inp), self.lin_bbprop)}) return T.dot(self.lin_bbprop, T.transpose(self.W * self.W)), self.dict_bbprop
def T_l2_cost_conv(x,a,A,imshp,kshp,mask=True): """ xsz*ysz*nchannels, nimages = x.shape xsz*ysz*nfeat, nimages = a.shape xsz*ysz*nchannels, nfeat = A.shape """ #imshp = num images, channels, szy, szx #kshp = features, channels, szy, szx #featshp = num images, features, szy, szx featshp = (imshp[0],kshp[0],imshp[2] - kshp[2] + 1,imshp[3] - kshp[3] + 1) # num images, features, szy, szx image = T.reshape(T.transpose(x),imshp) kernel = T.reshape(T.transpose(A),kshp) features = T.reshape(T.transpose(a),featshp) # Need to transpose first two dimensions of kernel, and reverse index kernel image dims (for correlation) kernel_rotated = T.transpose(kernel[:,:,::-1,::-1],axes=[1,0,2,3]) image_estimate = conv2d(features,kernel_rotated,border_mode='full') if mask: image_error_temp = image - image_estimate image_error = T.zeros_like(image_error_temp) image_error = T.set_subtensor(image_error[:,:,(kshp[2]-1):(imshp[2]-kshp[2]+1),(kshp[3]-1):(imshp[3]-kshp[3]+1)], image_error_temp[:,:,(kshp[2]-1):(imshp[2]-kshp[2]+1),(kshp[3]-1):(imshp[3]-kshp[3]+1)]) else: image_error = image - image_estimate return .5*T.sum(image_error **2)
def nin(X, param): w1, w2, w3, b1, b2, b3 = param X = X.dimshuffle(0, 1, 'x', 2, 3) # (n,32,1,r,c) w1 = w1.dimshuffle(0, 1, 2, 'x', 3, 4) # (64,32,16,1,3,3) w2 = w2.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,16,1,1) w3 = w3.dimshuffle(0, 1, 2, 'x', 'x') # (64,2,32,1,1) b1 = b1.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,16,1,1) b2 = b2.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,1,1,1) b3 = b3.dimshuffle(0, 'x', 1, 'x', 'x') # (64,1,2,1,1) indexi = T.arange(w1.shape[0], dtype='int32') # (0:64) indexi = T.repeat(indexi, w1.shape[1], axis=0) indexj = T.arange(w1.shape[1], dtype='int32') # (0:64) indexj = T.tile(indexj, w1.shape[0]) results, updates = scan(fn=metaOp1, sequences=[indexi, indexj], outputs_info=None, non_sequences=[X, w1, w2, b1, b2], strict=True) # (64*32,n,1,r,c) metaShape1 = results.shape[-4], results.shape[-2], results.shape[-1] reshaped1 = results.reshape((w1.shape[0], w1.shape[1]) + metaShape1) # (64,32,n,r,c) permuted1 = T.transpose(reshaped1, axes=(0, 2, 1, 3, 4)) # (64,n,32,r,c) indexi = T.arange(w1.shape[0], dtype='int32') # (0:64) results, updates = scan(fn=metaOp2, sequences=[indexi], outputs_info=None, non_sequences=[permuted1, w3, b3], strict=True) # (64,n,2,r,c) permuted2 = T.transpose(results, axes=(1, 0, 2, 3, 4)) # (n,64,2,r,c) metaShape2 = permuted2.shape[-2], permuted2.shape[-1] reshaped2 = permuted2.reshape((permuted2.shape[0], -1) + metaShape2) # (n,128,r,c) return reshaped2
def theano_kernel_derivative(imshp,kshp,featshp,stride=1): features = T.tensor4(dtype=theano.config.floatX) kernel = T.tensor4(dtype=theano.config.floatX) image = T.tensor4(dtype=theano.config.floatX) # Need to transpose first two dimensions of kernel, and reverse index kernel image dims (for correlation) kernel_rotated = T.transpose(kernel[:,:,::-1,::-1],axes=[1,0,2,3]) featshp_logical = (featshp[0],featshp[1],featshp[2]*stride,featshp[3]*stride) kshp_rotated = (kshp[1], kshp[0], kshp[2], kshp[3]) image_estimate = conv2d(features,kernel_rotated,border_mode='full', image_shape=featshp,filter_shape=kshp_rotated, imshp_logical=featshp_logical[1:],kshp_logical=kshp[2:]) image_error = image - image_estimate image_error_rot = T.transpose(image_error,[1,0,2,3])[:,:,::-1,::-1] imshp_rot = (imshp[1],imshp[0],imshp[2],imshp[3]) featshp_rot = (featshp[1],featshp[0],featshp[2],featshp[3]) features_rot = T.transpose(features,[1,0,2,3]) featshp_rot_logical = (featshp_rot[0],featshp_rot[1],featshp_rot[2]*stride,featshp_rot[3]*stride) kernel_grad_rot = -conv2d(image_error_rot,features_rot, image_shape=imshp_rot,filter_shape=featshp_rot, imshp_logical=imshp_rot[1:],kshp_logical=featshp_rot_logical[2:]) kernel_grad = T.transpose(kernel_grad_rot,[1,0,2,3]) return function(inputs=[image,features,kernel],outputs=kernel_grad)
def __init__(self, rng, input, n_feature_maps, n_in, n_out, b_size=5, read_file=False, W=None, b=None): # input dim should be: batch_size x n_feature_maps x 504 # n_in and n_out should be 504 and 40 respectively input = T.transpose(input, (1, 0, 2)) self.input = input if read_file==False: W_values = np.asarray( rng.uniform( low=-np.sqrt(6./(n_in+n_out)), high=np.sqrt(6./(n_in+n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) W = theano.shared(value=W_values, name='W', borrow=True) b_values = np.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.W = W self.b = b embedding_list = [] for i in range(n_feature_maps): embedding_list.append(T.tanh(T.dot(input[i], self.W) + self.b)) self.output = T.concatenate(embedding_list, axis=0) self.output = T.reshape(self.output, (n_feature_maps, b_size, n_out)) self.params = [self.W, self.b] self.input = T.transpose(self.input, (1, 0, 2)) self.output = T.transpose(self.output, (1, 0, 2))
def categorical_crossentropy_segm(prediction_proba, targets): ''' MODIFICATIONS: - reshape from image-size to array and back ''' shape = T.shape(prediction_proba) pred_mod1 = T.transpose(prediction_proba, (0,2,3,1)) pred_mod = T.reshape(pred_mod1, (-1,shape[1])) if prediction_proba.ndim == targets.ndim: targ_mod1 = T.transpose(targets,(0,2,3,1)) targ_mod = T.reshape(targ_mod1,(-1,shape[1])) else: targ_mod = T.reshape(targets, (-1,)) results = categorical_crossentropy(pred_mod, targ_mod) results = T.reshape(results, (shape[0],shape[2],shape[3])) # QUICK IMPLEMENTATION FOR TWO SPECIFIC CLASSES. NEEDS GENERALIZATION # Weights depending on class occurency: weights = (1.02275, 44.9647) cars_indx, not_cars_indx = T.nonzero(targets), T.nonzero(T.eq(targets,0)) T.set_subtensor(results[cars_indx], results[cars_indx]*float32(weights[1]) ) T.set_subtensor(results[not_cars_indx], results[not_cars_indx]*float32(weights[0]) ) return T.sum(results, axis=(1,2))
def nn_param(params,input): from theano import tensor as T from matplotlib import pyplot as plt layers=len(params) if(layers==1): lnum=0 p=T.nnet.sigmoid(T.dot(input,params[lnum][0][1])+params[lnum][1][1]) y=T.nnet.sigmoid(T.dot(p,T.transpose(params[lnum][0][1]))+params[lnum][2][1]) yval=y.eval() return yval for lnum in range(layers): if (lnum==0): p=T.nnet.sigmoid(T.dot(input,params[lnum][0][1])+params[lnum][1][1]) y=T.nnet.sigmoid(T.dot(p,T.transpose(params[lnum][0][1]))+params[lnum][2][1]) yval=y.eval() plt.plot(yval,label='%d'%lnum) else: p=T.nnet.sigmoid(T.dot(yval,params[lnum][0][1])+params[lnum][1][1]) y=T.nnet.sigmoid(T.dot(p,T.transpose(params[lnum][0][1]))+params[lnum][2][1]) yval=y.eval() plt.plot(yval) plt.legend() plt.show() return yval
def __init(): dataset = T.matrix("dataset", dtype=config.globalFloatType()) trans_dataset = T.transpose(dataset) dot_mul = T.dot(dataset, trans_dataset) l2 = T.sqrt(T.sum(T.square(dataset), axis=1)) # p =printing.Print("l2") # l2 = p(l2) l2_inv2 = T.inv(l2).dimshuffle(['x', 0]) # p =printing.Print("l2_inv2") # l2_inv2 = p(l2_inv2) l2_inv1 = T.transpose(l2_inv2) # p =printing.Print("l2_inv1") # l2_inv1 = p(l2_inv1) l2_inv = T.dot(l2_inv1, l2_inv2) # p =printing.Print("l2_inv") # l2_inv = p(l2_inv) affinty = (T.mul(dot_mul, l2_inv) + 1) / 2 globals()['__affinty_fun'] = theano.function( [dataset], [affinty], allow_input_downcast=True )
def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total): sigma2 = tt.square(sigma) Kuu = cov_total(Xu) Kuf = cov_total(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = cov_total(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 else: # VFE or DTC Lamd = tt.ones_like(Qffd) * sigma2 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - mean_total(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) Kus = self.cov_func(Xu, Xnew) As = solve_lower(Luu, Kus) mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c)) C = solve_lower(L_B, As) if diag: Kss = self.cov_func(Xnew, diag=True) var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0) if pred_noise: var += sigma2 return mu, var else: cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) + tt.dot(tt.transpose(C), C)) if pred_noise: cov += sigma2 * tt.identity_like(cov) return mu, stabilize(cov)
def kmaxpooling_output(input): ''' 实现 k-max pooling 1. 先排序 2. 再分别取出前k个值 :param k: k top higiest value :type k: int :return: ''' input = T.transpose(input, axes=(0, 1, 3, 2)) sorted_values = T.argsort(input, axis=3) topmax_indexes = sorted_values[:, :, :, -k:] # sort indexes so that we keep the correct order within the sentence topmax_indexes_sorted = T.sort(topmax_indexes) # given that topmax only gives the index of the third dimension, we need to generate the other 3 dimensions dim0 = T.arange(0, input.shape[0]).repeat(input.shape[1] * input.shape[2] * k) dim1 = T.arange(0, input.shape[1]).repeat(k * input.shape[2]).reshape((1, -1)).repeat(input.shape[0], axis=0).flatten() dim2 = T.arange(0, input.shape[2]).repeat(k).reshape((1, -1)).repeat(input.shape[0] * input.shape[1], axis=0).flatten() dim3 = topmax_indexes_sorted.flatten() return T.transpose( input[dim0, dim1, dim2, dim3].reshape((input.shape[0], input.shape[1], input.shape[2], k)), axes=(0, 1, 3, 2))
def T_subspacel1_slow_shrinkage_conv(a, L, lam_sparse, lam_slow, imshp,kshp,featshp,stride=(1,1),small_value=.001): featshp = (imshp[0],kshp[0],featshp[2],featshp[3]) # num images, features, szy, szx features = T.reshape(T.transpose(a),featshp,ndim=4) amp = T.sqrt(features[:,::2,:,:]**2 + features[:,1::2,:,:]**2 + small_value) #damp = amp[:,1:] - amp[:,:-1] # compose slow shrinkage with subspace l1 shrinkage # slow shrinkage div = T.zeros_like(amp) d1 = amp[1:,:,:,:] - amp[:-1,:,:,:] d2 = d1[1:,:,:,:] - d1[:-1,:,:,:] div = T.set_subtensor(div[1:-1,:,:,:], -d2) div = T.set_subtensor(div[0,:,:,:], -d1[0,:,:,:]) div = T.set_subtensor(div[-1,:,:,:], d1[-1,:,:,:]) slow_amp_shrinkage = 1 - (lam_slow / L) * (div / amp) slow_amp_value = T.switch(T.gt(slow_amp_shrinkage, 0), slow_amp_shrinkage, 0) slow_shrinkage_prox_a = slow_amp_value * features[:, ::2, :,:] slow_shrinkage_prox_b = slow_amp_value * features[:,1::2, :,:] # subspace l1 shrinkage amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a ** 2 + slow_shrinkage_prox_b ** 2) #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox amp_shrinkage = 1. - (lam_sparse / L) / amp_slow_shrinkage_prox amp_value = T.switch(T.gt(amp_shrinkage, 0.), amp_shrinkage, 0.) subspacel1_prox = T.zeros_like(features) subspacel1_prox = T.set_subtensor(subspacel1_prox[:, ::2, :,:], amp_value * slow_shrinkage_prox_a) subspacel1_prox = T.set_subtensor(subspacel1_prox[:,1::2, :,:], amp_value * slow_shrinkage_prox_b) reshape_subspacel1_prox = T.transpose(T.reshape(subspacel1_prox,(featshp[0],featshp[1]*featshp[2]*featshp[3]),ndim=2)) return reshape_subspacel1_prox
def _pooling_function(self, inputs, pool_size, strides, border_mode, dim_ordering): if pool_size[0]<-1: # k-max pooling input_layer = T.transpose(inputs, axes=(0, 1, 3, 2)) sorted_values = T.argsort(input_layer, axis=3) topmax_indexes = sorted_values[:, :, :, -self.k:] # sort indexes so that we keep the correct order within the sentence topmax_indexes_sorted = T.sort(topmax_indexes) # given that topmax only gives the index of the third dimension, we need to generate the other 3 dimensions dim0 = T.arange(0, input_layer.shape[0]).repeat(input_layer.shape[1] * input_layer.shape[2] * self.k) dim1 = T.arange(0, input_layer.shape[1]).repeat(self.k * input_layer.shape[2]).reshape((1, -1)).repeat( input_layer.shape[0], axis=0).flatten() dim2 = T.arange(0, input_layer.shape[2]).repeat(self.k).reshape((1, -1)).repeat( input_layer.shape[0] * input_layer.shape[1], axis=0).flatten() dim3 = topmax_indexes_sorted.flatten() x = T.transpose( input_layer[dim0, dim1, dim2, dim3].reshape( (input_layer.shape[0], input_layer.shape[1], input_layer.shape[2], self.k)), axes=(0, 1, 3, 2)) return x else: return super(MaxPooling2DWrapper, self)._pooling_function(inputs, pool_size, strides, border_mode, dim_ordering)
def T_l2_cost_conv_dA(x,a,A,imshp,kshp,featshp,stride=(1,1),mask=True): image_error, kernel, features = helper_T_l2_cost_conv(x=x,a=a,A=A,imshp=imshp,kshp=kshp,featshp=featshp,stride=stride,mask=mask) if stride == (1,1): image_error_rot = T.transpose(image_error,[1,0,2,3])[:,:,::-1,::-1] imshp_rot = (imshp[1],imshp[0],imshp[2],imshp[3]) featshp_rot = (featshp[1],featshp[0],featshp[2],featshp[3]) features_rot = T.transpose(features,[1,0,2,3]) featshp_rot_logical = (featshp_rot[0], featshp_rot[1], imshp[2] - kshp[2] + 1, imshp[3] - kshp[3] + 1) kernel_grad_rot = -1.*conv2d(image_error_rot,features_rot, image_shape=imshp_rot,filter_shape=featshp_rot, imshp_logical=imshp_rot[1:],kshp_logical=featshp_rot_logical[2:]) kernel_grad = T.transpose(kernel_grad_rot,[1,0,2,3]) reshape_kernel_grad = T.transpose(T.reshape(kernel_grad,(kshp[0],kshp[1]*kshp[2]*kshp[3]),ndim=2)) return reshape_kernel_grad else: my_conv = MyConv_view(strides=stride,kshp=kshp) kernel_grad = my_conv(image_error,features) reshape_kernel_grad = T.transpose(T.reshape(kernel_grad, (kshp[0], kshp[1] * kshp[2] * kshp[3]), ndim=2)) return reshape_kernel_grad
def get_output_for(self, input, **kwargs): ''' Computes 2D FFT. Input layer must have dimension [n, 2, nx, ny] ''' if self.is_3d: n, nc, nx, ny, nt = self.data_shape lin = T.transpose(input, axes=(0, 4, 1, 2, 3)) lin = lin.reshape((-1, nc, nx, ny)) lout, updates = theano.scan(self.transform, sequences=lin) lout = lout.reshape((-1, nt, nc, nx, ny)) out = T.transpose(lout, axes=(0, 2, 3, 4, 1)) return out # def loop_over_n(i, arr): # out, updates = theano.scan(self.transform, # sequences=arr[:, :, i])[0] # return out # nt = self.data_shape[-1] # out, updates = theano.scan(loop_over_n, # non_sequences=input, # sequences=xrange(nt)) # return out out, updates = theano.scan(self.transform, sequences=input) return out
def full(self, X, Z=None): X, Xc, Z = self._common(X, Z) if Z is None: return tt.dot(Xc, tt.transpose(Xc)) else: Zc = tt.sub(Z, self.c) return tt.dot(Xc, tt.transpose(Zc))
def kmeans(train_set_x): if train_set_x is None: train_set_x = T.matrix('train_set_x') ######################## # Normalize the inputs # ######################## epsilon_norm = 10 epsilon_zca = 0.015 K = 500 train_set_x = train_set_x - T.mean(train_set_x, axis=0) / T.sqrt(T.var(train_set_x, axis=0) + epsilon_norm) ##################### # Whiten the inputs # ##################### # a simple choice of whitening transform is the ZCA whitening transform # epsilon_zca is small constant # for contrast-normalizaed data, setting epsilon_zca to 0.01 for 16-by-16 pixel patches, # or to 0.1 for 8-by-8 pixel patches # is good starting point cov = T.dot(train_set_x, T.transpose(train_set_x)) / train_set_x.shape[1] U, S, V = linalg.svd(cov) tmp = T.dot(U, T.diag(1/T.sqrt(S + epsilon_zca))) tmp = T.dot(tmp, T.transpose(U)) whitened_x = T.dot(tmp, train_set_x) ###################### # Training the Model # ###################### # Initialization dimension_size = whitened_x.shape[0] num_samples = whitened_x.shape[1] srng = RandomStreams(seed=234) D = srng.normal(size=(dimension_size, K)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) # typically 10 iterations is enough num_iteration = 15 # compute new centroids, D_new for i in xrange(num_iteration): dx = T.dot(D.T, whitened_x) arg_max_dx = T.argmax(dx, axis=0) s = dx[arg_max_dx, T.arange(num_samples)] S = T.zeros((K, num_samples)) S = T.set_subtensor(S[arg_max_dx, T.arange(num_samples)], s) D = T.dot(whitened_x, T.transpose(S)) + D D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) return D
def train(self, n_epochs=100, mini_batch_size=1, learning_rate=0.1): index = T.lscalar() x=T.matrix('x') is_dropout = T.dscalar('is_dropout') params = [self.W, self.b1, self.b2] hidden = self.activation_function(T.dot(x, self.W)+self.b1) arr_n = self.get_mask(self.b1,0.5) hidden_tilde = hidden hidden_tilde = arr_n * hidden output_without_drop = T.dot(hidden,T.transpose(self.W))+self.b2 output_without_drop = self.output_function(output_without_drop) output_dropout = T.dot(hidden_tilde,T.transpose(self.W))+self.b2 output_dropout = self.output_function(output_dropout) #x_printed = theano.printing.Print('this is a very important value')(arr_n) #Use cross-entropy loss. L = -T.sum(x*T.log(output_dropout) + (1-x)*T.log(1-output_dropout), axis=1) cost=L.mean() L_without_drops = -T.sum(x*T.log(output_without_drop) + (1-x)*T.log(1-output_without_drop), axis=1) cost2 = L_without_drops.mean() updates=[] #Return gradient with respect to W, b1, b2. gparams = T.grad(cost,params) gparams_shared = theano.shared(gparams,'gparams_shared') gparams[0] = gparams[0] * arr_n gparams[1] = gparams[1] * arr_n #x_printed1 = theano.printing.Print('this is a very important value')(gparams[0]) #x_printed2 = theano.printing.Print('this is a very important value')(gparams[1]) #gparams_shared = gparams_shared*mask #Create a list of 2 tuples for updates. for param, gparam in zip(params, gparams): updates.append((param, param-learning_rate*gparam)) #Train given a mini-batch of the data. train = th.function(inputs=[index], outputs=[cost], updates=updates, givens={x:self.X[index:index+mini_batch_size,:]}) valid = th.function(inputs=[index], outputs=[cost2], givens={x:self.Y[index:index+mini_batch_size,:]}) import time start_time = time.clock() for epoch in xrange(n_epochs): print "Epoch:",epoch cost_train = 0 cost_valid = 0 for row in xrange(0,self.m, mini_batch_size): cost_train= cost_train + train(row)[0] for row in xrange(0,self.Y_m, mini_batch_size): cost_valid = cost_valid + valid(row)[0] global_valid_cost.append((cost_valid/self.Y_m)) global_train_cost.append((cost_train/self.m)) end_time = time.clock() print "Average time per epoch=", (end_time-start_time)/n_epochs
def weighted_binary_cross_entropy_4(pred, target, class_normalization): # Mix of 0 and 2 # From theano DIM = pred.shape[1] BATCH_SIZE = pred.shape[0] N_on_per_batch = (T.transpose(T.tile(target.sum(axis=1), (DIM, 1))) + 1) N_off_per_batch = (T.transpose(T.tile((1-target).sum(axis=1), (DIM, 1))) + 1) class_norm_tile = T.tile(class_normalization, (BATCH_SIZE, 1)) return -(class_norm_tile * target * T.log(pred) / N_on_per_batch + (1.0 - target) * T.log(1.0 - pred) / N_off_per_batch)
def Kmeans(X_train=None, K=300, epsilon_whitening=0.015): if X_train is None: X_train = T.matrix("X_train") ######################## # Normalize the inputs # ######################## # A constant added to the variance to avoid division by zero epsilon_norm = 10 # We subtract from each training sample (each column in X_train) its mean X_train = X_train - T.mean(X_train, axis=0) / T.sqrt(T.var(X_train, axis=0) + epsilon_norm) ##################### # Whiten the inputs # ##################### sigma = T.dot(X_train, T.transpose(X_train)) / X_train.shape[1] U, s, V = linalg.svd(sigma, full_matrices=False) tmp = T.dot(U, T.diag(1 / T.sqrt(s + epsilon_whitening))) tmp = T.dot(tmp, T.transpose(U)) X_Whitened = T.dot(tmp, X_train) ###################### # Training the Model # ###################### # Initialization dimensions = X_Whitened.shape[0] samples = X_Whitened.shape[1] srng = RandomStreams(seed=234) # We initialize the centroids by sampling them from a normal # distribution, and then normalizing them to unit length # D \in R^{n \times k} D = srng.normal(size=(dimensions, K)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) iterations = 30 for i in xrange(iterations): # Initialize new point representations # for every pass of the algorithm S = T.zeros((K, samples)) tmp = T.dot(D.T, X_Whitened) res = T.argmax(tmp, axis=0) max_values = tmp[res, T.arange(samples)] S = T.set_subtensor(S[res, T.arange(samples)], max_values) D = T.dot(X_Whitened, T.transpose(S)) D = D / T.sqrt(T.sum(T.sqr(D), axis=0)) return D
def forward_prop_step(x_t, h_t_prev, c_t_prev): h_t_prev.tag.test_value = np.random.uniform(0,1, (300,1)).astype('float64') c_t_prev.tag.test_value = np.random.uniform(0,1, (300,1)).astype('float64') argm_xt = T.argmax(x_t, axis=0)[0] argm_push = T.argmax(self.PUSH, axis=0)[0] argm_pop = T.argmax(self.POP, axis=0)[0] is_push = T.eq(argm_xt, argm_push) is_pop = T.eq(argm_xt, argm_pop) #candidate_to_push = W_h_push.dot(h_t_prev) candidate_to_push = h_t_prev pushed_stack = T.set_subtensor(self.stack[:,:,self.ptr_to_top+1], candidate_to_push) top_of_stack = self.stack[:,:,self.ptr_to_top] candidate_to_pop = T.tanh( W_h_prev_pop.dot(h_t_prev) + W_h_stack_pop.dot(top_of_stack) ) self.stack = ifelse( is_push, pushed_stack, ifelse( is_pop, self.stack, self.stack ) ) self.ptr_to_top = ifelse( is_push, self.ptr_to_top+1, ifelse( is_pop, self.ptr_to_top-1, self.ptr_to_top ) ) h_prime = ifelse( is_push, h_t_prev, ifelse( is_pop, candidate_to_pop, h_t_prev ) ) i = T.nnet.hard_sigmoid( W_x_i.dot(x_t) + W_h_i.dot(h_prime) ) o = T.nnet.hard_sigmoid( W_x_o.dot(x_t) + W_h_o.dot(h_prime) ) f = T.nnet.hard_sigmoid( W_x_f.dot(x_t) + W_h_f.dot(h_prime) ) g = T.tanh( W_x_g.dot(x_t) + W_h_g.dot(h_prime) ) c_t = f*c_t_prev + i*g h_t = o*T.tanh(c_t) o_t = T.transpose( T.nnet.softmax( T.transpose(W_hy.dot(h_t)) ) ) #theano.printing.debugprint(o_t) return [o_t, h_t, c_t]
def forward_prop_step(x_t, h_t_prev, h_t_2_prev, c_t_2_prev, c_t_prev): # h_t_prev.tag.test_value = np.random.uniform(0,1, (self.hidden_dim,self.minibatch_size)).astype('float64') # c_t_prev.tag.test_value = np.random.uniform(0,1, (self.hidden_dim,self.minibatch_size)).astype('float64') # Map input to {push,pop,internal} argm_xt = T.argmax(x_t, axis=0)[0] argm_push = T.argmax(self.PUSH, axis=0)[0] argm_pop = T.argmax(self.POP, axis=0)[0] is_push = T.eq(argm_xt, argm_push) is_pop = T.eq(argm_xt, argm_pop) # Layer 1 candidate_to_push = h_t_prev pushed_stack = T.set_subtensor(self.stack[:,:,self.ptr_to_top+1], candidate_to_push) top_of_stack = self.stack[:,:,self.ptr_to_top] candidate_to_pop = T.tanh( self.W_h_prev_pop.dot(h_t_prev) + self.W_h_stack_pop.dot(top_of_stack) ) self.stack = ifelse(is_push, pushed_stack, ifelse( is_pop,self.stack,self.stack)) self.ptr_to_top = ifelse(is_push, self.ptr_to_top+1, ifelse( is_pop, self.ptr_to_top-1, self.ptr_to_top)) h_prime = ifelse(is_push,h_t_prev, ifelse( is_pop, candidate_to_pop, h_t_prev)) i = T.nnet.hard_sigmoid( self.W_x_i.dot(x_t) + self.W_h_i.dot(h_prime) ) o = T.nnet.hard_sigmoid( self.W_x_o.dot(x_t) + self.W_h_o.dot(h_prime) ) f = T.nnet.hard_sigmoid( self.W_x_f.dot(x_t) + self.W_h_f.dot(h_prime) ) g = T.tanh( self.W_x_g.dot(x_t) + self.W_h_g.dot(h_prime) ) c_t = f*c_t_prev + i*g h_t = o*T.tanh(c_t) # Layer 2 candidate_to_push_2 = h_t_2_prev pushed_stack_2 = T.set_subtensor(self.stack_2[:,:,self.ptr_to_top_2+1], candidate_to_push_2) top_of_stack_2 = self.stack_2[:,:,self.ptr_to_top_2] candidate_to_pop_2 = T.tanh( self.W_h_prev_pop_2.dot(h_t_2_prev) + self.W_h_stack_pop_2.dot(top_of_stack_2) ) self.stack_2 = ifelse(is_push, pushed_stack_2, ifelse(is_pop, self.stack_2, self.stack_2)) self.ptr_to_top_2 = ifelse(is_push, self.ptr_to_top_2+1, ifelse(is_pop, self.ptr_to_top_2-1, self.ptr_to_top_2)) h_prime_2 = ifelse(is_push, h_t_2_prev, ifelse(is_pop, candidate_to_pop_2, h_t_2_prev)) i_2 = T.nnet.hard_sigmoid( self.W_x_i_2.dot(h_t) + self.W_h_i_2.dot(h_prime_2) ) o_2 = T.nnet.hard_sigmoid( self.W_x_o_2.dot(h_t) + self.W_h_o_2.dot(h_prime_2) ) f_2 = T.nnet.hard_sigmoid( self.W_x_f_2.dot(h_t) + self.W_h_f_2.dot(h_prime_2) ) g_2 = T.tanh( self.W_x_g_2.dot(h_t) + self.W_h_g_2.dot(h_prime_2) ) c_t_2 = f_2*c_t_2_prev + i_2*g_2 h_t_2 = o_2*T.tanh(c_t_2) # Output o_t = T.transpose( T.nnet.softmax( T.transpose(self.W_hy.dot(h_t_2)) ) ) return [o_t, h_t, h_t_2, c_t, c_t_2]
def _build_conditional(self, Xnew, X, f, cov_total, mean_total): Kxx = cov_total(X) Kxs = self.cov_func(X, Xnew) L = cholesky(stabilize(Kxx)) A = solve_lower(L, Kxs) v = solve_lower(L, f - mean_total(X)) mu = self.mean_func(Xnew) + tt.dot(tt.transpose(A), v) Kss = self.cov_func(Xnew) cov = Kss - tt.dot(tt.transpose(A), A) return mu, cov
def square_dist(self, X, Xs): X2 = tt.sum(tt.square(X), 1) if Xs is None: sqd = (-2.0 * tt.dot(X, tt.transpose(X)) + (tt.reshape(X2, (-1, 1)) + tt.reshape(X2, (1, -1)))) else: Xs2 = tt.sum(tt.square(Xs), 1) sqd = (-2.0 * tt.dot(X, tt.transpose(Xs)) + (tt.reshape(Xs2, (-1, 1)) + tt.reshape(Xs2, (1, -1)))) return tt.clip(sqd, 0.0, np.inf)
def __init__(self, input, iNeuronNum, oNeuronNum, activateType, train): self._input = input self._weight = theano.shared((np.random.randn(oNeuronNum, iNeuronNum) / (iNeuronNum ** 0.5))) self._bias = theano.shared(np.random.randn(oNeuronNum)) self._activateType = activateType self._output = T.transpose(T.dot(self._weight, T.transpose(self._input)) + self._bias.dimshuffle(0, "x")) self._parameter = [self._weight, self._bias] self._iNeuronNum = iNeuronNum self._oNeuronNum = oNeuronNum self._train = train
def softmax_segm(x): ''' MODIFICATIONS: - reshape from image-size to array and back ''' shape = T.shape(x) x_mod = T.transpose(x, (0,2,3,1)) x_mod = T.reshape(x_mod, (-1,shape[1])) results = softmax(x_mod) results = T.reshape(results, (shape[0],shape[2],shape[3],shape[1])) return T.transpose(results, (0,3,1,2))
def my_siamese_loss(y_true, y_pred): v_pari= y_pred[0::2] v_dispari= y_pred[1::2] y_pari= y_true[0::2] y_dispari= y_true[1::2] d=T.square(v_pari-v_dispari) l=T.sum(d,axis=1) loss=T.mean(T.transpose(y_pari) * l + T.transpose(1-y_pari)*T.maximum(margin-l,0)) return loss
#=================================================================================== #==========================Theano Function definitions============================== #=================================================================================== ATemp = T.matrix('ATemp') BTemp = T.tensor3('BTemp') UTemp = T.matrix('UTemp') E1Temp = T.vector('E1Temp') E2Temp = T.vector('E2Temp') E1E2Temp = T.vector('E1E2Temp') ECTemp = T.vector('ECTemp') E1ECTemp = T.vector('E1ECTemp') #Calculate scoring function temp1 = E1Temp.dot(BTemp).dot(T.transpose(E2Temp)) temp2 = ATemp.dot(E1E2Temp) temp3 = temp1 + temp2 temp4 = T.tanh(temp3) score = UTemp.dot(temp4) scoringFunction = theano.function( [ATemp, BTemp, UTemp, E1Temp, E2Temp, E1E2Temp], score) #======================================================================= #Function Name : loadEntityVectors #Input : relation name #Output : return NN params (A,B,U) for input relation #Functionality : Returns NN parameters for specific relation # Function reads parameters from text files dumped while training #=======================================================================
def transpose(x): """Tensor transpose """ return T.transpose(x)
def __init__(self, cooccurrence, z_k, opt, initializer, pz_weight_regularizer=None, pz_regularizer=None, initial_pz=None, initial_b=None, eps=1e-8): cooccurrence = cooccurrence.astype(np.float32) self.cooccurrence = cooccurrence self.z_k = z_k self.opt = opt x_k = cooccurrence.shape[0] self.x_k = x_k self.pz_weight_regularizer = pz_weight_regularizer self.pz_regularizer = pz_regularizer # cooccurrence matrix n = np.sum(cooccurrence, axis=None) _co = cooccurrence / n co = T.constant(_co, name="co") # (x_k, x_k) _co_m = np.sum(_co, axis=1, keepdims=True) co_m = T.constant(_co_m, name="co_m") # (x_k,1) _co_c = _co / (eps + _co_m) _co_h = np.sum(_co * -np.log(eps + _co_c), axis=1, keepdims=True) # (x_k, 1) print "COh: {}".format(np.sum(_co_h)) co_h = T.constant(_co_h, name="co_h") # parameters # P(z|x) if initial_pz is None: initial_pz = initializer((x_k, z_k)) pz_weight = K.variable(initial_pz, name="pz_weight") # (x_k, z_k) initial_w = initializer((z_k, x_k)) w = K.variable(initial_w, name="w") if initial_b is None: initial_b = initializer((x_k,)) b = K.variable(initial_b, name="b") params = [pz_weight, w, b] # loss p_z = softmax_nd(pz_weight) # (x_k, z_k) bucketprobs = softmax_nd(w + b) # (z_k, x_k) bucketnll = -T.log(eps + bucketprobs) # (z_k, x_k) lossparts = T.dot(co, T.transpose(bucketnll, (1, 0))) # (x_k, z_k) nll = T.sum(p_z * lossparts) # val loss enc = T.argmax(pz_weight, axis=1) oh = tensor_one_hot(enc, k=z_k) # (x_k, z_k) p_b = T.dot(T.transpose(oh, (1, 0)), co) # (z_k, x_k) marg = T.sum(p_b, axis=1, keepdims=True) # (z_k, 1) cond = p_b / (marg + eps) # (z_k, x_k) val_nll = T.sum(p_b * -T.log(eps + cond), axis=None) # scalar # utilization utilization = T.sum(T.gt(T.sum(oh, axis=0), 0), axis=0) # scalar reg_loss = T.constant(0.) self.regularize = False if pz_weight_regularizer: reg_loss += pz_weight_regularizer(pz_weight) self.regularize = True if pz_regularizer: reg_loss += pz_regularizer(p_z) self.regularize = True total_loss = nll + reg_loss self.val_fun = theano.function([], [nll, reg_loss, total_loss, val_nll, utilization]) self.encodings_fun = theano.function([], enc) updates = opt.get_updates(params=params, loss=total_loss) self.train_fun = theano.function([], [nll, reg_loss, total_loss], updates=updates) self.weights = params + opt.weights
def __init__(self, context, V, K, num_sub_tags, feature_matrix_values, context_sz, rng): """ Initialize the parameters of the language model """ # training contexts self.context = context # initialize context word embedding matrix R of shape (V, K) # TODO: parameterize initialization R_values = np.asarray(rng.uniform(-0.01, 0.01, size=(V, K)), dtype=theano.config.floatX) R_values[:,0:2] = np.zeros((V,2)) self.R = theano.shared(value=R_values, name='R', borrow=True) # initialize target word embedding matrix Q of shape (V, K) Q_values = np.asarray(rng.uniform(-0.01, 0.01, size=(V, K)), dtype=theano.config.floatX) Q_values[:,0:2] = np.zeros((V,2)) self.Q = theano.shared(value=Q_values, name='Q', borrow=True) # initialize weight tensor C of shape (context_sz, K, K) C_values = np.asarray(rng.normal(0, math.sqrt(0.1), size=(context_sz, K, K)), dtype=theano.config.floatX) self.C = theano.shared(value=C_values, name='C', borrow=True) # initialize tag matrix Tag_values = np.asarray(rng.normal(-0.01,0.01,size=(num_sub_tags,K)), dtype=theano.config.floatX) self.Tag = theano.shared(value=Tag_values,name='Tag',borrow=True) # initialize bias vector b_values = np.asarray(rng.normal(0, math.sqrt(0.1), size=(V,)), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, name='b', borrow=True) # context word representations self.r_w = self.R[context] # predicted word representation for target word self.q_hat = T.tensordot(self.C, self.r_w, axes=[[0,1], [1,2]]) # similarity score between predicted word and all target words self.s = T.transpose(T.dot(self.Q, self.q_hat) + T.reshape(self.b, (V,1))) # softmax activation function self.p_w_given_h = T.nnet.softmax(self.s) self.feature_matrix = theano.shared(value=feature_matrix_values,name="feature_matrix",borrow=True) # activation function for tags # feature_matrix : Tag Size x Sub Tag Size # Tag : Sub Tag Size x K # Q.T : K x V # s_tag = Tag Size x V self.s_tag = T.dot(T.dot(self.feature_matrix,self.Tag),T.transpose(self.Q)) #self.s_tag = T.dot((T.dot(self.feature_matrix,self.Tag)),T.transpose(self.Q)) # softmax activation function tag given word distribution self.p_t_given_w = T.nnet.softmax(self.s_tag) # parameters of the model self.params = [self.R, self.Q, self.C, self.b, self.Tag]
def __init__(self, n_in, n_hidden, x=T.tensor3("x"), xc=T.tensor3("xc"), mask=T.matrix("mask"), maskc=T.matrix("maskx"), prefix=""): self.params = [] if x is not None: self.x = x else: self.x = T.tensor3("x") if xc is not None: self.xc = xc else: self.xc = T.tensor3("xc") if mask is not None: self.mask = mask else: self.mask = T.matrix("mask") if maskc is not None: self.maskc = maskc else: self.maskc = T.matrix("maskc") #### 转置 为了进行scan运算 ### nmask = T.transpose(self.mask, axes=(1, 0)) nx = T.transpose(self.x, axes=(1, 0, 2)) nmaskc = T.transpose(self.maskc, axes=(1, 0)) nxc = T.transpose(self.xc, axes=(1, 0, 2)) wz_x, bz = init_weight(n_in, n_hidden, pre="%s_lstm_f_x_" % prefix) self.params += [wz_x, bz] wr_x, br = init_weight(n_in, n_hidden, pre="%s_lstm_i_x_" % prefix) self.params += [wr_x, br] wc_x, bc = init_weight(n_in, n_hidden, pre="%s_lstm_c_x_" % prefix) self.params += [wc_x, bc] wz_h, b_h = init_weight(n_hidden, n_hidden, pre="%s_lstm_f_h_" % prefix) self.params += [wz_h] wr_h, b_h = init_weight(n_hidden, n_hidden, pre="%s_lstm_i_h_" % prefix) self.params += [wr_h] wc_h, b_h = init_weight(n_hidden, n_hidden, pre="%s_lstm_c_h_" % prefix) self.params += [wc_h] #h_t_0 = T.alloc(np.array(0.,dtype=np.float64), x.shape[0], n_hidden) #c_t_0 = T.alloc(np.array(0.,dtype=np.float64), x.shape[0], n_hidden) h_t_0 = T.alloc(0., x.shape[0], n_hidden) h_t_0_c = T.alloc(0., xc.shape[0], n_hidden) #h_t_0 = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX)) #c_t_0 = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX)) h, r = theano.scan( self.recurrent_fn, sequences=[nx, nmask], outputs_info=[h_t_0], non_sequences=[wz_x, wz_h, bz, wr_x, wr_h, br, wc_x, wc_h, bc]) hc, rc = theano.scan( self.recurrent_fn, sequences=[nxc, nmaskc], outputs_info=[h_t_0_c], non_sequences=[wz_x, wz_h, bz, wr_x, wr_h, br, wc_x, wc_h, bc]) self.all_hiddenx = T.transpose(h, axes=(1, 0, 2)) self.nn_outx = h[-1] self.all_hiddenc = T.transpose(hc, axes=(1, 0, 2)) self.nn_outc = hc[-1] self.nn_out = h[-1] - hc[-1]
def __init__(self, n_hidden, embedding_dimention=50): ##n_in: sequence lstm 的输入维度 ##n_hidden: lstm for candi and zp 的隐层维度 ##n_hidden_sequence: sequence lstm的隐层维度 因为要同zp的结合做dot,所以其维度要是n_hidden的2倍 ## 即 n_hidden_sequence = 2 * n_hidden self.params = [] self.zp_x_pre = T.matrix("zp_x_pre") self.zp_x_post = T.matrix("zp_x_post") #self.zp_x_pre_dropout = _dropout_from_layer(self.zp_x_pre) #self.zp_x_post_dropout = _dropout_from_layer(self.zp_x_post) zp_nn_pre = GRU(embedding_dimention, n_hidden, self.zp_x_pre) #zp_nn_pre = LSTM(embedding_dimention,n_hidden,self.zp_x_pre_dropout) self.params += zp_nn_pre.params zp_nn_post = GRU(embedding_dimention, n_hidden, self.zp_x_post) #zp_nn_post = LSTM(embedding_dimention,n_hidden,self.zp_x_post_dropout) self.params += zp_nn_post.params self.zp_out = T.concatenate((zp_nn_pre.nn_out, zp_nn_post.nn_out)) self.ZP_layer = Layer(n_hidden * 2, n_hidden * 2, self.zp_out, ReLU) self.zp_out_output = self.ZP_layer.output #self.zp_out_dropout = _dropout_from_layer(T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out))) self.get_zp_out = theano.function( inputs=[self.zp_x_pre, self.zp_x_post], outputs=[self.ZP_layer.output]) ### get sequence output for NP ### self.np_x = T.tensor3("np_x") self.np_x_post = T.tensor3("np_x") self.np_x_pre = T.tensor3("np_x") #self.np_x_dropout = _dropout_from_layer(self.np_x) self.mask = T.matrix("mask") self.mask_pre = T.matrix("mask") self.mask_post = T.matrix("mask") self.np_nn_x = RNN_batch(embedding_dimention, n_hidden, self.np_x, self.mask) self.params += self.np_nn_x.params self.np_nn_pre = GRU_batch(embedding_dimention, n_hidden, self.np_x_pre, self.mask_pre) self.params += self.np_nn_pre.params self.np_nn_post = GRU_batch(embedding_dimention, n_hidden, self.np_x_post, self.mask_post) self.params += self.np_nn_post.params #self.np_nn_out = LSTM_batch(embedding_dimention,n_hidden*2,self.np_x,self.mask) #self.np_nn_out = LSTM_batch(embedding_dimention,n_hidden*2,self.np_x_dropout,self.mask) #self.params += self.np_nn_out.params #self.np_out = self.np_nn.nn_out self.np_nn_x_output = (self.np_nn_x.all_hidden).mean(axis=1) self.np_nn_post_output = self.np_nn_post.nn_out self.np_nn_pre_output = self.np_nn_pre.nn_out self.np_out = T.concatenate( (self.np_nn_x_output, self.np_nn_post_output, self.np_nn_pre_output), axis=1) self.NP_layer = Layer(n_hidden * 3, n_hidden * 2, self.np_out, ReLU) self.np_out_output = self.NP_layer.output self.np_x_head = T.transpose(self.np_x, axes=(1, 0, 2))[-1] self.get_np_head = theano.function(inputs=[self.np_x], outputs=[self.np_x_head]) self.get_np = theano.function(inputs=[ self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[self.np_out]) self.get_np_out = theano.function(inputs=[ self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[self.np_out_output]) w_attention_zp, b_attention = init_weight(n_hidden * 2, 1, pre="attention_hidden", ones=False) self.params += [w_attention_zp, b_attention] w_attention_np, b_u = init_weight(n_hidden * 2, 1, pre="attention_zp", ones=False) self.params += [w_attention_np] self.calcu_attention = tanh( T.dot(self.np_out_output, w_attention_np) + T.dot(self.zp_out_output, w_attention_zp) + b_attention) self.attention = softmax(T.transpose(self.calcu_attention, axes=(1, 0)))[0] self.get_attention = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[self.attention]) new_zp = T.sum(self.attention[:, None] * self.np_x_head, axis=0) self.get_new_zp = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[new_zp]) #### *** HOP *** #### self.w_hop_zp, self.b_hop_zp = init_weight(n_hidden * 2 + embedding_dimention, n_hidden * 2, pre="hop_") self.params += [self.w_hop_zp, self.b_hop_zp] ## hop 1 ## self.zp_hop_1_init = T.concatenate( (zp_nn_pre.nn_out, zp_nn_post.nn_out, new_zp)) self.zp_hop_1 = ReLU( T.dot(self.zp_hop_1_init, self.w_hop_zp) + self.b_hop_zp) self.calcu_attention_hop_1 = tanh( T.dot(self.np_out_output, w_attention_np) + T.dot(self.zp_hop_1, w_attention_zp) + b_attention) self.attention_hop_1 = softmax( T.transpose(self.calcu_attention_hop_1, axes=(1, 0)))[0] self.get_attention_hop_1 = theano.function( inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[self.attention_hop_1]) self.out = self.attention_hop_1 self.get_out = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[self.out]) l1_norm_squared = sum([(w**2).sum() for w in self.params]) l2_norm_squared = sum([(abs(w)).sum() for w in self.params]) lmbda_l1 = 0.0 #lmbda_l2 = 0.001 lmbda_l2 = 0.0 t = T.bvector() cost = -(T.log((self.out * t).sum())) #cost = -(T.log((self.out_dropout*t).sum())) #cost = 1-((self.out*t).sum()) lr = T.scalar() #grads = T.grad(cost, self.params) #updates = [(param, param-lr*grad) # for param, grad in zip(self.params, grads)] #updates = lasagne.updates.sgd(cost, self.params, lr) updates = lasagne.updates.adadelta(cost, self.params) self.train_step = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post, t, lr ], outputs=[cost], on_unused_input='warn', updates=updates)
def __init__(self, n_hidden, embedding_dimention=50, feature_dimention=61): ##n_in: sequence lstm 的输入维度 ##n_hidden: lstm for candi and zp 的隐层维度 #repre_active = ReLU repre_active = linear self.params = [] self.zp_x_pre = T.matrix("zp_x_pre") self.zp_x_post = T.matrix("zp_x_post") zp_nn_pre = LSTM(embedding_dimention, n_hidden, self.zp_x_pre) self.params += zp_nn_pre.params zp_nn_post = LSTM(embedding_dimention, n_hidden, self.zp_x_post) self.params += zp_nn_post.params attention_pre_on_post = softmax( (zp_nn_pre.nn_out * zp_nn_post.all_hidden).sum(axis=1))[0] attention_post_on_pre = softmax( (zp_nn_post.nn_out * zp_nn_pre.all_hidden).sum(axis=1))[0] zp_post = T.sum(attention_pre_on_post[:, None] * zp_nn_post.all_hidden, axis=0) zp_pre = T.sum(attention_post_on_pre[:, None] * zp_nn_pre.all_hidden, axis=0) #self.zp_out = T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out)) self.zp_out = T.concatenate((zp_post, zp_pre)) self.zp_out_output = self.zp_out ### get sequence output for NP ### self.np_x_post = T.tensor3("np_x") self.np_x_postc = T.tensor3("np_x") self.np_x_pre = T.tensor3("np_x") self.np_x_prec = T.tensor3("np_x") self.mask_pre = T.matrix("mask") self.mask_prec = T.matrix("mask") self.mask_post = T.matrix("mask") self.mask_postc = T.matrix("mask") self.np_nn_pre = sub_LSTM_batch(embedding_dimention, n_hidden, self.np_x_pre, self.np_x_prec, self.mask_pre, self.mask_prec) self.params += self.np_nn_pre.params self.np_nn_post = sub_LSTM_batch(embedding_dimention, n_hidden, self.np_x_post, self.np_x_postc, self.mask_post, self.mask_postc) self.params += self.np_nn_post.params self.np_nn_post_output = self.np_nn_post.nn_out self.np_nn_pre_output = self.np_nn_pre.nn_out self.np_out = T.concatenate( (self.np_nn_post_output, self.np_nn_pre_output), axis=1) #np_nn_f = LSTM(n_hidden*2,n_hidden*2,self.np_out) #self.params += np_nn_f.params #np_nn_b = LSTM(n_hidden*2,n_hidden*2,self.np_out[::-1]) #self.params += np_nn_b.params #self.bi_np_out = T.concatenate((np_nn_f.all_hidden,np_nn_b.all_hidden[::-1]),axis=1) #self.np_out_output = self.bi_np_out #self.get_np_out = theano.function(inputs=[self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc],outputs=[self.np_out_output]) self.feature = T.matrix("feature") self.feature_layer = Layer(feature_dimention, n_hidden, self.feature, repre_active) self.params += self.feature_layer.params w_attention_zp, b_attention = init_weight(n_hidden * 2, 1, pre="attention_zp", ones=False) self.params += [w_attention_zp, b_attention] w_attention_np, b_u = init_weight(n_hidden * 2, 1, pre="attention_np", ones=False) self.params += [w_attention_np] #w_attention_np_rnn,b_u = init_weight(n_hidden*4,1,pre="attention_np_rnn",ones=False) #self.params += [w_attention_np_rnn] w_attention_feature, b_u = init_weight(n_hidden, 1, pre="attention_feature", ones=False) self.params += [w_attention_feature] self.calcu_attention = tanh( T.dot(self.zp_out_output, w_attention_zp) + T.dot(self.np_out, w_attention_np) + T.dot(self.feature_layer.output, w_attention_feature) + b_attention) #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention) #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + b_attention) self.attention = softmax(T.transpose(self.calcu_attention, axes=(1, 0)))[0] self.out = self.attention self.get_out = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x_pre, self.np_x_prec, self.np_x_post, self.np_x_postc, self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc, self.feature ], outputs=[self.out], on_unused_input='warn') l1_norm_squared = sum([(w**2).sum() for w in self.params]) l2_norm_squared = sum([(abs(w)).sum() for w in self.params]) lmbda_l1 = 0.0 #lmbda_l2 = 0.001 lmbda_l2 = 0.0 t = T.bvector() cost = -(T.log((self.out * t).sum())) lr = T.scalar() updates = lasagne.updates.sgd(cost, self.params, lr) #updates = lasagne.updates.adadelta(cost, self.params) self.train_step = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x_pre, self.np_x_prec, self.np_x_post, self.np_x_postc, self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc, self.feature, t, lr ], outputs=[cost], on_unused_input='warn', updates=updates)
def logNormalPDF(X, Mu, XChol): Lambda = Tla.matrix_inverse(T.dot(XChol, T.transpose(XChol))) XMu = X - Mu return (-0.5 * T.dot(XMu, T.dot(Lambda, T.transpose(XMu))) + 0.5 * T.log(Tla.det(Lambda)) - 0.5 * np.log(2 * np.pi) * X.shape[0])
def __init__(self, n_actions, replay_memory, build_network, updates, screen_size, initial_weights_file=None): self.screen_size = screen_size self.mood_q = None self.last_q = 0 self.n_parameter_updates = 0 self.alpha = 0.00025 # update frequency ? # gradient momentum ? 0.95 # squared gradient momentum ? 0.95 # min squared gradient ? 0.01 self.save_every_n_frames = 100000 # ~ once per hour self.final_exploration_frame = 1000000 self.replay_start_size = 50000 self.i_action = 0 self.state = None self.initial_epsilon = 1 self.final_epsilon = 0.1 self.epsilon = self.initial_epsilon self.gamma = 0.99 self.replay_memory = replay_memory self.log_frequency = 1 self.minibatch_size = 32 # self.replay_memory_size = 1000000 self.target_network_update_frequency = 10000 s0_var = T.tensor4("s0", dtype=theano.config.floatX) a0_var = T.bmatrix("a0") r0_var = T.wcol("r0") s1_var = T.tensor4("s1", dtype=theano.config.floatX) future_reward_indicator_var = T.bcol("future_reward_indicator") self.n_actions = n_actions self.a_lookup = np.eye(self.n_actions, dtype=np.int8) self.network = build_network(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256), screen_size=self.screen_size) print("Compiling forward.") self.forward = theano.function([s0_var], lasagne.layers.get_output( self.network, deterministic=True)) self.network_stale = build_network( n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256), screen_size=self.screen_size) print("Compiling forward_stale.") self.forward_stale = theano.function([s1_var], lasagne.layers.get_output( self.network_stale, deterministic=True)) if initial_weights_file is not None: with np.load(initial_weights_file) as initial_weights: param_values = [ initial_weights['arr_%d' % i] for i in range(len(initial_weights.files)) ] lasagne.layers.set_all_param_values(self.network, param_values) self.i_action -= self.replay_start_size self._update_network_stale() out = lasagne.layers.get_output(self.network) out_stale = lasagne.layers.get_output(self.network_stale) self.loss, self.err, __y, __q = build_loss( out=out, out_stale=out_stale, a0_var=a0_var, r0_var=r0_var, future_reward_indicator_var=future_reward_indicator_var, gamma=self.gamma) params = lasagne.layers.get_all_params(self.network, trainable=True) print("Compiling train_fn.") self.train_fn = theano.function( [s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], [ self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale ], updates=updates(self.loss, params)) print("Compiling loss_fn.") self.loss_fn = theano.function( [s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], self.loss) self.test_mode = False
def __init__( self, numpy_rng, theano_rng=None, input1_v=None, input2_v=None, input3_v=None, input1_c=None, input2_c=None, input3_c=None, n_visible1_v=4096, n_visible2_v=4096, n_visible1_c=3529, n_visible2_c=3529, n_hidden_v=None, n_hidden_c=None, W1_c=None, bhid1_c=None, bvis1_c=None, W2_c=None, bhid2_c=None, bvis2_c=None, W1_v=None, bhid1_v=None, bvis1_v=None, W2_v=None, bhid2_v=None, bvis2_v=None, lamda=None, mu=None, beta=None, theta=None, momentum=0.9 ): self.n_visible1_v = n_visible1_v self.n_visible2_v = n_visible2_v self.n_hidden_v = n_hidden_v self.n_visible1_c = n_visible1_c self.n_visible2_c = n_visible2_c self.n_hidden_c = n_hidden_c self.lamda = lamda self.mu = mu self.beta = beta self.theta = theta self.momentum = momentum if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) self.W1_v = W1_v self.W2_v = W2_v self.W1_c = W1_c self.W2_c = W2_c self.b1_v = bhid1_v self.b2_v = bhid2_v self.b1_c = bhid1_c self.b2_c = bhid2_c self.b1_prime_v = bvis1_v self.b2_prime_v = bvis2_v self.b1_prime_c = bvis1_c self.b2_prime_c = bvis2_c self.W1_prime_v = T.transpose(self.W1_v) self.W2_prime_v = T.transpose(self.W2_v) self.W1_prime_c = T.transpose(self.W1_c) self.W2_prime_c = T.transpose(self.W2_c) self.theano_rng = theano_rng self.L2_sqr = ( (self.W1_v ** 2).mean() + (self.W2_v ** 2).mean() + (self.W1_c ** 2).mean() + (self.W2_c ** 2).mean() + (self.b1_v ** 2).mean() + (self.b2_v ** 2).mean() + (self.b1_c ** 2).mean() + (self.b2_c ** 2).mean() + (self.b1_prime_v ** 2).mean() + (self.b2_prime_v ** 2).mean() + (self.b1_prime_c ** 2).mean() + (self.b2_prime_c ** 2).mean() ) # if no input is given, generate a variable representing the input if input1_v is None: # we use a matrix because we expect a minibatch of several # examples, each example being a row self.x1_v = T.dmatrix(name='input1_v',dtype='float32') self.x2_v = T.dmatrix(name='input2_v',dtype='float32') self.x3_v = T.dmatrix(name='input3_v',dtype='float32') self.x1_c = T.dmatrix(name='input1_c',dtype='float32') self.x2_c = T.dmatrix(name='input2_c',dtype='float32') self.x3_c = T.dmatrix(name='input3_c',dtype='float32') else: self.x1_v = input1_v self.x2_v = input2_v self.x3_v = input3_v self.x1_c = input1_c self.x2_c = input2_c self.x3_c = input3_c self.params = [self.W1_v, self.b1_v, self.b1_prime_v, self.W2_v, self.b2_v, self.b2_prime_v, self.W1_c, self.b1_c, self.b1_prime_c, self.W2_c, self.b2_c, self.b2_prime_c ] # end-snippet-1 self.output1_v = T.nnet.hard_sigmoid (T.dot(self.x1_v, self.W1_v) + self.b1_v) self.output2_v = T.nnet.hard_sigmoid (T.dot(self.x2_v, self.W2_v) + self.b2_v) self.output3_v = T.nnet.hard_sigmoid (T.dot(self.x3_v, self.W2_v) + self.b2_v) self.output1_c = T.nnet.hard_sigmoid (T.dot(self.x1_c, self.W1_c) + self.b1_c) self.output2_c = T.nnet.hard_sigmoid (T.dot(self.x2_c, self.W2_c) + self.b2_c) self.output3_c = T.nnet.hard_sigmoid (T.dot(self.x3_c, self.W2_c) + self.b2_c) self.output1t_v = T.transpose(self.output1_v) self.output2t_v = T.transpose(self.output2_v) self.output3t_v = T.transpose(self.output3_v) self.output1t_c = T.transpose(self.output1_c) self.output2t_c = T.transpose(self.output2_c) self.output3t_c = T.transpose(self.output3_c) self.rec1_v = T.nnet.hard_sigmoid (T.dot(self.output1_v, self.W1_prime_v) + self.b1_prime_v) self.rec2_v = T.nnet.hard_sigmoid (T.dot(self.output2_v, self.W2_prime_v) + self.b2_prime_v) self.rec3_v = T.nnet.hard_sigmoid (T.dot(self.output3_v, self.W2_prime_v) + self.b2_prime_v) self.rec1_c = T.nnet.hard_sigmoid (T.dot(self.output1_c, self.W1_prime_c) + self.b1_prime_c) self.rec2_c = T.nnet.hard_sigmoid (T.dot(self.output2_c, self.W2_prime_c) + self.b2_prime_c) self.rec3_c = T.nnet.hard_sigmoid (T.dot(self.output3_c, self.W2_prime_c) + self.b2_prime_c)
weights(np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1), np.random.rand(1)) #W1real=T.set_subtensor(W[0:3],W[0:3]) #W2real=T.set_subtensor(W[3:6],W[3:6]) #W3real=T.set_subtensor(W[6:9],W[6:9]) #W1imag=T.set_subtensor(W[9:12],W[9:12]) #W2imag=T.set_subtensor(W[12:15],W[12:15]) #W3imag=T.set_subtensor(W[15:18],W[15:18]) cost = T.sqr( T.sum(T.transpose(W[0:3]) * xreal) - T.sum(T.transpose(W[9:12]) * ximag)) + T.sqr( T.sum(T.transpose(W[0:3]) * ximag) + T.sum(T.transpose(W[9:12]) * xreal)) + T.sqr( T.sum(T.transpose(W[3:6]) * xreal) - T.sum(T.transpose(W[12:15]) * ximag)) + T.sqr( T.sum(T.transpose(W[3:6]) * ximag) + T.sum(T.transpose(W[12:15]) * xreal)) + T.sqr( T.sum(T.transpose(W[6:9]) * ximag) + T.sum(T.transpose(W[15:18]) * xreal)) + l * T.sqr( T.sqr( T.sum(T.transpose(W[6:9]) * xreal) - T.sum(T.transpose(W[15:18]) * ximag)) - 1) loss = [] gradients = theano.tensor.grad(cost, [W])
def errors(ypred, ytrue): shp = ypred.shape rypred = T.transpose(ypred.reshape((shp[1], shp[0] * shp[2] * shp[3]))) preds = T.argmax(ryped, axis=1) return T.mean(T.neq(preds, ytrue))
def log_loss(self, y): return -T.dot(T.log(self.p_y_given_x), T.transpose(y))[T.arange(y.shape[0]), T.arange(y.shape[0])]
def __init__(self, n_in, n_hidden, x=T.tensor3("x"), xc=T.tensor3("xc"), mask=T.matrix("mask"), maskc=T.matrix("maskx"), prefix=""): self.params = [] if x is not None: self.x = x else: self.x = T.tensor3("x") if xc is not None: self.xc = xc else: self.xc = T.tensor3("xc") if mask is not None: self.mask = mask else: self.mask = T.matrix("mask") if maskc is not None: self.maskc = maskc else: self.maskc = T.matrix("maskc") #### 转置 为了进行scan运算 ### nmask = T.transpose(self.mask, axes=(1, 0)) nx = T.transpose(self.x, axes=(1, 0, 2)) nmaskc = T.transpose(self.maskc, axes=(1, 0)) nxc = T.transpose(self.xc, axes=(1, 0, 2)) wf_x, bf = init_weight(n_in, n_hidden, pre="%s_lstm_f_x_" % prefix) self.params += [wf_x, bf] wi_x, bi = init_weight(n_in, n_hidden, pre="%s_lstm_i_x_" % prefix) self.params += [wi_x, bi] wc_x, bc = init_weight(n_in, n_hidden, pre="%s_lstm_c_x_" % prefix) self.params += [wc_x, bc] wo_x, bo = init_weight(n_in, n_hidden, pre="%s_lstm_o_x_" % prefix) self.params += [wo_x, bo] wf_h, b_h = init_weight(n_hidden, n_hidden, pre="%s_lstm_f_h_" % prefix) self.params += [wf_h] wi_h, b_h = init_weight(n_hidden, n_hidden, pre="%s_lstm_i_h_" % prefix) self.params += [wi_h] wc_h, b_h = init_weight(n_hidden, n_hidden, pre="%s_lstm_c_h_" % prefix) self.params += [wc_h] wo_h, b_h = init_weight(n_hidden, n_hidden, pre="%s_lstm_o_h_" % prefix) self.params += [wo_h] #h_t_0 = T.alloc(np.array(0.,dtype=np.float64), x.shape[0], n_hidden) #c_t_0 = T.alloc(np.array(0.,dtype=np.float64), x.shape[0], n_hidden) h_t_0 = T.alloc(0., x.shape[0], n_hidden) c_t_0 = T.alloc(0., x.shape[0], n_hidden) h_t_0_c = T.alloc(0., xc.shape[0], n_hidden) c_t_0_c = T.alloc(0., xc.shape[0], n_hidden) [h, c], r = theano.scan(self.lstm_recurrent_fn, sequences=[nx, nmask], outputs_info=[h_t_0, c_t_0], non_sequences=[ wf_x, wf_h, bf, wi_x, wi_h, bi, wc_h, wc_x, bc, wo_x, wo_h, bo ]) [hc, cc], rc = theano.scan(self.lstm_recurrent_fn, sequences=[nxc, nmaskc], outputs_info=[h_t_0_c, c_t_0_c], non_sequences=[ wf_x, wf_h, bf, wi_x, wi_h, bi, wc_h, wc_x, bc, wo_x, wo_h, bo ]) self.all_hiddenx = T.transpose(h, axes=(1, 0, 2)) self.nn_outx = h[-1] self.all_hiddenc = T.transpose(hc, axes=(1, 0, 2)) self.nn_outc = hc[-1] self.nn_out = h[-1] - hc[-1]
def nll(ypred, ytrue): shp = ypred.shape rypred = T.transpose(ypred.reshape((shp[1], shp[0] * shp[2] * shp[3]))) return -T.mean(T.log(rypred)[T.arange(rypred.shape[0]), ytrue])
def __init__(self, n_hidden, embedding_dimention=50, feature_dimention=61): ##n_in: sequence lstm 的输入维度 ##n_hidden: lstm for candi and zp 的隐层维度 self.params = [] self.w_embedding = init_weight_file(args.embedding, args.embedding_dimention) self.params.append(self.w_embedding) self.zp_x_pre_index = T.imatrix("zp_x_pre") self.zp_x_post_index = T.imatrix("zp_x_post") zp_x_pre_newshape = (T.shape(self.zp_x_pre_index)[0], args.embedding_dimention) self.embedding_sub_zp_pre = self.w_embedding[ self.zp_x_pre_index.flatten()] self.zp_x_pre = T.reshape(self.embedding_sub_zp_pre, zp_x_pre_newshape) zp_x_post_newshape = (T.shape(self.zp_x_post_index)[0], args.embedding_dimention) self.embedding_sub_zp_post = self.w_embedding[ self.zp_x_post_index.flatten()] self.zp_x_post = T.reshape(self.embedding_sub_zp_post, zp_x_post_newshape) zp_nn_pre = LSTM(embedding_dimention, n_hidden, self.zp_x_pre) self.params += zp_nn_pre.params zp_nn_post = LSTM(embedding_dimention, n_hidden, self.zp_x_post) self.params += zp_nn_post.params danwei = theano.shared(np.eye(8, dtype=theano.config.floatX)) H_pre = zp_nn_pre.all_hidden H_post = zp_nn_post.all_hidden Ws1_pre, heihei = init_weight(n_hidden, n_hidden, pre="Ws1_pre_zp", ones=False) Ws2_pre, heihei = init_weight(8, n_hidden, pre="Ws2_pre_zp", ones=False) self.params += [Ws1_pre, Ws2_pre] A_pre = softmax(T.dot(Ws2_pre, T.dot(Ws1_pre, T.transpose(H_pre)))) P_pre = T.dot(A_pre, T.transpose(A_pre)) - danwei #norm_pre, _ = theano.scan(lambda i, tmp: T.dot(P_pre[i], P_pre[i]) + tmp, # sequences = T.arange(P_pre.shape[0]), # outputs_info = np.asarray(0., dtype=theano.config.floatX)) #f_norm_pre = T.sum(norm_pre[-1]) f_norm_pre = (P_pre**2).sum() zp_out_pre = T.mean(T.dot(A_pre, H_pre), axis=0) Ws1_post, heihei = init_weight(n_hidden, n_hidden, pre="Ws1_post_zp", ones=False) Ws2_post, heihei = init_weight(8, n_hidden, pre="Ws2_post_zp", ones=False) self.params += [Ws1_post, Ws2_post] A_post = softmax(T.dot(Ws2_post, T.dot(Ws1_post, T.transpose(H_post)))) P_post = T.dot(A_post, T.transpose(A_post)) - danwei #norm_post, _ = theano.scan(lambda i, tmp: T.dot(P_post[i], P_post[i]) + tmp, # sequences = T.arange(P_post.shape[0]), # outputs_info = np.asarray(0., dtype=theano.config.floatX)) #f_norm_post = T.sum(norm_post[-1]) f_norm_post = (P_post**2).sum() zp_out_post = T.mean(T.dot(A_post, H_post), axis=0) f_norm = f_norm_pre + f_norm_post #self.zp_out = T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out)) self.zp_out = T.concatenate((zp_out_pre, zp_out_post)) self.zp_out_output = self.zp_out ### get sequence output for NP ### self.np_x_post_index = T.itensor3("np_x") self.np_x_postc_index = T.itensor3("np_x") self.np_x_pre_index = T.itensor3("np_x") self.np_x_prec_index = T.itensor3("np_x") np_x_post_newshape = (T.shape(self.np_x_post_index)[0], T.shape(self.np_x_post_index)[1], args.embedding_dimention) self.embedding_sub_np_x_post = self.w_embedding[ self.np_x_post_index.flatten()] self.np_x_post = T.reshape(self.embedding_sub_np_x_post, np_x_post_newshape) np_x_postc_newshape = (T.shape(self.np_x_postc_index)[0], T.shape(self.np_x_postc_index)[1], args.embedding_dimention) self.embedding_sub_np_x_postc = self.w_embedding[ self.np_x_postc_index.flatten()] self.np_x_postc = T.reshape(self.embedding_sub_np_x_postc, np_x_postc_newshape) np_x_pre_newshape = (T.shape(self.np_x_pre_index)[0], T.shape(self.np_x_pre_index)[1], args.embedding_dimention) self.embedding_sub_np_x_pre = self.w_embedding[ self.np_x_pre_index.flatten()] self.np_x_pre = T.reshape(self.embedding_sub_np_x_pre, np_x_pre_newshape) np_x_prec_newshape = (T.shape(self.np_x_prec_index)[0], T.shape(self.np_x_prec_index)[1], args.embedding_dimention) self.embedding_sub_np_x_prec = self.w_embedding[ self.np_x_prec_index.flatten()] self.np_x_prec = T.reshape(self.embedding_sub_np_x_prec, np_x_prec_newshape) self.mask_pre = T.matrix("mask") self.mask_prec = T.matrix("mask") self.mask_post = T.matrix("mask") self.mask_postc = T.matrix("mask") self.np_nn_pre = sub_LSTM_batch(embedding_dimention, n_hidden, self.np_x_pre, self.np_x_prec, self.mask_pre, self.mask_prec) self.params += self.np_nn_pre.params self.np_nn_post = sub_LSTM_batch(embedding_dimention, n_hidden, self.np_x_post, self.np_x_postc, self.mask_post, self.mask_postc) self.params += self.np_nn_post.params self.np_nn_post_output = self.np_nn_post.nn_out self.np_nn_pre_output = self.np_nn_pre.nn_out self.np_out = T.concatenate( (self.np_nn_post_output, self.np_nn_pre_output), axis=1) np_nn_f = LSTM(n_hidden * 2, n_hidden * 2, self.np_out) self.params += np_nn_f.params np_nn_b = LSTM(n_hidden * 2, n_hidden * 2, self.np_out[::-1]) self.params += np_nn_b.params self.bi_np_out = T.concatenate( (np_nn_f.all_hidden, np_nn_b.all_hidden[::-1]), axis=1) self.np_out_output = self.bi_np_out #self.get_np_out = theano.function(inputs=[self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc],outputs=[self.np_out_output]) #self.feature = T.matrix("feature") #self.feature_layer = Layer(feature_dimention,n_hidden,self.feature,repre_active) #self.params += self.feature_layer.params w_attention_zp, b_attention = init_weight(n_hidden * 2, 1, pre="attention_zp", ones=False) self.params += [w_attention_zp, b_attention] w_attention_np, b_u = init_weight(n_hidden * 2, 1, pre="attention_np", ones=False) #self.params += [w_attention_np] w_attention_np_rnn, b_u = init_weight(n_hidden * 4, 1, pre="attention_np_rnn", ones=False) self.params += [w_attention_np_rnn] #np_out_dropout = _dropout_from_layer(self.np_out_output) #zp_out_dropout = _dropout_from_layer(self.zp_out_output) #np_dropout = _dropout_from_layer(self.np_out) #self.calcu_attention_dropout = tanh(T.dot(np_out_dropout,w_attention_np_rnn) + T.dot(zp_out_dropout,w_attention_zp) + T.dot(np_dropout,w_attention_np) + b_attention) #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + b_attention) self.calcu_attention = tanh( T.dot(self.np_out_output, w_attention_np_rnn) + T.dot(self.zp_out_output, w_attention_zp) + b_attention) self.attention = softmax(T.transpose(self.calcu_attention, axes=(1, 0)))[0] #self.attention_dropout = softmax(T.transpose(self.calcu_attention_dropout,axes=(1,0)))[0] self.out = self.attention #self.out_dropout = self.attention_dropout self.get_out = theano.function(inputs=[ self.zp_x_pre_index, self.zp_x_post_index, self.np_x_pre_index, self.np_x_prec_index, self.np_x_post_index, self.np_x_postc_index, self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc ], outputs=[self.out], on_unused_input='warn') l1_norm_squared = sum([(w**2).sum() for w in self.params]) l2_norm_squared = sum([(abs(w)).sum() for w in self.params]) lmbda_l1 = 0.0 #lmbda_l2 = 0.001 lmbda_l2 = 0.0 t = T.bvector() cost = -(T.log((self.out * t).sum())) + f_norm #cost = -(T.log((self.out_dropout*t).sum())) lr = T.scalar() updates = lasagne.updates.sgd(cost, self.params, lr) #updates = lasagne.updates.adadelta(cost, self.params) self.train_step = theano.function(inputs=[ self.zp_x_pre_index, self.zp_x_post_index, self.np_x_pre_index, self.np_x_prec_index, self.np_x_post_index, self.np_x_postc_index, self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc, t, lr ], outputs=[cost], on_unused_input='warn', updates=updates)
#==========================Theano Function definitions============================== #=================================================================================== #variables declaration ATemp = T.matrix('ATemp') BTemp = T.tensor3('BTemp') UTemp = T.matrix('UTemp') E1Temp = T.vector('E1Temp') E2Temp = T.vector('E2Temp') E1E2Temp = T.vector('E1E2Temp') ECTemp = T.vector('ECTemp') E1ECTemp = T.vector('E1ECTemp') #Definition of scoring function score = UTemp.dot( T.tanh(E1Temp.dot(BTemp).dot(T.transpose(E2Temp)) + ATemp.dot(E1E2Temp))) scoringFunction = theano.function( [ATemp, BTemp, UTemp, E1Temp, E2Temp, E1E2Temp], score) #Definition of loss function #calculated score of corrupted triplet to calculate loss scoreCorrupted = UTemp.dot( T.tanh(E1Temp.dot(BTemp).dot(T.transpose(ECTemp)) + ATemp.dot(E1ECTemp))) loss = T.largest(0, (1 - (UTemp.dot( T.tanh(E1Temp.dot(BTemp).dot(T.transpose(E2Temp)) + ATemp.dot(E1E2Temp)) )) + (UTemp.dot( T.tanh(E1Temp.dot(BTemp).dot(T.transpose(ECTemp)) + ATemp.dot(E1ECTemp)))) + regparam * (T.sum(ATemp**2) + T.sum(BTemp**2) + T.sum(UTemp**2)) / 3))
def __init__(self, phase, config, vocabulary_size=1295, hidden_ndim=512): # need to be same voca_size and hidde_ndim so as to load same shape params # self.log_self() size = 101 # model paras self.config = config learning_rate = self.config.items['lr'] self.alpha = np.array(1e-3, dtype=np.float32) self.eps = np.array(1e-6, dtype=np.float32) self.learning_rate = theano.shared(np.float32(config.items['lr'])) self.nClasses = vocabulary_size + 1 self.vocabulary_size = vocabulary_size # variables image = T.tensor4( 'image') # (2*nb*len, 3, 101, 101) or (2*nb*3, 3, 101, 101) mask = T.matrix('mask') # (nb, max_hlen) token = T.imatrix('token') # (nb, max_vlen) self.nb = mask.shape[0] self.max_xlen = image.shape[0] / 2 / self.nb self.max_hlen = mask.shape[1] net = {} # RGB modal net['image'] = InputLayer(shape=(None, 3, size, size)) # (2*nb*len, 3, 101, 101) # both hand net['conv1'] = Conv2DLayer(incoming=net['image'], num_filters=96, filter_size=7, stride=2) net['norm1'] = LocalResponseNormalization2DLayer(incoming=net['conv1']) net['pool1'] = MaxPool2DLayer(incoming=net['norm1'], pool_size=3) net['conv2'] = Conv2DLayer(incoming=net['pool1'], num_filters=256, filter_size=5) net['pool2'] = MaxPool2DLayer(incoming=net['conv2'], pool_size=2) net['conv3'] = Conv2DLayer(incoming=net['pool2'], num_filters=512, filter_size=3, pad=1) net['conv4'] = Conv2DLayer(incoming=net['conv3'], num_filters=512, filter_size=3, pad=1) net['conv5'] = Conv2DLayer(incoming=net['conv4'], num_filters=512, filter_size=3, pad=1) # modal fusion net['pool5'] = MaxPool2DLayer(incoming=net['conv5'], pool_size=3) # (2nb*len, 512, 2, 2) net['fc6'] = DenseLayer( incoming=net['pool5'], num_units=1024) # (2nb*len, 1024) or (nb*3, 1024) # dropout should be shared among timestep, or triplets net['drop6'] = ReshapeLayer(incoming=net['fc6'], shape=(2 * self.nb, -1, 1024)) # net['drop6'] = DropoutLayer(incoming=net['pre_drop6'], p=0.2, shared_axes=(1,)) net['fc7'] = DenseLayer(ReshapeLayer(net['drop6'], shape=(-1, 1024)), num_units=256, nonlinearity=identity) # (3*nb, 256) # encoding network for image features net['mask'] = InputLayer(shape=(None, None), name='mask') # (nb, max_hlen) net['pre_conv1d'] = DimshuffleLayer(net['drop6'], (0, 2, 1)) # (nb, 1024, max_xlen) net['conv1d_1'] = Conv1DLayer(net['pre_conv1d'], num_filters=1024, filter_size=3, pad='same') net['pool1d_1'] = MaxPool1DLayer(net['conv1d_1'], pool_size=2) #(nb, 1024, max_xlen/2) net['drop1d_1'] = DropoutLayer(net['pool1d_1'], p=0.1, shared_axes=(2, )) net['conv1d_2'] = Conv1DLayer(net['drop1d_1'], num_filters=1024, filter_size=3, pad='same') net['pool1d_2'] = MaxPool1DLayer(net['conv1d_2'], pool_size=2) #(nb, 1024, max_hlen) net['drop1d_2'] = DropoutLayer(net['pool1d_2'], p=0.1, shared_axes=(2, )) # LSTM, input shape=(nb, max_hlen, 1024) # two LSTM, one for fusion, one for right hand net['lstm_input'] = DimshuffleLayer( net['drop1d_2'], (0, 2, 1)) # (2*nb, max_hlen, 1024) # right hand lstm net['lstm_input_right'] = ExpressionLayer( net['lstm_input'], function=lambda x: x[:x.shape[0] / 2], output_shape='auto') net['lstm_frw_right'] = LSTMLayer( incoming=net['lstm_input_right'], mask_input=net['mask'], forgetgate=Gate(b=lasagne.init.Constant(1.0)), num_units=hidden_ndim) # (nb, max_hlen, hidden_ndim) net['lstm_bck_right'] = LSTMLayer( incoming=net['lstm_input_right'], mask_input=net['mask'], forgetgate=Gate(b=lasagne.init.Constant(1.0)), num_units=hidden_ndim, backwards=True) net['lstm_shp_right'] = ReshapeLayer( ConcatLayer((net['lstm_frw_right'], net['lstm_bck_right']), axis=2), shape=(-1, 2 * hidden_ndim)) # (nb*max_hlen, 2*hidden_ndim) # fusion lstm net['lstm_input_fusion'] = ExpressionLayer( net['lstm_input'], function=lambda x: T.concatenate( [x[:x.shape[0] / 2], x[x.shape[0] / 2:]], axis=2) / 2.0, output_shape='auto') net['lstm_frw_fusion'] = LSTMLayer( incoming=net['lstm_input_fusion'], mask_input=net['mask'], forgetgate=Gate(b=lasagne.init.Constant(1.0)), num_units=hidden_ndim) # (nb, max_hlen, hidden_ndim) net['lstm_bck_fusion'] = LSTMLayer( incoming=net['lstm_input_fusion'], mask_input=net['mask'], forgetgate=Gate(b=lasagne.init.Constant(1.0)), num_units=hidden_ndim, backwards=True) net['lstm_shp_fusion'] = ReshapeLayer( ConcatLayer((net['lstm_frw_fusion'], net['lstm_bck_fusion']), axis=2), shape=(-1, 2 * hidden_ndim)) # (nb*max_hlen, 2*hidden_ndim) net['lstm_shp'] = ConcatLayer( [net['lstm_shp_right'], net['lstm_shp_fusion']], axis=1) # net['lstm_shp'] = net['lstm_shp_right'] net['out'] = DenseLayer( net['lstm_shp'], self.nClasses, nonlinearity=identity) # (nb*max_hlen, nClasses) net['out_lin'] = ReshapeLayer(net['out'], shape=(self.nb, -1, self.nClasses)) self.net = net # try save load model dummy_save_file = 'dummy.pkl' glog.info('try save load dummy model to: %s...' % dummy_save_file) self.save_model(dummy_save_file) self.load_model(dummy_save_file) os.system('rm -rf dummy.pkl') glog.info( 'dummy save load success, remove it and start calculate outputs...' ) if phase == 'pretrain': # for triplet pretrain use self.params_feat = get_all_params(net['fc7']) regular_feat = lasagne.regularization.apply_penalty( self.params_feat, lasagne.regularization.l2) * np.array( 5e-4 / 2, dtype=np.float32) ## triplet train loss triplet_loss_train = self.get_triplet_loss(image, opflow, deterministic=False) loss_train_feat = triplet_loss_train + regular_feat ## triplet valid loss triplet_loss_valid = self.get_triplet_loss(image, opflow, deterministic=True) loss_valid_feat = triplet_loss_valid + regular_feat self.updates = lasagne.updates.momentum( loss_train_feat, self.params_feat, learning_rate=learning_rate, momentum=0.9) self.inputs = [image, opflow] self.train_outputs = [loss_train_feat, triplet_loss_train] self.valid_outputs = [loss_valid_feat, triplet_loss_valid] elif phase == 'ctc': # for ctc loss self.params_full = lasagne.layers.get_all_params( self.net['out_lin'], trainable=True) self.regular_params = lasagne.layers.get_all_params( self.net['out_lin'], regularizable=True) regular_full = lasagne.regularization.apply_penalty( self.regular_params, lasagne.regularization.l2) * np.array( 5e-4 / 2, dtype=np.float32) # full train loss ctc_loss_train, pred_train = self.get_ctc_loss(image, mask, token, deteministic=False) loss_train_full = ctc_loss_train + regular_full # full valid loss ctc_loss_valid, pred_valid = self.get_ctc_loss(image, mask, token, deteministic=True) loss_valid_full = ctc_loss_valid + regular_full self.updates = lasagne.updates.adam( loss_train_full, self.params_full, learning_rate=self.learning_rate) self.inputs = [image, mask, token] self.train_outputs = [loss_train_full, ctc_loss_train, pred_train] self.valid_outputs = [loss_valid_full, ctc_loss_valid, pred_valid] elif phase == 'extract_feature': pass # # for feature extraction # fc6 = get_output(self.net['fc6'], data, deterministic = True) # self.feature_func = theano.function(inputs=[data], outputs=fc6) elif phase == 'get_prediction': embeding = get_output(self.net['fusion_2'], { self.net['image']: image, self.net['opflow']: opflow, self.net['coord']: coord }, deterministic=True) # (nb, 1280, len_m) output_lin = get_output( self.net['out_lin'], { self.net['lstm_input']: T.transpose(embeding, (0, 2, 1)), self.net['mask']: mask }, deterministic=True) output_softmax = Softmax(output_lin) # (nb, max_hlen, nClasses) output_trans = T.transpose(output_softmax, (1, 0, 2)) # (max_hlen, nb, nClasses) best_path_loss, best_path = best_right_path_cost( output_trans, mask, token) ctc_loss = ctc_cost(output_trans, T.sum(mask, axis=1, dtype='int32'), token) # (nb, max_hlen, voca_size+1) self.predict_func = theano.function( inputs=[data, mask, token], outputs=[best_path_loss, best_path, ctc_loss]) elif phase == 'top_k_prediction': embeding = get_output(self.net['fusion_2'], { self.net['image']: image, self.net['opflow']: opflow, self.net['coord']: coord }, deterministic=True) # (nb, 1280, len_m) output_lin = get_output( self.net['out_lin'], { self.net['lstm_input']: T.transpose(embeding, (0, 2, 1)), self.net['mask']: mask }, deterministic=True) output_softmax = Softmax(output_lin) # (nb, max_hlen, nClasses) output_trans = T.transpose(output_softmax, (1, 0, 2)) # (max_hlen, nb, nClasses) top_k_path_loss, top_k_path = top_k_right_path_cost( output_trans, mask, token, k=config.items['top_k']) ctc_loss = ctc_cost(output_trans, T.sum(mask, axis=1, dtype='int32'), token) # (nb, max_hlen, voca_size+1) self.predict_func = theano.function( inputs=[data, mask, token], outputs=[output_lin, top_k_path_loss, top_k_path, ctc_loss]) glog.info('Model built, phase = %s' % phase)
margin = 0.10 lambda_ = 1.0 norm_in = T.sqrt(T.sum(prediction * prediction, axis=1)) norm_tar = T.sqrt(T.sum(target_var * target_var, axis=1)) norm_neg = T.sqrt(T.sum(neg_var * neg_var, axis=1)) #norm_in=input_var.sum(axis=1).reshape((input_var.shape[0], 1)) prod_xy_unnorm = (prediction * target_var) prod_xneg_unnorm = (prediction * neg_var) prod_xy_unnorm = prod_xy_unnorm.sum(axis=1) prod_xneg_unnorm = prod_xneg_unnorm.sum(axis=1) norm = norm_in * norm_tar + eps norm_xneg = norm_in * norm_neg + eps prod_xy = prod_xy_unnorm / (T.transpose(norm)) prod_xneg = prod_xneg_unnorm / (T.transpose(norm_xneg)) rank_loss = margin - prod_xy + prod_xneg rank_loss = T.maximum(rank_loss, 0) rank_loss_m = T.mean(rank_loss, axis=0) dist = 1 - prod_xy dist_m = T.mean(dist, axis=0) (lr, mtm) = (0.01, 0.9) #regularize all layers below dense params = lasagne.layers.get_all_params(network, trainable=True) loss = dist_m + lambda_ * rank_loss_m updates = lasagne.updates.adagrad(loss, params, learning_rate=lr) train_fn = theano.function([input_var, target_var, neg_var], [loss, prediction],
def cosine_similarity(A, B): return T.dot(A, T.transpose(B)) / (T.dot(A, T.transpose(A)) * T.dot(B, T.transpose(B)))
def get_upds(self, inp): w_update = T.dot(T.transpose(inp), self.fprop(inp)) * (1.0 / self.MB_size) h_update = T.mean(self.fprop(inp), axis=0) v_update = T.mean(inp, axis=0) return w_update, h_update, v_update
def get_light_curve(self, orbit=None, r=None, t=None, texp=None, return_num_eval=False, light_delay=False, **kwargs): """Get the light curve for an orbit at a set of times Args: orbit: An object with a ``get_relative_position`` method that takes a tensor of times and returns a list of Cartesian coordinates of a set of bodies relative to the central source. This method should return three tensors (one for each coordinate dimension) and each tensor should have the shape ``append(t.shape, r.shape)`` or ``append(t.shape, oversample, r.shape)`` when ``texp`` is given. The first two coordinate dimensions are treated as being in the plane of the sky and the third coordinate is the line of sight with positive values pointing *away* from the observer. For an example, take a look at :class:`orbits.KeplerianOrbit`. r (tensor): The radius of the transiting body in the same units as ``r_star``. This should have a shape that is consistent with the coordinates returned by ``orbit``. In general, this means that it should probably be a scalar or a vector with one entry for each body in ``orbit``. t (tensor): The times where the light curve should be evaluated. texp (Optional[tensor]): The exposure time of each observation. This can be a scalar or a tensor with the same shape as ``t``. If ``texp`` is provided, ``t`` is assumed to indicate the timestamp at the *middle* of an exposure of length ``texp``. """ if orbit is None: raise ValueError("missing required argument 'orbit'") if r is None: raise ValueError("missing required argument 'r'") if t is None: raise ValueError("missing required argument 't'") r = tt.as_tensor_variable(r) r = tt.reshape(r, (r.size, )) t = tt.as_tensor_variable(t) def pad(arg): return arg # return tt.shape_padleft(arg, t.ndim) + tt.shape_padright( # tt.zeros_like(t), arg.ndim # ) rgrid = pad(r) if texp is None: coords = orbit.get_relative_position(t, light_delay=light_delay) b = tt.sqrt(coords[0]**2 + coords[1]**2) b = tt.reshape(b, rgrid.shape) los = tt.reshape(coords[2], rgrid.shape) return limbdark(self.c_norm, b / orbit.r_star, rgrid / orbit.r_star, los)[0] n = pad(orbit.n) sini = pad(orbit.sin_incl) cosi = pad(orbit.cos_incl) # texp = tt.as_tensor_variable(texp) + tt.zeros_like(rgrid) if orbit.ecc is None: aome2 = pad(-orbit.a) e = 0.0 sinw = 0.0 cosw = 0.0 kwargs["circular"] = True else: aome2 = pad(-orbit.a * (1 - orbit.ecc**2)) e = pad(orbit.ecc) sinw = pad(orbit.sin_omega) cosw = pad(orbit.cos_omega) kwargs["circular"] = False # Apply the time integrated op tgrid = tt.transpose(orbit._warp_times(t) - orbit.tref) texp = tt.as_tensor_variable(texp) + tt.zeros_like(tgrid) kwargs["Nc"] = kwargs.get("Nc", self.num_cl) op = IntegratedLimbDarkOp(**kwargs) res = op( self.c_norm, texp, tgrid, rgrid / orbit.r_star, n, aome2, sini, cosi, e, sinw, cosw, ) if return_num_eval: return res[0], res[1] return res[0]
def apply_global_transform(pose_params, positions): R = angle_axis_to_rotation_matrix(pose_params[0]) s = pose_params[1] R *= s[np.newaxis, :] t = pose_params[2] return T.transpose(T.dot(R, T.transpose(positions))) + t
def compute_OD(idx, zS, zD, zAA, zBB): OD = T.dot(T.transpose(zS[-idx - 1]), zD[idx]) return OD
def build_decoder(self, query_tokens, query_token_embed, query_token_embed_mask): # logging.info('building decoder ...') # (batch_size, decoder_state_dim) decoder_prev_state = ndim_tensor(2, name='decoder_prev_state') # (batch_size, decoder_state_dim) decoder_prev_cell = ndim_tensor(2, name='decoder_prev_cell') # (batch_size, n_timestep, decoder_state_dim) hist_h = ndim_tensor(3, name='hist_h') # (batch_size, decoder_state_dim) prev_action_embed = ndim_tensor(2, name='prev_action_embed') # (batch_size) node_id = T.ivector(name='node_id') # (batch_size, node_embed_dim) node_embed = self.node_embedding[node_id] # (batch_size) par_rule_id = T.ivector(name='par_rule_id') # (batch_size, decoder_state_dim) par_rule_embed = T.switch(par_rule_id[:, None] < 0, T.alloc(0., 1, config.rule_embed_dim), self.rule_embedding_W[par_rule_id]) # ([time_step]) time_steps = T.ivector(name='time_steps') # (batch_size) parent_t = T.ivector(name='parent_t') # (batch_size, 1) parent_t_reshaped = T.shape_padright(parent_t) query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask, dropout=config.dropout, train=False) # (batch_size, 1, decoder_state_dim) prev_action_embed_reshaped = prev_action_embed.dimshuffle((0, 'x', 1)) # (batch_size, 1, node_embed_dim) node_embed_reshaped = node_embed.dimshuffle((0, 'x', 1)) # (batch_size, 1, node_embed_dim) par_rule_embed_reshaped = par_rule_embed.dimshuffle((0, 'x', 1)) if not config.frontier_node_type_feed: node_embed_reshaped *= 0. if not config.parent_action_feed: par_rule_embed_reshaped *= 0. decoder_input = T.concatenate([prev_action_embed_reshaped, node_embed_reshaped, par_rule_embed_reshaped], axis=-1) # (batch_size, 1, decoder_state_dim) # (batch_size, 1, decoder_state_dim) # (batch_size, 1, field_token_encode_dim) decoder_next_state_dim3, decoder_next_cell_dim3, ctx_vectors = self.decoder_lstm(decoder_input, init_state=decoder_prev_state, init_cell=decoder_prev_cell, hist_h=hist_h, context=query_embed, context_mask=query_token_embed_mask, parent_t_seq=parent_t_reshaped, dropout=config.dropout, train=False, time_steps=time_steps) decoder_next_state = decoder_next_state_dim3.flatten(2) # decoder_output = decoder_next_state * (1 - DECODER_DROPOUT) decoder_next_cell = decoder_next_cell_dim3.flatten(2) decoder_next_state_trans_rule = self.decoder_hidden_state_W_rule(decoder_next_state) decoder_next_state_trans_token = self.decoder_hidden_state_W_token(T.concatenate([decoder_next_state, ctx_vectors.flatten(2)], axis=-1)) rule_prob = softmax(T.dot(decoder_next_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b) gen_action_prob = self.terminal_gen_softmax(decoder_next_state) vocab_prob = softmax(T.dot(decoder_next_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b) ptr_net_decoder_state = T.concatenate([decoder_next_state_dim3, ctx_vectors], axis=-1) copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state) copy_prob = copy_prob.flatten(2) inputs = [query_tokens] outputs = [query_embed, query_token_embed_mask] self.decoder_func_init = theano.function(inputs, outputs) inputs = [time_steps, decoder_prev_state, decoder_prev_cell, hist_h, prev_action_embed, node_id, par_rule_id, parent_t, query_embed, query_token_embed_mask] outputs = [decoder_next_state, decoder_next_cell, rule_prob, gen_action_prob, vocab_prob, copy_prob] self.decoder_func_next_step = theano.function(inputs, outputs)
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"): print "mode: ", mode print "data_name: ", data_name print "pooling_mode: ", pooling_mode print "Started!" rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, \ wordEmbeddingDim=249, \ sentenceLayerNodesNum=50, \ sentenceLayerNodesSize=[5, 249], \ docLayerNodesNum=10, \ docLayerNodesSize=[3, 50], pooling_mode=pooling_mode) layer1 = HiddenLayer(rng, input=layer0.output, n_in=layer0.outputDimension, n_out=10, activation=T.tanh) layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=2) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. # data_name = "car" para_path = "data/" + data_name + "/model/multi_input_mergeinput" + pooling_mode + ".model" traintext = "data/" + data_name + "/train/text" trainlabel = "data/" + data_name + "/train/label" testtext = "data/" + test_dataname + "/test/text" testlabel = "data/" + test_dataname + "/test/label" loadParamsVal(para_path, params) if (mode == "train" or mode == "test"): learning_rate = 0.1 error = layer2.errors(docLabel) cost = layer2.negative_log_likelihood(docLabel) grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] print "Loading test data." cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext, labelset=testlabel) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, validPosList = cr_test.getCorpus( [0, 1000]) # print "Right answer: " # print zip(validIds, validLabels) validDocMatrixes = numpy.column_stack((validDocMatrixes, validPosList)) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) # validPosList = transToTensor(validPosList, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Data loaded." valid_model = theano.function( [], [ cost, error, layer2.y_pred, docLabel, T.transpose(layer2.p_y_given_x)[1] ], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels }, allow_input_downcast=True) # ####Validate the model#### costNum, errorNum, pred_label, real_label, pred_prob = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # print "Valid Pred: ", pred_label # print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) if mode == "test": print "tpr_all: ", tpr print "fpr_all: ", fpr roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) ar = (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", ar print "threshold: ", threshold[index_of_one] if mode == "test": valid_model.free() return errorNum, roc_auc, tpr[index_of_one], fpr[index_of_one], ar print "Loading train data." cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext, labelset=trainlabel) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, posList = cr_train.getCorpus( [0, 100000]) # print "Right answer: " # print zip(ids, labels) docMatrixes = numpy.column_stack((docMatrixes, posList)) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) # posList = transToTensor(posList, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) # valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt") print index = T.lscalar("index") batchSize = 10 n_batches = (len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1 print print "Train set size is ", len(docMatrixes.get_value()) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches print "Compiling computing graph." # for list-type data train_model = theano.function( [index], [cost, error, layer2.y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize:(index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize:(index + 1) * batchSize], }, allow_input_downcast=True) print "Compiled." print "Start to train." epoch = 0 n_epochs = 10 ite = 0 while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(n_batches): # for list-type data print ".", costNum, errorNum, pred_label, real_label = train_model(i) print ".", ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if (ite % 10 == 0): print print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model costNum, errorNum, pred_label, real_label, pred_prob = valid_model( ) print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # print "pred_prob: ", pred_prob # print "Valid Pred: ", pred_label fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved." valid_model.free() train_model.free() elif (mode == "deploy"): print "Compiling computing graph." output_model = theano.function( [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred]) print "Compiled." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") count = 21000 while (count <= 21000): docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus( [count, count + 100]) docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX) docSentenceNums = numpy.array(docSentenceNums, dtype=numpy.int32) sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32) print "start to predict." pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums) print "End predicting." print "Writing resfile." # print zip(ids, pred_y[0]) f = file("data/test/res/res" + str(count), "w") f.write(str(zip(ids, pred_y[0]))) f.close() print "Written." + str(count) count += 100
def build(self): # (batch_size, max_example_action_num, action_type) tgt_action_seq = ndim_itensor(3, 'tgt_action_seq') # (batch_size, max_example_action_num, action_type) tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type') # (batch_size, max_example_action_num) tgt_node_seq = ndim_itensor(2, 'tgt_node_seq') # (batch_size, max_example_action_num) tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq') # (batch_size, max_example_action_num) tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq') # (batch_size, max_example_action_num, symbol_embed_dim) # tgt_node_embed = self.node_embedding(tgt_node_seq, mask_zero=False) tgt_node_embed = self.node_embedding[tgt_node_seq] # (batch_size, max_query_length) query_tokens = ndim_itensor(2, 'query_tokens') # (batch_size, max_query_length, query_token_embed_dim) # (batch_size, max_query_length) query_token_embed, query_token_embed_mask = self.query_embedding(query_tokens, mask_zero=True) # if WORD_DROPOUT > 0: # logging.info('used word dropout for source, p = %f', WORD_DROPOUT) # query_token_embed, query_token_embed_intact = WordDropout(WORD_DROPOUT, self.srng)(query_token_embed, False) batch_size = tgt_action_seq.shape[0] max_example_action_num = tgt_action_seq.shape[1] # previous action embeddings # (batch_size, max_example_action_num, action_embed_dim) tgt_action_seq_embed = T.switch(T.shape_padright(tgt_action_seq[:, :, 0] > 0), self.rule_embedding_W[tgt_action_seq[:, :, 0]], self.vocab_embedding_W[tgt_action_seq[:, :, 1]]) tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed) # parent rule application embeddings tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0, T.alloc(0., 1, config.rule_embed_dim), self.rule_embedding_W[tgt_par_rule_seq]) if not config.frontier_node_type_feed: tgt_node_embed *= 0. if not config.parent_action_feed: tgt_par_rule_embed *= 0. # (batch_size, max_example_action_num, action_embed_dim + symbol_embed_dim + action_embed_dim) decoder_input = T.concatenate([tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed], axis=-1) # (batch_size, max_query_length, query_embed_dim) query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask, dropout=config.dropout, srng=self.srng) # (batch_size, max_example_action_num) tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1) # decoder_hidden_states: (batch_size, max_example_action_num, lstm_hidden_state) # ctx_vectors: (batch_size, max_example_action_num, encoder_hidden_dim) decoder_hidden_states, _, ctx_vectors = self.decoder_lstm(decoder_input, context=query_embed, context_mask=query_token_embed_mask, mask=tgt_action_seq_mask, parent_t_seq=tgt_par_t_seq, dropout=config.dropout, srng=self.srng) # if DECODER_DROPOUT > 0: # logging.info('used dropout for decoder output, p = %f', DECODER_DROPOUT) # decoder_hidden_states = Dropout(DECODER_DROPOUT, self.srng)(decoder_hidden_states) # ==================================================== # apply additional non-linearity transformation before # predicting actions # ==================================================== decoder_hidden_state_trans_rule = self.decoder_hidden_state_W_rule(decoder_hidden_states) decoder_hidden_state_trans_token = self.decoder_hidden_state_W_token(T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1)) # (batch_size, max_example_action_num, rule_num) rule_predict = softmax(T.dot(decoder_hidden_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b) # (batch_size, max_example_action_num, 2) terminal_gen_action_prob = self.terminal_gen_softmax(decoder_hidden_states) # (batch_size, max_example_action_num, target_vocab_size) vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b) # (batch_size, max_example_action_num, lstm_hidden_state + encoder_hidden_dim) ptr_net_decoder_state = T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1) # (batch_size, max_example_action_num, max_query_length) copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state) # (batch_size, max_example_action_num) rule_tgt_prob = rule_predict[T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 0]] # (batch_size, max_example_action_num) vocab_tgt_prob = vocab_predict[T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 1]] # (batch_size, max_example_action_num) copy_tgt_prob = copy_prob[T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 2]] # (batch_size, max_example_action_num) tgt_prob = tgt_action_seq_type[:, :, 0] * rule_tgt_prob + \ tgt_action_seq_type[:, :, 1] * terminal_gen_action_prob[:, :, 0] * vocab_tgt_prob + \ tgt_action_seq_type[:, :, 2] * terminal_gen_action_prob[:, :, 1] * copy_tgt_prob likelihood = T.log(tgt_prob + 1.e-7 * (1 - tgt_action_seq_mask)) loss = - (likelihood * tgt_action_seq_mask).sum(axis=-1) # / tgt_action_seq_mask.sum(axis=-1) loss = T.mean(loss) # let's build the function! train_inputs = [query_tokens, tgt_action_seq, tgt_action_seq_type, tgt_node_seq, tgt_par_rule_seq, tgt_par_t_seq] optimizer = optimizers.get(config.optimizer) optimizer.clip_grad = config.clip_grad updates, grads = optimizer.get_updates(self.params, loss) self.train_func = theano.function(train_inputs, [loss], # [loss, tgt_action_seq_type, tgt_action_seq, # rule_tgt_prob, vocab_tgt_prob, copy_tgt_prob, # copy_prob, terminal_gen_action_prob], updates=updates) # if WORD_DROPOUT > 0: # self.build_decoder(query_tokens, query_token_embed_intact, query_token_embed_mask) # else: # self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask) self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask)
def transpose(x): return T.transpose(x)
f, phi_m = inp.input_var, lb_op.input_var # f - inputs, phi_m - basis # f.shape = Nxl, phi_m.shape = Nxn f = T.printing.Print('f')(f) phi_m = T.printing.Print('phi_m')(phi_m) # compute A - the input coefficients matrix A = utils_lasagne.desc_coeff(f, phi_m[:, 0:neigen]) A = T.printing.Print('A')(A) # compute B - the reference coefficients matrix # B, At, AtA, AtAi, AtB = ldiv(phi_n[:, 0: neigen], f) B = utils_lasagne.desc_coeff(f, phi_n[:, 0:neigen]) B = T.printing.Print('B')(B) # compute C using least-squares: argmin_X( ||X*A - B||^2 ) C = T.transpose(utils_lasagne.ldiv(T.transpose(A), T.transpose(B))) C = T.printing.Print('C')(C) # apply mapping A*C Br = T.dot(C, A) Br = T.printing.Print('Br')(Br) # compute smoothed mapped functions g output = T.dot(phi_n[:, 0:neigen], Br) funcs = dict() funcs['predict'] = theano.function( [inp.input_var, lb_op.input_var], [output, A, B, C, Br], #, At, AtA, AtAi, AtB], on_unused_input='warn') # output_, A_, B_, C_, Br_, gr_, At_, AtA_, AtAi_, AtB_ = funcs['predict'](*x_) output_, A_, B_, C_, Br_, gr_ = funcs['predict'](*x_)
import theano import theano.tensor as T from theano import pp from theano import function import numpy as np from ipdb import set_trace conv5 = T.ftensor4() sim_map = T.ftensor3() top_diff = T.ftensor4() batch_size, c, h, w = conv5.shape value = T.reshape(conv5, newshape=(batch_size, c, h * w)) value = T.transpose(value, axes=(0, 2, 1)) context = T.batched_dot(sim_map, value) context = T.transpose(context, axes=(0, 2, 1)) context = T.reshape(context, newshape=(batch_size, c, h, w)) fuse = context + conv5 fuse_sum = T.sum(fuse * top_diff) forward_theano = theano.function([conv5, sim_map], fuse) backward_theano = theano.function([conv5, sim_map, top_diff], T.grad(fuse_sum, conv5)) one = np.ones(shape=(3, 3)) np_conv5 = np.stack([one, one + 1, one + 2, one + 3], axis=0).astype(np.float32)