def compute_sym_blk_tridiag_inv_b(S, D, b): ''' Symbolically solve Cx = b for x, where C is assumed to be *symmetric* block matrix. Input: D - (T x n x n) diagonal blocks of the inverse S - (T-1 x n x n) intermediary matrix computation returned by the function compute_sym_blk_tridiag Output: x - (T x n) solution of Cx = b From: Jain et al, 2006 "Numerically Stable Algorithms for Inversion of Block Tridiagonal and Banded Matrices" (c) Evan Archer, 2015 ''' nT = T.shape(b)[0] d = T.shape(b)[1] initp = T.zeros([d], dtype=theano.config.floatX) inity = T.zeros([d], dtype=theano.config.floatX) initq = T.zeros([d], dtype=theano.config.floatX) def compute_p(idx, pp, b, S): pm = ifelse(T.eq(idx, nT - 1), b[-1], b[idx] + T.dot(S[T.max([-idx - 1, -nT + 1])], pp)) return pm p, updates = theano.scan(compute_p, sequences=[T.arange(nT - 1, -1, -1)], outputs_info=initp, non_sequences=[b, S]) def compute_q(idx, qm, b, S, D): qp = ifelse( T.eq(idx, 0), T.dot(T.dot(T.transpose(S[-1]), D[0]), b[0]), T.dot(T.transpose(S[-idx - 1]), qm + T.dot(D[idx], b[idx]))) return qp q, updates_q = theano.scan(compute_q, sequences=[T.arange(nT - 1)], outputs_info=p[0], non_sequences=[b, S, D]) def compute_y(idx, p, q, S, D): yi = ifelse( T.eq(idx, 0), T.dot(D[0], p[-1]), ifelse(T.eq(idx, nT - 1), T.dot(D[-1], p[0]) + q[-1], T.dot(D[idx], p[-idx - 1]) + q[idx - 1])) return yi y, updates_y = theano.scan(compute_y, sequences=[T.arange(nT)], outputs_info=None, non_sequences=[p, q, S, D]) #return [y, updates_q+updates+y] return y
def RecurrentPredictor(X,drop_prob,mask=None): batch_size = T.shape(X)[0] seq_len = T.shape(X)[1] emb_phons = lib.ops.dropout(lib.ops.Embedding( 'DurationPredictor.Embedding_Phonemes', V, 256, X ),drop_prob) gru = lib.ops.dropout(lib.ops.BiGRU( 'DurationPredictor.BiGRU', emb_phons, 256, 256, mask=mask ),drop_prob) out = T.nnet.relu(lib.ops.Linear( 'DurationPredictor.FC', gru, 512, 1 ))[:,:,0] return out
def regularizer_cost(self, L_t): """ Penalize if the expected set size is far from the target value :type L_t: T.matrix :param L_t: The Ensemble matrix :type return value: T.fscalar :param return value: The Cost . """ num_sents = T.shape(L_t)[0] # Calculate expectation value #eigenvalues, _ = T.nlinalg.eigh(L_t) #expected_set_size = T.dot(eigenvalues, 1 / (eigenvalues + 1)) K = T.eye(num_sents) - T.nlinalg.matrix_inverse(L_t + T.eye(num_sents)) expected_set_size = T.nlinalg.trace(K) # We need to scale 'expected_set_size' because we remove empty sets while sampling. num_sents = T.shape(L_t)[0] L_I = L_t + T.eye(num_sents) det_L_I = Det()(L_I) factor = det_L_I / (det_L_I - 1.0) expected_set_size *= factor return 2 * (expected_set_size - self.args.num_target_sentences ) * self.args.regularizer, expected_set_size
def neg_log_normal_mixture_likelihood(true, parameters): NT = T.shape(true)[0] D = T.shape(true)[1] M = T.shape(parameters)[1] // (D + D**2 + 1) means, sigmas, weights = mapping(true, parameters) two_pi = 2 * np.pi log2pi = np.log(two_pi) def log_single_data_point(i, means, sigmas, weights, true): mu = means[i, :, :] P = sigmas[i, :, :, :] al = weights[i, :] tr = true[i, :] def log_single_component(c, mu, P, al, tr): L = T.tril(P[c, :, :], k=-1) + T.diag(T.exp(T.diagonal( P[c, :, :]))) z = T.exp(-0.5 * T.sum(T.dot(T.transpose(L), (tr - mu[c, :]))**2) + T.log(al[c]) + T.log(T.nlinalg.det(L)) - D * log2pi / 2.) return z z, _ = theano.scan(fn=log_single_component, sequences=T.arange(M), non_sequences=[mu, P, al, tr]) return T.log(T.sum(z) + 1e-44) Z, _ = theano.scan(fn=log_single_data_point, sequences=T.arange(NT), non_sequences=[means, sigmas, weights, true]) return -T.mean(Z)
def sample_h_given_v(self, v0_sample): ''' This function infers state of hidden units given visible units ''' # compute the activation of the hidden units given a sample of # the visibles pre_sigmoid_h1, h1_mean = self.propup(v0_sample) # LARGER mu IS MORE SPARSE. mu = 0.000001 # mu = 0.01 is probably too small. # LOOKED AT THE CODE HERE: http://lrn2cre8.ofai.at/lrn2/doc/_modules/lrn2/models/srbm_goh.html#SRBM_Goh ## DAN ADDED:######################### rank_0 = ((h1_mean.argsort(axis=0)).argsort(axis=0).astype( theano.config.floatX) + 1.) / T.shape(h1_mean)[0].astype( theano.config.floatX) rank_1 = ((h1_mean.argsort(axis=1)).argsort(axis=1).astype( theano.config.floatX) + 1.) / T.shape(h1_mean)[1].astype( theano.config.floatX) h1_mean = (1. - 0.5) * (rank_0**((1. / mu) - 1.)) + 0.5 * (rank_1**( (1. / mu) - 1.)) #pre_sigmoid_h1_bin = T.log(h1_mean) - T.log(1. - h1_mean) #pre_sigmoid_h1 = pre_sigmoid_h1_bin ####################################### # get a sample of the hiddens given their activation # Note that theano_rng.binomial returns a symbolic sample of dtype # int64 by default. If we want to keep our computations in floatX # for the GPU we need to specify to return the dtype floatX h1_sample = self.theano_rng.binomial(size=h1_mean.shape, n=1, p=h1_mean, dtype=theano.config.floatX) return [pre_sigmoid_h1, h1_mean, h1_sample]
def get_sensi_speci(y_hat, y): # y_hat = T.concatenate(T.sum(input=y_hat[:, 0:2], axis=1), T.sum(input=y_hat[:, 2:], axis=1)) y_hat = T.stacklists([y_hat[:, 0] + y_hat[:, 1], y_hat[:, 2] + y_hat[:, 3] + y_hat[:, 4]]).T y_hat = T.argmax(y_hat) tag = 10 * y_hat + y tneg = T.cast((T.shape(tag[(T.eq(tag, 0.)).nonzero()]))[0], config.floatX) fneg = T.cast((T.shape(tag[(T.eq(tag, 1.)).nonzero()]))[0], config.floatX) fpos = T.cast((T.shape(tag[(T.eq(tag, 10.)).nonzero()]))[0], config.floatX) tpos = T.cast((T.shape(tag[(T.eq(tag, 11.)).nonzero()]))[0], config.floatX) # assert fneg + fneg + fpos + tpos == 1380 # tneg.astype(config.floatX) # fneg.astype(config.floatX) # fpos.astype(config.floatX) # tpos.astype(config.floatX) speci = ifelse(T.eq((tneg + fpos), 0), np.float64(float('inf')), tneg / (tneg + fpos)) sensi = ifelse(T.eq((tpos + fneg), 0), np.float64(float('inf')), tpos / (tpos + fneg)) # keng die!!! # if T.eq((tneg + fpos), 0): # speci = float('inf') # else: # speci = tneg // (tneg + fpos) # if T.eq((tpos + fneg), 0.): # sensi = float('inf') # else: # sensi = tpos // (tpos + fneg) # speci.astype(config.floatX) # sensi.astype(config.floatX) return [sensi, speci]
def activation(self, conv_out): conv_out = T.reshape( conv_out, (T.shape(conv_out)[0], T.shape(conv_out)[1] // self.n_pieces, self.n_pieces, T.shape(conv_out)[2], T.shape(conv_out)[3])) return T.max(conv_out, axis=2)
def bias_h(self, v_in): """ Calculate latent activation biases, combined for sparsity and selectivity. """ h_act = self.activation_h(v_in) h_act = h_act.dimshuffle(1,0,2,3) shape_before = h_act.shape h_act = h_act.reshape((h_act.shape[0], -1)) rank_0 = ((h_act.argsort(axis=0) ).argsort(axis=0).astype(fx) + 1. ) / T.shape(h_act)[0].astype(fx) rank_1 = ((h_act.argsort(axis=1) ).argsort(axis=1).astype(fx) + 1. ) / T.shape(h_act)[1].astype(fx) # Interpolate towards the average of the sparsity and selectivity bias # matrices. lat_act = (1. - self.interp) * (rank_0 ** ((1. / self.mu) - 1.)) \ + self.interp * (rank_1 ** ((1. / self.mu) - 1.)) lat_act = lat_act.reshape(shape_before) lat_act = lat_act.dimshuffle(1,0,2,3) # inverse of sigmoid lat_act_logit = T.log(lat_act) - T.log(1. - lat_act) return lat_act_logit
def call(self, x, mask=None): R = T.reshape( x, (T.shape(x)[0], T.shape(x)[1] / self.OneOnX, self.OneOnX)) M = K.max(R, axis=(2), keepdims=True) R = K.switch(K.equal(R, M), R, 0.) R = T.reshape(R, (T.shape(x)[0], T.shape(x)[1])) return R
def DeepVoice(X, drop_prob): batch_size = T.shape(X)[0] seq_len = T.shape(X)[1] emb_phons = T.extra_ops.to_one_hot(X.flatten(),V).reshape((batch_size,-1,V)) out = T.nnet.relu(lib.ops.Linear( 'DurationPredictor.FC.1', emb_phons, V, 256 )) out = lib.ops.dropout(T.nnet.relu(lib.ops.Linear( 'DurationPredictor.FC.2', out, 256, 256 )),drop_prob) out = lib.ops.dropout(lib.ops.RNN( 'GRU', 'DurationPredictor.GRU', out, 256, 128, n_layers=2, residual=False )[:,:,-1],drop_prob) out = lib.ops.Linear( 'DurationPredictor.FC.3', out, 128, 1 )[:,:,0] return out
def get_output_for(self, upscaled, **kwargs): a, b = self.scale_factor # get output for pooling and pre-pooling layer inp, out =\ lasagne.layers.get_output([self.pool2d_layer_in, self.pool2d_layer]) # upscale the input feature map by scale_factor if b > 1: upscaled = T.extra_ops.repeat(upscaled, b, 3) if a > 1: upscaled = T.extra_ops.repeat(upscaled, a, 2) # get the shapes for pre-pooling layer and upscaled layer sh_pool2d_in = T.shape(inp) sh_upscaled = T.shape(upscaled) # in case the shape is different left-bottom-pad with zero tmp = T.zeros(sh_pool2d_in) indx = (slice(None), slice(None), slice(0, sh_upscaled[2]), slice(0, sh_upscaled[3])) upscaled = T.set_subtensor(tmp[indx], upscaled) # get max pool indices indices_pool = T.grad(None, wrt=inp, known_grads={out: T.ones_like(out)}) # mask values using indices_pool f = indices_pool * upscaled return f
def get_output_for(self, inputs, **kwargs): #input = (batch,channels,14,14) #boxes = (batch,num_boxes,5) #out = (batch,channels,num_boxes) # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R input = inputs[0] boxes = inputs[1] #assert(input.shape[0]==boxes.shape[0]) batch = T.shape(input)[0] channels = T.shape(input)[1] height = T.shape(input)[2] width = T.shape(input)[3] num_boxes = T.shape(boxes)[1] #/batch _boxes = boxes.dimshuffle((2, 1, 0)).reshape( (5, num_boxes * batch) ).dimshuffle( (1, 0) ) #((boxes.dimshuffle((2,1,0))).reshape((5,num_boxes*batch))).dimshuffle((1,0))#boxes#.T.reshape((5,num_boxes*batch)).T #for bt in range(batch): # _boxes[bt*num_boxes:(bt+1)*num_boxes,0]=bt #output = T.zeros((batch * num_boxes , channels, self.num_features)) op = ROIPoolingOp(pooled_h=self.pool_dims, pooled_w=self.pool_dims, spatial_scale=self.sp_scale) output = op( input, _boxes ) #num_boxes*batch,channels,height,width --> batch,channels*height*width,num_boxes #output = output[0].reshape((batch,num_boxes,channels*self.pool_dims*self.pool_dims)).dimshuffle((0,2,1)) #output = output[0].reshape((batch*num_boxes,channels*self.pool_dims*self.pool_dims)) #output = output.dimshuffle((1,0)).reshape((channels*self.pool_dims*self.pool_dims,num_boxes,batch)).dimshuffle((2,0,1)) return output[0]
def binarize_weights_a(self, W, eval): if self.binary_training == True: if self.stochastic_training == True: p = self.hard_sigm(W / self.W0_a) srng = theano.sandbox.rng_mrg.MRG_RandomStreams( self.rng.randint(999998)) p_mask = T.cast(srng.binomial(n=1, p=p, size=T.shape(W)), theano.config.floatX) Wb = T.switch(p_mask, self.W0_a, -self.W0_a) else: Wb = T.switch(T.ge(W, 0), self.W0_a, -self.W0_a) elif self.ternary_training == True: if self.stochastic_training == True: w_sign = T.gt(W, 0) * 2 - 1 p = self.clipped_v(W / self.W0_a) srng = theano.sandbox.rng_mrg.MRG_RandomStreams( self.rng.randint(999998)) Wb = self.W0_a * w_sign * T.cast( srng.binomial(n=1, p=p, size=T.shape(W)), theano.config.floatX) else: larger_than_neg_0_5 = T.gt(W, -self.W0_a / 3) larger_than_pos_0_5 = T.gt(W, self.W0_a / 3) W_val = larger_than_neg_0_5 * 1 + larger_than_pos_0_5 * 1 - 1 Wb = W_val * self.W0_a else: Wb = W return Wb
def sample_h_given_v(self, v0_sample): ''' This function infers state of hidden units given visible units ''' # compute the activation of the hidden units given a sample of # the visibles pre_sigmoid_h1, h1_mean = self.propup(v0_sample) ################################################################## ## Sparsity: ##################################################### ################################################################## rank_0 = ((h1_mean.argsort(axis=0)).argsort(axis=0).astype( theano.config.floatX) + 1.) / T.shape(h1_mean)[0].astype( theano.config.floatX) rank_1 = ((h1_mean.argsort(axis=1)).argsort(axis=1).astype( theano.config.floatX) + 1.) / T.shape(h1_mean)[1].astype( theano.config.floatX) h1_mean = (1. - 0.9) * (rank_0**((1. / 0.99) - 1.)) + 0.9 * (rank_1**( (1. / 0.99) - 1.)) # get a sample of the hiddens given their activation # Note that theano_rng.binomial returns a symbolic sample of dtype # int64 by default. If we want to keep our computations in floatX # for the GPU we need to specify to return the dtype floatX h1_sample = self.theano_rng.binomial(size=h1_mean.shape, n=1, p=h1_mean, dtype=theano.config.floatX) return [pre_sigmoid_h1, h1_mean, h1_sample]
def discrete_grads(loss,network,LR): global update_type,best_params,H,N,th # th is a parameter that controls the nonlinearity of state transfer probability W_params = lasagne.layers.get_all_params(network, discrete=True) #Get all the weight parameters layers = lasagne.layers.get_all_layers(network) W_grads = [] for layer in layers: params = layer.get_params(discrete=True) if params: W_grads.append(theano.grad(loss, wrt=layer.W)) #Here layer.W = weight_tune(param) updates = lasagne.updates.adam(loss_or_grads=W_grads,params=W_params,learning_rate=LR) for param, parambest in izip(W_params, best_params) : L = 2*H/pow(2,N) #state step length in Z_N a=random.random() #c is a random variable with binary value if a<0.85: c = 1 else: c = 0 b=random.random() state_rand = T.round(b*pow(2,N))*L-H #state_rand is a random state in the discrete weight space Z_N delta_W1 =c*(state_rand-parambest)#parambest would transfer to state_rand with probability of a, or keep unmoved with probability of 1-a delta_W1_direction = T.cast(T.sgn(delta_W1),theano.config.floatX) dis1=T.abs_(delta_W1) #the absolute distance k1=delta_W1_direction*T.floor(dis1/L) #the integer part v1=delta_W1-k1*L #the decimal part Prob1= T.abs_(v1/L) #the transfer probability Prob1 = T.tanh(th*Prob1) #the nonlinear tanh() function accelerates the state transfer delta_W2 = updates[param] - param delta_W2_direction = T.cast(T.sgn(delta_W2),theano.config.floatX) dis2=T.abs_(delta_W2) #the absolute distance k2=delta_W2_direction*T.floor(dis2/L) #the integer part v2=delta_W2-k2*L #the decimal part Prob2= T.abs_(v2/L) #the transfer probability Prob2 = T.tanh(th*Prob2) #the nonlinear tanh() function accelerates the state transfer srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) Gate1 = T.cast(srng.binomial(n=1, p=Prob1, size=T.shape(Prob1)), theano.config.floatX) # Gate1 is a binary variable with probability of Prob1 to be 1 Gate2 = T.cast(srng.binomial(n=1, p=Prob2, size=T.shape(Prob2)), theano.config.floatX) # Gate2 is a binary variable with probability of Prob2 to be 1 delta_W1_new=(k1+delta_W1_direction*Gate1)*L #delta_W1_new = k*L where k is an integer updates_param1 = T.clip(parambest + delta_W1_new,-H,H) updates_param1 = weight_tune(updates_param1,-H,H) #fine tuning for guaranteeing each element strictly constrained in the discrete space delta_W2_new=(k2+delta_W2_direction*Gate2)*L #delta_W2_new = k*L where k is an integer updates_param2 = T.clip(param + delta_W2_new,-H,H) updates_param2 = weight_tune(updates_param2,-H,H) #fine tuning for guaranteeing each element strictly constrained in the discrete space # if update_type<100, the weight probabilistically tranfers from parambest to state_rand, which helps to search the global minimum # elst it would probabilistically transfer from param to a state nearest to updates[param] updates[param]= T.switch(T.lt(update_type,100), updates_param1, updates_param2) return updates
def RecurrentMapper(ctx): emb_ctx = lib.ops.Embedding('Mapper.Generator.Embedding_Context', V, ENC_DIM, ctx) batch_size = T.shape(ctx)[0] seq_len = T.shape(ctx)[1] out = lib.ops.BiGRU('Mapper.Generator.BiGRU', emb_ctx, ENC_DIM, 256) readout = lib.ops.Linear('Mapper.Generator.FC', out, 512, EMB_DIM) return readout
def batched_thompson_sampling(self, bb_alpha_con, q, lower, upper, bb_alpha_samples): ''' q = number of samples lower = lowest x value in training data upper = highest x value in training data (specifying range?) ''' grid_size = 10000 grid = casting(lower + np.random.rand(grid_size, len(lower)) * (upper - lower)) def sigmoid(x): return 1.0 / (1.0 + T.exp(-x)) x = T.matrix('x', dtype=theano.config.floatX) prediction_probs = T.exp( LogSumExp(bb_alpha_con.network.output(self.x), 0) + T.log(1.0 / bb_alpha_samples) )**30 # 2-D array of size (n_samples, 2) where column 1 gives the probability of the constraint being unsatisfied and column two gives the probabilty of the constraint being satisfied. # 19 November 20:45 prediction_lg is the logistic function applied to the NN output. prediction_lg = sigmoid(4.0 * (self.network.output(self.x) - self.y_max) / (self.y_min - self.y_max) - 2.0) predict_lg = theano.function([self.x], prediction_lg) function_grid_lg = theano.function( [self.x], -prediction_lg[0, :, 0] * T.reshape( prediction_probs[:, :, 1], [T.shape(self.x)[0], 1])[:, 0]) function_scalar_lg = theano.function( [self.x], -prediction_lg[0, 0, 0] * T.reshape( prediction_probs[:, :, 1], [T.shape(self.x)[0], 1])[0, 0]) function_scalar_gradient_lg = theano.function( [self.x], T.grad( -prediction_lg[0, 0, 0] * T.reshape( prediction_probs[:, :, 1], [T.shape(self.x)[0], 1])[0, 0], self.x)) self.network.update_randomness(grid_size) X_numpy = \ global_optimization(grid, lower, upper, function_grid_lg, function_scalar_lg, function_scalar_gradient_lg)[0] for i in range(1, q): self.network.update_randomness(grid_size) new_point = \ global_optimization(grid, lower, upper, function_grid_lg, function_scalar_lg, function_scalar_gradient_lg)[ 0] # new_point.shape = (1,2) X_numpy = casting(np.concatenate([X_numpy, new_point], 0)) print(i, X_numpy) samples = self.predict(X_numpy) print("Predictive mean at selected points:\n", np.mean(samples, 0)[:, 0]) return X_numpy
def compileActivation(self, net, layerNum): variable = net.x if layerNum == 0 else net.varArrayA[layerNum - 1] #Calc shapes for reshape function on-the-fly. Assume we have square images as input. sX = T.cast(T.sqrt(T.shape(variable)[0] / self.kernel_shape[1]), 'int16') #Converts input from 2 to 4 dimensions Xr = T.reshape(variable.T, (T.shape(variable)[1], self.kernel_shape[1], sX, sX)) if self.optimized: out_size = T.cast( T.ceil((T.shape(Xr)[-1] - T.shape(net.varWeights[layerNum]['w'])[-1] + 1) / np.float32(self.stride)), 'int32') conv_op = FilterActs(stride=self.stride) input_shuffled = Xr.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = net.varWeights[layerNum]['w'].dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_flipped = filters_shuffled[:, ::-1, ::-1, :] # flip rows and columns contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_flipped * (net.dropOutVectors[layerNum].dimshuffle('x', 0, 1, 'x') if self.dropout else 1.0)) a = conv_op(contiguous_input, contiguous_filters) a = a[:, :out_size, :out_size, :] #Add bias a = a + net.varWeights[layerNum]['b'].dimshuffle(0, 'x', 'x', 'x') else: a = T.nnet.conv2d(Xr, net.varWeights[layerNum]['w'] * (net.dropOutVectors[layerNum].dimshuffle('x', 'x', 0, 1) if self.dropout else 1.0), border_mode='valid', subsample=(self.stride, self.stride)) #Add bias a = a + net.varWeights[layerNum]['b'].dimshuffle('x', 0, 'x', 'x') if self.pooling: if self.optimized: #Pooling # ds - side of square pool window # stride - Defines the stride size between successive pooling squares. # Setting this parameter smaller than sizeX produces overlapping pools. # Setting it equal to sizeX gives the usual, non-overlapping pools. Values greater than sizeX are not allowed. pool_op = MaxPool(ds=self.pooling_shape, stride=self.pooling_shape) contiguous_input = gpu_contiguous(a) a = pool_op(contiguous_input) a = a.dimshuffle(3, 0, 1, 2) # c01b to bc01 else: #a = downsample.max_pool_2d(a, (self.pooling_shape, self.pooling_shape), ignore_border=False) a = pool.max_pool2D(a, (self.pooling_shape, self.pooling_shape), ignore_border=False) else: if self.optimized: a = a.dimshuffle(3, 0, 1, 2) # c01b to bc01 a = T.flatten(a, outdim=2).T #Sigmoid a = self.activation(a, self.pool_size) net.varArrayA.append(a)
def infer_shape(self, node, in_shapes): data_shape = T.shape(node.inputs[0]) rois_shape = T.shape(node.inputs[1]) batch_size = rois_shape[0] num_maps = data_shape[1] h = self.pooled_h w = self.pooled_w out_shape = [batch_size, num_maps, h, w] return [out_shape, out_shape]
def compileActivation(self, net, layerNum): variable = net.x if layerNum == 0 else net.varArrayA[layerNum - 1] #Calc shapes for reshape function on-the-fly. Assume we have square images as input. sX = T.cast(T.sqrt(T.shape(variable)[0] / self.kernel_shape[1]), 'int16') #Converts input from 2 to 4 dimensions Xr = T.reshape(variable.T, (T.shape(variable)[1], self.kernel_shape[1], sX, sX)) if self.optimized: out_size = T.cast( T.ceil((T.shape(Xr)[-1] - T.shape(net.varWeights[layerNum]['w'])[-1] + 1) / np.float32(self.stride)), 'int32') conv_op = FilterActs(stride=self.stride) input_shuffled = Xr.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = net.varWeights[layerNum]['w'].dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_flipped = filters_shuffled[:, ::-1, ::-1, :] # flip rows and columns contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_flipped * (net.dropOutVectors[layerNum].dimshuffle('x', 0, 1, 'x') if self.dropout else 1.0)) a = conv_op(contiguous_input, contiguous_filters) a = a[:, :out_size, :out_size, :] #Add bias a = a + net.varWeights[layerNum]['b'].dimshuffle(0, 'x', 'x', 'x') else: a = T.nnet.conv2d(Xr, net.varWeights[layerNum]['w'] * (net.dropOutVectors[layerNum].dimshuffle('x', 'x', 0, 1) if self.dropout else 1.0), border_mode='valid', subsample=(self.stride, self.stride)) #Add bias a = a + net.varWeights[layerNum]['b'].dimshuffle('x', 0, 'x', 'x') if self.pooling: if self.optimized: #Pooling # ds - side of square pool window # stride - Defines the stride size between successive pooling squares. # Setting this parameter smaller than sizeX produces overlapping pools. # Setting it equal to sizeX gives the usual, non-overlapping pools. Values greater than sizeX are not allowed. pool_op = MaxPool(ds=self.pooling_shape, stride=self.pooling_shape) contiguous_input = gpu_contiguous(a) a = pool_op(contiguous_input) a = a.dimshuffle(3, 0, 1, 2) # c01b to bc01 else: a = downsample.max_pool_2d(a, (self.pooling_shape, self.pooling_shape), ignore_border=False) else: if self.optimized: a = a.dimshuffle(3, 0, 1, 2) # c01b to bc01 a = T.flatten(a, outdim=2).T #Sigmoid a = self.activation(a, self.pool_size) net.varArrayA.append(a)
def __init__(self, p, *args, **kwargs): super().__init__(*args, **kwargs) try: self.k = tt.shape(p)[-1].tag.test_value except AttributeError: self.k = tt.shape(p)[-1] p = tt.as_tensor_variable(floatX(p)) self.p = (p.T / tt.sum(p, -1)).T self.mode = tt.argmax(p)
def __init__(self, p, *args, **kwargs): super(BinaryCrossEntropyLikelihood, self).__init__(*args, **kwargs) self.loss_func = categorical_hinge try: self.k = tt.shape(p)[-1].tag.test_value except AttributeError: self.k = tt.shape(p)[-1] self.p = tt.as_tensor_variable(p) self.mode = tt.argmax(p)
def __init__(self, p, *args, **kwargs): super(Categorical, self).__init__(*args, **kwargs) try: self.k = tt.shape(p)[-1].tag.test_value except AttributeError: self.k = tt.shape(p)[-1] self.p = p = tt.as_tensor_variable(p) self.p = (p.T / tt.sum(p, -1)).T self.mode = tt.argmax(p)
def activation(self,z): y = T.reshape(z,(T.shape(z)[0], self.n_units//self.n_pieces, self.n_pieces)) y = T.max(y,axis=2) y = T.reshape(y,(T.shape(z)[0],self.n_units//self.n_pieces)) return y
def conv2D_keep_shape(x, w, image_shape, filter_shape, subsample=(1, 1)): # crop output to same size as input fs = T.shape(w)[2] - 1 # this is the filter size minus 1 ims = T.shape(x)[2] # this is the image size return theano.sandbox.cuda.dnn.dnn_conv(img=x, kerns=w, border_mode='full', subsample=subsample, )[:, :, fs/2:ims+fs/2, fs/2:ims+fs/2]
def get_train(U_Ot, U_R, lenW, n_facts): def phi_x1(x_t, L): return T.concatenate([L[x_t].reshape((-1,)), zeros((2*lenW,)), zeros((3,))], axis=0) def phi_x2(x_t, L): return T.concatenate([zeros((lenW,)), L[x_t].reshape((-1,)), zeros((lenW,)), zeros((3,))], axis=0) def phi_y(x_t, L): return T.concatenate([zeros((2*lenW,)), L[x_t].reshape((-1,)), zeros((3,))], axis=0) def phi_t(x_t, y_t, yp_t, L): return T.concatenate([zeros(3*lenW,), T.stack(T.switch(T.lt(x_t,y_t), 1, 0), T.switch(T.lt(x_t,yp_t), 1, 0), T.switch(T.lt(y_t,yp_t), 1, 0))], axis=0) def s_Ot(xs, y_t, yp_t, L): result, updates = theano.scan( lambda x_t, t: T.dot(T.dot(T.switch(T.eq(t, 0), phi_x1(x_t, L).reshape((1,-1)), phi_x2(x_t, L).reshape((1,-1))), U_Ot.T), T.dot(U_Ot, (phi_y(y_t, L) - phi_y(yp_t, L) + phi_t(x_t, y_t, yp_t, L)))), sequences=[xs, T.arange(T.shape(xs)[0])]) return result.sum() def sR(xs, y_t, L, V): result, updates = theano.scan( lambda x_t, t: T.dot(T.dot(T.switch(T.eq(t, 0), phi_x1(x_t, L).reshape((1,-1)), phi_x2(x_t, L).reshape((1,-1))), U_R.T), T.dot(U_R, phi_y(y_t, V))), sequences=[xs, T.arange(T.shape(xs)[0])]) return result.sum() x_t = T.iscalar('x_t') m = [x_t] + [T.iscalar('m_o%d' % i) for i in xrange(n_facts)] f = [T.iscalar('f%d_t' % i) for i in xrange(n_facts)] r_t = T.iscalar('r_t') gamma = T.scalar('gamma') L = T.fmatrix('L') # list of messages V = T.fmatrix('V') # vocab r_args = T.stack(*m) cost_arr = [0] * 2 * (len(m)-1) updates_arr = [0] * 2 * (len(m)-1) for i in xrange(len(m)-1): cost_arr[2*i], updates_arr[2*i] = theano.scan( lambda f_bar, t: T.switch(T.or_(T.eq(t, f[i]), T.eq(t, T.shape(L)-1)), 0, T.largest(gamma - s_Ot(T.stack(*m[:i+1]), f[i], t, L), 0)), sequences=[L, T.arange(T.shape(L)[0])]) cost_arr[2*i+1], updates_arr[2*i+1] = theano.scan( lambda f_bar, t: T.switch(T.or_(T.eq(t, f[i]), T.eq(t, T.shape(L)-1)), 0, T.largest(gamma + s_Ot(T.stack(*m[:i+1]), t, f[i], L), 0)), sequences=[L, T.arange(T.shape(L)[0])]) cost1, u1 = theano.scan( lambda r_bar, t: T.switch(T.eq(r_t, t), 0, T.largest(gamma - sR(r_args, r_t, L, V) + sR(r_args, t, L, V), 0)), sequences=[V, T.arange(T.shape(V)[0])]) cost = cost1.sum() for c in cost_arr: cost += c.sum() g_uo, g_ur = T.grad(cost, [U_Ot, U_R]) train = theano.function( inputs=[r_t, gamma, L, V] + m + f, outputs=[cost], updates=[(U_Ot, U_Ot-alpha*g_uo), (U_R, U_R-alpha*g_ur)]) return train
def mapping(true, parameters): NT = T.shape(true)[0] D = T.shape(true)[1] M = T.shape(parameters)[1] // (D + D**2 + 1) means = parameters[:, :D * M].reshape((NT, M, D)) sigmas = parameters[:, D * M:D * M + M * D * D].reshape((NT, M, D, D)) weights = T.nnet.softmax(parameters[:, D * M + M * D * D:]) return means, sigmas, weights
def GRU(i, U, W, b, x_0, s_prev): b1 = T.specify_shape((coversion_ones*b[i * 3,:]).T, T.shape(x_0)) b2 = T.specify_shape((coversion_ones*b[i * 3 + 1 ,:]).T, T.shape(x_0)) b3 = T.specify_shape((coversion_ones*b[i * 3 + 2,:]).T, T.shape(x_0)) z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1) r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2) c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3) return (T.ones_like(z) - z) * c + z * s_prev
def __init__(self, input1, input2): x1_sub = input1[:, :, 2:-2, 2:-2] x1_flatten = T.flatten(x1_sub) x1 = T.extra_ops.repeat(x1_flatten, 25) x1 = T.reshape(x1, [T.shape(x1_flatten)[0], 25]) x2 = neighbours.images2neibs(input2, neib_shape=(5, 5), neib_step=(1, 1)) diff = x1 - x2 new_shape = T.shape(x1_sub)*[1, 1, 5, 5] diff_img = neighbours.neibs2images(diff, neib_shape=(5, 5), original_shape=[1, 25, 25*5, 5*5]) self.output = T.nnet.relu(diff_img)
def activation(self, z): y = T.reshape(z, (T.shape(z)[0], self.n_units, self.n_pieces)) # maxout y = T.max(y, axis=2) y = T.reshape(y, (T.shape(z)[0], self.n_units)) return y
def conv2D_keep_shape(x, w, image_shape, filter_shape, subsample=(1, 1)): # crop output to same size as input fs = T.shape(w)[2] - 1 # this is the filter size minus 1 ims = T.shape(x)[2] # this is the image size # return theano.sandbox.cuda.dnn.dnn_conv(img=x, kerns=w, return theano.tensor.nnet.conv2d(x,w, image_shape=image_shape, filter_shape=filter_shape, border_mode='full', subsample=subsample, )[:, :, fs/2:ims+fs/2, fs/2:ims+fs/2]
def get_cost_updates(self): self.lr_D = T.scalar('lrD') self.lr_G = T.scalar('lrG') self.discriminator_cost = self.discriminator.logRegressionLayer.negative_log_likelihood( ) discriminator_params = [self.W, self.b] + self.discriminator.params g_D = [ self.lr_D * T.grad(self.discriminator_cost, param) for param in discriminator_params ] self.classification_error = T.mean( T.neq(self.discriminator.logRegressionLayer.y_pred, self.y_all_input)[:T.shape(self.X_input)[0]]) self.gen_classification_error = T.mean( T.neq(self.discriminator.logRegressionLayer.y_pred, self.y_all_input)[T.shape(self.X_input)[0]:]) self.discrimination_error = T.mean( T.neq( self.discriminator.logRegressionLayer.y_pred // self.num_of_corpus, self.y_all_input // self.num_of_corpus)) self.cost_per_gen, updates = theano.scan( fn=lambda p, y: T.log(p[y] / (p[y] + p[y - self.num_of_corpus])), outputs_info=None, sequences=[ self.discriminator.logRegressionLayer. p_y_given_x[T.shape(self.X_input)[0]:], self.g_label_input ], non_sequences=None) self.generator_cost = T.mean(self.cost_per_gen) g_G = [ self.lr_G * T.grad(self.generator_cost, param) for param in self.generator.params ] params = discriminator_params + self.generator.params gparams = g_D + g_G self.updates = updates + [(param, T.cast(param - gparam, 'float32')) for param, gparam in zip(params, gparams)] self.lr_C = T.scalar('lrC') self.classifier_cost = self.classifier.logRegressionLayer.negative_log_likelihood( ) self.classifier_error = self.classifier.logRegressionLayer.errors() classifier_params = [self.W, self.b] + self.classifier.params g_C = [ self.lr_C * T.grad( self.classifier.logRegressionLayer.negative_log_likelihood(), param) for param in classifier_params ] self.c_updates = [(param, T.cast(param - gparam, 'float32')) for param, gparam in zip(classifier_params, g_C)]
def get_output_for(self, inputs, **kwargs): # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R input = inputs[0] boxes = inputs[1] batch = T.shape(input)[0] channels = T.shape(input)[1] height = T.shape(input)[2] width = T.shape(input)[3] num_boxes = T.shape(boxes)[0] output = T.zeros((batch * num_boxes, channels, self.num_features)) for idbb, bb in enumerate(range(num_boxes)): batch_ind = bb[0] pool_list = [] #for pool_dim in self.pool_dims: start_w = T.clip(T.floor(bb[1] * self.sp_scale), 0, width) start_h = T.clip(T.floor(bb[2] * self.sp_scale), 0, heigth) end_w = T.clip(T.ceil(bb[3] * self.sp_scale), 0, width) end_h = T.clip(T.ceil(bb[4] * self.sp_scale), 0, height) w = T.max(end_w - start_w + 1, 1) h = T.amx(end_h - start_h + 1, 1) start_samples_y, start_sample_x = T.floor( _meshgrid(start_h, end_h, pool_dims + 1, start_w, end_w, pool_dims + 1)) end_samples_y, end_sample_x = T.ceil( _meshgrid(start_h, end_h, pool_dims + 1, start_w, end_w, pool_dims + 1)) input[batch_ind, :, np.floor(py):np.ceil(samples_y[idy + 1]), np.floor(px):np.ceil(samples_x[idx + 1])] #T.max() #for idx,px in enumerate(samples_x[:-1]): # for idy,py in enumerate(samples_y[:-1]): # (pool.dnn_pool( input[batch_ind,:,np.floor(py):np.ceil(samples_y[idy+1]),np.floor(px):np.ceil(samples_x[idx+1])],(0,0),(None,None),'max', (0,0) )).flatten(2) #sz_w = ( w - 1 ) // pool_dim #sz_h = ( h - 1 ) // pool_dim #str_h = w // pool_dim #str_w = h // pool_dim #pool = dnn.dnn_pool( input[bb[0],:,start_h:end_h+1,start_w:end_w+1], (sz_h,sz_w), (str_h,str_w), 'max', (0,0) ).flatten(2) pool_list.append(pool) output[idbb] = T.transpose(T.concatenate( pool_list, axis=1)) #not efficient but for the moment is ok! #if everything is correct this vector should be ordered as in fast RCNN return output
def down_sampleT(self, x, y, _sample_rate): length = tensor.cast(tensor.shape(y)[0] * _sample_rate, 'int32') id_max = tensor.cast(tensor.shape(y)[0] - 1, 'int32') def get_sub(i,x,y): idd = self.srng.random_integers(low = 0, high = id_max) return [x[idd], y[idd]] ([dx, dy], updates) = theano.scan(fn = get_sub, outputs_info=None, sequences=tensor.arange(length), non_sequences=[x,y]) return dx, dy, length
def GRU(i, U, W, b, x_0, s_previous): U_copy, W_copy = U, W b1 = T.specify_shape((coversion_ones * b[i * 3, :]).T, T.shape(x_0)) b2 = T.specify_shape((coversion_ones * b[i * 3 + 1, :]).T, T.shape(x_0)) b3 = T.specify_shape((coversion_ones * b[i * 3 + 2, :]).T, T.shape(x_0)) z = T.nnet.hard_sigmoid(U_copy[i * 3 + 0].dot(x_0) + W_copy[i * 3 + 0].dot(s_previous) + b1) r = T.nnet.hard_sigmoid(U_copy[i * 3 + 1].dot(x_0) + W_copy[i * 3 + 1].dot(s_previous) + b2) s_candidate = T.tanh(U_copy[i * 3 + 2].dot(x_0) + W_copy[i * 3 + 2].dot(s_previous * r) + b3) return (T.ones_like(z) - z) * s_candidate + z * s_previous
def __init__(self, loss_func, p, *args, **kwargs): super(LogLikelihood, self).__init__(*args, **kwargs) if loss_func is None: loss_func = categorical_crossentropy self.loss_func = loss_func try: self.k = tt.shape(p)[-1].tag.test_value except AttributeError: self.k = tt.shape(p)[-1] self.p = tt.as_tensor_variable(p) self.mode = tt.argmax(p)
def apt_maf_loss_atomic_proposal(net, svi=False, combined_loss=False): """Define loss function for training with a atomic proposal. Assumes a uniform proposal distribution over each sample parameter and an externally provided set of alternatives. net: MAF-based conditional density net svi : bool Whether to use SVI version of the mdn or not """ assert net.density == 'maf' assert not svi, 'SVI not supported for MAFs' # define symbolic variable to hold params that will be inferred # params : n_batch x n_outputs # all_thetas : (n_batch * (n_atoms + 1) x n_outputs # lprs : (n_atoms + 1) x n_batch # stats : n_batch x n_inputs # x_nl : (n_batch * (n_atoms + 1)) x n_inputs theta_all = tensorN(2, name='params_nl', dtype=dtype) x_nl = tensorN(2, name='stats_nl', dtype=dtype) lprs = tensorN(2, name='lprs', dtype=dtype) # log tilde_p / p n_batch = tt.shape(lprs)[1] n_atoms = tt.shape(lprs)[0] - 1 # compute MAF log-densities for true and other atoms lprobs = theano.clone(output=net.lprobs, replace={ net.params: theta_all, net.stats: x_nl }, share_inputs=True) lprobs = tt.reshape(lprobs, newshape=(n_atoms + 1, n_batch), ndim=2) # compute nonnormalized log posterior probabilities atomic_ppZ = lprobs - lprs # compute posterior probability of true params in atomic task atomic_pp = atomic_ppZ[0, :].squeeze() - \ MyLogSumExp(atomic_ppZ, axis=0).squeeze() # collect the extra input variables that have to be provided for each # training data point, and calculate the loss by averaging over samples trn_inputs = [theta_all, x_nl, lprs] if combined_loss: # add prior loss on prior samples l_ml = lprobs[0, :].squeeze() # direct posterior evaluation is_prior_sample = tensorN(1, name='prop_mask', dtype=dtype) trn_inputs.append(is_prior_sample) loss = -tt.mean(atomic_pp + is_prior_sample * l_ml) else: loss = -tt.mean(atomic_pp) return loss, trn_inputs
def get_output_for( self, inputs ,**kwargs ): # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R input = inputs[0] boxes = inputs[1] batch = T.shape (input)[0] channels = T.shape (input)[1] height = T.shape( input )[2] width = T.shape( input )[3] num_boxes = T.shape(boxes)[0] #output = T.zeros((batch * num_boxes , channels, self.num_features)) op = ROIPoolingOp(pooled_h=self.pool_dims, pooled_w=self.pool_dims, spatial_scale=self.sp_scale) output = op(input, boxes) return output[0]
def __init__(self, p, *args, **kwargs): super().__init__(*args, **kwargs) try: self.k = tt.shape(p)[-1].tag.test_value except AttributeError: self.k = tt.shape(p)[-1] p = tt.as_tensor_variable(floatX(p)) # From #2082, it may be dangerous to automatically rescale p at this # point without checking for positiveness self.p = p self.mode = tt.argmax(p, axis=-1) if self.mode.ndim == 1: self.mode = tt.squeeze(self.mode)
def grad(self, inputs, cost_grad): """ Notes: 1. The gradient is computed under the assumption that perturbations of the input array respect triangularity, i.e. partial derivatives wrt triangular region are zero. 2. In contrast with the usual mathematical presentation, in order to apply theano's 'reshape' function wich implements row-order (i.e. C order), the differential expressions below have been derived based on the row-vectorizations of inputs 'a' and 'b'. See The Matrix Reference Manual, Copyright 1998-2011 Mike Brookes, Imperial College, London, UK """ a, b = inputs ingrad = cost_grad ingrad = tensor.as_tensor_variable(ingrad) shp_a = (tensor.shape(inputs[0])[1], tensor.shape(inputs[0])[1]) I_M = tensor.eye(*shp_a) if self.lower: inv_a = solve_triangular(a, I_M, lower=True) tri_M = tril(tensor.ones(shp_a)) else: inv_a = solve_triangular(a, I_M, lower=False) tri_M = triu(tensor.ones(shp_a)) if b.ndim == 1: prod_a_b = tensor.tensordot(-b.T, inv_a.T, axes=1) prod_a_b = tensor.shape_padleft(prod_a_b) jac_veca = kron(inv_a, prod_a_b) jac_b = inv_a outgrad_veca = tensor.tensordot(ingrad, jac_veca, axes=1) outgrad_a = tensor.reshape(outgrad_veca, (inputs[0].shape[0], inputs[0].shape[0])) * tri_M outgrad_b = tensor.tensordot(ingrad, jac_b, axes=1).flatten(ndim=1) else: ingrad_vec = ingrad.flatten(ndim=1) prod_a_b = tensor.tensordot(-b.T, inv_a.T, axes=1) jac_veca = kron(inv_a, prod_a_b) I_N = tensor.eye(tensor.shape(inputs[1])[1], tensor.shape(inputs[1])[1]) jac_vecb = kron(inv_a, I_N) outgrad_veca = tensor.tensordot(ingrad_vec, jac_veca, axes=1) outgrad_a = tensor.reshape(outgrad_veca, (inputs[0].shape[0], inputs[0].shape[0])) * tri_M outgrad_vecb = tensor.tensordot(ingrad_vec, jac_vecb, axes=1) outgrad_b = tensor.reshape(outgrad_vecb, (inputs[1].shape[0], inputs[1].shape[1])) return [outgrad_a, outgrad_b]
def one_hot_crossentropy(y_true, y_pred): # use Keras`s code to prevent nan, inf if theano.config.floatX == "float64": epsilon = 1.0e-9 else: epsilon = 1.0e-7 # cut the values between 0 and 1 # ( in fact, Softmax makes value 0 ~ 1, so this is not need, # i think, maybe, process this code to prevent unexpected nan, inf ) y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon) # scale preds so that the class probas of each sample sum to 1 y_pred /= y_pred.sum(axis=-1, keepdims=True) # volabulary size voca_size = T.shape(y_pred)[-1] # convert to 1D array for indexing y_pred = y_pred.flatten() y_true = y_true.flatten().astype("int32") # change y_true`s word vector index to fit 1D array ix = T.arange(y_true.size) * voca_size + y_true # indexing instead of summation cce = -T.log(y_pred[ix]) return cce
def build_model(): # 1. Input layer l_in = lasagne.layers.InputLayer(shape=(None, seq_len, n_inputs)) batchsize, _, _ = T.shape(l_in.input_var) # 2. First Dense Layer l_reshape_a = lasagne.layers.ReshapeLayer( l_in, (batchsize*seq_len,n_inputs)) l_1_batchnorm = batchnormlayer(l=l_reshape_a, num_units=N_L1, nonlinearity=lasagne.nonlinearities.rectify) l_reshape_b = lasagne.layers.ReshapeLayer( l_1_batchnorm, (batchsize, seq_len, N_L1)) # 3. LSTM Layers l_forward = lasagne.layers.LSTMLayer(l_reshape_b, N_LSTM_F) l_backward = lasagne.layers.LSTMLayer(l_reshape_b, N_LSTM_B, backwards=True) #Concat layer l_sum = lasagne.layers.ConcatLayer(incomings=[l_forward, l_backward], axis=2) # 4. Second Dense Layer l_reshape_c = lasagne.layers.ReshapeLayer( l_sum, (batchsize*seq_len, N_LSTM_F+N_LSTM_B)) l_2_batchnorm = batchnormlayer(l=l_reshape_c, num_units=N_L2, nonlinearity=lasagne.nonlinearities.rectify) # 5. Output Layer l_recurrent_out = lasagne.layers.DenseLayer( l_2_batchnorm, num_units=num_classes, nonlinearity=lasagne.nonlinearities.softmax) # Now, reshape the output back to the RNN format l_out = lasagne.layers.ReshapeLayer( l_recurrent_out, (batchsize, seq_len, num_classes)) return l_in, l_out
def dropout_fprop(self, input): # we reduce the precision of parameters for the computations self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range) self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range) # create the dropout mask # The cast is important because # int * float32 = float64 which pulls things off the gpu srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999)) self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX) # apply the mask self.fixed_x = input * self.mask # weighted sum self.z = T.dot(self.fixed_x, self.fixed_W) + self.fixed_b self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range) # activation self.y = self.activation(self.fixed_z) self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range) # return the output return self.fixed_y
def logit_softmax_fn(logits): axis_last = logits.dimshuffle(range(axis) + range(axis + 1, logits.ndim) + [axis]) logits_flattened = T.reshape(axis_last, (-1, T.shape(axis_last)[-1])) logits_shifted = logits_flattened - logits_flattened.max(axis=1, keepdims=True) logits_normalized = (logits_shifted - T.log(T.sum(T.exp(logits_shifted), axis=1, keepdims=True))) return T.reshape(logits_normalized, logits.shape)
def timestep(predictions, label, len_example, total_len_example): label_binary = T.gt(label[0:len_example-1], 0) oov_count = T.shape(label_binary)[0] - T.sum(label_binary) a = total_len_example return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count
def MaxOut(z, *args): #z = T.dot(W, X) + B.dimshuffle(0, 'x') d = T.shape(z) n_elem = args[0] z = z.reshape((d[0] / n_elem, n_elem, d[1])) a = T.max(z, axis=1) return a
def add_normal(model, name, m, v): new_var = model['stream'].normal(avg=m, std=T.sqrt(v)) new_factors = [ require(T.gt(v,0)), T.log(2*np.pi), -T.prod(T.shape(new_var))*T.log(v)/2, -(new_var-m)**2/2/v] return add_stochastic(model, name, new_var, new_factors)
def cross_entropy_cost(target, output, output_act, in_sided, out_sided, in_bounded, out_bounded, act): assert in_bounded #assert out_bounded scale_bb = 1. if in_bounded != 1.: target = target / in_bounded #if out_bounded != 1.: # output = output / out_bounded # scale_bb = 1. / out_bounded if not in_sided: target = (target+1)/(2.0) if not out_sided: output= (output+1)/(2.0) scale_bb = scale_bb / 2. ddXE = target * scale_bb * 1./(output * output) + (1 - target) * scale_bb * 1./((1 - output) * (1-output)) ddXE /= T.shape(ddXE)[0] ddXE = T.cast(ddXE,dtype=theano.config.floatX) if act in ['sigmoid','tanh','tanhnorm','abstanh','abstanhnorm']: if act == 'sigmoid': return sigmoid_cross_entropy(target, output_act,ddXE) if act == 'tanh': return tanh_cross_entropy(target, output_act,ddXE) if act == 'tanhnorm': return tanhnorm_cross_entropy(target, output_act,ddXE) if act == 'abstanh': return abstanh_cross_entropy(target, output_act,ddXE) if act == 'abstanhnorm': return abstanhnorm_cross_entropy(target, output_act,ddXE) else: XE = target * T.log(output) + (1 - target) * T.log(1 - output) return [[-T.mean(T.sum(XE, axis=1),axis=0)] , ddXE]
def init_param_updates(self, layer, parameter): epoch = self.variables.epoch step = self.variables.step beta1 = self.beta1 beta2 = self.beta2 parameter_shape = T.shape(parameter).eval() prev_first_moment = theano.shared( name="{}/prev-first-moment".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) prev_weighted_inf_norm = theano.shared( name="{}/prev-weighted-inf-norm".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) gradient = T.grad(self.variables.error_func, wrt=parameter) first_moment = beta1 * prev_first_moment + (1 - beta1) * gradient weighted_inf_norm = T.maximum(beta2 * prev_weighted_inf_norm, T.abs_(gradient)) parameter_delta = ( (1 / (1 - beta1 ** epoch)) * (first_moment / (weighted_inf_norm + self.epsilon)) ) return [ (prev_first_moment, first_moment), (prev_weighted_inf_norm, weighted_inf_norm), (parameter, parameter - step * parameter_delta), ]
def binarization(W,H,binary=True,deterministic=False,stochastic=False,srng=None): # (deterministic == True) <-> test-time <-> inference-time if not binary or (deterministic and stochastic): # print("not binary") Wb = W else: # [-1,1] -> [0,1] Wb = hard_sigmoid(W/H) # Wb = T.clip(W/H,-1,1) # Stochastic BinaryConnect if stochastic: # print("stoch") Wb = T.cast(srng.binomial(n=1, p=Wb, size=T.shape(Wb)), theano.config.floatX) # Deterministic BinaryConnect (round to nearest) else: # print("det") Wb = T.round(Wb) # 0 or 1 -> -1 or 1 Wb = T.cast(T.switch(Wb,H,-H), theano.config.floatX) return Wb
def predict(self, input): #input is an array of vectors (2D np.array) self.input = input padw = int(self.window/2) if padw>0: padding = np.asarray([np.zeros((self.dim_in,), dtype=theano.config.floatX)] * (padw)) inp = T.concatenate((padding, input, padding), axis=0) else: inp = self.input seq = T.arange(T.shape(inp)[0]-self.window+1) self.input, _ = theano.scan(lambda v: inp[v : v+self.window].flatten(), sequences=seq) # initialize the gates out = theano.shared(numpy.zeros((self.dim_out,), dtype=theano.config.floatX)) # gate computations def rnn_step(x, h_prev): if self.use_bias: out = T.nnet.sigmoid(T.dot(x, self.Wx) + T.dot(h_prev, self.Wh) + self.b) else: out = T.nnet.sigmoid(T.dot(x, self.Wx) + T.dot(h_prev, self.Wh)) return out self.output, _ = theano.scan(fn=rnn_step, sequences = dict(input=self.input, taps=[0]), outputs_info = [out]) if self.use_last_output: self.output = self.output[-1] if self.pooling != None: self.output = self.pooling(self.output) return self.output
def get_cost_updates(self, corruption_level, learning_rate,cost_function_name): """ This function computes the cost and the updates for one trainng step of the dA """ #print str(self.activation) tilde_x = self.get_corrupted_input(self.x, corruption_level) #print self.activation y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) cost=None if cost_function_name=='cross_entropy': #print 'cross_entropy..' L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1-z), axis=1) cost = T.mean(L) #所有节点求和,然后所有batch_size求平均 if cost_function_name=='sqr_error': L=(T.sum(T.square(T.abs_(self.x-z))/2.,axis=0))/T.cast(T.shape(self.x)[0],'float32') #theano.printing.debugprint(obj=cost,print_type=True) #printdebug.debugprint(cost) cost=T.mean(L) T.cast(cost, 'float32') #print cost # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams) ] return (cost, updates)
def bbprop(self): self.lin_bbprop = self.p_y_given_x - self.p_y_given_x * self.p_y_given_x self.lin_bbprop /= T.shape(self.p_y_given_x)[0] self.dict_bbprop = {} self.dict_bbprop.update({self.b_upmask: T.sum(self.lin_bbprop, 0)}) self.dict_bbprop.update({self.W_upmask: T.dot(T.transpose(self.inp * self.inp), self.lin_bbprop)}) return T.dot(self.lin_bbprop, T.transpose(self.W * self.W)), self.dict_bbprop
def hessian(objective, argument): """ Compute the directional derivative of the gradient (which is equal to the hessian multiplied by direction). """ g = T.grad(objective, argument) # Create a new tensor A, which has the same type (i.e. same dimensionality) # as argument. A = argument.type() try: # First attempt efficient 'R-op', this directly calculates the # directional derivative of the gradient, rather than explicitly # calculating the hessian and then multiplying. R = T.Rop(g, argument, A) except NotImplementedError: shp = T.shape(argument) H = T.jacobian(g.flatten(), argument).reshape( T.concatenate([shp, shp]), 2*A.ndim) R = T.tensordot(H, A, A.ndim) try: hess = theano.function([argument, A], R, on_unused_input='raise') except theano.compile.UnusedInputError: warn('Theano detected unused input - suggests hessian may be zero or ' 'constant.') hess = theano.function([argument, A], R, on_unused_input='ignore') return hess
def jacobian_mul_vector_l_flat(y, x, W, v, x_val, W_val, v_val): J = theano.gradient.jacobian(y, x) J_flat = T.flatten(J, J.ndim - 1) # The jacobian result on flattened matrix x VJ = v.dot(J_flat) VJ_reshape = T.reshape(VJ, T.shape(x)) f_VJ = theano.function([x, W, v], VJ_reshape) return f_VJ(x_val, W_val, v_val)
def forward(self, ec_H, ec_C, mask): (sens_size, batch_size) = T.shape(mask) def step(m, prev_Y, prev_H, prev_C): """Forward a time step of the decoder.""" # LSTM forward time step (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C) # LSTM output O = self.lstm_output.forward(H) # Apply softmax to LSTM output P = self.softmax.forward(O) # Make prediction one_hot_Y = T.argmax(P, axis=1) # Feed the output to the next time step Y = self.embedding.forward(one_hot_Y) # FIXME: Deal with differ length ? return (P, Y, H, C) results, updates = theano.scan( fn=step, sequences=[mask], outputs_info=[ None, dict(initial=T.zeros((batch_size, self.embedding_size)), taps=[-1]), dict(initial=ec_H, taps=[-1]), dict(initial=ec_C, taps=[-1]) ] ) # return np.swapaxes(results[0], 0, 1) # returns the softmax probabilities return results[0]
def init_prev_delta(self, parameter): parameter_shape = T.shape(parameter).eval() self.prev_delta = theano.shared( name="{}/prev-delta".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) return self.prev_delta
def get_ranks(self): p_ent1 = self.params['TransE_E'][self.px[:,0]] p_ent2 = self.params['TransE_E'][self.px[:,1]] def get_rank(i): ent1 = self.params['TransE_E'][self.px[i][0]] ent2 = self.params['TransE_E'][self.px[i][1]] rel = self.params['TransE_R'][self.py[i]] # all relation loss all_trans_disvec = ent1 + self.params['TransE_R'] - ent2 all_transE_loss = T.batched_dot(all_trans_disvec, all_trans_disvec) #all_men_disvec = self.params['TransE_R'] - self.cnn_output[i] all_relcnn_loss = - T.dot(self.params['TransE_R'], self.cnn_output[i]) all_loss = self.trans*all_transE_loss + self.theta*all_relcnn_loss # ground true relation loss gt_trans_disvec = ent1 + rel - ent2 gt_transE_loss = T.dot(gt_trans_disvec, gt_trans_disvec) #gt_men_disvec = rel - self.cnn_output[i] gt_relcnn_loss = - T.dot(rel, self.cnn_output[i]) gt_loss = self.trans*gt_transE_loss + self.theta*gt_relcnn_loss return T.sum(all_loss < gt_loss) res, _ = theano.scan( fn = get_rank, outputs_info = None, sequences = [T.arange(T.shape(self.py)[0])] ) return T.mean(res)