def _get_targets(y, log_y_hat, y_mask, y_hat_mask): ''' Returns the target values according to the CTC cost with respect to y_hat. Note that this is part of the gradient with respect to the softmax output and not with respect to the input of the original softmax function. All computations are done in log scale ''' num_classes = log_y_hat.shape[2] - 1 blanked_y, blanked_y_mask = _add_blanks( y=y, blank_symbol=num_classes, y_mask=y_mask) log_alpha, log_beta = _log_forward_backward(blanked_y, log_y_hat, blanked_y_mask, y_hat_mask, num_classes) # explicitly not using a mask to prevent inf - inf y_prob = _class_batch_to_labeling_batch(blanked_y, log_y_hat, y_hat_mask=None) marginals = log_alpha + log_beta - y_prob max_marg = marginals.max(2) max_marg = T.switch(T.le(max_marg, -np.inf), 0, max_marg) log_Z = T.log(T.exp(marginals - max_marg[:,:, None]).sum(2)) log_Z = log_Z + max_marg log_Z = T.switch(T.le(log_Z, -np.inf), 0, log_Z) targets = _labeling_batch_to_class_batch(blanked_y, T.exp(marginals - log_Z[:,:, None]), num_classes + 1) return targets
def get_rain_level(vals): return TT.switch( TT.le(vals, 0.1), 0, TT.switch( TT.le(vals, 2.5), 1, TT.switch(TT.le(vals, 8.0), 2, TT.switch(TT.le(vals, 16.0), 3, 4))))
def get_output_for(self, input, **kwargs): if self.tied_feamap: return input * T.gt(input, 0) + input * T.le(input, 0) \ * T.shape_padleft(T.shape_padright(self.W[seg], n_ones = len(input_dim) - 2)) else: return input * T.gt(input, 0) + input * T.le(input, 0) \ * T.shape_padleft(self.W)
def depth(r, b): # depth = 1 - s0 / pi; where s0 is from Agol+ b = tt.abs_(b) r = tt.abs_(r) b2 = b ** 2 r2 = r ** 2 opr = 1 + r omr = 1 - r rmo = r - 1 # Case 2 a = kite_area(r, b) twoa = 2 * a k0 = tt.arctan2(twoa, rmo * opr + b2) k1 = tt.arctan2(twoa, omr * opr + b2) case2 = (k1 + r2 * k0 - a) / np.pi return tt.switch( tt.le(opr, b), tt.zeros_like(r), tt.switch( tt.and_(tt.lt(tt.abs_(omr), b), tt.lt(b, opr)), case2, tt.switch(tt.le(b, omr), r2, tt.ones_like(r)), ), )
def sticky_ALB(o, t, o2, v, alpha_0, beta_0, alpha_1, beta_1, d, tau_p, tau_n, unchosen_p, b, stickiness): b = 1. / b # Convert inverse temperature to temperature # Implements choice stickiness unchosen_0 = T.switch(T.le(v, 0.5), 1, unchosen_p) unchosen_1 = T.switch(T.gt(v, 0.5), 1, unchosen_p) # Only update if outcome isn't missing alpha_0 = T.switch(T.ge(o, 0), (1 - d) * alpha_0 + (o * tau_p * unchosen_0), alpha_0) beta_0 = T.switch(T.ge(o, 0), (1 - d) * beta_0 + ((1 - o) * tau_n * unchosen_0), beta_0) alpha_1 = T.switch(T.ge(o2, 0), (1 - d) * alpha_1 + (o2 * tau_p * unchosen_1), alpha_1) beta_1 = T.switch(T.ge(o2, 0), (1 - d) * beta_1 + ((1 - o2) * tau_n * unchosen_1), beta_1) value_0 = alpha_0 / (alpha_0 + beta_0) value_1 = alpha_1 / (alpha_1 + beta_1) value_0 = T.switch(T.le(v, 0.5), T.pow(value_0, stickiness), value_0) value_1 = T.switch(T.gt(v, 0.5), T.pow(value_1, stickiness), value_1) value = ((value_0 - value_1) + 1) / 2. var_0 = (alpha_0 * beta_0) / (T.pow(alpha_0 + beta_0, 2) * (alpha_0 + beta_0 + 1)) var_1 = (alpha_1 * beta_1) / (T.pow(alpha_1 + beta_1, 2) * (alpha_1 + beta_1 + 1)) value = np.exp(b * value) / (np.exp(b * value) + np.exp(b * (1 - value))) return (value, alpha_0, beta_0, alpha_1, beta_1, var_0, var_1, value_0, value_1, o, o2)
def prepareTraining(self): ''' Prepares the relevant functions (details on neural_net_creator's prepareTraining) ''' #loss objective to minimize self.prediction = lasagne.layers.get_output(self.network) self.prediction=self.prediction[:,0] #self.loss = lasagne.objectives.categorical_crossentropy(self.prediction, self.target_var) #the loss is now the squared error in the output self.loss = lasagne.objectives.squared_error(self.prediction, self.target_var) self.loss = self.loss.mean() self.params = lasagne.layers.get_all_params(self.network, trainable=True) self.updates = lasagne.updates.nesterov_momentum( self.loss, self.params, learning_rate=0.01, momentum=0.9) self.test_prediction = lasagne.layers.get_output(self.network, deterministic=True) self.test_prediction=self.test_prediction[:,0] self.test_loss = lasagne.objectives.squared_error(self.test_prediction, self.target_var) self.test_loss = self.test_loss.mean() #the accuracy is now the number of sample that achieve a 0.01 precision (can be changed) self.test_acc = T.mean(T.le(T.abs_(T.sub(self.test_prediction,self.target_var)),0.01) , dtype=theano.config.floatX) self.test_acc2 = T.mean(T.le(T.abs_(T.sub(self.test_prediction,self.target_var)),0.05) , dtype=theano.config.floatX) self.test_acc3 = T.mean(T.le(T.abs_(T.sub(self.test_prediction,self.target_var)),0.1) , dtype=theano.config.floatX) self.train_fn = theano.function([self.input_var, self.target_var], self.loss, updates=self.updates) self.val_fn = theano.function([self.input_var, self.target_var], [self.test_loss,self.test_acc,self.test_acc2,self.test_acc3]) self.use = theano.function([self.input_var],[self.test_prediction])
def init(self, input, input_shape, posslope, negslope): self.input = input # assert( posslope[-1]=="Constant")#learning slope not yet implemented # assert( negslope[-1]=="Constant") # if negslope[-1] == "Constant": self.output = T.ge(input, 0.) * input * posslope[2] + T.lt( input, 0.) * input * negslope[2] self.input_shape = input_shape self.output_shape = input_shape self.params = [] elif negslope[-1] == "Learn": self.pos_slope_sym = theano.shared( posslope[2] * np.ones([input_shape[0], 1], dtype=theano.config.floatX), borrow=True, broadcastable=(False, True)) self.neg_slope_sym = theano.shared( negslope[2] * np.ones([input_shape[0], 1], dtype=theano.config.floatX), borrow=True, broadcastable=(False, True)) self.output = T.ge(input, 0) * input * T.ge( self.pos_slope_sym, posslope[0]) * T.le( self.pos_slope_sym, posslope[1] ) * self.pos_slope_sym + T.lt(input, 0) * input * T.ge( self.neg_slope_sym, negslope[0]) * T.le( self.neg_slope_sym, negslope[1]) * self.neg_slope_sym self.params = [self.pos_slope_sym, self.neg_slope_sym] self.input_shape = input_shape self.output_shape = input_shape else: assert (False)
def get_targets(y, log_y_hat, y_mask, y_hat_mask): """ Returns the target values according to the CTC cost with respect to y_hat. Note that this is part of the gradient with respect to the softmax output and not with respect to the input of the original softmax function. All computations are done in log scale """ # log_y_hat is input_seq_len x num_batch x num_classes + 1 num_classes = log_y_hat.shape[2] - 1 blanked_y, blanked_y_mask = _add_blanks(y=y, blank_symbol=num_classes, y_mask=y_mask) log_alpha, log_beta = _log_forward_backward(blanked_y, log_y_hat, blanked_y_mask, y_hat_mask, num_classes) # explicitly not using a mask to prevent inf - inf y_prob = _class_batch_to_labeling_batch(blanked_y, log_y_hat, y_hat_mask=None) marginals = log_alpha + log_beta - y_prob max_marg = marginals.max(2) max_marg = T.switch(T.le(max_marg, -numpy.inf), 0, max_marg) log_Z = T.log(T.exp(marginals - max_marg[:, :, None]).sum(2)) log_Z = log_Z + max_marg log_Z = T.switch(T.le(log_Z, -numpy.inf), 0, log_Z) targets = _labeling_batch_to_class_batch( blanked_y, T.exp(marginals - log_Z[:, :, None]), num_classes + 1) return targets
def call(self, X): if type(X) is not list or len(X) != 2: raise Exception("SquareAttention must be called on a list of two tensors. Got: " + str(X)) frame, position = X[0], X[1] # Reshaping the input to exclude the time dimension frameShape = K.shape(frame) positionShape = K.shape(position) (chans, height, width) = frameShape[-3:] targetDim = positionShape[-1] frame = K.reshape(frame, (-1, chans, height, width)) position = K.reshape(position, (-1, ) + (targetDim, )) # Applying the attention hw = THT.abs_(position[:, 2] - position[:, 0]) * self.scale / 2.0 hh = THT.abs_(position[:, 3] - position[:, 1]) * self.scale / 2.0 position = THT.maximum(THT.set_subtensor(position[:, 0], position[:, 0] - hw), -1.0) position = THT.minimum(THT.set_subtensor(position[:, 2], position[:, 2] + hw), 1.0) position = THT.maximum(THT.set_subtensor(position[:, 1], position[:, 1] - hh), -1.0) position = THT.minimum(THT.set_subtensor(position[:, 3], position[:, 3] + hh), 1.0) rX = Data.linspace(-1.0, 1.0, width) rY = Data.linspace(-1.0, 1.0, height) FX = THT.gt(rX, position[:,0].dimshuffle(0,'x')) * THT.le(rX, position[:,2].dimshuffle(0,'x')) FY = THT.gt(rY, position[:,1].dimshuffle(0,'x')) * THT.le(rY, position[:,3].dimshuffle(0,'x')) m = FY.dimshuffle(0, 1, 'x') * FX.dimshuffle(0, 'x', 1) m = m + self.alpha - THT.gt(m, 0.) * self.alpha frame = frame * m.dimshuffle(0, 'x', 1, 2) # Reshaping the frame to include time dimension output = K.reshape(frame, frameShape) return output
def depth_grad(r, b): # depth = 1 - s0 / pi; where s0 is from Agol+ b = tt.abs_(b) r = tt.abs_(r) b2 = b**2 opr = 1 + r omr = 1 - r rmo = r - 1 # Case 2 a = kite_area(r, b) twor = 2 * r twoa = 2 * a k0 = tt.arctan2(twoa, rmo * opr + b2) dr = twor * k0 / np.pi db = -twoa / (b * np.pi) zero = tt.zeros_like(r) return ( tt.switch( tt.le(opr, b), zero, tt.switch( tt.and_(tt.lt(tt.abs_(omr), b), tt.lt(b, opr)), dr, tt.switch(tt.le(b, omr), twor, zero), ), ), tt.switch( tt.le(opr, b), zero, tt.switch(tt.and_(tt.lt(tt.abs_(omr), b), tt.lt(b, opr)), db, zero), ), )
def inner(x, A, b, u, l, L, eps): gradient = T.dot(A, x) + b x2 = x - gradient / L x2 = T.switch(T.le(x2, l), l, x2) x2 = T.switch(T.ge(x2, u), u, x2) d = T.max(T.abs_(x - x2)) condition = T.le(T.max(d), eps) return (x2, d), theano.scan_module.until(condition)
def OneStep(alpha, b): # minimize alpha alpha_new = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') # minimize b tmp_new = T.clip(W/alpha_new, -1., 1.) b_new = T.switch( T.ge(tmp_new, pow(2, -n)), T.pow(2, round3(T.log2(tmp_new)-0.0849625)), T.switch( T.le(tmp_new, -pow(2, -n)), -T.pow(2, round3(T.log2(-tmp_new)-0.0849625)), 0.)) b_new = T.switch(T.ge(b_new, pow(2, - (n-1))), b_new, T.switch(T.le(b_new, -pow(2, -(n-1))), b_new, T.sgn(b_new)*pow(2, -(n-1)))) delta = T.abs_(alpha_new-alpha) condition = T.lt(delta, 1e-6) return [alpha_new, b_new], theano.scan_module.until(condition)
def apply_mean_stress_theory(m_s_th, sm, rng, sn_0, r_m, r_y): rng = ifelse( tt.eq(1, m_s_th), ifelse(tt.lt(0, sm), rng / (1 - (sm / r_m)), ifelse(tt.le(r_m, tt.abs_(sm)), 1.01 * sn_0, rng)), ifelse( tt.eq(2, m_s_th), ifelse(tt.lt(tt.abs_(sm), r_m), rng / (1 - (sm / r_m)**2), ifelse(tt.le(r_m, sm), 1.01 * sn_0, rng)), ifelse( tt.eq(3, m_s_th), ifelse( tt.lt(0, sm) & tt.lt(sm, r_y), rng / 1 - (sm / r_y), ifelse(tt.le(r_y, tt.abs_(sm)), 1.01 * sn_0, rng)), rng))) return rng
def calc_time_gate(time_input_n): # Broadcast the time across all units t_broadcast = time_input_n.dimshuffle([0,'x']) # Get the time within the period in_cycle_time = T.mod(t_broadcast + shift_broadcast, period_broadcast) # Find the phase is_up_phase = T.le(in_cycle_time, on_mid_broadcast) is_down_phase = T.gt(in_cycle_time, on_mid_broadcast)*T.le(in_cycle_time, on_end_broadcast) # Set the mask sleep_wake_mask = T.switch(is_up_phase, in_cycle_time/on_mid_broadcast, T.switch(is_down_phase, (on_end_broadcast-in_cycle_time)/on_mid_broadcast, off_slope*(in_cycle_time/period_broadcast))) return sleep_wake_mask
def gate_layer(tparams, X_word, X_char, options, prefix, pretrain_mode, activ='lambda x: x', **kwargs): """ compute the forward pass for a gate layer Parameters ---------- tparams : OrderedDict of theano shared variables, {parameter name: value} X_word : theano 3d tensor, word input, dimensions: (num of time steps, batch size, dim of vector) X_char : theano 3d tensor, char input, dimensions: (num of time steps, batch size, dim of vector) options : dictionary, {hyperparameter: value} prefix : string, layer name pretrain_mode : theano shared scalar, 0. = word only, 1. = char only, 2. = word & char activ : string, activation function: 'liner', 'tanh', or 'rectifier' Returns ------- X : theano 3d tensor, final vector, dimensions: (num of time steps, batch size, dim of vector) """ # compute gating values, Eq.(3) G = tensor.nnet.sigmoid( tensor.dot(X_word, tparams[p_name(prefix, 'v')]) + tparams[p_name(prefix, 'b')][0]) X = ifelse( tensor.le(pretrain_mode, numpy.float32(1.)), ifelse(tensor.eq(pretrain_mode, numpy.float32(0.)), X_word, X_char), G[:, :, None] * X_char + (1. - G)[:, :, None] * X_word) return eval(activ)(X)
def concat_layer(tparams, X_word, X_char, options, prefix, pretrain_mode, activ='lambda x: x', **kwargs): """ compute the forward pass for a concat layer Parameters ---------- tparams : OrderedDict of theano shared variables, {parameter name: value} X_word : theano 3d tensor, word input, dimensions: (num of time steps, batch size, dim of vector) X_char : theano 3d tensor, char input, dimensions: (num of time steps, batch size, dim of vector) options : dictionary, {hyperparameter: value} prefix : string, layer name pretrain_mode : theano shared scalar, 0. = word only, 1. = char only, 2. = word & char activ : string, activation function: 'liner', 'tanh', or 'rectifier' Returns ------- X : theano 3d tensor, final vector, dimensions: (num of time steps, batch size, dim of vector) """ X = ifelse( tensor.le(pretrain_mode, numpy.float32(1.)), ifelse(tensor.eq(pretrain_mode, numpy.float32(0.)), X_word, X_char), tensor.dot(tensor.concatenate([X_word, X_char], axis=2), tparams[p_name(prefix, 'W')]) + tparams[p_name(prefix, 'b')]) return eval(activ)(X)
def __init__(self, input, sigma=20.0, window_radius=60): self.input = input self.sigma = theano.shared(value=np.array(sigma, dtype=theano.config.floatX), name='sigma') apply_blur = T.gt(self.sigma, 0.0) no_blur = T.le(self.sigma, 0.0) self.output = ifelse(no_blur, input, gaussian_filter(input.dimshuffle('x', 0, 1), self.sigma, window_radius)[0, :, :]) self.params = [self.sigma]
def logp_loss3(self, x, y, fake_label,neg_label, pos_ratio = 0.5): #adopt maxout for negative # pos_rati0 means pos examples weight (0.5 means equal 1:1) print "adopt positives weight ............. "+str(pos_ratio) y = y.dimshuffle((1,0)) inx = x.dimshuffle((1,0)) fake_mask = T.neq(y, fake_label) y = y*fake_mask pos_mask = T.and_(fake_mask, T.le(y, neg_label-1))*pos_ratio neg_mask = T.ge(y, neg_label)*(1- pos_ratio) pos_score, neg_score = self.structure2(inx,False) maxneg = T.max(neg_score, axis = -1) scores = T.concatenate((pos_score, maxneg.dimshuffle((0,1,'x'))), axis = 2) d3shape = scores.shape #seq*batch , label scores = scores.reshape((d3shape[0]*d3shape[1], d3shape[2])) pro = T.nnet.softmax(scores) _logp = T.nnet.categorical_crossentropy(pro, y.flatten()) _logp = _logp.reshape(fake_mask.shape) loss = (T.sum(_logp*pos_mask)+ T.sum(_logp*neg_mask))/ (T.sum(pos_mask)+T.sum(neg_mask)) pos_loss = T.sum(_logp*pos_mask) neg_loss = T.sum(_logp*neg_mask) return loss, pos_loss, neg_loss
def tied_neighbours(preds, n_sample_preds, n_classes): eps = 1e-8 #preds = T.clip(preds, eps, 1-eps) preds_per_trial_row = preds.reshape((-1, n_sample_preds, n_classes)) earlier_neighbours = preds_per_trial_row[:, :-1] later_neighbours = preds_per_trial_row[:, 1:] # Have to now ensure first values are larger zero # for numerical stability :/ # Example of problem otherwise: """ a = T.fmatrix() b = T.fmatrix() soft_out_a =softmax(a) soft_out_b =softmax(b) loss = categorical_crossentropy(soft_out_a[:,1:],soft_out_b[:,:-1]) neigh_fn = theano.function([a,b], loss) neigh_fn(np.array([[0,1000,0]], dtype=np.float32), np.array([[0.1,0.9,0.3]], dtype=np.float32)) -> inf """ # renormalize(?) earlier_neighbours = (T.gt(earlier_neighbours, eps) * earlier_neighbours + T.le(earlier_neighbours, eps) * earlier_neighbours + eps) loss = categorical_crossentropy(earlier_neighbours, later_neighbours) return loss
def compile(self): # 1D: n_words, 2D: batch * n_cands self.x = T.imatrix() self.y = T.fvector() self.train_inputs = [self.x, self.y] self.pred_inputs = [self.x] self.activation = self.args.activation self.n_d = self.args.hidden_dim self.n_e = self.emb_layers[0].n_d self.pad_id = self.emb_layers[0].vocab_map[PAD] self.dropout = theano.shared( np.float32(self.args.dropout).astype(theano.config.floatX)) self._set_layers(args=self.args, n_d=self.n_d, n_e=self.n_e) ########### # Network # ########### h_in = self._input_layer(x=self.x) h = self._mid_layer(h_prev=h_in, x=self.x, pad_id=self.pad_id) y_scores = self._output_layer(h=h) self.y_pred = T.le(0.5, y_scores) ######################### # Set an objective func # ######################### self.set_params(layers=self.layers) self.loss = self.set_loss(self.y, y_scores) self.cost = self.set_cost(args=self.args, params=self.params, loss=self.loss)
def each_loss(outpt, inpt): # y 是填充了blank之后的ans blank = 26 y_nblank = T.neq(inpt, blank) n = T.dot(y_nblank, y_nblank) # 真实的字符长度 N = 2 * n + 1 # 填充后的字符长度,去除尾部多余的填充 labels = inpt[:N] labels2 = T.concatenate((labels, [blank, blank])) sec_diag = T.neq(labels2[:-2], labels2[2:]) * T.eq(labels2[1:-1], blank) recurrence_relation = \ T.eye(N) + \ T.eye(N, k=1) + \ T.eye(N, k=2) * sec_diag.dimshuffle((0, 'x')) pred_y = outpt[:, labels] fwd_pbblts, _ = theano.scan( lambda curr, accum: T.switch(T.eq(curr*T.dot(accum, recurrence_relation), 0.0), T.dot(accum, recurrence_relation) , curr*T.dot(accum, recurrence_relation)), sequences=[pred_y], outputs_info=[T.eye(N)[0]] ) #return fwd_pbblts #liklihood = fwd_pbblts[0, 0] liklihood = fwd_pbblts[-1, -1] + fwd_pbblts[-1, -2] #liklihood = T.switch(T.lt(liklihood, 1e-35), 1e-35, liklihood) #loss = -T.log(T.cast(liklihood, "float32")) #loss = 10 * (liklihood - 1) * (liklihood - 100) loss = (T.le(liklihood, 1.0)*(10*(liklihood-1)*(liklihood-100)))+(T.gt(liklihood, 1.0)*(-T.log(T.cast(liklihood, "float32")))) return loss
def compare_max(l2_norm, coding_dist): result,updates=theano.scan(lambda i,x:T.switch(T.le(x[i],T.constant(1e-12)),T.constant(1e-12),x[i]),\ outputs_info=None,\ sequences=T.arange(coding_dist.shape[0]),\ non_sequences=[l2_norm]) return result
def ALB_softmax_health_weighting(o, t, o2, health, v, alpha_0, beta_0, alpha_1, beta_1, d, tau_p, tau_n, unchosen_p, b, tau_p_w, tau_n_w, decay_w): # Without variance weighting b = 1. / b # Convert inverse temperature to temperature unchosen_0 = T.switch(T.le(v, 0.5), 1, unchosen_p) unchosen_1 = T.switch(T.gt(v, 0.5), 1, unchosen_p) health = T.switch(T.lt(health, 0), 0, health) tau_p = T.switch(T.ge(tau_p, 0), tau_p * (1 - tau_p_w * health), tau_p * (1 - (1 + tau_p_w * health))) tau_n = T.switch(T.ge(tau_n, 0), tau_n * (1 - tau_n_w * health), tau_n * (1 - (1 + tau_n_w * health))) d = T.switch(T.ge(tau_p, 0), d * (1 - decay_w * health), d * (1 - (1 + decay_w * health))) # Only update if outcome isn't missing alpha_0 = T.switch(T.ge(o, 0), (1 - d) * alpha_0 + (o * tau_p * unchosen_0), alpha_0) beta_0 = T.switch(T.ge(o, 0), (1 - d) * beta_0 + ((1 - o) * tau_n * unchosen_0), beta_0) alpha_1 = T.switch(T.ge(o2, 0), (1 - d) * alpha_1 + (o2 * tau_p * unchosen_1), alpha_1) beta_1 = T.switch(T.ge(o2, 0), (1 - d) * beta_1 + ((1 - o2) * tau_n * unchosen_1), beta_1) value_0 = alpha_0 / (alpha_0 + beta_0) value_1 = alpha_1 / (alpha_1 + beta_1) value = ((value_0 - value_1) + 1) / 2. var_0 = (alpha_0 * beta_0) / (T.pow(alpha_0 + beta_0, 2) * (alpha_0 + beta_0 + 1)) var_1 = (alpha_1 * beta_1) / (T.pow(alpha_1 + beta_1, 2) * (alpha_1 + beta_1 + 1)) value = np.exp(b * value) / (np.exp(b * value) + np.exp(b * (1 - value))) return (value, alpha_0, beta_0, alpha_1, beta_1, var_0, var_1, value_0, value_1, o, o2, unchosen_0, unchosen_1)
def ALB_softmax(o, t, o2, v, alpha_0, beta_0, alpha_1, beta_1, d, tau_p, tau_n, unchosen_p, b): # Without variance weighting b = 1. / b # Convert inverse temperature to temperature unchosen_0 = T.switch(T.le(v, 0.5), 1, unchosen_p) unchosen_1 = T.switch(T.gt(v, 0.5), 1, unchosen_p) # Only update if outcome isn't missing alpha_0 = T.switch(T.ge(o, 0), (1 - d) * alpha_0 + (o * tau_p * unchosen_0), alpha_0) beta_0 = T.switch(T.ge(o, 0), (1 - d) * beta_0 + ((1 - o) * tau_n * unchosen_0), beta_0) alpha_1 = T.switch(T.ge(o2, 0), (1 - d) * alpha_1 + (o2 * tau_p * unchosen_1), alpha_1) beta_1 = T.switch(T.ge(o2, 0), (1 - d) * beta_1 + ((1 - o2) * tau_n * unchosen_1), beta_1) value_0 = alpha_0 / (alpha_0 + beta_0) value_1 = alpha_1 / (alpha_1 + beta_1) value = ((value_0 - value_1) + 1) / 2. var_0 = (alpha_0 * beta_0) / (T.pow(alpha_0 + beta_0, 2) * (alpha_0 + beta_0 + 1)) var_1 = (alpha_1 * beta_1) / (T.pow(alpha_1 + beta_1, 2) * (alpha_1 + beta_1 + 1)) value = np.exp(b * value) / (np.exp(b * value) + np.exp(b * (1 - value))) return (value, alpha_0, beta_0, alpha_1, beta_1, var_0, var_1, value_0, value_1, o, o2, unchosen_0, unchosen_1)
def get_value(self, tau0): dt = self.delta ar, cr, a, b, c, d = self.term.coefficients # Format the lags correctly tau0 = tt.abs_(tau0) tau = tau0[..., None] # Precompute some factors dpt = dt + tau dmt = dt - tau # Real parts: # tau > Delta crd = cr * dt cosh = tt.cosh(crd) norm = 2 * ar / crd ** 2 K_large = tt.sum(norm * (cosh - 1) * tt.exp(-cr * tau), axis=-1) # tau < Delta crdmt = cr * dmt K_small = K_large + tt.sum(norm * (crdmt - tt.sinh(crdmt)), axis=-1) # Complex part cd = c * dt dd = d * dt c2 = c ** 2 d2 = d ** 2 c2pd2 = c2 + d2 C1 = a * (c2 - d2) + 2 * b * c * d C2 = b * (c2 - d2) - 2 * a * c * d norm = 1.0 / (dt * c2pd2) ** 2 k0 = tt.exp(-c * tau) cdt = tt.cos(d * tau) sdt = tt.sin(d * tau) # For tau > Delta cos_term = 2 * (tt.cosh(cd) * tt.cos(dd) - 1) sin_term = 2 * (tt.sinh(cd) * tt.sin(dd)) factor = k0 * norm K_large += tt.sum( (C1 * cos_term - C2 * sin_term) * factor * cdt, axis=-1 ) K_large += tt.sum( (C2 * cos_term + C1 * sin_term) * factor * sdt, axis=-1 ) # tau < Delta edmt = tt.exp(-c * dmt) edpt = tt.exp(-c * dpt) cos_term = ( edmt * tt.cos(d * dmt) + edpt * tt.cos(d * dpt) - 2 * k0 * cdt ) sin_term = ( edmt * tt.sin(d * dmt) + edpt * tt.sin(d * dpt) - 2 * k0 * sdt ) K_small += tt.sum(2 * (a * c + b * d) * c2pd2 * dmt * norm, axis=-1) K_small += tt.sum((C1 * cos_term + C2 * sin_term) * norm, axis=-1) return tt.switch(tt.le(tau0, dt), K_small, K_large)
def errors(self): """Return a float representing the number of errors in the minibatch over the total number of examples of the minibatch ; zero one loss over the size of the minibatch :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label """ # check if y has same dimension of y_pred if self.correct_output.ndim != self.predicted_output.ndim: raise TypeError('y should have the same shape as self.y_pred', ('correct_output', self.correct_output.type, 'predicted_output', self.predicted_output.type)) # check if y is of the correct datatype if self.correct_output.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction return T.mean(T.neq(self.predicted_output, self.correct_output)) elif self.correct_output.dtype.startswith('float'): # First compare the equality of the data the same way numpy.allclose # does, then counts the failures. return T.sum( T.neq( 0, T.sum(T.le(self.predicted_output - self.correct_output, 1e-05 + 1e-08 * T.abs_(self.correct_output)), axis=1))) else: raise NotImplementedError()
def compile(self): # 1D: n_words, 2D: batch * n_cands self.x = T.imatrix() self.y = T.fvector() self.train_inputs = [self.x, self.y] self.pred_inputs = [self.x] self.activation = self.args.activation self.n_d = self.args.hidden_dim self.n_e = self.emb_layers[0].n_d self.pad_id = self.emb_layers[0].vocab_map[PAD] self.dropout = theano.shared(np.float32(self.args.dropout).astype(theano.config.floatX)) self._set_layers(args=self.args, n_d=self.n_d, n_e=self.n_e) ########### # Network # ########### h_in = self._input_layer(x=self.x) h = self._mid_layer(h_prev=h_in, x=self.x, pad_id=self.pad_id) y_scores = self._output_layer(h=h) self.y_pred = T.le(0.5, y_scores) ######################### # Set an objective func # ######################### self.set_params(layers=self.layers) self.loss = self.set_loss(self.y, y_scores) self.cost = self.set_cost(args=self.args, params=self.params, loss=self.loss)
def gate_layer(tparams, X_word, X_char, options, prefix, pretrain_mode, activ='lambda x: x', **kwargs): """ compute the forward pass for a gate layer Parameters ---------- tparams : OrderedDict of theano shared variables, {parameter name: value} X_word : theano 3d tensor, word input, dimensions: (num of time steps, batch size, dim of vector) X_char : theano 3d tensor, char input, dimensions: (num of time steps, batch size, dim of vector) options : dictionary, {hyperparameter: value} prefix : string, layer name pretrain_mode : theano shared scalar, 0. = word only, 1. = char only, 2. = word & char activ : string, activation function: 'liner', 'tanh', or 'rectifier' Returns ------- X : theano 3d tensor, final vector, dimensions: (num of time steps, batch size, dim of vector) """ # compute gating values, Eq.(3) G = tensor.nnet.sigmoid(tensor.dot(X_word, tparams[p_name(prefix, 'v')]) + tparams[p_name(prefix, 'b')][0]) X = ifelse(tensor.le(pretrain_mode, numpy.float32(1.)), ifelse(tensor.eq(pretrain_mode, numpy.float32(0.)), X_word, X_char), G[:, :, None] * X_char + (1. - G)[:, :, None] * X_word) return eval(activ)(X)
def RMSprop_v1(tparams, cost, inps, lr, rho=0.9, epsilon=1e-6, cutoff=1e10): """ default: lr=0.001 This is the implementation of the RMSprop algorithm used in http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf. """ grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, 5): grads = [g * 5 / norm for g in grads] gshared = [ theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in tparams.iteritems() ] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] for p, g in zip(tparams.values(), gshared): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g**2 updates.append((acc, acc_new)) updated_p = p - lr * (g / tensor.sqrt(acc_new + epsilon)) updated_p = tensor.switch(tensor.ge(updated_p, cutoff), cutoff, updated_p) updated_p = tensor.switch(tensor.le(updated_p, -cutoff), -cutoff, updated_p) updates.append((p, updated_p)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def tied_neighbours(preds, n_sample_preds, n_classes): eps = 1e-8 #preds = T.clip(preds, eps, 1-eps) preds_per_trial_row = preds.reshape((-1, n_sample_preds, n_classes)) earlier_neighbours = preds_per_trial_row[:,:-1] later_neighbours = preds_per_trial_row[:,1:] # Have to now ensure first values are larger zero # for numerical stability :/ # Example of problem otherwise: """ a = T.fmatrix() b = T.fmatrix() soft_out_a =softmax(a) soft_out_b =softmax(b) loss = categorical_crossentropy(soft_out_a[:,1:],soft_out_b[:,:-1]) neigh_fn = theano.function([a,b], loss) neigh_fn(np.array([[0,1000,0]], dtype=np.float32), np.array([[0.1,0.9,0.3]], dtype=np.float32)) -> inf """ # renormalize(?) earlier_neighbours = (T.gt(earlier_neighbours, eps) * earlier_neighbours + T.le(earlier_neighbours, eps) * earlier_neighbours + eps) loss = categorical_crossentropy(earlier_neighbours, later_neighbours) return loss
def logp(self, value): p_ = self.p k = self.k # Clip values before using them for indexing value_clip = tt.clip(value, 0, k - 1) # We must only check that the values sum to 1 if p comes from a # tensor variable, i.e. when p is a step_method proposal. In the other # cases we normalize ourselves if not isinstance(p_, (numbers.Number, np.ndarray, tt.TensorConstant, tt.sharedvar.SharedVariable)): sumto1 = theano.gradient.zero_grad( tt.le(abs(tt.sum(p_, axis=-1) - 1), 1e-5)) p = p_ else: p = p_ / tt.sum(p_, axis=-1, keepdims=True) sumto1 = True if p.ndim > 1: a = tt.log(np.moveaxis(p, -1, 0)[value_clip]) else: a = tt.log(p[value_clip]) return bound(a, value >= 0, value <= (k - 1), sumto1, tt.all(p_ > 0, axis=-1), tt.all(p <= 1, axis=-1))
def ALB_var(o, t, o2, v, alpha_0, beta_0, alpha_1, beta_1, d, tau_p, tau_n, unchosen_p, b, var_weight): unchosen_0 = T.switch(T.le(v, 0.5), 1, unchosen_p) unchosen_1 = T.switch(T.gt(v, 0.5), 1, unchosen_p) # Only update if outcome isn't missing alpha_0 = T.switch(T.ge(o, 0), (1 - d) * alpha_0 + (o * tau_p * unchosen_0), alpha_0) beta_0 = T.switch(T.ge(o, 0), (1 - d) * beta_0 + ((1 - o) * tau_n * unchosen_0), beta_0) alpha_1 = T.switch(T.ge(o2, 0), (1 - d) * alpha_1 + (o2 * tau_p * unchosen_1), alpha_1) beta_1 = T.switch(T.ge(o2, 0), (1 - d) * beta_1 + ((1 - o2) * tau_n * unchosen_1), beta_1) value_0 = alpha_0 / (alpha_0 + beta_0) value_1 = alpha_1 / (alpha_1 + beta_1) var_0 = (alpha_0 * beta_0) / (T.pow(alpha_0 + beta_0, 2) * (alpha_0 + beta_0 + 1)) var_1 = (alpha_1 * beta_1) / (T.pow(alpha_1 + beta_1, 2) * (alpha_1 + beta_1 + 1)) # Weighting by variance variance_bias = var_0 / (var_0 + var_1) w_value_0 = value_0 * (1 - (variance_bias * var_weight)) w_value_1 = value_1 * (1 - (1 - variance_bias) * var_weight) value = ((w_value_0 - w_value_1) + 1) / 2. # Softmax value = np.exp(b * value) / (np.exp(b * value) + np.exp(b * (1 - value))) return (value, alpha_0, beta_0, alpha_1, beta_1, o, o2, var_0, var_1, value_0, value_1, unchosen_0, unchosen_1, variance_bias, w_value_0, w_value_1)
def value(self, tau0): dt = self.delta ar, cr, a, b, c, d = self.term.coefficients # Format the lags correctly tau0 = tt.abs_(tau0) tau = tt.reshape(tau0, tt.concatenate([tau0.shape, [1]]), ndim=tau0.ndim + 1) # Precompute some factors dpt = dt + tau dmt = dt - tau # Real parts: # tau > Delta crd = cr * dt norm = 1.0 / (crd)**2 factor = (tt.exp(crd) + tt.exp(-crd) - 2) * norm, K_large = tt.sum(ar * tt.exp(-cr * tau) * factor, axis=-1) # tau < Delta K_small = tt.sum((2 * cr * (dmt) + tt.exp(-cr * dmt) + tt.exp(-cr * dpt) - 2 * tt.exp(-cr * tau)) * norm, axis=-1) # Complex part cd = c * dt dd = d * dt c2 = c**2 d2 = d**2 c2pd2 = c2 + d2 C1 = a * (c2 - d2) + 2 * b * c * d C2 = b * (c2 - d2) - 2 * a * c * d norm = 1.0 / (dt * c2pd2)**2 k0 = tt.exp(-c * tau) cdt = tt.cos(d * tau) sdt = tt.sin(d * tau) # For tau > Delta cos_term = 2 * (tt.cosh(cd) * tt.cos(dd) - 1) sin_term = 2 * (tt.sinh(cd) * tt.sin(dd)) factor = k0 * norm K_large += tt.sum((C1 * cos_term - C2 * sin_term) * factor * cdt, axis=-1) K_large += tt.sum((C2 * cos_term + C1 * sin_term) * factor * sdt, axis=-1) # Real part edmt = tt.exp(-c * dmt) edpt = tt.exp(-c * dpt) cos_term = edmt * tt.cos(d * dmt) + edpt * tt.cos( d * dpt) - 2 * k0 * cdt sin_term = edmt * tt.sin(d * dmt) + edpt * tt.sin( d * dpt) - 2 * k0 * sdt K_small += tt.sum(2 * (a * c + b * d) * c2pd2 * dmt * norm, axis=-1) K_small += tt.sum((C1 * cos_term + C2 * sin_term) * norm, axis=-1) return tt.switch(tt.le(tau0, dt), K_small, K_large)
def logp(self, value): p = self.p k = self.k sumto1 = theano.gradient.zero_grad(T.le(abs(T.sum(p) - 1), 1e-5)) return bound(T.log(p[value]), value >= 0, value <= (k - 1), sumto1)
def huber_loss(y_true, y_pred): max_delta = 1.0 error = y_true - y_pred abs_error = np.abs(error) loss1 = 0.5 * T.square(error) loss2 = max_delta * abs_error - 0.5 * T.square(max_delta) loss = T.switch(T.le(abs_error, max_delta), loss1, loss2) return T.mean(loss)
def upwind(dip_ind, str_ind, StartTimes, slownesses, patch_size): [n_patch_dip, n_patch_str] = slownesses.shape zero = theano.shared(0) s1 = str_ind - 1 d1 = dip_ind - 1 s2 = str_ind + 1 d2 = dip_ind + 1 # if a < b return b checked_s1 = ifelse(tt.lt(s1, zero), zero, s1) checked_d1 = ifelse(tt.lt(d1, zero), zero, d1) # if a =< b return a-1 checked_s2 = ifelse(tt.le(n_patch_str, s2), n_patch_str - 1, s2) checked_d2 = ifelse(tt.le(n_patch_dip, d2), n_patch_dip - 1, d2) ST_xmin = tt.min( (StartTimes[checked_d1, str_ind], StartTimes[checked_d2, str_ind])) ST_ymin = tt.min( (StartTimes[dip_ind, checked_s1], StartTimes[dip_ind, checked_s2])) ### Eikonal equation solver ### # The unique solution to the equation # [(x-a)^+]^2 + [(x-b)^+]^2 = f^2 * h^2 # where a = u_xmin, b = u_ymin, is # # | min(a,b) + f*h, |a-b|>= f*h # xnew = | # |0.5 * [ a+b+sqrt( 2*f^2*h^2 - (a-b)^2 ) ], |a-b| < f*h start_new = ifelse( tt.le(slownesses[dip_ind, str_ind] * patch_size, tt.abs_(ST_xmin - ST_ymin)), tt.min((ST_xmin, ST_ymin)) + slownesses[dip_ind, str_ind] * \ patch_size, (ST_xmin + ST_ymin + \ tt.sqrt(2 * tt.pow(slownesses[dip_ind, str_ind], 2) * \ tt.pow(patch_size, 2) - \ tt.pow((ST_xmin - ST_ymin), 2) )) / 2 ) # if a < b return a output = ifelse(tt.lt(start_new, StartTimes[dip_ind, str_ind]), start_new, StartTimes[dip_ind, str_ind]) return tt.set_subtensor( StartTimes[dip_ind:dip_ind + 1, str_ind:str_ind + 1], output)
def adapt_step(dt, accept_prob, pos, mom, energy, energy_grad, k_energy): dt = tt.switch(tt.gt(accept_prob**sign, 2.**(-sign)), (2.**sign) * dt, dt) accept_prob = leapfrog_accept_prob(dt, pos, mom, energy, energy_grad, k_energy) return (dt, accept_prob), th.scan_module.until( tt.all(tt.le(accept_prob**sign, 2.**(-sign))))
def objective(y_true, y_pred, P, Q, alpha=0., beta=0.15, dbeta=0., gamma=0.01, gamma1=-1., poos=0.23, eps=1e-6): '''Expects a binary class matrix instead of a vector of scalar classes. ''' beta = np.float32(beta) dbeta = np.float32(dbeta) gamma = np.float32(gamma) poos = np.float32(poos) eps = np.float32(eps) # scale preds so that the class probas of each sample sum to 1 y_pred += eps y_pred /= y_pred.sum(axis=-1, keepdims=True) y_true = T.cast(y_true.flatten(), 'int64') y1 = T.and_(T.gt(y_true, 0), T.le(y_true, Q)) # in-set y0 = T.or_(T.eq(y_true, 0), T.gt(y_true, Q)) # out-of-set or unlabeled y0sum = y0.sum() + eps # number of oos y1sum = y1.sum() + eps # number of in-set # we want to reduce cross entrophy of labeled data # convert all oos/unlabeled to label=0 cost0 = T.nnet.categorical_crossentropy(y_pred, T.switch(y_true <= Q, y_true, 0)) cost0 = T.dot(y1, cost0) / y1sum # average cost per labeled example if alpha: cost1 = T.nnet.categorical_crossentropy(y_pred, y_pred) cost1 = T.dot(y0, cost1) / y0sum # average cost per labeled example cost0 += alpha*cost1 # we want to increase the average entrophy in each batch # average over batch if beta: y_pred_avg0 = T.dot(y0, y_pred) / y0sum y_pred_avg0 = T.clip(y_pred_avg0, eps, np.float32(1) - eps) y_pred_avg0 /= y_pred_avg0.sum(axis=-1, keepdims=True) cost2 = T.nnet.categorical_crossentropy(y_pred_avg0.reshape((1,-1)), P-dbeta)[0] # [None,:] cost2 = T.switch(y0sum > 0.5, cost2, 0.) # ignore cost2 if no samples cost0 += beta*cost2 # binary classifier score if gamma: y_pred0 = T.clip(y_pred[:,0], eps, np.float32(1) - eps) if gamma1 < 0.: cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot(np.float32(1)-poos*y0.T,T.log(np.float32(1)-y_pred0)) cost3 /= y_pred.shape[0] cost0 += gamma*cost3 elif gamma1 > 0.: cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0,T.log(np.float32(1)-y_pred0)) cost3 /= y0sum cost31 = - T.dot(y1,T.log(np.float32(1)-y_pred0)) cost3 /= y1sum cost0 += gamma*cost3 + gamma1*cost31 else: # gamma1 == 0. cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0, T.log(np.float32(1)-y_pred0)) cost3 /= y0sum cost0 += gamma*cost3 return cost0
def apply(self, y, y_hat): epsilon = 1e-5 # to avoid nan mask = (tensor.le(srng.uniform(size=y[:,-1:].shape, dtype=config.floatX), .0005))*1. cost = 0 for i in range(15): cost += tensor.nnet.binary_crossentropy(y_hat[:,i,:,:], y[:,i,:,:]).mean() cost += ( tensor.nnet.binary_crossentropy(y_hat[:,15,:,:], tensor.eq(y[:,15,:,:],1)*1.) * mask ).mean() return cost
def cost(self,Y,Y_hat): w = T.fscalar() r = self.r w = 0.05 i = T.le(Y,w) j = T.eq(i,0) z = T.join(0,Y[i]/r,Y[j]) z_hat = T.join(0,Y_hat[i]/r,Y_hat[j]) return super(linear_mlp_bayesian_cost,self).cost(z,z_hat)
def huber_loss(y_true, y_pred, delta=1., axis=None): a = y_true - y_pred squared_loss = 0.5*T.sqr(a) absolute_loss = (delta*abs(a) - 0.5*T.sqr(delta)) cost = T.switch(T.le(abs(a), delta), squared_loss, absolute_loss) return cost.mean(axis=axis)
def clip_gradients(gparams, threshold=5.): clipped_gparams = [] for gparam in gparams: norm_gparam = T.sqrt(T.sqr(gparam).sum()) clipped_gparams.append(T.switch(T.le(norm_gparam, threshold), gparam, (gparam/norm_gparam)*threshold)) return clipped_gparams
def theano_symbolic_dtw(x1, x2, x1_lengths, x2_lengths, distance_function=cosine, normalize=True, debug_level=None, eps=None): """ A symbolic implementation of DTW that supports batches of sequence pairs. Returns a scalar if ndim == 2 and a vector of size x1.shape[1] if ndim == 3 This is slow! About 90 times slower than the Cython implementation using the parameters below. :param x1: A tensor containing the first side of the sequence pairs to be aligned. :param x2: A tensor containing the second side of the sequence pairs to be aligned. :param x1_lengths: An integer vector identifying the lengths of the sequences in x1 :param x2_lengths: An integer vector identifying the lengths of the sequences in x2 :param distance_function: The symbolic distance function to use (e.g. a reference to a function in distance). :param normalize: Whether the DTW distances should be sequence length normalized. :param debug_level: The debug level to use (see above for explanation). :param eps: The minimum value to use inside the distance function. Set to the machine epsilon if None. :return: The DTW distances for every sequence pair in the batch. """ if eps is None: eps = numpy.dtype(theano.config.floatX).type(numpy.finfo(float).eps) assert 0 <= x1_lengths.ndim == x2_lengths.ndim <= 1 assert isinstance(normalize, bool) ndim = x1.ndim assert 2 <= ndim == x2.ndim <= 3 # Ensure x2 is the shorter input to minimize the number of scan iterations x1_shorter_than_x2 = tt.le(x1.shape[0], x2.shape[0]) x1, x2 = _swap(x1_shorter_than_x2, x1, x2, 'x1', 'x2', debug_level) x1_lengths, x2_lengths = _swap(x1_shorter_than_x2, x1_lengths, x2_lengths, 'x1_lengths', 'x2_lengths', debug_level) # Compute distances between x1 sequences and paired x2 sequences d = distance_function(x1, x2, eps) # Iterate over the temporal slices of x2. See dtw_outer_step for an explanation of the other inputs to this scan # operation x1_indexes = tt.arange(x1.shape[0], dtype=DTYPE_INT64) results, _ = theano.scan(_create_dtw_outer_step(distance_function, debug_level), sequences=[x1_indexes, d], outputs_info=[ tt.zeros_like(x2[:, :, 0] if x2.ndim == 3 else x2[:, 0], dtype=theano.config.floatX)], non_sequences=[x1_lengths, x2_lengths]) result = results[x1_lengths - 1, x2_lengths - 1, tt.arange(x1.shape[1])] if x2.ndim == 3 else results[ x1_lengths - 1, x2_lengths - 1] result = _debug(result, 'theano_symbolic_dtw.result', debug_level) assert result.ndim == x1_lengths.ndim # Length normalize the distances if requested to do so if normalize: result = _debug(result / tt.cast(x1_lengths + x2_lengths, dtype=utility.get_standard_dtype()), 'theano_symbolic_dtw.norm_result', debug_level) return result
def output_index(self): from theano.ifelse import ifelse index = self.index if self.sources: # In some cases, e.g. forwarding, the target index (for "classes") might have shape[0]==0. # Or shape[0]==1 with index[0]==0. See Dataset.shapes_for_batches(). # Use source index in that case. have_zero = T.le(index.shape[0], 1) * T.eq(T.sum(index[0]), 0) index = ifelse(have_zero, T.cast(self.sources[0].index,'int8'), T.cast(index,'int8')) return index
def logp(self, x): n = self.n p = self.p X = x[self.tri_index] X = t.fill_diagonal(X, 1) result = self._normalizing_constant(n, p) result += (n - 1.0) * log(det(X)) return bound(result, n > 0, all(le(X, 1)), all(ge(X, -1)))
def logp(self, value): p = self.p k = self.k sumto1 = theano.gradient.zero_grad(tt.le(abs(tt.sum(p, axis=-1) - 1), 1e-5)) if p.ndim > 1: a = tt.log(p[tt.arange(p.shape[0]), value]) else: a = tt.log(p[value]) return bound(a, value >= 0, value <= (k - 1), sumto1)
def innerL_(sS, i): Ei = calcEk_(sS, i) # use "+" instead of "or" and "*" instead of "and" checkUselessAlpha1 = T.ge(sS.labels[i] * Ei, -sS.tol) + T.ge(sS.alphas[i], sS.C) checkUselessAlpha2 = T.le(sS.labels[i]*Ei, sS.tol) + T.lt(sS.alphas[i], 0) isUselessAlpha = toTheanoBool(checkUselessAlpha1 * checkUselessAlpha2) updateL = innerL_alphaInRange_(sS, i, Ei) earlyret = sS.retlist(0) return ifelse(isUselessAlpha, earlyret, updateL)
def loss_confident_bootstrapping(self, y, factor=1): #Customized categorical cross entropy. #Based on the multibox impl. More tuned to paper. More strict p = self.output #Only confident predictions are included. Everything between 0.2 and 0.8 is disregarded. 60% of the range. hardUpper = T.gt(p, 0.8) hardLower = T.le(p, 0.2) loss = ( - T.sum( ((factor * y) + ((1.0- factor) * hardUpper)) * T.log(p) ) - T.sum( ((factor * (1.0 - y)) + ((1.0- factor) * hardLower)) * T.log(1.0 - p) ) ) return loss/self.size
def normalizeAngle(theta): if T.gt(theta, -np.pi) and T.lt(theta, np.pi): return theta else: twopi = 2*np.pi mult = np.floor(theta / twopi) theta -= mult * twopi if T.ge(theta, np.pi): theta -= twopi elif T.le(theta, -np.pi): theta += twopi return theta
def cubicBSpline(self, L): b = T.zeros_like(L) idx4 = T.ge(L, 0) * T.lt(L, 1) idx3 = T.ge(L, 1) * T.lt(L, 2) idx2 = T.ge(L, 2) * T.lt(L, 3) idx1 = T.ge(L, 3) * T.le(L, 4) b = T.switch(T.eq(idx4, 1), T.pow(L, 3) / 6, b) b = T.switch(T.eq(idx3, 1), (-3*T.pow(L-1,3) + 3*T.pow(L-1,2) + 3*(L-1) + 1) / 6, b) b = T.switch(T.eq(idx2, 1), ( 3*T.pow(L-2,3) - 6*T.pow(L-2,2) + 4) / 6, b) b = T.switch(T.eq(idx1, 1), (- T.pow(L-3,3) + 3*T.pow(L-3,2) - 3*(L-3) + 1) / 6, b) return b.T # b is K x K' and thus, as we multiply from the right with
def get_train_func(self, learning_rate, nce=True, em=False): print >>sys.stderr, "Trainining type: EM = %s, NCE = %s"%(em, nce) # TODO: Implement AdaGrad x, y_s = T.ivector("x"), T.imatrix("y_s") if em: cost = -self.get_sym_nc_complete_expectation(x, y_s) if nce else -self.get_sym_complete_expectation(x, y_s) else: cost = -T.log(self.get_sym_nc_direct_prob(x, y_s)) if nce else -T.log(self.get_sym_direct_prob(x, y_s)) params = self.repr_params + self.enc_params + self.rec_params g_params = T.grad(cost, params) # Updating the parameters only if the norm of the gradient is less than 100. # Important: This check also takes care of any element in the gradients being nan. The conditional returns False even in that case. updates=[ (p, ifelse(T.le(T.nlinalg.norm(g, None), T.constant(100.0, dtype='float64')), p - learning_rate * g, p)) for p, g in zip(params, g_params) ] train_func = theano.function([x, y_s], cost, updates=updates) return train_func
def alpha_huber(y_true, y_pred): """ sets the epislon in huber loss equal to a percentile of the residuals """ # abs_r = T.abs_(y_pred - y_true) # loss = 0.5 * T.sqr(abs_r) # epsilon = np.percentile(loss, alpha * 100) # idx = abs_r <= epsilon # loss[idx] = epsilon * abs_r[idx] - 0.5 * T.sqr(epsilon) #switch(cond, ift, iff) alpha=0.95 abs_r = T.abs_(y_pred - y_true) epsilon = np.percentile(0.5 * T.sqr(abs_r), alpha * 100) loss =T.switch(T.le(abs_r,epsilon),epsilon * abs_r - 0.5 * T.sqr(epsilon),0.5 * T.sqr(abs_r)) return loss
def get_gradients(self, model, data, **kwargs): space, sources = self.get_data_specs(model) space.validate(data) assert isinstance(model, CompressAdversaryPair) g = model.compressor d = model.discriminator #get raw gradients for d and g objectives... d_obj, g_obj = self.get_objectives(model, data) g_params = g.get_params() d_params = d.get_params() for param in g_params: assert param not in d_params for param in d_params: assert param not in g_params d_grads = T.grad(d_obj, d_params) g_grads = T.grad(g_obj, g_params) # if self.scale_grads: # S_grad = T.grad(g_obj, S) # scale = T.maximum(1., self.target_scale / T.sqrt(T.sqr(S_grad).sum())) # g_grads = [g_grad * scale for g_grad in g_grads] #adjust raw gradients with control signals rval = OrderedDict() zeros = itertools.repeat(theano.tensor.constant(0., dtype='float32')) if self.ever_train_discriminator: rval.update(OrderedDict(safe_zip(d_params, [self.now_train_discriminator * dg for dg in d_grads]))) else: rval.update(OrderedDict(zip(d_params, zeros))) if self.ever_train_compressor: rval.update(OrderedDict(safe_zip(g_params, [self.now_train_compressor * gg for gg in g_grads]))) else: rval.update(OrderedDict(zip(g_params, zeros))) #update control signals using the updates return functionality updates = OrderedDict() #first, the clock self.future_train_clock = T.switch(T.ge(self.train_clock,self.discriminator_steps+self.joint_steps+self.compressor_steps),1.,self.train_clock+1.) updates[self.train_clock] = self.future_train_clock #then the control signals updates[self.now_train_discriminator] = T.switch(T.le(self.future_train_clock,self.discriminator_steps+self.joint_steps),1.,0.) updates[self.now_train_compressor] = T.switch(T.gt(self.future_train_clock,self.discriminator_steps),1.,0.) return rval, updates
def frank(u,v,d,cut=25): ''' Frank Copula ''' d = (TT.nnet.sigmoid(d)-0.5)*cut U = TT.exp(-d*u)-1 V = TT.exp(-d*v)-1 D = TT.exp(-d )-1 C = 1+U*V/D idx = TT.le(C,0).nonzero() C = TT.set_subtensor(C[idx],0) C = -1/(d) * TT.log(C) return C
def logp(self, value): p = self.p k = self.k # Clip values before using them for indexing value_clip = tt.clip(value, 0, k - 1) sumto1 = theano.gradient.zero_grad( tt.le(abs(tt.sum(p, axis=-1) - 1), 1e-5)) if p.ndim > 1: a = tt.log(p[tt.arange(p.shape[0]), value_clip]) else: a = tt.log(p[value_clip]) return bound(a, value >= 0, value <= (k - 1), sumto1)
def initialize(self): assert self.loss in ( 'ctc', 'ce_ctc', 'hmm', 'ctc2', 'sprint', 'viterbi', 'fast_bw', 'warp_ctc'), 'invalid loss: ' + self.loss self.y_m = T.reshape(self.z, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2) if not self.attrs.get("apply_softmax", True): self.p_y_given_x_flat = self.y_m self.p_y_given_x = self.z self.z = T.log(self.z) self.y_m = T.log(self.y_m) elif self.attrs.get("gauss_outputs", False): self.y_m = -T.sqr(self.y_m) self.p_y_given_x_flat = T.exp(self.y_m) self.p_y_given_x = T.reshape(self.p_y_given_x_flat, self.z.shape) else: # standard case self.p_y_given_x_flat = T.nnet.softmax(self.y_m) self.p_y_given_x = T.reshape(T.nnet.softmax(self.y_m), self.z.shape) self.y_pred = T.argmax(self.p_y_given_x_flat, axis=-1) self.output = self.p_y_given_x if self.attrs.get('compute_priors', False): exp_average = self.attrs.get("compute_priors_exp_average", 0) custom = T.mean(self.p_y_given_x_flat[self.i], axis=0) custom_init = numpy.ones((self.attrs['n_out'],), 'float32') / numpy.float32(self.attrs['n_out']) if self.attrs.get('use_label_priors', 0) > 0: # use labels to compute priors in first epoch custom_0 = T.mean(theano.tensor.extra_ops.to_one_hot(self.y_data_flat[self.i], self.attrs['n_out'], 'float32'), axis=0) custom = T.switch(T.le(self.network.epoch, self.attrs.get('use_label_priors', 0)), custom_0, custom) custom_init = numpy.zeros((self.attrs['n_out'],), 'float32') self.priors = self.add_param(theano.shared(custom_init, 'priors'), 'priors', custom_update=custom, custom_update_normalized=not exp_average, custom_update_exp_average=exp_average) self.log_prior = T.log(T.maximum(self.priors, numpy.float32(1e-20))) self._maybe_substract_prior_from_output() if self.attrs.get('compute_distortions', False): p = self.p_y_given_x_flat[self.i] momentum = p[:-1] * p[1:] momentum = T.sum(momentum,axis=-1) loop = T.mean(momentum) forward = numpy.float32(1) - loop self.distortions = { 'loop' : self.add_param(theano.shared(numpy.ones(1,) * numpy.float32(0.5), 'loop'), 'loop', custom_update = loop, custom_update_normalized=True), 'forward' : self.add_param(theano.shared(numpy.ones(1,) * numpy.float32(0.5), 'forward'), 'forward', custom_update = forward, custom_update_normalized=True) }
def step(ord): i = ord[0] xx1 = T.maximum(x1[i], x1[ord[1:]]) yy1 = T.maximum(y1[i], y1[ord[1:]]) xx2 = T.minimum(x2[i], x2[ord[1:]]) yy2 = T.minimum(y2[i], y2[ord[1:]]) w = T.maximum(0.0, xx2 - xx1 + 1) h = T.maximum(0.0, yy2 - yy1 + 1) inter = w * h ovr = inter / (areas[i] + areas[ord[1:]] - inter) inds = T.le(ovr, thresh).nonzero()[0] ord = ord[inds + 1] return (i, ord), until(order.size > 0)