def irprop_plus_trainer(x, y, w, parameters, loss, random_stream, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6): """IRPROP+ is batch trainer, for details see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3428 :param positive_step: factor, by which the step is increased when continuing going in the direction :param negative_step: factor, by which the step is increased when changing direction to opposite :param min_step: minimal change of weight during iteration :param max_step: maximal change of weight during iteration """ loss_value = loss(x, y, w) prev_loss_value = theano.shared(1e10) shareds = [prev_loss_value] updates = [] for name, param in parameters.items(): old_derivative = theano.shared(param.get_value() * 0.) delta = theano.shared(param.get_value() * 0. + 1e-3) new_derivative = T.grad(loss_value, param) shift_if_bad_step = T.where(new_derivative * old_derivative < 0, delta * T.sgn(old_derivative), 0) shift = ifelse(loss_value > prev_loss_value, shift_if_bad_step, 0. * param) # unfortunately we can't do it this way: param += shift new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step) new_delta = T.clip(new_delta, min_step, max_step) updates.append([param, param + shift - new_delta * T.sgn(new_derivative)]) updates.append([delta, new_delta]) new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative) updates.append([old_derivative, new_old_derivative]) shareds.extend([old_derivative, delta]) updates.append([prev_loss_value, loss_value]) return shareds, updates
def generateRpropUpdates(params, error, init_size=1, verbose=False): prevw = [] deltaw = [] updates = [] gradients = [] #initalize stuff for p in params: prevw.append(theano.shared(np.zeros(p.shape.eval()).astype(config.floatX))) deltaw.append(theano.shared(init_size * np.ones(p.shape.eval()). astype(config.floatX))) iterations = 0 for p, dw, pw in zip(params, deltaw, prevw): try: if verbose: print("\rGradient {} out of {}".format(iterations + 1, len(params)), end="") gradients.append(T.grad(error, p)) iterations += 1 except Exception: print('Unused input') continue #Array describing which values are when gradients are both positive or both negative simW = T.neq((T.eq((pw > 0), (gradients[-1] > 0))), (T.eq((pw < 0), (gradients[-1] < 0)))) #Array describing which values are when gradients are in opposite directions diffW = ((pw > 0) ^ (gradients[-1] > 0)) * (T.neq(pw, 0) * T.neq(gradients[-1], 0)) updates.append((p, p - (T.sgn(gradients[-1]) * dw * (T.eq(diffW, 0))))) updates.append((dw, T.switch(diffW, dw * 0.5, T.switch(simW, dw * 1.2, dw)))) updates.append((pw, (T.sgn(gradients[-1]) * dw * (T.eq(diffW, 0))))) storage = prevw + deltaw if verbose: print("\nDone with updates") return (storage, updates)
def irprop_plus_trainer(x, y, w, parameters, loss, random_stream, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6): """IRPROP+ trainer, see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.1332""" loss_value = loss(x, y, w) prev_loss_value = theano.shared(1e10) shareds = [] updates = [] for name, param in parameters.iteritems(): old_derivative = theano.shared(param.get_value() * 0.) delta = theano.shared(param.get_value() * 0. + 1e-3) new_derivative = T.grad(loss_value, param) shift_if_bad_step = T.where(new_derivative * old_derivative < 0, delta * T.sgn(old_derivative), 0) # THIS doesn't work! shift = ifelse(loss_value > prev_loss_value, shift_if_bad_step, 0. * param) # unfortunately we can't do it this way: param += shift new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step) new_delta = T.clip(new_delta, min_step, max_step) updates.append([param, param + shift - new_delta * T.sgn(new_derivative)]) updates.append([delta, new_delta]) new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative) updates.append([old_derivative, new_old_derivative]) shareds.extend([old_derivative, delta, prev_loss_value]) updates.append([prev_loss_value, loss_value]) return shareds, updates
def irprop_minus_updates(params, grads): # IRPROP- parameters updates = [] deltas = 0.1*numpy.ones(len(params)) last_params = params positiveStep = 1.2 negativeStep = 0.5 maxStep = 1. minStep = math.exp(-6) for param, gparam, delta, last_gparam in zip(params, grads, deltas, last_params): # calculate change change = T.sgn(gparam * last_gparam) if T.gt(change, 0) : delta = T.minimum(delta * positiveStep, maxStep) elif T.lt(change, 0): delta = T.maximum(delta * negativeStep, minStep) last_gparam = 0 # update the weights updates.append((param, param - T.sgn(gparam) * delta)) # store old change last_gparam = gparam return updates
def irprop_minus_updates(params, grads): # IRPROP- parameters updates = [] deltas = 0.1 * numpy.ones(len(params), theano.config.floatX) last_params = params positiveStep = 1.2 negativeStep = 0.5 maxStep = 50 #1. minStep = math.exp(-6) for param, gparam, delta, last_gparam in zip(params, grads, deltas, last_params): # calculate change change = T.sgn(gparam * last_gparam) if T.gt(change, 0): delta = T.minimum(delta * positiveStep, maxStep) elif T.lt(change, 0): delta = T.maximum(delta * negativeStep, minStep) last_gparam = 0 delta = delta.astype('float32') # update the weights updates.append((param, param - T.sgn(gparam) * delta)) # store old change last_gparam = gparam return updates
def mean_field_fancy_step(self, V, P, Mu): iterm = T.dot(T.dot(P*Mu,self.W.T*self.beta),self.W) normalized_V = self.beta * (V-self.b) main_term = T.dot(normalized_V, self.W) iA = self.w * P*Mu - iterm full_A = iA + main_term+self.a Mu1 = full_A / self.gamma Q = self.Q_from_A( full_A) iMu = iA / self.gamma #if this is negative, we are ammplifying so we use default damping #if this is positive, we are flipping, use max(0,lambda(tau)) discriminant = T.sgn(Mu-iMu) * Mu/(1e-10+abs(Mu-iMu)) Lambda = self.tau * discriminant - T.sgn(Mu-iMu) * iMu/(1e-10+abs(Mu-iMu)) mask = discriminant <= 0 fancy_damp = mask*self.s_default_damping_factor + (1.-mask)*T.maximum(0.,Lambda) return Q, Mu1, fancy_damp
def NTanhPInp(x, p, use_noise=1, c=0.25, half_normal=False, alpha=1.1): """ Noisy Tanh units where the noise is injected to the input: NANI with learning p. This function works well with discrete switching functions. ---------------------------------------------------- Arguments: x: theano tensor variable, input of the function. p: theano shared variable, a vector of parameters for p. use_noise: int, whether to add noise or not to the activations, this is in particular useful for the test time, in order to disable the noise injection. c: float, standard deviation of the noise half_normal: bool, whether the noise should be sampled from half-normal or normal distribution. """ logger.info("c: %f" % c) signs = T.sgn(x) delta = HardTanh(x) - x signs = T.sgn(x) noise = global_trng.normal(size=x.shape, avg=0, std=1.0, dtype=floatX) noise_det = 0. if half_normal: if alpha > 1.0: c *= -1 noise_det = 0.797 noise = abs(noise) elif not use_noise: noise = 0. noise = use_noise * noise + (1. - use_noise) * noise_det scale = c * T.nnet.softplus(p * abs(delta) / (abs(noise) + 1e-10)) res = HardTanh(x + scale * noise) return res
def get_updates(self, v): # Contrastive divergence chain_end, updates_CD = self.CD(self, chain_start=v, cdk=self.CDk) # [Expected] negative log-likelihood cost = T.mean(self.free_energy(v), axis=0) - T.mean(self.free_energy(chain_end), axis=0) # L2 Regularization if isinstance(self.regularize, L2Regularization): cost += self.regularization # Gradients (use automatic differentiation) # We must not compute the gradient through the gibbs sampling, i.e. use consider_constant gparams = T.grad(cost, self.parameters, consider_constant=[chain_end]) gradients = dict(zip(self.parameters, gparams)) # Get learning rates for all params given their gradient. lr, updates_lr = self.learning_rate(gradients) updates = OrderedDict() updates.update(updates_CD) # Add updates from CD updates.update(updates_lr) # Add updates from learning_rate # Updates parameters for param, gparam in gradients.items(): updates[param] = param - lr[param] * gradients[param] if isinstance(self.regularize, L1Regularization): updates[self.b] = T.sgn(updates[self.b]) * T.maximum(abs(updates[self.b]) - lr[self.b]*self.regularize.decay, 0) updates[self.W] = T.sgn(updates[self.W]) * T.maximum(abs(updates[self.W]) - lr[self.W]*self.regularize.decay, 0) return updates
def old_rprop(param, learning_rate, gparam, mask, updates, current_cost, previous_cost, eta_plus=1.5, eta_minus=0.5, max_delta=50, min_delta=10e-8): previous_grad = sharedX(numpy.ones(param.shape.eval()), borrow=True) delta = sharedX(learning_rate * numpy.ones(param.shape.eval()), borrow=True) previous_inc = sharedX(numpy.zeros(param.shape.eval()), borrow=True) zero = T.zeros_like(param) one = T.ones_like(param) change = previous_grad * gparam new_delta = T.clip( T.switch(T.gt(change, 0.), delta * eta_plus, T.switch(T.lt(change, 0.), delta * eta_minus, delta)), min_delta, max_delta) new_previous_grad = T.switch(T.gt(change, 0.), gparam, T.switch(T.lt(change, 0.), zero, gparam)) inc = T.switch( T.gt(change, 0.), -T.sgn(gparam) * new_delta, T.switch(T.lt(change, 0.), zero, -T.sgn(gparam) * new_delta)) updates.append((previous_grad, new_previous_grad)) updates.append((delta, new_delta)) updates.append((previous_inc, inc)) return param + inc * mask
def discrete_grads(loss,network,LR): global update_type,best_params,H,N,th # th is a parameter that controls the nonlinearity of state transfer probability W_params = lasagne.layers.get_all_params(network, discrete=True) #Get all the weight parameters layers = lasagne.layers.get_all_layers(network) W_grads = [] for layer in layers: params = layer.get_params(discrete=True) if params: W_grads.append(theano.grad(loss, wrt=layer.W)) #Here layer.W = weight_tune(param) updates = lasagne.updates.adam(loss_or_grads=W_grads,params=W_params,learning_rate=LR) for param, parambest in izip(W_params, best_params) : L = 2*H/pow(2,N) #state step length in Z_N a=random.random() #c is a random variable with binary value if a<0.85: c = 1 else: c = 0 b=random.random() state_rand = T.round(b*pow(2,N))*L-H #state_rand is a random state in the discrete weight space Z_N delta_W1 =c*(state_rand-parambest)#parambest would transfer to state_rand with probability of a, or keep unmoved with probability of 1-a delta_W1_direction = T.cast(T.sgn(delta_W1),theano.config.floatX) dis1=T.abs_(delta_W1) #the absolute distance k1=delta_W1_direction*T.floor(dis1/L) #the integer part v1=delta_W1-k1*L #the decimal part Prob1= T.abs_(v1/L) #the transfer probability Prob1 = T.tanh(th*Prob1) #the nonlinear tanh() function accelerates the state transfer delta_W2 = updates[param] - param delta_W2_direction = T.cast(T.sgn(delta_W2),theano.config.floatX) dis2=T.abs_(delta_W2) #the absolute distance k2=delta_W2_direction*T.floor(dis2/L) #the integer part v2=delta_W2-k2*L #the decimal part Prob2= T.abs_(v2/L) #the transfer probability Prob2 = T.tanh(th*Prob2) #the nonlinear tanh() function accelerates the state transfer srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) Gate1 = T.cast(srng.binomial(n=1, p=Prob1, size=T.shape(Prob1)), theano.config.floatX) # Gate1 is a binary variable with probability of Prob1 to be 1 Gate2 = T.cast(srng.binomial(n=1, p=Prob2, size=T.shape(Prob2)), theano.config.floatX) # Gate2 is a binary variable with probability of Prob2 to be 1 delta_W1_new=(k1+delta_W1_direction*Gate1)*L #delta_W1_new = k*L where k is an integer updates_param1 = T.clip(parambest + delta_W1_new,-H,H) updates_param1 = weight_tune(updates_param1,-H,H) #fine tuning for guaranteeing each element strictly constrained in the discrete space delta_W2_new=(k2+delta_W2_direction*Gate2)*L #delta_W2_new = k*L where k is an integer updates_param2 = T.clip(param + delta_W2_new,-H,H) updates_param2 = weight_tune(updates_param2,-H,H) #fine tuning for guaranteeing each element strictly constrained in the discrete space # if update_type<100, the weight probabilistically tranfers from parambest to state_rand, which helps to search the global minimum # elst it would probabilistically transfer from param to a state nearest to updates[param] updates[param]= T.switch(T.lt(update_type,100), updates_param1, updates_param2) return updates
def __init__(self, model, e, a=0.5, verbose=2, iterator='linear'): self.verbose = verbose self.model = init(model) try: self.iterator = instantiate(iterators, iterator) except: self.iterator = instantiate(async_iterators, iterator) y_tr = self.model[-1].op({ 'dropout': True, 'bn_active': True, 'infer': False }) y_te = self.model[-1].op({ 'dropout': False, 'bn_active': False, 'infer': False }) y_inf = self.model[-1].op({ 'dropout': False, 'bn_active': True, 'infer': True }) self.X = self.model[0].X self.Y = T.TensorType(theano.config.floatX, (False, ) * (len(model[-1].out_shape)))() cost = T.nnet.categorical_crossentropy(y_tr, self.Y).mean() X_adv = self.X + e * T.sgn(T.grad(cost, self.X)) self.model[0].X = X_adv y_tr_adv = self.model[-1].op({ 'dropout': True, 'bn_active': True, 'infer': False }) cost_adv = a * cost + (1. - a) * T.nnet.categorical_crossentropy( y_tr_adv, self.Y).mean() te_cost = T.nnet.categorical_crossentropy(y_te, self.Y).mean() X_te_adv = self.X + e * T.sgn(T.grad(te_cost, self.X)) self.updates = collect_updates(self.model, cost_adv) self.infer_updates = collect_infer_updates(self.model) self.reset_updates = collect_reset_updates(self.model) self._train = theano.function([self.X, self.Y], cost_adv, updates=self.updates) self._predict = theano.function([self.X], y_te) self._fast_sign = theano.function([self.X, self.Y], X_te_adv) self._infer = theano.function([self.X], y_inf, updates=self.infer_updates) self._reset = theano.function([], updates=self.reset_updates)
def rprop(param,learning_rate,gparam,mask,updates,current_cost,previous_cost, eta_plus=1.2,eta_minus=0.5,max_delta=50, min_delta=10e-6): previous_grad = sharedX(numpy.ones(param.shape.eval()),borrow=True) delta = sharedX(learning_rate * numpy.ones(param.shape.eval()),borrow=True) previous_inc = sharedX(numpy.zeros(param.shape.eval()),borrow=True) zero = T.zeros_like(param) one = T.ones_like(param) change = previous_grad * gparam new_delta = T.clip( T.switch( T.eq(gparam,0.), delta, T.switch( T.gt(change,0.), delta*eta_plus, T.switch( T.lt(change,0.), delta*eta_minus, delta ) ) ), min_delta, max_delta ) new_previous_grad = T.switch( T.eq(mask * gparam,0.), previous_grad, T.switch( T.gt(change,0.), gparam, T.switch( T.lt(change,0.), zero, gparam ) ) ) inc = T.switch( T.eq(mask * gparam,0.), zero, T.switch( T.gt(change,0.), - T.sgn(gparam) * new_delta, T.switch( T.lt(change,0.), zero, - T.sgn(gparam) * new_delta ) ) ) updates.append((previous_grad,new_previous_grad)) updates.append((delta,new_delta)) updates.append((previous_inc,inc)) return param + inc * mask
def Update(params, gradients, velocities): global MOMENTUM global LEARNING_RATE global LEARNING_RATE_DECAY param_updates = [ (v, v * MOMENTUM - LEARNING_RATE * T.sgn(g) * T.clip(T.abs_(g), 0.0001, 9.8)) for g, v in zip(gradients, velocities) ] for i in range(0, len(gradients)): velocities[i] = velocities[i] * MOMENTUM - LEARNING_RATE * T.sgn(gradients[i]) * T.clip(T.abs_(gradients[i]), 0.5, 9.8) param_updates.extend([ (p, p + v) for p, v in zip(params, velocities) ]) LEARNING_RATE *= LEARNING_RATE_DECAY return param_updates
def logp(self, value): """ Compute logp. :param value: evaluation point :return: log probability at evaluation point """ return tt.log( self.scale / (self.symmetry + (self.symmetry**-1))) + (-value * self.scale * tt.sgn(value) * (self.symmetry**tt.sgn(value)))
def earth_mover_distance_asym(y_pred, y_true, y_mask, axis_order='xy'): y_pred = T.reshape(y_pred, y_true.shape) y_pred *= y_mask y_true *= y_mask y_true = y_true / y_true.sum() y_pred = y_pred / y_pred.sum() if axis_order == 'yx': y_true = y_true.dimshuffle([0, 1, 3, 2]) y_pred = y_pred.dimshuffle([0, 1, 3, 2]) # calculate approximate earth mover distance to transform probability # distribution y_true into y_pred emd = 0.0 # calculate how much probability mass has to be moved along rows, in x direction diff = y_pred - y_true move_x = diff.sum(axis=2, keepdims=True).cumsum(axis=3)[..., :, :-1] # calculate from which cells to take the probability mass move_x_weights = diff.cumsum(axis=3)[..., :, :-1] # use only positions where sign is right move_x_weights = T.set_subtensor( move_x_weights[T.neq(T.sgn(move_x_weights), T.sgn(move_x)).nonzero()], 0) # normalize weightings to one # set weights uniformely to one, if all are zero move_x_weights = T.set_subtensor( move_x_weights[T.eq(move_x_weights.sum(axis=2, keepdims=True), 0).nonzero()], 1) move_x_weights /= move_x_weights.sum(axis=2, keepdims=True) # apply weighting move_x = move_x * move_x_weights emd += np.abs(move_x).sum() y_true_trans = y_true y_true_trans += T.set_subtensor(y_true_trans[..., :, :-1], move_x) y_true_trans -= T.set_subtensor(y_true_trans[..., :, 1:], move_x) # move mass along columns, in y direction diff = y_pred - y_true_trans move_y = diff.cumsum(axis=2)[..., :-1, :] emd += np.abs(move_y).sum() # check if we get y_pred y_true_trans2 = y_true_trans y_true_trans2 += T.set_subtensor(y_true_trans2[..., :-1, :], move_y) y_true_trans2 -= T.set_subtensor(y_true_trans2[..., 1:, :], move_y) return emd
def get_updates(self, params, cost): grads_rprop = [] grads_history = [] grads_rprop_new = [] shapes = [] grads = T.grad(cost, params) for param, grad in zip(params, grads): shape = param.shape.eval() shapes.append(shape) #grad = tt.grad(loss, wrt=param) #grads.append(grad) # Save gradients histories for RProp. grad_hist = theano.shared(param.get_value() * 0.0 + 1.0, name="rpop_hist_%s" % param) grads_history.append( grad_hist ) # Create variables where rprop rates will be stored. grad_rprop = theano.shared(param.get_value() * 0.0 + self.lr, name="rprop_%s" % param) grads_rprop.append(grad_rprop) # Compute the new RProp coefficients. rprop_sign = T.sgn(grad_hist * grad) grad_rprop_new = grad_rprop * ( T.eq(rprop_sign, 1) * self.plus + T.neq(rprop_sign, 1) * self.minus ) grads_rprop_new.append(grad_rprop_new) updates = [ # Update parameters according to the RProp update rule. (p, p - rg * T.sgn(g)) for p, g, rg in zip(params, grads, grads_rprop_new) ] + [ # Save current gradient for the next step.. (hg, g) for hg, g in zip( grads_history, grads) ] + [ # Save the new rprop grads. (rg, rg_new) for rg, rg_new in zip( grads_rprop, grads_rprop_new) ] return updates
def symGivens2(a, b): """ Stable Symmetric Givens rotation plus reflection Parameters a: (theano scalar) first element of a two-vector [a; b] b: (theano scalar) second element of a two-vector [a; b] Returns c cosine(theta), where theta is the implicit angle of rotation (counter-clockwise) in a plane-rotation s sine(theta) d two-norm of [a; b] Description: This method gives c and s such that [ c s ][a] = [d], [ s -c ][b] [0] where d = two norm of vector [a, b], c = a / sqrt(a^2 + b^2) = a / d, s = b / sqrt(a^2 + b^2) = b / d. The implementation guards against overflow in computing sqrt(a^2 + b^2). SEE ALSO: (1) Algorithm 4.9, stable *unsymmetric* Givens rotations in Golub and van Loan's book Matrix Computations, 3rd edition. (2) MATLAB's function PLANEROT. Observations: Implementing this function as a single op in C might improve speed considerably .. """ c_branch1 = T.switch(T.eq(a, constantX(0)), constantX(1), T.sgn(a)) c_branch21 = (a / b) * T.sgn(b) / \ T.sqrt(constantX(1) + (a / b) ** 2) c_branch22 = T.sgn(a) / T.sqrt(constantX(1) + (b / a)**2) c_branch2 = T.switch( T.eq(a, constantX(0)), constantX(0), T.switch(T.gt(abs(b), abs(a)), c_branch21, c_branch22)) c = T.switch(T.eq(b, constantX(0)), c_branch1, c_branch2) s_branch1 = T.sgn(b) / T.sqrt(constantX(1) + (a / b)**2) s_branch2 = (b / a) * T.sgn(a) / T.sqrt(constantX(1) + (b / a)**2) s = T.switch( T.eq(b, constantX(0)), constantX(0), T.switch(T.eq(a, constantX(0)), T.sgn(b), T.switch(T.gt(abs(b), abs(a)), s_branch1, s_branch2))) d_branch1 = b / (T.sgn(b) / T.sqrt(constantX(1) + (a / b)**2)) d_branch2 = a / (T.sgn(a) / T.sqrt(constantX(1) + (b / a)**2)) d = T.switch( T.eq(b, constantX(0)), abs(a), T.switch(T.eq(a, constantX(0)), abs(b), T.switch(T.gt(abs(b), abs(a)), d_branch1, d_branch2))) return c, s, d
def model_predict(train_set_x, test_set_x, gallery_set_y, query_set_y): global WEIGHTS_SAVE_PATH, WEIGHTS_FILE_NAME if not WEIGHTS_FILE_NAME: print 'no weights_file, please add weights file!' return print 'predict start time: ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) model = build_DDN_net(HASH_NUM, SPLIT_NUM, REGULARIZER_PARAMS) model.load_weights(WEIGHTS_SAVE_PATH + WEIGHTS_FILE_NAME) Deepid_output = Model(input=model.get_layer('main_input').input, output=model.get_layer('A6').output) gallery_set_x = Deepid_output.predict(train_set_x) query_set_x = Deepid_output.predict(test_set_x) gallery_binary_x = T.sgn(gallery_set_x).eval() query_binary_x = T.sgn(query_set_x).eval() train_binary_x, train_data_y = gallery_binary_x, gallery_set_y train_data_y.shape = (gallery_set_y.shape[0], 1) test_binary_x, test_data_y = query_binary_x, query_set_y test_data_y.shape = (query_set_y.shape[0], 1) train_y_rep = repmat(train_data_y, 1, test_data_y.shape[0]) test_y_rep = repmat(test_data_y.T, train_data_y.shape[0], 1) cateTrainTest = (train_y_rep == test_y_rep) train_data_y = train_data_y + 1 test_data_y = test_data_y + 1 train_data_y = np.asarray(train_data_y, dtype=int) test_data_y = np.asarray(test_data_y, dtype=int) B = compactbit(train_binary_x) tB = compactbit(test_binary_x) hammRadius = 2 hammTrainTest = hammingDist(tB, B).T Ret = (hammTrainTest <= hammRadius + 0.000001) [Pre, Rec] = evaluate_macro(cateTrainTest, Ret) print 'Precision with Hamming radius_2 = ', Pre print 'Recall with Hamming radius_2 = ', Rec HammingRank = np.argsort(hammTrainTest, axis=0) [MAP, p_topN] = cat_apcal(train_data_y, test_data_y, HammingRank, TOP_K) print 'MAP with Hamming Ranking = ', MAP print 'Precision of top %d returned = %f ' % (TOP_K, p_topN) print 'predict finish time: ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
def symGivens2(a, b): """ Stable Symmetric Givens rotation plus reflection Parameters a: (theano scalar) first element of a two-vector [a; b] b: (theano scalar) second element of a two-vector [a; b] Returns c cosine(theta), where theta is the implicit angle of rotation (counter-clockwise) in a plane-rotation s sine(theta) d two-norm of [a; b] Description: This method gives c and s such that [ c s ][a] = [d], [ s -c ][b] [0] where d = two norm of vector [a, b], c = a / sqrt(a^2 + b^2) = a / d, s = b / sqrt(a^2 + b^2) = b / d. The implementation guards against overflow in computing sqrt(a^2 + b^2). SEE ALSO: (1) Algorithm 4.9, stable *unsymmetric* Givens rotations in Golub and van Loan's book Matrix Computations, 3rd edition. (2) MATLAB's function PLANEROT. Observations: Implementing this function as a single op in C might improve speed considerably .. """ c_branch1 = T.switch(T.eq(a, constantX(0)), constantX(1), T.sgn(a)) c_branch21 = (a / b) * T.sgn(b) / T.sqrt(constantX(1) + (a / b) ** 2) c_branch22 = T.sgn(a) / T.sqrt(constantX(1) + (b / a) ** 2) c_branch2 = T.switch(T.eq(a, constantX(0)), constantX(0), T.switch(T.gt(abs(b), abs(a)), c_branch21, c_branch22)) c = T.switch(T.eq(b, constantX(0)), c_branch1, c_branch2) s_branch1 = T.sgn(b) / T.sqrt(constantX(1) + (a / b) ** 2) s_branch2 = (b / a) * T.sgn(a) / T.sqrt(constantX(1) + (b / a) ** 2) s = T.switch( T.eq(b, constantX(0)), constantX(0), T.switch(T.eq(a, constantX(0)), T.sgn(b), T.switch(T.gt(abs(b), abs(a)), s_branch1, s_branch2)), ) d_branch1 = b / (T.sgn(b) / T.sqrt(constantX(1) + (a / b) ** 2)) d_branch2 = a / (T.sgn(a) / T.sqrt(constantX(1) + (b / a) ** 2)) d = T.switch( T.eq(b, constantX(0)), abs(a), T.switch(T.eq(a, constantX(0)), abs(b), T.switch(T.gt(abs(b), abs(a)), d_branch1, d_branch2)), ) return c, s, d
def __init__(self, x, N, D): """ Initialize the cost function and gradient for logistic regression :type x: theano.tensor.vector :param x: symbolic variables that describes the input :type N: int :param N: total number of train instances :type D: int :param N: dimensionality of the feature space """ # Create a one dimensional tensor (i.e. a vector) for the weight vector. # borrow=True does not perform a deep copy of the variable and is faster. self.w = theano.shared(value=numpy.zeros(D, dtype=theano.config.floatX), name='w', borrow=True) # Initialise the bias self.b = theano.shared(value=numpy.float(0), name='b') # Symbolic definition of the logistic sigmoid function self.p_y_given_x = T.nnet.sigmoid(T.dot(x, self.w) + self.b) # Symbolic definition of how to predict the class self.y_pred = (T.sgn(self.p_y_given_x - 0.5) + 1) / 2 # Parameters of the model self.params = [self.w, self.b] pass
def get_state(self): st = super(LatentTypeWithTuningCurve, self).get_state() # The filters are non-identifiable as we can negate both the # temporal and the spatial filters and get the same net effect. # By convention, choose the sign that results in the most # positive temporal filter. sign = T.sgn(T.sum(self.stim_resp_t, axis=0)) T.addbroadcast(sign, 0) # Similarly, we can trade a constant between the spatial and temporal # pieces. By convention, set the temporal filter to norm 1. Z = T.sqrt(T.sum(self.stim_resp_t**2, axis=0)) T.addbroadcast(Z, 0) # Compute the normalized temporal response stim_resp_t = sign*(1.0/Z)*self.stim_resp_t # Finally, reshape the spatial component as necessary if self.spatial_ndim == 2: stim_resp_x = sign*Z*self.stim_resp_x stim_resp_x = T.reshape(stim_resp_x, self.spatial_shape + (self.R,)) else: stim_resp_x = sign*Z*self.stim_resp_x st.update({'stim_response_x' : stim_resp_x, 'stim_response_t' : stim_resp_t}) return st
def relevance_conv_z(out_relevances, inputs, weights, bias=None): norms_for_relevances = conv2d(inputs, weights) if bias is not None: norms_for_relevances += bias.dimshuffle('x',0,'x','x') # stabilize # prevent division by 0 and division by small numbers eps = 1e-3 norms_for_relevances += (T.sgn(norms_for_relevances) * eps) norms_for_relevances += (T.eq(norms_for_relevances, 0) * eps) normed_relevances = out_relevances / norms_for_relevances # upconv in_relevances = conv2d(normed_relevances, weights.dimshuffle(1,0,2,3)[:,:,::-1,::-1], border_mode='full') in_relevances_proper = in_relevances * inputs if bias is not None: bias_relevance = bias.dimshuffle('x',0,'x','x') * normed_relevances # Divide bias by weight size before convolving back # mean across channel, 0, 1 dims (hope this is correct?) fraction_bias = bias_relevance / T.prod(weights.shape[1:]).astype( theano.config.floatX) bias_rel_in = conv2d(fraction_bias, T.ones_like(weights).dimshuffle(1,0,2,3)[:,:,::-1,::-1], border_mode='full') in_relevances_proper += bias_rel_in return in_relevances_proper
def evaluate_net(*states): activations = T.fvectors(len(weights)) idx = 0 for neurons, activator, isInput, isOutput, weightFrame in weights: sumParts = [] for i, info in enumerate(weightFrame): srcIdx, w = info sumParts.append(T.dot(states[srcIdx], w.transpose())) if len(sumParts): sumParts = T.stack(*sumParts) activity = T.sum(sumParts, axis=0) if activator == TIDENTITY: activation = activity elif activator == TLOGISTIC: activation = 1. / (1. + T.exp(-activity)) elif activator == THYPERBOLIC: activation = T.tanh(activity) elif activator == TTHRESHOLD: activation = T.sgn(activity) elif activator == TBIAS: activation = T.ones_like(activity, dtype='float32') elif activator == TRADIAL: activation = T.exp(-activity*activity/2.0) else: raise Exception("Unknown activation function for layer {0}" + layer.id) else: activation = T.zeros_like(states[idx])#states[idx] activations[idx] = activation idx += 1 checklist = [T.all(T.eq(a,s)) for a,s in zip(activations, states)] condition = T.all(T.as_tensor_variable(checklist)) return activations, {}, theano.scan_module.until(condition )
def irprop_star_trainer(x, y, w, parameters, loss, random_stream, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6): """ IRPROP* trainer (own experimental modification, not recommended for usage) """ shareds = [] updates = [] loss_value = loss(x, y, w) for name, param in parameters.items(): param_shape = param.get_value().shape n = numpy.prod(param_shape).astype(int) new_derivative_ = T.grad(loss_value, param).flatten() lnewder, rnewder = new_derivative_.reshape([n, 1]), new_derivative_.reshape([1, n]) new_derivative_plus = lnewder + rnewder new_derivative_minus = lnewder - rnewder new_param = param for new_derivative in [new_derivative_plus, new_derivative_minus]: delta = theano.shared(numpy.zeros([n, n], dtype=floatX) + 1e-3) old_derivative = theano.shared(numpy.zeros([n, n], dtype=floatX)) new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step) new_delta = T.clip(new_delta, min_step, max_step) updates.append([delta, new_delta]) new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative) updates.append([old_derivative, new_old_derivative]) new_param = new_param - (new_delta * T.sgn(new_derivative)).sum(axis=1).reshape(param.shape) shareds.extend([old_derivative, delta]) updates.append([param, new_param]) return shareds, updates
def Update(params, gradients, velocities): global MOMENTUM global LEARNING_RATE global LEARNING_RATE_DECAY param_updates = [ (v, v * MOMENTUM - LEARNING_RATE * T.sgn(g) * T.clip(T.abs_(g), 0.0001, 9.8)) for g, v in zip(gradients, velocities) ] for i in range(0, len(gradients)): velocities[i] = velocities[i] * MOMENTUM - LEARNING_RATE * T.sgn( gradients[i]) * T.clip(T.abs_(gradients[i]), 0.5, 9.8) param_updates.extend([(p, p + v) for p, v in zip(params, velocities)]) LEARNING_RATE *= LEARNING_RATE_DECAY return param_updates
def fd3(mlp, fdm, params, globalLR1, globalLR2, momentParam1, momentParam2): cost1 = mlp.classError1 + mlp.penalty gradT1reg = T.grad(cost1, mlp.paramsT2) updateT1 = [] updateT2 = [] onlyT2param = [] # take opt from Adam? if params.opt2 in ['adam']: opt2 = adam() else: opt2 = None # update W - (1) + (3) for param, uC1, uC2 in zip(mlp.paramsT1, fdm.updateC1T1, fdm.updateC2T1): updateT1 += [(param, param + uC1 - uC2)] # compute grad T2 of C1, update T2 - [(4) - (2) ] / lr1 for param, grad, gT2 in zip(mlp.paramsT2, gradT1reg, fdm.gradC1T2): if params.T2onlySGN: grad_proxi = T.sgn((grad - gT2) / step * globalLR1) else: grad_proxi = (grad - gT2) / step * globalLR1 tempUp, tempPair, _ = update_fun(param, T.reshape(grad_proxi, param.shape), None, 'T2', {}, opt2, params, globalLR1, globalLR2, momentParam1, momentParam2) updateT2 += tempUp onlyT2param += tempPair debugs = [check for (_, check) in onlyT2param] return updateT1 + updateT2, debugs
def irprop_minus_trainer(x, y, w, parameters, loss, random_stream, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6): """IRPROP- is batch trainer, for details see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3428 . This is default trainer, very stable for classification. :param positive_step: factor, by which the step is increased when continuing going in the direction :param negative_step: factor, by which the step is increased when changing direction to opposite :param min_step: minimal change of weight during iteration :param max_step: maximal change of weight during iteration """ shareds = [] updates = [] loss_value = loss(x, y, w) for name, param in parameters.items(): old_derivative = theano.shared(param.get_value() * 0.) delta = theano.shared(param.get_value() * 0. + 1e-3) shareds.extend([old_derivative, delta]) new_derivative = T.grad(loss_value, param) new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step) new_delta = T.clip(new_delta, min_step, max_step) updates.append([param, param - new_delta * T.sgn(new_derivative)]) updates.append([delta, new_delta]) new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative) updates.append([old_derivative, new_old_derivative]) return shareds, updates
def model_mod_act(nx=4, nh=100, ny=1, p=None, tau = 10., seed = 0): np.random.seed(seed) if p == None: Wx = theano.shared(np.random.normal(0., 0.5, (nx, nh)).astype(theano.config.floatX)) Wh = theano.shared(np.random.normal(0., 1./nh, (nh, nh)).astype(theano.config.floatX)) Wy = theano.shared(np.zeros((nh,ny), dtype=theano.config.floatX)) bh = theano.shared(np.zeros(nh, dtype=theano.config.floatX)) by = theano.shared(np.zeros(ny, dtype=theano.config.floatX)) p = [Wx, Wh, Wy, bh, by] else: Wx = p[0]; Wh = p[1]; Wy = p[2]; bh = p[3]; by = p[4] h0 = theano.shared(np.zeros(nh, dtype=theano.config.floatX)) x = T.matrix('input_x') rho_h = T.matrix('rho_h') t = T.scalar('teachSig') mod = T.matrix('modulator') #theano.config.exception_verbosity='high' def recurrence(x_t, rho_h_t, mod_t, h_tm1): dh = (-h_tm1 + bh + mod_t * (T.dot(x_t, Wx) + T.dot(h_tm1, Wh) + rho_h_t)) / tau ha_t = h_tm1 + dh h_t = T.tanh(ha_t) s_t = T.dot(h_t, Wy) + by return [ha_t, h_t, s_t] ([ha, h, y], updates) = theano.scan(fn=recurrence, sequences=[x, rho_h, mod], outputs_info=[dict(), h0, dict()]) h = T.tanh(ha) y_0 = y[0, 0] y_T = y[-1, 0] loss = (((0.-y_0) ** 2.) + ((t-y_T) ** 2.)) / 2. acc = T.neq(T.sgn(y_T), t) return p, [x, rho_h, mod, t], y_T, [loss, acc], h, ha, y
def __abs__(self, other): assert hasattr(self, 'out'), 'all layers need a default output' new_obj = utils.copy(self) new_obj.out = abs(new_obj.out) if hasattr(new_obj, 'grads'): new_obj.grads = [TT.sgn(new_obj.out) * x for x in new_obj.grads] return new_obj
def adExample(self, X, y, weights, network, input_var): target_var = T.ivector('targets') prediction = lasagne.layers.get_output(network) if self.loss is 'softmax': loss = lasagne.objectives.categorical_crossentropy( prediction, target_var) if self.loss is 'svm': loss = lasagne.objectives.multiclass_hinge_loss( prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) lasagne.layers.set_all_param_values(network, weights) Xnew = np.zeros((self.num_images, 3, 224, 224)) Xnew[:, :, :, :] = X grad = T.grad(loss, input_var) final_examples = X + self.eps * T.sgn(grad) func1 = theano.function([input_var, target_var], final_examples, allow_input_downcast=True) result = func1(Xnew, y) return result
def get_function(self, func_name): if func_name == 'tanh': return T.tanh elif func_name == 'hardtanh': L.warning('Current hardTanh implementation is slow!') return lambda x: ((abs(x) <= 1) * x) + ((1 < abs(x)) * T.sgn(x)) elif func_name == 'xtanh': return lambda x: T.tanh(x) + 0.1 * x elif func_name == 'sigmoid': return T.nnet.sigmoid elif func_name == 'fastsigmoid': L.error('T.nnet.ultra_fast_sigmoid function has some problems') elif func_name == 'hardsigmoid': return T.nnet.hard_sigmoid elif func_name == 'xsigmoid': return lambda x: T.nnet.sigmoid(x) + 0.1 * x elif func_name == 'softplus': return T.nnet.softplus elif func_name == 'relu': #return lambda x: T.maximum(x, 0) return lambda x: x * (x > 0) #return T.nnet.relu # Update theano and then use this one instead elif func_name == 'leakyrelu': return lambda x: T.maximum(x, 0.01 * x) elif func_name == 'cappedrelu': return lambda x: T.minimum(x * (x > 0), 6) elif func_name == 'softmax': return T.nnet.softmax elif func_name == 'norm1': return lambda x: x / T.nlinalg.norm(x, 1) elif func_name == 'norm2': #return lambda x: x / T.nlinalg.norm(x, 2) return lambda x: x / T.dot(x, x)**0.5 else: L.error('Invalid function name given: ' + func_name)
def rprop_core(params, gradients, rprop_increase=1.01, rprop_decrease=0.99, rprop_min_step=0, rprop_max_step=100, learning_rate=0.01): """ Rprop optimizer. See http://sci2s.ugr.es/keel/pdf/algorithm/articulo/2003-Neuro-Igel-IRprop+.pdf. """ for param, grad in zip(params, gradients): grad_tm1 = theano.shared(np.zeros_like(param.get_value()), name=param.name + '_grad') step_tm1 = theano.shared(np.zeros_like(param.get_value()) + learning_rate, name=param.name + '_step') test = grad * grad_tm1 same = T.gt(test, 0) diff = T.lt(test, 0) step = T.minimum( rprop_max_step, T.maximum( rprop_min_step, step_tm1 * (T.eq(test, 0) + same * rprop_increase + diff * rprop_decrease))) grad = grad - diff * grad yield param, param - T.sgn(grad) * step yield grad_tm1, grad yield step_tm1, step
def fd3(mlp, fdm, params, globalLR1, globalLR2, momentParam1, momentParam2): cost1 = mlp.classError1 + mlp.penalty gradT1reg = T.grad(cost1, mlp.paramsT2) updateT1 = []; updateT2 = []; onlyT2param = [] # take opt from Adam? if params.opt2 in ['adam']: opt2 = adam() else: opt2 = None # update W - (1) + (3) for param, uC1, uC2 in zip(mlp.paramsT1, fdm.updateC1T1, fdm.updateC2T1): updateT1 += [(param, param + uC1 - uC2)] # compute grad T2 of C1, update T2 - [(4) - (2) ] / lr1 for param, grad, gT2 in zip(mlp.paramsT2, gradT1reg, fdm.gradC1T2): if params.T2onlySGN: grad_proxi = T.sgn((grad - gT2)/step*globalLR1) else: grad_proxi = (grad - gT2)/step*globalLR1 tempUp, tempPair, _ = update_fun(param, T.reshape(grad_proxi, param.shape), None, 'T2', {}, opt2, params, globalLR1, globalLR2, momentParam1, momentParam2) updateT2 += tempUp onlyT2param += tempPair debugs = [check for (_, check) in onlyT2param] return updateT1 + updateT2, debugs
def relevance_conv_z(out_relevances, inputs, weights, bias=None): norms_for_relevances = conv2d(inputs, weights) if bias is not None: norms_for_relevances += bias.dimshuffle("x", 0, "x", "x") # stabilize # prevent division by 0 and division by small numbers eps = 1e-4 norms_for_relevances += T.sgn(norms_for_relevances) * eps norms_for_relevances += T.eq(norms_for_relevances, 0) * eps normed_relevances = out_relevances / norms_for_relevances # upconv in_relevances = conv2d(normed_relevances, weights.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full") in_relevances_proper = in_relevances * inputs if bias is not None: bias_relevance = bias.dimshuffle("x", 0, "x", "x") * normed_relevances # Divide bias by weight size before convolving back # mean across channel, 0, 1 dims (hope this is correct?) fraction_bias = bias_relevance / T.prod(weights.shape[1:]).astype(theano.config.floatX) bias_rel_in = conv2d( fraction_bias, T.ones_like(weights).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full" ) in_relevances_proper += bias_rel_in return in_relevances_proper
def learning_updates(self): step = self.learning_rate self.grads = [] self.steps = [] for param in self.params: v = param.get_value() n = param.name self.grads.append(theano.shared(np.zeros_like(v), name=n + '_grad')) self.steps.append( theano.shared(np.zeros_like(v) + step, name=n + '_step')) for param, step_tm1, grad_tm1 in zip(self.params, self.steps, self.grads): grad = TT.grad(self.J, param) test = grad * grad_tm1 same = TT.gt(test, 0) diff = TT.lt(test, 0) step = TT.minimum( self.max_step, TT.maximum( self.min_step, step_tm1 * (TT.eq(test, 0) + same * self.step_increase + diff * self.step_decrease))) grad = grad - diff * grad yield param, param - TT.sgn(grad) * step yield grad_tm1, grad yield step_tm1, step
def __init__(self,input,response): targets = response.resp mask = T.sgn(targets) antargets=T.switch(T.gt(targets,0),targets,1+targets) self.hengeloss = T.sum((mask*(antargets-input.output)).clip(0,1e10)) self.output = response.resp self.output_shape = response.resp_shape
def rprop_updates(cost, params): initial_rprop_rate = 0.005 minimum_rprop_rate = 1e-6 maximum_rprop_rate = 50 rprop_eta_n = 0.5 rprop_eta_p = 1.2 rprop_values = [ shared(initial_rprop_rate * numpy.ones(p.get_value(borrow=True).shape, dtype=theano.config.floatX)) for p in params ] rprop_signs = [ shared( numpy.zeros(p.get_value(borrow=True).shape, dtype=theano.config.floatX)) for p in params ] updates = [] for param, value, sign in zip(params, rprop_values, rprop_signs): grad = T.grad(cost, param) sign_new = T.sgn(grad) sign_changed = T.neq(sign, sign_new) updates.append( (param, T.switch(sign_changed, param, param - value * sign_new))) updates.append((value, T.clip( T.switch(sign_changed, rprop_eta_n * value, rprop_eta_p * value), minimum_rprop_rate, maximum_rprop_rate))) updates.append((sign, sign_new)) return updates
def __init__(self, input, n_in, unit_input_step, threshold, initVmem=0.5, batchsize=1000): self.input = input # input real-valued matrix, shape = (batchsize, n_in) self.n_in = n_in self.unit_input_step = unit_input_step self.threshold = threshold self.signs = T.sgn(self.input) self.input_abs = abs(self.input) self.incremental_step = self.input_abs * self.unit_input_step self.initVmem = initVmem self.batchsize = batchsize self.Vmem_init = np.zeros( (self.batchsize, n_in), dtype=theano.config.floatX) + np.float32( self.initVmem * threshold) self.Vmem = theano.shared(self.Vmem_init, name='Vmem', borrow=True) self.Vmem_update_prespike = self.Vmem + self.incremental_step self.output = T.ge(self.Vmem_update_prespike, self.threshold) * self.signs self.Vmem_update_postspike = T.cast( self.Vmem_update_prespike - abs(self.output) * self.threshold, theano.config.floatX)
def discrete_grads(loss,network,LR): global update_type,best_params,H,N,th # th is a parameter that controls the nonlinearity of state transfer probability W_params = lasagne.layers.get_all_params(network, discrete=True) #Get all the weight parameters layers = lasagne.layers.get_all_layers(network) W_grads = [] for layer in layers: params = layer.get_params(discrete=True) if params: W_grads.append(theano.grad(loss, wrt=layer.W)) #Here layer.W = weight_tune(param) updates = lasagne.updates.adam(loss_or_grads=W_grads,params=W_params,learning_rate=LR) for param, parambest in izip(W_params, best_params) : L = 2*H/pow(2,N) #state step length in Z_N a=random.random() #c is a random variable with binary value if a<0.85: c = 1 else: c = 0 b=random.random() state_rand = T.round(b*pow(2,N))*L-H #state_rand is a random state in the discrete weight space Z_N delta_W1 =c*(state_rand-parambest)#parambest would transfer to state_rand with probability of a, or keep unmoved with probability of 1-a delta_W1_direction = T.cast(T.sgn(delta_W1),theano.config.floatX) dis1=T.abs_(delta_W1) #the absolute distance k1=delta_W1_direction*T.floor(dis1/L) #the integer part v1=delta_W1-k1*L #the decimal part Prob1= T.abs_(v1/L) #the transfer probability Prob1 = T.tanh(th*Prob1) #the nonlinear tanh() function accelerates the state transfer
def discretized_laplace(mean, logscale, binsize, sample=None): scale = .5 * T.exp(logscale) if sample is None: u = G.rng_curand.uniform(size=mean.shape) - .5 sample = mean - scale * T.sgn(u) * T.log(1 - 2 * abs(u)) sample = T.floor(sample / binsize) * binsize #discretize the sample d = .5 * binsize def cdf(x): z = x - mean return .5 + .5 * T.sgn(z) * (1. - T.exp(-abs(z) / scale)) def logmass1(x): # General method for probability mass, but numerically unstable for large |x-mean|/scale return T.log(cdf(x + d) - cdf(x - d) + 1e-7) def logmass2(x): # Only valid for |x-mean| >= d return -abs(x - mean) / scale + T.log( T.exp(d / scale) - T.exp(-d / scale)) - np.log(2.).astype(G.floatX) def logmass_stable(x): switch = (abs(x - mean) < d) return switch * logmass1(x) + (1 - switch) * logmass2(x) logp = logmass_stable(sample).flatten(2).sum(axis=1) entr = None #(1 + logscale).flatten(2).sum(axis=1) return RandomVariable(sample, logp, entr, mean=mean, scale=scale)
def iRpropPlus( cost, params, c, positiveStep=np.float32(1.2), negativeStep=np.float32(0.5), maxStep=np.float32(50), minStep=np.float32(1e-6), ): updates = [] for layerParams in params: for param in layerParams: lastParamGradSign = theano.shared(param.get_value(borrow=True) * 0.0) lastParamDelta = theano.shared(param.get_value(borrow=True) * 0.1) lastCost = theano.shared(np.float32(np.inf)) gradient = T.grad(cost=cost, wrt=param, disconnected_inputs="raise") change = T.sgn(lastParamGradSign * gradient) changePos = T.gt(change, 0.0).astype(theano.config.floatX) changeNeg = T.lt(change, 0.0).astype(theano.config.floatX) changeZero = T.eq(change, 0.0).astype(theano.config.floatX) costInc = T.gt(cost, lastCost).astype(theano.config.floatX) newParam = ( param - changePos * T.sgn(gradient) * T.minimum(lastParamDelta * positiveStep, maxStep) + changeNeg * costInc * lastParamGradSign * lastParamDelta - changeZero * T.sgn(gradient) * lastParamDelta ) # max-norm regularization newParam = maxNormReg(newParam, c, epsilon) newLastParamDelta = ( changePos * T.minimum(lastParamDelta * positiveStep, maxStep).astype(theano.config.floatX) + changeNeg * T.maximum(lastParamDelta * negativeStep, minStep).astype(theano.config.floatX) + changeZero * lastParamDelta.astype(theano.config.floatX) ) newLastParamGradSign = ( changePos * T.sgn(gradient).astype(theano.config.floatX) + changeNeg * 0 + changeZero * T.sgn(gradient).astype(theano.config.floatX) ) updates.append((param, newParam)) updates.append((lastParamDelta, newLastParamDelta)) updates.append((lastParamGradSign, newLastParamGradSign)) updates.append((lastCost, cost)) return updates
def forward_theano(self, x): abs_x = tt.abs_(x) y = tt.switch(abs_x < self.c, tt.erf(x / 2.**0.5), (((self.beta**2 - 4 * self.alpha * (self.gamma - abs_x))**0.5 - self.beta) / (2 * self.alpha)) * tt.sgn(x)) return y
def laplace_diag(mean, logscale, sample=None): scale = .5*T.exp(logscale) if sample is None: u = G.rng_curand.uniform(size=mean.shape) - .5 sample = mean - scale * T.sgn(u) * T.log(1-2*abs(u)) logp = (- logscale - abs(sample-mean) / scale).flatten(2).sum(axis=1) entr = (1 + logscale).flatten(2).sum(axis=1) return RandomVariable(sample, logp, entr, mean=mean, scale=scale)
def _laplace(trng, p, size=None): dim = p.shape[p.ndim-1] // 2 mu = _slice(p, 0, dim) log_b = _slice(p, 1, dim) if size is None: size = mu.shape epsilon = trng.uniform(size=size, dtype=floatX) - 0.5 return mu + T.exp(log_b) * T.sgn(epsilon) * T.log(1.0 - 2 * abs(epsilon))
def NSigmoidP(x, p, use_noise=1, alpha=1.1, c=0.15, noise=None, half_normal=True): """ Noisy Sigmoid Tanh Units: NAN with learning p ---------------------------------------------------- Arguments: x: theano tensor variable, input of the function. p: theano shared variable, a vector of parameters for p. use_noise: int, whether to add noise or not to the activations, this is in particular useful for the test time, in order to disable the noise injection. c: float, standard deviation of the noise alpha: float, the leakage rate from the linearized function to the nonlinear one. half_normal: bool, whether the noise should be sampled from half-normal or normal distribution. """ lin_sigm = 0.25 * x + 0.5 logger.info("c: %f" % c) signs = T.sgn(x) delta = HardSigmoid(x) - lin_sigm signs = T.sgn(x) scale = c * (T.nnet.sigmoid(p * delta) - 0.5)**2 if not noise: noise = global_trng.normal(size=x.shape, avg=0, std=1.0, dtype=floatX) noise_det = 0. if half_normal: if alpha > 1.0: scale *= -1. if not use_noise: noise_det = numpy.float32(0.797) else: noise = abs(noise) elif not use_noise: noise_det = 0. noise = use_noise * noise + (1. - use_noise) * noise_det res = (alpha * HardSigmoid(x) + (1. - alpha) * lin_sigm - signs * scale * noise) return res
def get_perturbation(self, dir, epsilon): if (self.norm_constraint == 'max'): print 'perturb:max' return epsilon * T.sgn(dir) if (self.norm_constraint == 'L2'): print 'perturb:L2' dir = self.get_normalized_vector(dir) dir = epsilon * numpy.float(numpy.sqrt(self.layer_sizes[0])) * dir return dir
def rprop_plus_updates(params, grads): # RPROP+ parameters updates = [] deltas = 0.1*numpy.ones(len(params)) last_weight_changes = numpy.zeros(len(params)) last_params = params positiveStep = 1.2 negativeStep = 0.5 maxStep = 50. minStep = math.exp(-6) # RPROP+ parameter update (original Reidmiller implementation) for param, gparam, last_gparam, delta, last_weight_change in \ zip(params, grads, last_params, deltas, last_weight_changes): # calculate change change = T.sgn(gparam * last_gparam) if T.gt(change, 0) : delta = T.minimum(delta * positiveStep, maxStep) if T.lt(delta, minStep): delta = minStep weight_change = T.sgn(gparam) * delta last_gparam = gparam elif T.lt(change, 0): delta = T.maximum(delta * negativeStep, minStep) if T.gt(delta, maxStep): delta = maxStep weight_change = -last_weight_change last_gparam = 0 else : weight_change = T.sgn(gparam) * delta last_gparam = param # update the weights updates.append((param, param - weight_change)) # store old change last_weight_change = weight_change return updates
def NSigmoid(x, use_noise=1, alpha=1.15, c=0.25, threshold=2.0,half_normal=False): """ Noisy Hard Sigmoid Units: NAN without learning p ---------------------------------------------------- Arguments: x: theano tensor variable, input of the function. use_noise: int, whether to add noise or not to the activations, this is in particular useful for the test time, in order to disable the noise injection. c: float, standard deviation of the noise alpha: the leaking rate from the linearized function to the nonlinear one. """ logger.info("c: %f" % c) signs = T.sgn(x) delta = abs(x) - threshold scale = c * (T.nnet.sigmoid(delta**2) - 0.5)**2 noise = global_trng.normal(size=x.shape, avg=0, std=1.0, dtype=floatX) if half_normal: if alpha > 1.0: scale *= -1 noise = abs(noise) if not use_noise: noise = 0.797 elif not use_noise: noise = 0. eps = scale * noise + alpha * delta signs = T.sgn(x) z = x - signs * eps test = T.cast(T.ge(abs(x), threshold), floatX) res = test * z + (1. - test) * HardSigmoid(x) return res
def get_xelm_predict_function(f_name): X_matrix = T.dmatrix('X') W_matrix = T.dmatrix('W') beta = T.dmatrix('beta') H_matrix = metric_theano[f_name](X_matrix, W_matrix) s = T.sgn(T.dot(H_matrix, beta)) xelm_predict_function = theano.function([X_matrix, W_matrix, beta], s) return xelm_predict_function
def __init__(self, input, inputLabels, nrLayers, initialWeights, initialBiases, activationFunction, classificationActivationFunction, visibleDropout, hiddenDropout, adversarial_training, adversarial_epsilon, adversarial_coefficient, training_options): self.input = input self.inputLabels = inputLabels # If we should use adversarial training or not self.adversarial_training = adversarial_training self.adversarial_coefficient = adversarial_coefficient self.adversarial_epsilon = adversarial_epsilon self.visibleDropout = visibleDropout self.hiddenDropout = hiddenDropout self.activationFunction = activationFunction self.classificationActivationFunction = classificationActivationFunction self.training_options = training_options # Let's initialize the fields # The weights and biases, make them shared variables nrWeights = nrLayers - 1 self.nrWeights = nrWeights biases = [] weights = [] for i in xrange(nrWeights): w = theano.shared(value=np.asarray(initialWeights[i], dtype=theanoFloat), name='W') weights.append(w) b = theano.shared(value=np.asarray(initialBiases[i], dtype=theanoFloat), name='b') biases.append(b) # Set the parameters of the object # Do not set more than this, these will be used for differentiation in the # gradient params = weights + biases self.biases = biases # Initialize the super class super(MiniBatchTrainer, self).__init__(params, weights, training_options) # Create a theano random number generator required to sample units for dropout self.theanoRng = RandomStreams(seed=np.random.randint(1, 1000)) self.output = self.forwardPass(self.input) if self.adversarial_training: # TODO(mihaela): move this to the BatchTrainer superclass? # This would require moving the forward functionality there error = T.sum(self.costFun(self.output, self.inputLabels)) grad_error = T.grad(error, self.input) adversarial_input = self.input + self.adversarial_epsilon * T.sgn(grad_error) self.adversarial_output = self.forwardPass(adversarial_input)
def get_perturbation(dir, epsilon,norm_constraint): if (norm_constraint == 'max'): print 'perturb:max' return epsilon * T.sgn(dir) elif (norm_constraint == 'L2'): print 'perturb:L2' dir = get_normalized_vector(dir) dir = epsilon * dir return dir else: raise NotImplementedError()
def get_rbfnet_predict_function(metric_name): X_matrix = T.dmatrix('X') W_matrix = T.dmatrix('W') beta = T.dvector('beta') b = T.scalar('b') H_matrix = metric_theano[metric_name](X_matrix, W_matrix) H_rbf = np.exp(T.power(H_matrix, 2) * (-b)) s = T.sgn(T.dot(H_rbf, beta)) rbfnet_predict_function = theano.function([X_matrix, W_matrix, beta, b], s) return rbfnet_predict_function
def sample_proposal_s(self): #s is npcl-by-ns u=self.theano_rng.uniform(size=T.shape(self.s_now))-0.5 mean_term=self.get_prediction(self.s_now) s_prop=mean_term-T.sgn(u)*T.log(1.0-2.0*T.abs_(u))*self.b #return T.cast(s_prop,'float32'), T.cast(s_pred,'float32'), T.cast(prop_term,'float32'), prop_mean return s_prop
def NTanhPInp(x, p, use_noise=1, c=0.25, half_normal=False,alpha=1.1): """ Noisy Tanh units where the noise is injected to the input: NANI with learning p. This function works well with discrete switching functions. ---------------------------------------------------- Arguments: x: theano tensor variable, input of the function. p: theano shared variable, a vector of parameters for p. use_noise: int, whether to add noise or not to the activations, this is in particular useful for the test time, in order to disable the noise injection. c: float, standard deviation of the noise half_normal: bool, whether the noise should be sampled from half-normal or normal distribution. """ logger.info("c: %f" % c) signs = T.sgn(x) delta = HardTanh(x) - x signs = T.sgn(x) noise = global_trng.normal(size=x.shape, avg=0, std=1.0, dtype=floatX) noise_det = 0. if half_normal: if alpha > 1.0: c *= -1 noise_det = 0.797 noise = abs(noise) elif not use_noise: noise = 0. noise = use_noise * noise + (1. - use_noise) * noise_det scale = c * T.nnet.softplus(p * abs(delta) / (abs(noise) + 1e-10)) res = HardTanh(x + scale * noise) return res