Exemple #1
0
def irprop_plus_trainer(x, y, w, parameters, loss, random_stream,
                        positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6):
    """IRPROP+ is batch trainer, for details see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3428

    :param positive_step: factor, by which the step is increased when continuing going in the direction
    :param negative_step: factor, by which the step is increased when changing direction to opposite
    :param min_step: minimal change of weight during iteration
    :param max_step: maximal change of weight during iteration
    """
    loss_value = loss(x, y, w)
    prev_loss_value = theano.shared(1e10)
    shareds = [prev_loss_value]
    updates = []
    for name, param in parameters.items():
        old_derivative = theano.shared(param.get_value() * 0.)
        delta = theano.shared(param.get_value() * 0. + 1e-3)
        new_derivative = T.grad(loss_value, param)

        shift_if_bad_step = T.where(new_derivative * old_derivative < 0, delta * T.sgn(old_derivative), 0)
        shift = ifelse(loss_value > prev_loss_value, shift_if_bad_step, 0. * param)
        # unfortunately we can't do it this way: param += shift

        new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step)
        new_delta = T.clip(new_delta, min_step, max_step)

        updates.append([param, param + shift - new_delta * T.sgn(new_derivative)])
        updates.append([delta, new_delta])

        new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative)
        updates.append([old_derivative, new_old_derivative])
        shareds.extend([old_derivative, delta])

    updates.append([prev_loss_value, loss_value])
    return shareds, updates
Exemple #2
0
def generateRpropUpdates(params, error, init_size=1, verbose=False):
    prevw = []
    deltaw = []
    updates = []
    gradients = []
    #initalize stuff
    for p in params:
        prevw.append(theano.shared(np.zeros(p.shape.eval()).astype(config.floatX)))
        deltaw.append(theano.shared(init_size * np.ones(p.shape.eval()).
            astype(config.floatX)))

    iterations = 0
    for p, dw, pw in zip(params, deltaw, prevw):
        try:
            if verbose: print("\rGradient {} out of {}".format(iterations + 1, len(params)), end="")
            gradients.append(T.grad(error, p))
            iterations += 1
        except Exception:
            print('Unused input')
            continue
        #Array describing which values are when gradients are both positive or both negative
        simW = T.neq((T.eq((pw > 0), (gradients[-1] > 0))), (T.eq((pw < 0), (gradients[-1] <
            0))))

        #Array describing which values are when gradients are in opposite directions
        diffW = ((pw > 0) ^ (gradients[-1] > 0)) * (T.neq(pw, 0) * T.neq(gradients[-1], 0))
        updates.append((p, p - (T.sgn(gradients[-1]) * dw * (T.eq(diffW, 0)))))
        updates.append((dw, T.switch(diffW, dw *
            0.5, T.switch(simW, dw * 1.2, dw))))
        updates.append((pw, (T.sgn(gradients[-1]) * dw * (T.eq(diffW, 0)))))

    storage = prevw + deltaw
    if verbose: print("\nDone with updates")

    return (storage, updates)
Exemple #3
0
def irprop_plus_trainer(x, y, w, parameters, loss, random_stream,
                        positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6):
    """IRPROP+ is batch trainer, for details see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3428

    :param positive_step: factor, by which the step is increased when continuing going in the direction
    :param negative_step: factor, by which the step is increased when changing direction to opposite
    :param min_step: minimal change of weight during iteration
    :param max_step: maximal change of weight during iteration
    """
    loss_value = loss(x, y, w)
    prev_loss_value = theano.shared(1e10)
    shareds = [prev_loss_value]
    updates = []
    for name, param in parameters.items():
        old_derivative = theano.shared(param.get_value() * 0.)
        delta = theano.shared(param.get_value() * 0. + 1e-3)
        new_derivative = T.grad(loss_value, param)

        shift_if_bad_step = T.where(new_derivative * old_derivative < 0, delta * T.sgn(old_derivative), 0)
        shift = ifelse(loss_value > prev_loss_value, shift_if_bad_step, 0. * param)
        # unfortunately we can't do it this way: param += shift

        new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step)
        new_delta = T.clip(new_delta, min_step, max_step)

        updates.append([param, param + shift - new_delta * T.sgn(new_derivative)])
        updates.append([delta, new_delta])

        new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative)
        updates.append([old_derivative, new_old_derivative])
        shareds.extend([old_derivative, delta])

    updates.append([prev_loss_value, loss_value])
    return shareds, updates
Exemple #4
0
def irprop_plus_trainer(x, y, w, parameters, loss, random_stream,
                        positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6):
    """IRPROP+ trainer, see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.1332"""
    loss_value = loss(x, y, w)
    prev_loss_value = theano.shared(1e10)
    shareds = []
    updates = []
    for name, param in parameters.iteritems():
        old_derivative = theano.shared(param.get_value() * 0.)
        delta = theano.shared(param.get_value() * 0. + 1e-3)
        new_derivative = T.grad(loss_value, param)

        shift_if_bad_step = T.where(new_derivative * old_derivative < 0, delta * T.sgn(old_derivative), 0)
        # THIS doesn't work!
        shift = ifelse(loss_value > prev_loss_value, shift_if_bad_step, 0. * param)
        # unfortunately we can't do it this way: param += shift

        new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step)
        new_delta = T.clip(new_delta, min_step, max_step)

        updates.append([param, param + shift - new_delta * T.sgn(new_derivative)])
        updates.append([delta, new_delta])

        new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative)
        updates.append([old_derivative, new_old_derivative])
        shareds.extend([old_derivative, delta, prev_loss_value])

    updates.append([prev_loss_value, loss_value])
    return shareds, updates
Exemple #5
0
def irprop_minus_updates(params, grads):

    # IRPROP- parameters
    updates = []
    deltas = 0.1*numpy.ones(len(params))
    last_params = params
    
    positiveStep = 1.2
    negativeStep = 0.5
    maxStep = 1.
    minStep = math.exp(-6)

    for param, gparam, delta, last_gparam in zip(params, grads, deltas, last_params):
        # calculate change
        change = T.sgn(gparam * last_gparam)
        if T.gt(change, 0) :
            delta = T.minimum(delta * positiveStep, maxStep)
                           
        elif T.lt(change, 0):
            delta = T.maximum(delta * negativeStep, minStep)
            
            last_gparam = 0
            
        # update the weights
        updates.append((param, param - T.sgn(gparam) * delta))
        # store old change
        last_gparam = gparam

    return updates
Exemple #6
0
def irprop_minus_updates(params, grads):

    # IRPROP- parameters
    updates = []
    deltas = 0.1 * numpy.ones(len(params), theano.config.floatX)
    last_params = params

    positiveStep = 1.2
    negativeStep = 0.5
    maxStep = 50  #1.
    minStep = math.exp(-6)

    for param, gparam, delta, last_gparam in zip(params, grads, deltas,
                                                 last_params):
        # calculate change
        change = T.sgn(gparam * last_gparam)
        if T.gt(change, 0):
            delta = T.minimum(delta * positiveStep, maxStep)

        elif T.lt(change, 0):
            delta = T.maximum(delta * negativeStep, minStep)

            last_gparam = 0
        delta = delta.astype('float32')

        # update the weights
        updates.append((param, param - T.sgn(gparam) * delta))
        # store old change
        last_gparam = gparam

    return updates
Exemple #7
0
    def mean_field_fancy_step(self, V, P, Mu):

        iterm = T.dot(T.dot(P*Mu,self.W.T*self.beta),self.W)

        normalized_V = self.beta * (V-self.b)
        main_term = T.dot(normalized_V, self.W)


        iA = self.w * P*Mu - iterm

        full_A = iA + main_term+self.a
        Mu1 = full_A / self.gamma

        Q = self.Q_from_A( full_A)

        iMu = iA / self.gamma

        #if this is negative, we are ammplifying so we use default damping
        #if this is positive, we are flipping, use max(0,lambda(tau))
        discriminant = T.sgn(Mu-iMu) * Mu/(1e-10+abs(Mu-iMu))

        Lambda = self.tau * discriminant -  T.sgn(Mu-iMu) * iMu/(1e-10+abs(Mu-iMu))

        mask = discriminant <= 0

        fancy_damp = mask*self.s_default_damping_factor + (1.-mask)*T.maximum(0.,Lambda)


        return Q, Mu1, fancy_damp
Exemple #8
0
def NTanhPInp(x, p, use_noise=1, c=0.25, half_normal=False, alpha=1.1):
    """
    Noisy Tanh units where the noise is injected to the input: NANI with learning p.
    This function works well with discrete switching functions.
    ----------------------------------------------------
    Arguments:
        x: theano tensor variable, input of the function.
        p: theano shared variable, a vector of parameters for p.
        use_noise: int, whether to add noise or not to the activations, this is in particular
        useful for the test time, in order to disable the noise injection.
        c: float, standard deviation of the noise
        half_normal: bool, whether the noise should be sampled from half-normal or
        normal distribution.
    """

    logger.info("c: %f" % c)
    signs = T.sgn(x)
    delta = HardTanh(x) - x
    signs = T.sgn(x)
    noise = global_trng.normal(size=x.shape, avg=0, std=1.0, dtype=floatX)
    noise_det = 0.
    if half_normal:
        if alpha > 1.0:
            c *= -1
        noise_det = 0.797
        noise = abs(noise)
    elif not use_noise:
        noise = 0.

    noise = use_noise * noise + (1. - use_noise) * noise_det
    scale = c * T.nnet.softplus(p * abs(delta) / (abs(noise) + 1e-10))
    res = HardTanh(x + scale * noise)
    return res
Exemple #9
0
    def get_updates(self, v):
        # Contrastive divergence
        chain_end, updates_CD = self.CD(self, chain_start=v, cdk=self.CDk)

        # [Expected] negative log-likelihood
        cost = T.mean(self.free_energy(v), axis=0) - T.mean(self.free_energy(chain_end), axis=0)

        # L2 Regularization
        if isinstance(self.regularize, L2Regularization):
            cost += self.regularization

        # Gradients (use automatic differentiation)
        # We must not compute the gradient through the gibbs sampling, i.e. use consider_constant
        gparams = T.grad(cost, self.parameters, consider_constant=[chain_end])
        gradients = dict(zip(self.parameters, gparams))

        # Get learning rates for all params given their gradient.
        lr, updates_lr = self.learning_rate(gradients)

        updates = OrderedDict()
        updates.update(updates_CD)  # Add updates from CD
        updates.update(updates_lr)  # Add updates from learning_rate

        # Updates parameters
        for param, gparam in gradients.items():
            updates[param] = param - lr[param] * gradients[param]

        if isinstance(self.regularize, L1Regularization):
            updates[self.b] = T.sgn(updates[self.b]) * T.maximum(abs(updates[self.b]) - lr[self.b]*self.regularize.decay, 0)
            updates[self.W] = T.sgn(updates[self.W]) * T.maximum(abs(updates[self.W]) - lr[self.W]*self.regularize.decay, 0)

        return updates
Exemple #10
0
def old_rprop(param,
              learning_rate,
              gparam,
              mask,
              updates,
              current_cost,
              previous_cost,
              eta_plus=1.5,
              eta_minus=0.5,
              max_delta=50,
              min_delta=10e-8):
    previous_grad = sharedX(numpy.ones(param.shape.eval()), borrow=True)
    delta = sharedX(learning_rate * numpy.ones(param.shape.eval()),
                    borrow=True)
    previous_inc = sharedX(numpy.zeros(param.shape.eval()), borrow=True)
    zero = T.zeros_like(param)
    one = T.ones_like(param)
    change = previous_grad * gparam

    new_delta = T.clip(
        T.switch(T.gt(change, 0.), delta * eta_plus,
                 T.switch(T.lt(change, 0.), delta * eta_minus, delta)),
        min_delta, max_delta)
    new_previous_grad = T.switch(T.gt(change, 0.), gparam,
                                 T.switch(T.lt(change, 0.), zero, gparam))
    inc = T.switch(
        T.gt(change, 0.), -T.sgn(gparam) * new_delta,
        T.switch(T.lt(change, 0.), zero, -T.sgn(gparam) * new_delta))

    updates.append((previous_grad, new_previous_grad))
    updates.append((delta, new_delta))
    updates.append((previous_inc, inc))
    return param + inc * mask
Exemple #11
0
def discrete_grads(loss,network,LR):
    global update_type,best_params,H,N,th # th is a parameter that controls the nonlinearity of state transfer probability

    W_params = lasagne.layers.get_all_params(network, discrete=True) #Get all the weight parameters
    layers = lasagne.layers.get_all_layers(network)
	
    W_grads = []
    for layer in layers:
        params = layer.get_params(discrete=True)
        if params:
            W_grads.append(theano.grad(loss, wrt=layer.W)) #Here layer.W = weight_tune(param)  
    updates = lasagne.updates.adam(loss_or_grads=W_grads,params=W_params,learning_rate=LR)  

    for param, parambest in izip(W_params, best_params) :

        L = 2*H/pow(2,N) #state step length in Z_N 
		
        a=random.random() #c is a random variable with binary value       
        if a<0.85:
           c = 1
        else:
           c = 0
        
        b=random.random()
        state_rand = T.round(b*pow(2,N))*L-H #state_rand is a random state in the discrete weight space Z_N
        
        delta_W1 =c*(state_rand-parambest)#parambest would transfer to state_rand with probability of a, or keep unmoved with probability of 1-a
        delta_W1_direction = T.cast(T.sgn(delta_W1),theano.config.floatX)
	dis1=T.abs_(delta_W1) #the absolute distance
        k1=delta_W1_direction*T.floor(dis1/L) #the integer part
        v1=delta_W1-k1*L #the decimal part
        Prob1= T.abs_(v1/L) #the transfer probability
	Prob1 = T.tanh(th*Prob1) #the nonlinear tanh() function accelerates the state transfer
		   
        delta_W2 = updates[param] - param
        delta_W2_direction = T.cast(T.sgn(delta_W2),theano.config.floatX)
	dis2=T.abs_(delta_W2) #the absolute distance
        k2=delta_W2_direction*T.floor(dis2/L) #the integer part
        v2=delta_W2-k2*L #the decimal part
        Prob2= T.abs_(v2/L) #the transfer probability
        Prob2 = T.tanh(th*Prob2) #the nonlinear tanh() function accelerates the state transfer  
       
        srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
        Gate1 = T.cast(srng.binomial(n=1, p=Prob1, size=T.shape(Prob1)), theano.config.floatX) # Gate1 is a binary variable with probability of Prob1 to be 1
        Gate2 = T.cast(srng.binomial(n=1, p=Prob2, size=T.shape(Prob2)), theano.config.floatX) # Gate2 is a binary variable with probability of Prob2 to be 1

        delta_W1_new=(k1+delta_W1_direction*Gate1)*L #delta_W1_new = k*L where k is an integer
        updates_param1 = T.clip(parambest + delta_W1_new,-H,H)
        updates_param1 = weight_tune(updates_param1,-H,H) #fine tuning for guaranteeing each element strictly constrained in the discrete space

        delta_W2_new=(k2+delta_W2_direction*Gate2)*L #delta_W2_new = k*L where k is an integer  
        updates_param2 = T.clip(param + delta_W2_new,-H,H)
        updates_param2 = weight_tune(updates_param2,-H,H) #fine tuning for guaranteeing each element strictly constrained in the discrete space

		# if update_type<100, the weight probabilistically tranfers from parambest to state_rand, which helps to search the global minimum
        # elst it would probabilistically transfer from param to a state nearest to updates[param]	
        updates[param]= T.switch(T.lt(update_type,100), updates_param1, updates_param2) 
      
    return updates
Exemple #12
0
    def __init__(self, model, e, a=0.5, verbose=2, iterator='linear'):

        self.verbose = verbose
        self.model = init(model)
        try:
            self.iterator = instantiate(iterators, iterator)
        except:
            self.iterator = instantiate(async_iterators, iterator)

        y_tr = self.model[-1].op({
            'dropout': True,
            'bn_active': True,
            'infer': False
        })
        y_te = self.model[-1].op({
            'dropout': False,
            'bn_active': False,
            'infer': False
        })
        y_inf = self.model[-1].op({
            'dropout': False,
            'bn_active': True,
            'infer': True
        })
        self.X = self.model[0].X
        self.Y = T.TensorType(theano.config.floatX,
                              (False, ) * (len(model[-1].out_shape)))()

        cost = T.nnet.categorical_crossentropy(y_tr, self.Y).mean()

        X_adv = self.X + e * T.sgn(T.grad(cost, self.X))

        self.model[0].X = X_adv
        y_tr_adv = self.model[-1].op({
            'dropout': True,
            'bn_active': True,
            'infer': False
        })

        cost_adv = a * cost + (1. - a) * T.nnet.categorical_crossentropy(
            y_tr_adv, self.Y).mean()

        te_cost = T.nnet.categorical_crossentropy(y_te, self.Y).mean()

        X_te_adv = self.X + e * T.sgn(T.grad(te_cost, self.X))

        self.updates = collect_updates(self.model, cost_adv)
        self.infer_updates = collect_infer_updates(self.model)
        self.reset_updates = collect_reset_updates(self.model)
        self._train = theano.function([self.X, self.Y],
                                      cost_adv,
                                      updates=self.updates)
        self._predict = theano.function([self.X], y_te)
        self._fast_sign = theano.function([self.X, self.Y], X_te_adv)
        self._infer = theano.function([self.X],
                                      y_inf,
                                      updates=self.infer_updates)
        self._reset = theano.function([], updates=self.reset_updates)
Exemple #13
0
def rprop(param,learning_rate,gparam,mask,updates,current_cost,previous_cost,
          eta_plus=1.2,eta_minus=0.5,max_delta=50, min_delta=10e-6):
    previous_grad = sharedX(numpy.ones(param.shape.eval()),borrow=True)
    delta = sharedX(learning_rate * numpy.ones(param.shape.eval()),borrow=True)
    previous_inc = sharedX(numpy.zeros(param.shape.eval()),borrow=True)
    zero = T.zeros_like(param)
    one = T.ones_like(param)
    change = previous_grad * gparam

    new_delta = T.clip(
            T.switch(
                T.eq(gparam,0.),
                delta,
                T.switch(
                    T.gt(change,0.),
                    delta*eta_plus,
                    T.switch(
                        T.lt(change,0.),
                        delta*eta_minus,
                        delta
                    )
                )
            ),
            min_delta,
            max_delta
    )
    new_previous_grad = T.switch(
            T.eq(mask * gparam,0.),
            previous_grad,
            T.switch(
                T.gt(change,0.),
                gparam,
                T.switch(
                    T.lt(change,0.),
                    zero,
                    gparam
                )
            )
    )
    inc = T.switch(
            T.eq(mask * gparam,0.),
            zero,
            T.switch(
                T.gt(change,0.),
                - T.sgn(gparam) * new_delta,
                T.switch(
                    T.lt(change,0.),
                    zero,
                    - T.sgn(gparam) * new_delta
                )
            )
    )

    updates.append((previous_grad,new_previous_grad))
    updates.append((delta,new_delta))
    updates.append((previous_inc,inc))
    return param + inc * mask
def Update(params, gradients, velocities):
    global MOMENTUM
    global LEARNING_RATE
    global LEARNING_RATE_DECAY

    param_updates = [ (v, v * MOMENTUM - LEARNING_RATE * T.sgn(g) * T.clip(T.abs_(g), 0.0001, 9.8)) for g, v in zip(gradients, velocities) ]
    for i in range(0, len(gradients)):
        velocities[i] = velocities[i] * MOMENTUM - LEARNING_RATE * T.sgn(gradients[i]) * T.clip(T.abs_(gradients[i]), 0.5, 9.8)
    param_updates.extend([ (p, p + v) for p, v in zip(params, velocities) ])
    LEARNING_RATE *= LEARNING_RATE_DECAY
    return param_updates
    def logp(self, value):
        """
        Compute logp.

        :param value: evaluation point
        :return: log probability at evaluation point
        """
        return tt.log(
            self.scale /
            (self.symmetry +
             (self.symmetry**-1))) + (-value * self.scale * tt.sgn(value) *
                                      (self.symmetry**tt.sgn(value)))
Exemple #16
0
def earth_mover_distance_asym(y_pred, y_true, y_mask, axis_order='xy'):
    y_pred = T.reshape(y_pred, y_true.shape)
    y_pred *= y_mask
    y_true *= y_mask

    y_true = y_true / y_true.sum()
    y_pred = y_pred / y_pred.sum()

    if axis_order == 'yx':
        y_true = y_true.dimshuffle([0, 1, 3, 2])
        y_pred = y_pred.dimshuffle([0, 1, 3, 2])

    # calculate approximate earth mover distance to transform probability
    # distribution y_true into y_pred

    emd = 0.0

    # calculate how much probability mass has to be moved along rows, in x direction
    diff = y_pred - y_true
    move_x = diff.sum(axis=2, keepdims=True).cumsum(axis=3)[..., :, :-1]
    # calculate from which cells to take the probability mass
    move_x_weights = diff.cumsum(axis=3)[..., :, :-1]
    # use only positions where sign is right
    move_x_weights = T.set_subtensor(
        move_x_weights[T.neq(T.sgn(move_x_weights), T.sgn(move_x)).nonzero()],
        0)
    # normalize weightings to one
    # set weights uniformely to one, if all are zero
    move_x_weights = T.set_subtensor(
        move_x_weights[T.eq(move_x_weights.sum(axis=2, keepdims=True),
                            0).nonzero()], 1)
    move_x_weights /= move_x_weights.sum(axis=2, keepdims=True)
    # apply weighting
    move_x = move_x * move_x_weights
    emd += np.abs(move_x).sum()

    y_true_trans = y_true
    y_true_trans += T.set_subtensor(y_true_trans[..., :, :-1], move_x)
    y_true_trans -= T.set_subtensor(y_true_trans[..., :, 1:], move_x)

    # move mass along columns, in y direction
    diff = y_pred - y_true_trans
    move_y = diff.cumsum(axis=2)[..., :-1, :]
    emd += np.abs(move_y).sum()

    # check if we get y_pred
    y_true_trans2 = y_true_trans
    y_true_trans2 += T.set_subtensor(y_true_trans2[..., :-1, :], move_y)
    y_true_trans2 -= T.set_subtensor(y_true_trans2[..., 1:, :], move_y)

    return emd
Exemple #17
0
    def get_updates(self, params, cost):
        grads_rprop = []
        grads_history = []
        grads_rprop_new = []

        shapes = []

        grads = T.grad(cost, params)

        for param, grad in zip(params, grads):
            shape = param.shape.eval()
            shapes.append(shape)
            #grad = tt.grad(loss, wrt=param)
            #grads.append(grad)

            # Save gradients histories for RProp.
            grad_hist = theano.shared(param.get_value() * 0.0 + 1.0,
                                      name="rpop_hist_%s" % param)
            grads_history.append(
                grad_hist
            )

            # Create variables where rprop rates will be stored.
            grad_rprop = theano.shared(param.get_value() * 0.0 + self.lr,
                                       name="rprop_%s" % param)
            grads_rprop.append(grad_rprop)

            # Compute the new RProp coefficients.
            rprop_sign = T.sgn(grad_hist * grad)
            grad_rprop_new = grad_rprop * (
                T.eq(rprop_sign, 1) * self.plus
                + T.neq(rprop_sign, 1) * self.minus
            )
            grads_rprop_new.append(grad_rprop_new)

        updates = [
            # Update parameters according to the RProp update rule.
            (p, p - rg * T.sgn(g))
            for p, g, rg in zip(params, grads, grads_rprop_new)
        ] + [
            # Save current gradient for the next step..
            (hg, g) for hg, g in zip(
                grads_history, grads)
        ] + [
            # Save the new rprop grads.
            (rg, rg_new) for rg, rg_new in zip(
                grads_rprop, grads_rprop_new)
        ]

        return updates
Exemple #18
0
def symGivens2(a, b):
    """
    Stable Symmetric Givens rotation plus reflection

    Parameters

        a: (theano scalar) first element of a two-vector  [a; b]
        b: (theano scalar) second element of a two-vector [a; b]
    Returns

        c  cosine(theta), where theta is the implicit angle of
           rotation (counter-clockwise) in a plane-rotation
        s  sine(theta)
        d  two-norm of [a; b]

    Description:
        This method gives c and s such that
            [ c  s ][a] = [d],
            [ s -c ][b]   [0]
      where d = two norm of vector [a, b],
            c = a / sqrt(a^2 + b^2) = a / d,
            s = b / sqrt(a^2 + b^2) = b / d.
      The implementation guards against overflow in computing
         sqrt(a^2 + b^2).

      SEE ALSO:
         (1) Algorithm 4.9, stable *unsymmetric* Givens
         rotations in Golub and van Loan's book Matrix
         Computations, 3rd edition.
         (2) MATLAB's function PLANEROT.

      Observations:
          Implementing this function as a single op in C might improve speed
          considerably ..
    """
    c_branch1 = T.switch(T.eq(a, constantX(0)), constantX(1), T.sgn(a))
    c_branch21 = (a / b) * T.sgn(b) / \
            T.sqrt(constantX(1) + (a / b) ** 2)
    c_branch22 = T.sgn(a) / T.sqrt(constantX(1) + (b / a)**2)

    c_branch2 = T.switch(
        T.eq(a, constantX(0)), constantX(0),
        T.switch(T.gt(abs(b), abs(a)), c_branch21, c_branch22))
    c = T.switch(T.eq(b, constantX(0)), c_branch1, c_branch2)

    s_branch1 = T.sgn(b) / T.sqrt(constantX(1) + (a / b)**2)
    s_branch2 = (b / a) * T.sgn(a) / T.sqrt(constantX(1) + (b / a)**2)
    s = T.switch(
        T.eq(b, constantX(0)), constantX(0),
        T.switch(T.eq(a, constantX(0)), T.sgn(b),
                 T.switch(T.gt(abs(b), abs(a)), s_branch1, s_branch2)))

    d_branch1 = b / (T.sgn(b) / T.sqrt(constantX(1) + (a / b)**2))
    d_branch2 = a / (T.sgn(a) / T.sqrt(constantX(1) + (b / a)**2))
    d = T.switch(
        T.eq(b, constantX(0)), abs(a),
        T.switch(T.eq(a, constantX(0)), abs(b),
                 T.switch(T.gt(abs(b), abs(a)), d_branch1, d_branch2)))
    return c, s, d
Exemple #19
0
def model_predict(train_set_x, test_set_x, gallery_set_y, query_set_y):
    global WEIGHTS_SAVE_PATH, WEIGHTS_FILE_NAME
    if not WEIGHTS_FILE_NAME:
        print 'no weights_file, please add weights file!'
        return
    print 'predict start time: ' + time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))
    model = build_DDN_net(HASH_NUM, SPLIT_NUM, REGULARIZER_PARAMS)
    model.load_weights(WEIGHTS_SAVE_PATH + WEIGHTS_FILE_NAME)
    Deepid_output = Model(input=model.get_layer('main_input').input,
                          output=model.get_layer('A6').output)
    gallery_set_x = Deepid_output.predict(train_set_x)
    query_set_x = Deepid_output.predict(test_set_x)

    gallery_binary_x = T.sgn(gallery_set_x).eval()
    query_binary_x = T.sgn(query_set_x).eval()

    train_binary_x, train_data_y = gallery_binary_x, gallery_set_y
    train_data_y.shape = (gallery_set_y.shape[0], 1)
    test_binary_x, test_data_y = query_binary_x, query_set_y
    test_data_y.shape = (query_set_y.shape[0], 1)

    train_y_rep = repmat(train_data_y, 1, test_data_y.shape[0])
    test_y_rep = repmat(test_data_y.T, train_data_y.shape[0], 1)
    cateTrainTest = (train_y_rep == test_y_rep)
    train_data_y = train_data_y + 1
    test_data_y = test_data_y + 1

    train_data_y = np.asarray(train_data_y, dtype=int)
    test_data_y = np.asarray(test_data_y, dtype=int)

    B = compactbit(train_binary_x)
    tB = compactbit(test_binary_x)

    hammRadius = 2
    hammTrainTest = hammingDist(tB, B).T

    Ret = (hammTrainTest <= hammRadius + 0.000001)
    [Pre, Rec] = evaluate_macro(cateTrainTest, Ret)
    print 'Precision with Hamming radius_2 = ', Pre
    print 'Recall with Hamming radius_2 = ', Rec

    HammingRank = np.argsort(hammTrainTest, axis=0)
    [MAP, p_topN] = cat_apcal(train_data_y, test_data_y, HammingRank, TOP_K)
    print 'MAP with Hamming Ranking = ', MAP
    print 'Precision of top %d returned = %f ' % (TOP_K, p_topN)
    print 'predict finish time: ' + time.strftime('%Y-%m-%d %H:%M:%S',
                                                  time.localtime(time.time()))
Exemple #20
0
def symGivens2(a, b):
    """
    Stable Symmetric Givens rotation plus reflection

    Parameters

        a: (theano scalar) first element of a two-vector  [a; b]
        b: (theano scalar) second element of a two-vector [a; b]
    Returns

        c  cosine(theta), where theta is the implicit angle of
           rotation (counter-clockwise) in a plane-rotation
        s  sine(theta)
        d  two-norm of [a; b]

    Description:
        This method gives c and s such that
            [ c  s ][a] = [d],
            [ s -c ][b]   [0]
      where d = two norm of vector [a, b],
            c = a / sqrt(a^2 + b^2) = a / d,
            s = b / sqrt(a^2 + b^2) = b / d.
      The implementation guards against overflow in computing
         sqrt(a^2 + b^2).

      SEE ALSO:
         (1) Algorithm 4.9, stable *unsymmetric* Givens
         rotations in Golub and van Loan's book Matrix
         Computations, 3rd edition.
         (2) MATLAB's function PLANEROT.

      Observations:
          Implementing this function as a single op in C might improve speed
          considerably ..
    """
    c_branch1 = T.switch(T.eq(a, constantX(0)), constantX(1), T.sgn(a))
    c_branch21 = (a / b) * T.sgn(b) / T.sqrt(constantX(1) + (a / b) ** 2)
    c_branch22 = T.sgn(a) / T.sqrt(constantX(1) + (b / a) ** 2)

    c_branch2 = T.switch(T.eq(a, constantX(0)), constantX(0), T.switch(T.gt(abs(b), abs(a)), c_branch21, c_branch22))
    c = T.switch(T.eq(b, constantX(0)), c_branch1, c_branch2)

    s_branch1 = T.sgn(b) / T.sqrt(constantX(1) + (a / b) ** 2)
    s_branch2 = (b / a) * T.sgn(a) / T.sqrt(constantX(1) + (b / a) ** 2)
    s = T.switch(
        T.eq(b, constantX(0)),
        constantX(0),
        T.switch(T.eq(a, constantX(0)), T.sgn(b), T.switch(T.gt(abs(b), abs(a)), s_branch1, s_branch2)),
    )

    d_branch1 = b / (T.sgn(b) / T.sqrt(constantX(1) + (a / b) ** 2))
    d_branch2 = a / (T.sgn(a) / T.sqrt(constantX(1) + (b / a) ** 2))
    d = T.switch(
        T.eq(b, constantX(0)),
        abs(a),
        T.switch(T.eq(a, constantX(0)), abs(b), T.switch(T.gt(abs(b), abs(a)), d_branch1, d_branch2)),
    )
    return c, s, d
Exemple #21
0
    def __init__(self, x, N, D):
        """ 
        Initialize the cost function and gradient for logistic regression

        :type x: theano.tensor.vector
        :param x: symbolic variables that describes the input 

        :type N: int 
        :param N: total number of train instances

        :type D: int
        :param N: dimensionality of the feature space
        """
        # Create a one dimensional tensor (i.e. a vector) for the weight vector.
        # borrow=True does not perform a deep copy of the variable and is faster.
        self.w = theano.shared(value=numpy.zeros(D,
                                                 dtype=theano.config.floatX),
                               name='w',
                               borrow=True)

        # Initialise the bias
        self.b = theano.shared(value=numpy.float(0), name='b')

        # Symbolic definition of the logistic sigmoid function
        self.p_y_given_x = T.nnet.sigmoid(T.dot(x, self.w) + self.b)

        # Symbolic definition of how to predict the class
        self.y_pred = (T.sgn(self.p_y_given_x - 0.5) + 1) / 2

        # Parameters of the model
        self.params = [self.w, self.b]
        pass
Exemple #22
0
    def get_state(self):
        st = super(LatentTypeWithTuningCurve, self).get_state()

        # The filters are non-identifiable as we can negate both the
        # temporal and the spatial filters and get the same net effect.
        # By convention, choose the sign that results in the most
        # positive temporal filter.
        sign = T.sgn(T.sum(self.stim_resp_t, axis=0))
        T.addbroadcast(sign, 0)

        # Similarly, we can trade a constant between the spatial and temporal
        # pieces. By convention, set the temporal filter to norm 1.
        Z = T.sqrt(T.sum(self.stim_resp_t**2, axis=0))
        T.addbroadcast(Z, 0)

        # Compute the normalized temporal response
        stim_resp_t = sign*(1.0/Z)*self.stim_resp_t

        # Finally, reshape the spatial component as necessary
        if self.spatial_ndim == 2:
            stim_resp_x = sign*Z*self.stim_resp_x
            stim_resp_x = T.reshape(stim_resp_x,
                                    self.spatial_shape + (self.R,))
        else:
            stim_resp_x = sign*Z*self.stim_resp_x

        st.update({'stim_response_x' : stim_resp_x,
                   'stim_response_t' : stim_resp_t})

        return st
Exemple #23
0
def relevance_conv_z(out_relevances, inputs, weights, bias=None):
    norms_for_relevances = conv2d(inputs, weights)
    if bias is not None:
        norms_for_relevances +=  bias.dimshuffle('x',0,'x','x')
    # stabilize
    # prevent division by 0 and division by small numbers
    eps = 1e-3
    norms_for_relevances += (T.sgn(norms_for_relevances) * eps)
    norms_for_relevances += (T.eq(norms_for_relevances, 0) * eps)
    
    normed_relevances = out_relevances / norms_for_relevances
    # upconv
    in_relevances = conv2d(normed_relevances, 
                           weights.dimshuffle(1,0,2,3)[:,:,::-1,::-1], 
                           border_mode='full')
   
    in_relevances_proper = in_relevances * inputs
    
    if bias is not None:
        bias_relevance = bias.dimshuffle('x',0,'x','x') * normed_relevances
        # Divide bias by weight size before convolving back
        # mean across channel, 0, 1 dims (hope this is correct?)
        fraction_bias = bias_relevance / T.prod(weights.shape[1:]).astype(
            theano.config.floatX)
        bias_rel_in = conv2d(fraction_bias, 
          T.ones_like(weights).dimshuffle(1,0,2,3)[:,:,::-1,::-1], 
          border_mode='full')
        in_relevances_proper +=  bias_rel_in
    
    return in_relevances_proper
Exemple #24
0
 def evaluate_net(*states):
     activations = T.fvectors(len(weights))
     idx = 0
     for neurons, activator, isInput, isOutput, weightFrame in weights:
         sumParts = []
         for i, info in enumerate(weightFrame):
             srcIdx, w = info
             sumParts.append(T.dot(states[srcIdx], w.transpose()))
         
         if len(sumParts):
             sumParts = T.stack(*sumParts)
             activity = T.sum(sumParts, axis=0)
             if activator == TIDENTITY:
                 activation = activity
             elif activator == TLOGISTIC:
                 activation = 1. / (1. + T.exp(-activity))
             elif activator == THYPERBOLIC:
                 activation = T.tanh(activity)
             elif activator == TTHRESHOLD:
                 activation = T.sgn(activity)
             elif activator == TBIAS:
                 activation = T.ones_like(activity, dtype='float32')
             elif activator == TRADIAL:
                 activation = T.exp(-activity*activity/2.0)
             else:
                 raise Exception("Unknown activation function for layer {0}" + layer.id)
         else:
             activation = T.zeros_like(states[idx])#states[idx]
             
         activations[idx] = activation
         idx += 1
     
     checklist = [T.all(T.eq(a,s)) for a,s in zip(activations, states)]
     condition = T.all(T.as_tensor_variable(checklist))
     return activations, {}, theano.scan_module.until(condition )
Exemple #25
0
def irprop_star_trainer(x, y, w, parameters, loss, random_stream,
                        positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6):
    """ IRPROP* trainer (own experimental modification, not recommended for usage) """
    shareds = []
    updates = []
    loss_value = loss(x, y, w)

    for name, param in parameters.items():
        param_shape = param.get_value().shape
        n = numpy.prod(param_shape).astype(int)
        new_derivative_ = T.grad(loss_value, param).flatten()
        lnewder, rnewder = new_derivative_.reshape([n, 1]), new_derivative_.reshape([1, n])
        new_derivative_plus = lnewder + rnewder
        new_derivative_minus = lnewder - rnewder
        new_param = param
        for new_derivative in [new_derivative_plus, new_derivative_minus]:
            delta = theano.shared(numpy.zeros([n, n], dtype=floatX) + 1e-3)
            old_derivative = theano.shared(numpy.zeros([n, n], dtype=floatX))

            new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step)
            new_delta = T.clip(new_delta, min_step, max_step)

            updates.append([delta, new_delta])
            new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative)
            updates.append([old_derivative, new_old_derivative])
            new_param = new_param - (new_delta * T.sgn(new_derivative)).sum(axis=1).reshape(param.shape)
            shareds.extend([old_derivative, delta])

        updates.append([param, new_param])

    return shareds, updates
Exemple #26
0
def Update(params, gradients, velocities):
    global MOMENTUM
    global LEARNING_RATE
    global LEARNING_RATE_DECAY

    param_updates = [
        (v, v * MOMENTUM -
         LEARNING_RATE * T.sgn(g) * T.clip(T.abs_(g), 0.0001, 9.8))
        for g, v in zip(gradients, velocities)
    ]
    for i in range(0, len(gradients)):
        velocities[i] = velocities[i] * MOMENTUM - LEARNING_RATE * T.sgn(
            gradients[i]) * T.clip(T.abs_(gradients[i]), 0.5, 9.8)
    param_updates.extend([(p, p + v) for p, v in zip(params, velocities)])
    LEARNING_RATE *= LEARNING_RATE_DECAY
    return param_updates
Exemple #27
0
def fd3(mlp, fdm, params, globalLR1, globalLR2, momentParam1, momentParam2):

    cost1 = mlp.classError1 + mlp.penalty
    gradT1reg = T.grad(cost1, mlp.paramsT2)

    updateT1 = []
    updateT2 = []
    onlyT2param = []
    # take opt from Adam?
    if params.opt2 in ['adam']: opt2 = adam()
    else: opt2 = None

    # update W - (1) + (3)
    for param, uC1, uC2 in zip(mlp.paramsT1, fdm.updateC1T1, fdm.updateC2T1):
        updateT1 += [(param, param + uC1 - uC2)]

    # compute grad T2 of C1,  update T2 - [(4) - (2) ] / lr1
    for param, grad, gT2 in zip(mlp.paramsT2, gradT1reg, fdm.gradC1T2):
        if params.T2onlySGN:
            grad_proxi = T.sgn((grad - gT2) / step * globalLR1)
        else:
            grad_proxi = (grad - gT2) / step * globalLR1

        tempUp, tempPair, _ = update_fun(param,
                                         T.reshape(grad_proxi,
                                                   param.shape), None, 'T2',
                                         {}, opt2, params, globalLR1,
                                         globalLR2, momentParam1, momentParam2)
        updateT2 += tempUp
        onlyT2param += tempPair

    debugs = [check for (_, check) in onlyT2param]
    return updateT1 + updateT2, debugs
Exemple #28
0
def irprop_minus_trainer(x, y, w, parameters, loss, random_stream,
                         positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6):
    """IRPROP- is batch trainer, for details see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3428 .
    This is default trainer, very stable for classification.

    :param positive_step: factor, by which the step is increased when continuing going in the direction
    :param negative_step: factor, by which the step is increased when changing direction to opposite
    :param min_step: minimal change of weight during iteration
    :param max_step: maximal change of weight during iteration
    """
    shareds = []
    updates = []
    loss_value = loss(x, y, w)
    for name, param in parameters.items():
        old_derivative = theano.shared(param.get_value() * 0.)
        delta = theano.shared(param.get_value() * 0. + 1e-3)
        shareds.extend([old_derivative, delta])
        new_derivative = T.grad(loss_value, param)

        new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step)
        new_delta = T.clip(new_delta, min_step, max_step)

        updates.append([param, param - new_delta * T.sgn(new_derivative)])
        updates.append([delta, new_delta])

        new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative)
        updates.append([old_derivative, new_old_derivative])
    return shareds, updates
def model_mod_act(nx=4, nh=100, ny=1, p=None, tau = 10., seed = 0):
    np.random.seed(seed)
    if p == None:
        Wx = theano.shared(np.random.normal(0., 0.5, (nx, nh)).astype(theano.config.floatX))
        Wh = theano.shared(np.random.normal(0., 1./nh, (nh, nh)).astype(theano.config.floatX))
        Wy = theano.shared(np.zeros((nh,ny), dtype=theano.config.floatX))
        bh = theano.shared(np.zeros(nh, dtype=theano.config.floatX))
        by = theano.shared(np.zeros(ny, dtype=theano.config.floatX))
        p = [Wx, Wh, Wy, bh, by]
    else:
        Wx = p[0]; Wh = p[1]; Wy = p[2]; bh = p[3]; by = p[4]

    h0 = theano.shared(np.zeros(nh, dtype=theano.config.floatX))
    x = T.matrix('input_x')
    rho_h = T.matrix('rho_h')
    t = T.scalar('teachSig')
    mod = T.matrix('modulator')
    #theano.config.exception_verbosity='high'
    def recurrence(x_t, rho_h_t, mod_t, h_tm1):
        dh = (-h_tm1 + bh  + mod_t * (T.dot(x_t, Wx) + T.dot(h_tm1, Wh) + rho_h_t)) / tau
        ha_t = h_tm1 + dh
        h_t = T.tanh(ha_t)
        s_t = T.dot(h_t, Wy) + by
        return [ha_t, h_t, s_t]

    ([ha, h, y], updates) = theano.scan(fn=recurrence, sequences=[x, rho_h, mod], outputs_info=[dict(), h0, dict()])

    h = T.tanh(ha)
    y_0 = y[0, 0]
    y_T = y[-1, 0]
    loss = (((0.-y_0) ** 2.) + ((t-y_T) ** 2.)) / 2.
    acc = T.neq(T.sgn(y_T), t)
    return p, [x, rho_h, mod, t], y_T, [loss, acc], h, ha, y
Exemple #30
0
 def __abs__(self, other):
     assert hasattr(self, 'out'), 'all layers need a default output'
     new_obj = utils.copy(self)
     new_obj.out = abs(new_obj.out)
     if hasattr(new_obj, 'grads'):
         new_obj.grads = [TT.sgn(new_obj.out) * x for x in new_obj.grads]
     return new_obj
    def adExample(self, X, y, weights, network, input_var):
        target_var = T.ivector('targets')

        prediction = lasagne.layers.get_output(network)

        if self.loss is 'softmax':
            loss = lasagne.objectives.categorical_crossentropy(
                prediction, target_var)
        if self.loss is 'svm':
            loss = lasagne.objectives.multiclass_hinge_loss(
                prediction, target_var)

        loss = loss.mean()
        params = lasagne.layers.get_all_params(network, trainable=True)

        lasagne.layers.set_all_param_values(network, weights)
        Xnew = np.zeros((self.num_images, 3, 224, 224))
        Xnew[:, :, :, :] = X

        grad = T.grad(loss, input_var)
        final_examples = X + self.eps * T.sgn(grad)
        func1 = theano.function([input_var, target_var],
                                final_examples,
                                allow_input_downcast=True)
        result = func1(Xnew, y)
        return result
Exemple #32
0
	def get_function(self, func_name):
		if func_name == 'tanh':
			return T.tanh
		elif func_name == 'hardtanh':
			L.warning('Current hardTanh implementation is slow!')
			return lambda x: ((abs(x) <= 1) * x) + ((1 < abs(x)) * T.sgn(x))
		elif func_name == 'xtanh':
			return lambda x: T.tanh(x) + 0.1 * x
		elif func_name == 'sigmoid':
			return T.nnet.sigmoid
		elif func_name == 'fastsigmoid':
			L.error('T.nnet.ultra_fast_sigmoid function has some problems')
		elif func_name == 'hardsigmoid':
			return T.nnet.hard_sigmoid
		elif func_name == 'xsigmoid':
			return lambda x: T.nnet.sigmoid(x) + 0.1 * x
		elif func_name == 'softplus':
			return T.nnet.softplus
		elif func_name == 'relu':
			#return lambda x: T.maximum(x, 0)
			return lambda x: x * (x > 0)
			#return T.nnet.relu # Update theano and then use this one instead
		elif func_name == 'leakyrelu':
			return lambda x: T.maximum(x, 0.01 * x)
		elif func_name == 'cappedrelu':
			return lambda x: T.minimum(x * (x > 0), 6)
		elif func_name == 'softmax':
			return T.nnet.softmax
		elif func_name == 'norm1':
			return lambda x: x / T.nlinalg.norm(x, 1)
		elif func_name == 'norm2':
			#return lambda x: x / T.nlinalg.norm(x, 2)
			return lambda x: x / T.dot(x, x)**0.5
		else:
			L.error('Invalid function name given: ' + func_name)
def rprop_core(params,
               gradients,
               rprop_increase=1.01,
               rprop_decrease=0.99,
               rprop_min_step=0,
               rprop_max_step=100,
               learning_rate=0.01):
    """
    Rprop optimizer.
    See http://sci2s.ugr.es/keel/pdf/algorithm/articulo/2003-Neuro-Igel-IRprop+.pdf.
    """
    for param, grad in zip(params, gradients):
        grad_tm1 = theano.shared(np.zeros_like(param.get_value()),
                                 name=param.name + '_grad')
        step_tm1 = theano.shared(np.zeros_like(param.get_value()) +
                                 learning_rate,
                                 name=param.name + '_step')

        test = grad * grad_tm1
        same = T.gt(test, 0)
        diff = T.lt(test, 0)
        step = T.minimum(
            rprop_max_step,
            T.maximum(
                rprop_min_step,
                step_tm1 * (T.eq(test, 0) + same * rprop_increase +
                            diff * rprop_decrease)))
        grad = grad - diff * grad
        yield param, param - T.sgn(grad) * step
        yield grad_tm1, grad
        yield step_tm1, step
Exemple #34
0
def fd3(mlp, fdm, params, globalLR1, globalLR2, momentParam1, momentParam2):

    cost1 = mlp.classError1 + mlp.penalty
    gradT1reg = T.grad(cost1, mlp.paramsT2)        

    updateT1 = []; updateT2 = []; onlyT2param = []    
    # take opt from Adam?
    if params.opt2 in ['adam']: opt2 = adam()
    else: opt2 = None    

    # update W - (1) + (3)            
    for param, uC1, uC2 in zip(mlp.paramsT1, fdm.updateC1T1, fdm.updateC2T1):                               
        updateT1 += [(param, param + uC1 - uC2)]

    # compute grad T2 of C1,  update T2 - [(4) - (2) ] / lr1
    for param, grad, gT2 in zip(mlp.paramsT2, gradT1reg, fdm.gradC1T2):   
        if params.T2onlySGN:
           grad_proxi = T.sgn((grad - gT2)/step*globalLR1)
        else:
           grad_proxi = (grad - gT2)/step*globalLR1
            
        tempUp, tempPair, _ = update_fun(param, T.reshape(grad_proxi, param.shape), None,
                              'T2', {}, opt2, params,
                              globalLR1, globalLR2, momentParam1, momentParam2)
        updateT2 += tempUp
        onlyT2param += tempPair        
     
     
    debugs = [check for (_, check) in onlyT2param]  
    return updateT1 + updateT2, debugs
    

    
Exemple #35
0
def relevance_conv_z(out_relevances, inputs, weights, bias=None):
    norms_for_relevances = conv2d(inputs, weights)
    if bias is not None:
        norms_for_relevances += bias.dimshuffle("x", 0, "x", "x")
    # stabilize
    # prevent division by 0 and division by small numbers
    eps = 1e-4
    norms_for_relevances += T.sgn(norms_for_relevances) * eps
    norms_for_relevances += T.eq(norms_for_relevances, 0) * eps

    normed_relevances = out_relevances / norms_for_relevances
    # upconv
    in_relevances = conv2d(normed_relevances, weights.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full")

    in_relevances_proper = in_relevances * inputs

    if bias is not None:
        bias_relevance = bias.dimshuffle("x", 0, "x", "x") * normed_relevances
        # Divide bias by weight size before convolving back
        # mean across channel, 0, 1 dims (hope this is correct?)
        fraction_bias = bias_relevance / T.prod(weights.shape[1:]).astype(theano.config.floatX)
        bias_rel_in = conv2d(
            fraction_bias, T.ones_like(weights).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full"
        )
        in_relevances_proper += bias_rel_in

    return in_relevances_proper
Exemple #36
0
 def learning_updates(self):
     step = self.learning_rate
     self.grads = []
     self.steps = []
     for param in self.params:
         v = param.get_value()
         n = param.name
         self.grads.append(theano.shared(np.zeros_like(v),
                                         name=n + '_grad'))
         self.steps.append(
             theano.shared(np.zeros_like(v) + step, name=n + '_step'))
     for param, step_tm1, grad_tm1 in zip(self.params, self.steps,
                                          self.grads):
         grad = TT.grad(self.J, param)
         test = grad * grad_tm1
         same = TT.gt(test, 0)
         diff = TT.lt(test, 0)
         step = TT.minimum(
             self.max_step,
             TT.maximum(
                 self.min_step,
                 step_tm1 * (TT.eq(test, 0) + same * self.step_increase +
                             diff * self.step_decrease)))
         grad = grad - diff * grad
         yield param, param - TT.sgn(grad) * step
         yield grad_tm1, grad
         yield step_tm1, step
Exemple #37
0
 def __init__(self,input,response):
     targets = response.resp
     mask = T.sgn(targets)
     antargets=T.switch(T.gt(targets,0),targets,1+targets)
     self.hengeloss = T.sum((mask*(antargets-input.output)).clip(0,1e10))
     self.output = response.resp
     self.output_shape = response.resp_shape
Exemple #38
0
def irprop_minus_trainer(x, y, w, parameters, loss, random_stream,
                         positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6):
    """IRPROP- is batch trainer, for details see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3428 .
    This is default trainer, very stable for classification.

    :param positive_step: factor, by which the step is increased when continuing going in the direction
    :param negative_step: factor, by which the step is increased when changing direction to opposite
    :param min_step: minimal change of weight during iteration
    :param max_step: maximal change of weight during iteration
    """
    shareds = []
    updates = []
    loss_value = loss(x, y, w)
    for name, param in parameters.items():
        old_derivative = theano.shared(param.get_value() * 0.)
        delta = theano.shared(param.get_value() * 0. + 1e-3)
        shareds.extend([old_derivative, delta])
        new_derivative = T.grad(loss_value, param)

        new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step)
        new_delta = T.clip(new_delta, min_step, max_step)

        updates.append([param, param - new_delta * T.sgn(new_derivative)])
        updates.append([delta, new_delta])

        new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative)
        updates.append([old_derivative, new_old_derivative])
    return shareds, updates
Exemple #39
0
 def __abs__(self, other):
     assert hasattr(self, 'out'), 'all layers need a default output'
     new_obj = utils.copy(self)
     new_obj.out = abs(new_obj.out)
     if hasattr(new_obj, 'grads'):
         new_obj.grads = [TT.sgn(new_obj.out) * x for x in new_obj.grads]
     return new_obj
Exemple #40
0
def rprop_updates(cost, params):
    initial_rprop_rate = 0.005
    minimum_rprop_rate = 1e-6
    maximum_rprop_rate = 50
    rprop_eta_n = 0.5
    rprop_eta_p = 1.2

    rprop_values = [
        shared(initial_rprop_rate * numpy.ones(p.get_value(borrow=True).shape,
                                               dtype=theano.config.floatX))
        for p in params
    ]
    rprop_signs = [
        shared(
            numpy.zeros(p.get_value(borrow=True).shape,
                        dtype=theano.config.floatX)) for p in params
    ]
    updates = []
    for param, value, sign in zip(params, rprop_values, rprop_signs):
        grad = T.grad(cost, param)
        sign_new = T.sgn(grad)
        sign_changed = T.neq(sign, sign_new)
        updates.append(
            (param, T.switch(sign_changed, param, param - value * sign_new)))
        updates.append((value,
                        T.clip(
                            T.switch(sign_changed, rprop_eta_n * value,
                                     rprop_eta_p * value), minimum_rprop_rate,
                            maximum_rprop_rate)))
        updates.append((sign, sign_new))
    return updates
Exemple #41
0
 def __init__(self,
              input,
              n_in,
              unit_input_step,
              threshold,
              initVmem=0.5,
              batchsize=1000):
     self.input = input  # input real-valued matrix, shape = (batchsize, n_in)
     self.n_in = n_in
     self.unit_input_step = unit_input_step
     self.threshold = threshold
     self.signs = T.sgn(self.input)
     self.input_abs = abs(self.input)
     self.incremental_step = self.input_abs * self.unit_input_step
     self.initVmem = initVmem
     self.batchsize = batchsize
     self.Vmem_init = np.zeros(
         (self.batchsize, n_in), dtype=theano.config.floatX) + np.float32(
             self.initVmem * threshold)
     self.Vmem = theano.shared(self.Vmem_init, name='Vmem', borrow=True)
     self.Vmem_update_prespike = self.Vmem + self.incremental_step
     self.output = T.ge(self.Vmem_update_prespike,
                        self.threshold) * self.signs
     self.Vmem_update_postspike = T.cast(
         self.Vmem_update_prespike - abs(self.output) * self.threshold,
         theano.config.floatX)
def discrete_grads(loss,network,LR):
    global update_type,best_params,H,N,th # th is a parameter that controls the nonlinearity of state transfer probability

    W_params = lasagne.layers.get_all_params(network, discrete=True) #Get all the weight parameters
    layers = lasagne.layers.get_all_layers(network)
	
    W_grads = []
    for layer in layers:
        params = layer.get_params(discrete=True)
        if params:
            W_grads.append(theano.grad(loss, wrt=layer.W)) #Here layer.W = weight_tune(param)  
    updates = lasagne.updates.adam(loss_or_grads=W_grads,params=W_params,learning_rate=LR)  

    for param, parambest in izip(W_params, best_params) :

        L = 2*H/pow(2,N) #state step length in Z_N 
		
        a=random.random() #c is a random variable with binary value       
        if a<0.85:
           c = 1
        else:
           c = 0
        
        b=random.random()
        state_rand = T.round(b*pow(2,N))*L-H #state_rand is a random state in the discrete weight space Z_N
        
        delta_W1 =c*(state_rand-parambest)#parambest would transfer to state_rand with probability of a, or keep unmoved with probability of 1-a
        delta_W1_direction = T.cast(T.sgn(delta_W1),theano.config.floatX)
		dis1=T.abs_(delta_W1) #the absolute distance
        k1=delta_W1_direction*T.floor(dis1/L) #the integer part
        v1=delta_W1-k1*L #the decimal part
        Prob1= T.abs_(v1/L) #the transfer probability
	    Prob1 = T.tanh(th*Prob1) #the nonlinear tanh() function accelerates the state transfer
Exemple #43
0
def discretized_laplace(mean, logscale, binsize, sample=None):
    scale = .5 * T.exp(logscale)
    if sample is None:
        u = G.rng_curand.uniform(size=mean.shape) - .5
        sample = mean - scale * T.sgn(u) * T.log(1 - 2 * abs(u))
        sample = T.floor(sample / binsize) * binsize  #discretize the sample

    d = .5 * binsize

    def cdf(x):
        z = x - mean
        return .5 + .5 * T.sgn(z) * (1. - T.exp(-abs(z) / scale))

    def logmass1(x):
        # General method for probability mass, but numerically unstable for large |x-mean|/scale
        return T.log(cdf(x + d) - cdf(x - d) + 1e-7)

    def logmass2(x):
        # Only valid for |x-mean| >= d
        return -abs(x - mean) / scale + T.log(
            T.exp(d / scale) - T.exp(-d / scale)) - np.log(2.).astype(G.floatX)

    def logmass_stable(x):
        switch = (abs(x - mean) < d)
        return switch * logmass1(x) + (1 - switch) * logmass2(x)

    logp = logmass_stable(sample).flatten(2).sum(axis=1)
    entr = None  #(1 + logscale).flatten(2).sum(axis=1)
    return RandomVariable(sample, logp, entr, mean=mean, scale=scale)
def iRpropPlus(
    cost,
    params,
    c,
    positiveStep=np.float32(1.2),
    negativeStep=np.float32(0.5),
    maxStep=np.float32(50),
    minStep=np.float32(1e-6),
):
    updates = []
    for layerParams in params:
        for param in layerParams:
            lastParamGradSign = theano.shared(param.get_value(borrow=True) * 0.0)
            lastParamDelta = theano.shared(param.get_value(borrow=True) * 0.1)
            lastCost = theano.shared(np.float32(np.inf))

            gradient = T.grad(cost=cost, wrt=param, disconnected_inputs="raise")
            change = T.sgn(lastParamGradSign * gradient)
            changePos = T.gt(change, 0.0).astype(theano.config.floatX)
            changeNeg = T.lt(change, 0.0).astype(theano.config.floatX)
            changeZero = T.eq(change, 0.0).astype(theano.config.floatX)
            costInc = T.gt(cost, lastCost).astype(theano.config.floatX)
            newParam = (
                param
                - changePos * T.sgn(gradient) * T.minimum(lastParamDelta * positiveStep, maxStep)
                + changeNeg * costInc * lastParamGradSign * lastParamDelta
                - changeZero * T.sgn(gradient) * lastParamDelta
            )

            # max-norm regularization
            newParam = maxNormReg(newParam, c, epsilon)

            newLastParamDelta = (
                changePos * T.minimum(lastParamDelta * positiveStep, maxStep).astype(theano.config.floatX)
                + changeNeg * T.maximum(lastParamDelta * negativeStep, minStep).astype(theano.config.floatX)
                + changeZero * lastParamDelta.astype(theano.config.floatX)
            )
            newLastParamGradSign = (
                changePos * T.sgn(gradient).astype(theano.config.floatX)
                + changeNeg * 0
                + changeZero * T.sgn(gradient).astype(theano.config.floatX)
            )
            updates.append((param, newParam))
            updates.append((lastParamDelta, newLastParamDelta))
            updates.append((lastParamGradSign, newLastParamGradSign))
    updates.append((lastCost, cost))
    return updates
 def forward_theano(self, x):
     abs_x = tt.abs_(x)
     y = tt.switch(abs_x < self.c, tt.erf(x / 2.**0.5),
                   (((self.beta**2 - 4 * self.alpha *
                     (self.gamma - abs_x))**0.5
                    - self.beta) /
                    (2 * self.alpha)) * tt.sgn(x))
     return y
Exemple #46
0
def laplace_diag(mean, logscale, sample=None):
    scale = .5*T.exp(logscale)
    if sample is None:
        u = G.rng_curand.uniform(size=mean.shape) - .5
        sample = mean - scale * T.sgn(u) * T.log(1-2*abs(u))
    logp = (- logscale - abs(sample-mean) / scale).flatten(2).sum(axis=1)
    entr = (1 + logscale).flatten(2).sum(axis=1)
    return RandomVariable(sample, logp, entr, mean=mean, scale=scale)
def _laplace(trng, p, size=None):
    dim = p.shape[p.ndim-1] // 2
    mu = _slice(p, 0, dim)
    log_b = _slice(p, 1, dim)
    if size is None:
        size = mu.shape
    epsilon = trng.uniform(size=size, dtype=floatX) - 0.5
    return mu + T.exp(log_b) * T.sgn(epsilon) * T.log(1.0 - 2 * abs(epsilon))
Exemple #48
0
def NSigmoidP(x,
              p,
              use_noise=1,
              alpha=1.1,
              c=0.15,
              noise=None,
              half_normal=True):
    """
    Noisy Sigmoid Tanh Units: NAN with learning p
    ----------------------------------------------------
    Arguments:
        x: theano tensor variable, input of the function.
        p: theano shared variable, a vector of parameters for p.
        use_noise: int, whether to add noise or not to the activations, this is in particular
        useful for the test time, in order to disable the noise injection.
        c: float, standard deviation of the noise
        alpha: float, the leakage rate from the linearized function to the nonlinear one.
        half_normal: bool, whether the noise should be sampled from half-normal or
        normal distribution.
    """
    lin_sigm = 0.25 * x + 0.5
    logger.info("c: %f" % c)
    signs = T.sgn(x)
    delta = HardSigmoid(x) - lin_sigm
    signs = T.sgn(x)
    scale = c * (T.nnet.sigmoid(p * delta) - 0.5)**2
    if not noise:
        noise = global_trng.normal(size=x.shape,
                                   avg=0,
                                   std=1.0,
                                   dtype=floatX)
    noise_det = 0.
    if half_normal:
       if alpha > 1.0:
          scale *= -1.
       if not use_noise:
           noise_det = numpy.float32(0.797)
       else:
           noise = abs(noise)
    elif not use_noise:
        noise_det = 0.

    noise = use_noise * noise + (1. - use_noise) * noise_det
    res = (alpha * HardSigmoid(x) + (1. - alpha) * lin_sigm - signs * scale * noise)
    return res
Exemple #49
0
 def get_perturbation(self, dir, epsilon):
     if (self.norm_constraint == 'max'):
         print 'perturb:max'
         return epsilon * T.sgn(dir)
     if (self.norm_constraint == 'L2'):
         print 'perturb:L2'
         dir = self.get_normalized_vector(dir)
         dir = epsilon * numpy.float(numpy.sqrt(self.layer_sizes[0])) * dir
         return dir
Exemple #50
0
def rprop_plus_updates(params, grads):

    # RPROP+ parameters
    updates = []
    deltas = 0.1*numpy.ones(len(params))
    last_weight_changes = numpy.zeros(len(params))
    last_params = params
    
    positiveStep = 1.2
    negativeStep = 0.5
    maxStep = 50.
    minStep = math.exp(-6)

    # RPROP+ parameter update (original Reidmiller implementation)
    for param, gparam, last_gparam, delta, last_weight_change in \
            zip(params, grads, last_params, deltas, last_weight_changes):
        # calculate change
        change = T.sgn(gparam * last_gparam)
        if T.gt(change, 0) :
            delta = T.minimum(delta * positiveStep, maxStep)
            
            if T.lt(delta, minStep):
                delta = minStep
            weight_change = T.sgn(gparam) * delta
            last_gparam = gparam
            
        elif T.lt(change, 0):
            delta = T.maximum(delta * negativeStep, minStep)
            
            if T.gt(delta, maxStep):
                delta = maxStep
            weight_change = -last_weight_change
            last_gparam = 0
            
        else :
            weight_change = T.sgn(gparam) * delta
            last_gparam = param

        # update the weights
        updates.append((param, param - weight_change))
        # store old change
        last_weight_change = weight_change

    return updates
Exemple #51
0
def NSigmoid(x,
              use_noise=1,
              alpha=1.15,
              c=0.25,
              threshold=2.0,half_normal=False):
    """
    Noisy Hard Sigmoid Units: NAN without learning p
    ----------------------------------------------------
    Arguments:
        x: theano tensor variable, input of the function.
        use_noise: int, whether to add noise or not to the activations, this is in particular
        useful for the test time, in order to disable the noise injection.
        c: float, standard deviation of the noise
        alpha: the leaking rate from the linearized function to the nonlinear one.
    """

    logger.info("c: %f" % c)
    signs = T.sgn(x)
    delta = abs(x) - threshold

    scale = c * (T.nnet.sigmoid(delta**2)  - 0.5)**2

    noise = global_trng.normal(size=x.shape,
                                   avg=0,
                                   std=1.0,
                                   dtype=floatX)

    if half_normal:
       if alpha > 1.0:
           scale *= -1
       noise = abs(noise)
       if not use_noise:
            noise = 0.797
    elif not use_noise:
        noise = 0.

    eps = scale * noise + alpha * delta
    signs = T.sgn(x)
    z = x - signs * eps

    test = T.cast(T.ge(abs(x), threshold), floatX)
    res = test * z + (1. - test) * HardSigmoid(x)

    return res
Exemple #52
0
def get_xelm_predict_function(f_name):
    X_matrix = T.dmatrix('X')
    W_matrix = T.dmatrix('W')
    beta = T.dmatrix('beta')

    H_matrix = metric_theano[f_name](X_matrix, W_matrix)
    s = T.sgn(T.dot(H_matrix, beta))

    xelm_predict_function = theano.function([X_matrix, W_matrix, beta], s)
    return xelm_predict_function
Exemple #53
0
  def __init__(self, input, inputLabels, nrLayers, initialWeights, initialBiases,
               activationFunction, classificationActivationFunction,
               visibleDropout, hiddenDropout,
               adversarial_training, adversarial_epsilon, adversarial_coefficient,
               training_options):
    self.input = input
    self.inputLabels = inputLabels
    # If we should use adversarial training or not
    self.adversarial_training = adversarial_training
    self.adversarial_coefficient = adversarial_coefficient
    self.adversarial_epsilon = adversarial_epsilon

    self.visibleDropout = visibleDropout
    self.hiddenDropout = hiddenDropout
    self.activationFunction = activationFunction
    self.classificationActivationFunction = classificationActivationFunction
    self.training_options = training_options

    # Let's initialize the fields
    # The weights and biases, make them shared variables
    nrWeights = nrLayers - 1
    self.nrWeights = nrWeights
    biases = []
    weights = []
    for i in xrange(nrWeights):
      w = theano.shared(value=np.asarray(initialWeights[i],
                                         dtype=theanoFloat),
                        name='W')
      weights.append(w)

      b = theano.shared(value=np.asarray(initialBiases[i],
                                         dtype=theanoFloat),
                        name='b')
      biases.append(b)

    # Set the parameters of the object
    # Do not set more than this, these will be used for differentiation in the
    # gradient
    params = weights + biases
    self.biases = biases

    # Initialize the super class
    super(MiniBatchTrainer, self).__init__(params, weights, training_options)

    # Create a theano random number generator required to sample units for dropout
    self.theanoRng = RandomStreams(seed=np.random.randint(1, 1000))
    self.output = self.forwardPass(self.input)

    if self.adversarial_training:
      # TODO(mihaela): move this to the BatchTrainer superclass?
      # This would require moving the forward functionality there
      error = T.sum(self.costFun(self.output, self.inputLabels))
      grad_error = T.grad(error, self.input)
      adversarial_input = self.input + self.adversarial_epsilon * T.sgn(grad_error)
      self.adversarial_output = self.forwardPass(adversarial_input)
Exemple #54
0
def get_perturbation(dir, epsilon,norm_constraint):
        if (norm_constraint == 'max'):
            print 'perturb:max'
            return epsilon * T.sgn(dir)
        elif (norm_constraint == 'L2'):
            print 'perturb:L2'
            dir = get_normalized_vector(dir)
            dir = epsilon * dir
            return dir
        else:
            raise NotImplementedError()
Exemple #55
0
def get_rbfnet_predict_function(metric_name):
    X_matrix = T.dmatrix('X')
    W_matrix = T.dmatrix('W')
    beta = T.dvector('beta')
    b = T.scalar('b')

    H_matrix = metric_theano[metric_name](X_matrix, W_matrix)
    H_rbf = np.exp(T.power(H_matrix, 2) * (-b))
    s = T.sgn(T.dot(H_rbf, beta))

    rbfnet_predict_function = theano.function([X_matrix, W_matrix, beta, b], s)
    return rbfnet_predict_function
	def sample_proposal_s(self):
		
		#s is npcl-by-ns
		
		u=self.theano_rng.uniform(size=T.shape(self.s_now))-0.5
		
		mean_term=self.get_prediction(self.s_now)
		
		s_prop=mean_term-T.sgn(u)*T.log(1.0-2.0*T.abs_(u))*self.b
		
		#return T.cast(s_prop,'float32'), T.cast(s_pred,'float32'), T.cast(prop_term,'float32'), prop_mean
		return s_prop
Exemple #57
0
def NTanhPInp(x,
              p,
              use_noise=1,
              c=0.25,
              half_normal=False,alpha=1.1):
    """
    Noisy Tanh units where the noise is injected to the input: NANI with learning p.
    This function works well with discrete switching functions.
    ----------------------------------------------------
    Arguments:
        x: theano tensor variable, input of the function.
        p: theano shared variable, a vector of parameters for p.
        use_noise: int, whether to add noise or not to the activations, this is in particular
        useful for the test time, in order to disable the noise injection.
        c: float, standard deviation of the noise
        half_normal: bool, whether the noise should be sampled from half-normal or
        normal distribution.
    """

    logger.info("c: %f" % c)
    signs = T.sgn(x)
    delta = HardTanh(x) - x
    signs = T.sgn(x)
    noise = global_trng.normal(size=x.shape,
                               avg=0,
                               std=1.0,
                               dtype=floatX)
    noise_det = 0.
    if half_normal:
       if alpha > 1.0:
          c *= -1
       noise_det = 0.797
       noise = abs(noise)
    elif not use_noise:
        noise = 0.

    noise = use_noise * noise + (1. - use_noise) * noise_det
    scale = c * T.nnet.softplus(p * abs(delta) / (abs(noise) + 1e-10))
    res = HardTanh(x + scale * noise)
    return res