Example #1
0
File: ext.py Project: Beronx86/cle
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        for k, p in mainloop.updates.items():
            for key in self.keys:
                if key in str(k):
                    token = 1

                    for waiver in self.waivers:
                        if waiver in str(k):
                            token = 0

                    if token:
                        updated_param = mainloop.updates[k]

                        if self.is_vector:
                            col_norms = T.sqrt(T.sqr(updated_param).sum(axis=0))
                            desired_norms = T.clip(col_norms, 0, self.weight_norm)
                            ratio = (desired_norms / (1e-7 + col_norms))
                            mainloop.updates[k] = updated_param * ratio
                        else:
                            norm = T.sqrt(T.sqr(updated_param).sum())
                            desired_norm = T.clip(norm, 0, self.weight_norm)
                            ratio = (desired_norm / (1e-7 + norm))
                            mainloop.updates[k] = updated_param * ratio
Example #2
0
 def cost(self):
   """
   :rtype: (theano.Variable | None, dict[theano.Variable,theano.Variable] | None)
   :returns: cost, known_grads
   """
   known_grads = None
   if self.loss == 'ce' or self.loss == 'priori':
     if self.attrs.get("target", "").endswith("[sparse:coo]"):
       assert isinstance(self.y, tuple)
       assert len(self.y) == 3
       from NativeOp import crossentropy_softmax_and_gradient_z_sparse
       y_mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")]
       ce, grad_z = crossentropy_softmax_and_gradient_z_sparse(
         self.z, self.index, self.y[0], self.y[1], self.y[2], y_mask)
       return self.norm * T.sum(ce), {self.z: grad_z}
     if self.y_data_flat.type == T.ivector().type:
       # Use crossentropy_softmax_1hot to have a more stable and more optimized gradient calculation.
       # Theano fails to use it automatically; I guess our self.i indexing is too confusing.
       #idx = self.index.flatten().dimshuffle(0,'x').repeat(self.y_m.shape[1],axis=1) # faster than line below
       #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m * idx, y_idx=self.y_data_flat * self.index.flatten())
       nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i])
       #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat)
       #nll = -T.log(T.nnet.softmax(self.y_m)[self.i,self.y_data_flat[self.i]])
       #z_c = T.exp(self.z[:,self.y])
       #nll = -T.log(z_c / T.sum(z_c,axis=2,keepdims=True))
       #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat)
       #nll = T.set_subtensor(nll[self.j], T.constant(0.0))
     else:
       nll = -T.dot(T.log(T.clip(self.p_y_given_x[self.i], 1.e-38, 1.e20)), self.y_data_flat[self.i].T)
     return self.norm * T.sum(nll), known_grads
   elif self.loss == 'entropy':
     h_e = T.exp(self.y_m) #(TB)
     pcx = T.clip((h_e / T.sum(h_e, axis=1, keepdims=True)).reshape((self.index.shape[0],self.index.shape[1],self.attrs['n_out'])), 1.e-6, 1.e6) # TBD
     ee = -T.sum(pcx[self.i] * T.log(pcx[self.i])) # TB
     #nll, pcxs = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y[self.i])
     nll, _ = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) # TB
     ce = nll.reshape(self.index.shape) * self.index # TB
     y = self.y_data_flat.reshape(self.index.shape) * self.index # TB
     f = T.any(T.gt(y,0), axis=0) # B
     return T.sum(f * T.sum(ce, axis=0) + (1-f) * T.sum(ee, axis=0)), known_grads
     #return T.sum(T.switch(T.gt(T.sum(y,axis=0),0), T.sum(ce, axis=0), -T.sum(ee, axis=0))), known_grads
     #return T.switch(T.gt(T.sum(self.y_m[self.i]),0), T.sum(nll), -T.sum(pcx * T.log(pcx))), known_grads
   elif self.loss == 'priori':
     pcx = self.p_y_given_x[self.i, self.y_data_flat[self.i]]
     pcx = T.clip(pcx, 1.e-38, 1.e20)  # For pcx near zero, the gradient will likely explode.
     return -T.sum(T.log(pcx)), known_grads
   elif self.loss == 'sse':
     if self.y_data_flat.dtype.startswith('int'):
       y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim=1), 'int32')
       y_oh = T.eq(T.shape_padleft(T.arange(self.attrs['n_out']), y_f.ndim), T.shape_padright(y_f, 1))
       return T.mean(T.sqr(self.p_y_given_x[self.i] - y_oh[self.i])), known_grads
     else:
       #return T.sum(T.sum(T.sqr(self.y_m - self.y.reshape(self.y_m.shape)), axis=1)[self.i]), known_grads
       return T.sum(T.sqr(self.y_m[self.i] - self.y_data_flat.reshape(self.y_m.shape)[self.i])), known_grads
       #return T.sum(T.sum(T.sqr(self.z - (self.y.reshape((self.index.shape[0], self.index.shape[1], self.attrs['n_out']))[:self.z.shape[0]])), axis=2).flatten()[self.i]), known_grads
       #y_z = T.set_subtensor(T.zeros((self.index.shape[0],self.index.shape[1],self.attrs['n_out']), dtype='float32')[:self.z.shape[0]], self.z).flatten()
       #return T.sum(T.sqr(y_z[self.i] - self.y[self.i])), known_grads
       #return T.sum(T.sqr(self.y_m - self.y[:self.z.shape[0]*self.index.shape[1]]).flatten()[self.i]), known_grads
   else:
     assert False, "unknown loss: %s" % self.loss
Example #3
0
    def get_constraint_updates(self):
        constraint_updates = OrderedDict() 
        if self.flags['scalar_lambd']:
            constraint_updates[self.lambd] = T.mean(self.lambd) * T.ones_like(self.lambd)

        # constraint filters to have unit norm
        if self.flags['wv_norm'] in ('unit', 'max_unit'):
            wv = constraint_updates.get(self.Wv, self.Wv)
            wv_norm = T.sqrt(T.sum(wv**2, axis=0))
            if self.flags['wv_norm'] == 'unit':
                constraint_updates[self.Wv] = wv / wv_norm
            elif self.flags['wv_norm'] == 'max_unit':
                constraint_updates[self.Wv] = wv / wv_norm * T.minimum(wv_norm, 1.0)

        constraint_updates[self.scalar_norms] = T.maximum(1.0, self.scalar_norms)
        ## clip parameters to maximum values (if applicable)
        for (k,v) in self.clip_max.iteritems():
            assert k in [param.name for param in self.params()]
            param = constraint_updates.get(k, getattr(self, k))
            constraint_updates[param] = T.clip(param, param, v)

        ## clip parameters to minimum values (if applicable)
        for (k,v) in self.clip_min.iteritems():
            assert k in [param.name for param in self.params()]
            param = constraint_updates.get(k, getattr(self, k))
            constraint_updates[param] = T.clip(constraint_updates.get(param, param), v, param)

        return constraint_updates
Example #4
0
def custom_loss(y_true, y_pred):
  epsilon = 0.001
  first_log = T.log(T.clip(y_pred, 0.001, np.inf) + 1.)
  second_log = T.log(T.clip(y_true, 0.001, np.inf) + 1.)
  first_sum = T.log(T.sum(T.clip(y_pred, 0.001, np.inf))+1)
  second_sum = T.log(T.sum(T.clip(y_true, 0.001, np.inf))+1)
  return T.mean(T.square(first_log-second_log), axis=-1) + CMC_PENALTY*T.square(first_sum-second_sum)
    def get_constraint_updates(self):
        constraint_updates = OrderedDict() 

        if self.flags['wv_norm'] == 'unit':
            constraint_updates[self.Wv] = self.Wv / self.norm_wv
        elif self.flags['wv_norm'] == 'max_unit':
            constraint_updates[self.Wv] = self.Wv / self.norm_wv * T.minimum(self.norm_wv, 1.0)

        if self.flags['scalar_lambd']:
            constraint_updates[self.lambd] = T.mean(self.lambd) * T.ones_like(self.lambd)

        ## Enforce sparsity pattern on g if required ##
        if self.sparse_gmask:
            constraint_updates[self.Wg] = self.Wg * self.sparse_gmask.mask.T

        ## clip parameters to maximum values (if applicable)
        for (k,v) in self.clip_max.iteritems():
            assert k in [param.name for param in self.params()]
            param = constraint_updates.get(k, getattr(self, k))
            constraint_updates[param] = T.clip(param, param, v)

        ## clip parameters to minimum values (if applicable)
        for (k,v) in self.clip_min.iteritems():
            assert k in [param.name for param in self.params()]
            param = constraint_updates.get(k, getattr(self, k))
            constraint_updates[param] = T.clip(constraint_updates.get(param, param), v, param)

        return constraint_updates
Example #6
0
    def compute_hard_windows(self, image_shape, location, scale):
        # find topleft(front) and bottomright(back) corners for each patch
        a = location - 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale)
        b = location + 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale)

        # grow by three patch pixels
        a -= self.kernel.k_sigma_radius(self.cutoff, scale)
        b += self.kernel.k_sigma_radius(self.cutoff, scale)

        # clip to fit inside image and have nonempty window
        a = T.clip(a, 0, image_shape - 1)
        b = T.clip(b, a + 1, image_shape)

        if self.batched_window:
            # take the bounding box of all windows; now the slices
            # will have the same length for each sample and scan can
            # be avoided.  comes at the cost of typically selecting
            # more of the input.
            a = a.min(axis=0, keepdims=True)
            b = b.max(axis=0, keepdims=True)

        # make integer
        a = T.cast(T.floor(a), 'int16')
        b = T.cast(T.ceil(b), 'int16')

        return a, b
Example #7
0
def gaussian_likelihood_diagonal_variance(t, mu, sig, dim):
    """
    Gaussian Likelihood along first dimension
    Parameters
    ----------
    t   : TensorVariable
    mu  : FullyConnected (Linear)
    sig : FullyConnected (Softplus)
    dim : First dimension of the target vector t
    """
    # First clip sig
    sig_clip = T.clip(sig, 1e-40, 1e40)

    # Since the variance matrix is diagonal, normalization term is easier to compute,
    # and calculus overflow can easily be prevented by first summing by 2*pi and taking square
    sig_time_2pi = T.sqrt(sig_clip * 2 * math.pi)

    #######################
    #######################
    # This is the problem... product goes to 0
    normalization_coeff = T.clip(T.prod(sig_time_2pi, axis=0), 1e-40, 1e40)
    #######################
    #######################

    # Once again, fact that sig is diagonal allows for simplifications :
    # term by term division instead of inverse matrix multiplication
    exp_term = (T.exp(- 0.5 * (t-mu) * (t-mu) / sig_clip).sum(axis=0))
    pdf = exp_term / normalization_coeff
    return pdf
Example #8
0
    def get_constraint_updates(self):
        
        updates = OrderedDict()

        ## unit-variance constraint on hidden-unit activations ##
        if self.flags['unit_std']:
            updates[self.Wv] = self.Wv / self.avg_hact_std

        ## clip parameters to maximum values (if applicable)
        for (k,v) in self.clip_max.iteritems():
            assert k in [param.name for param in self.params()]
            param = getattr(self, k)
            updates[param] = T.clip(param, param, v)

        ## clip parameters to minimum values (if applicable)
        for (k,v) in self.clip_min.iteritems():
            assert k in [param.name for param in self.params()]
            param = getattr(self, k)
            updates[param] = T.clip(updates.get(param, param), v, param)
        
        ## constrain lambd to be a scalar
        if self.flags['scalar_lambd']:
            lambd = updates.get(self.lambd, self.lambd)
            updates[self.lambd] = T.mean(lambd) * T.ones_like(lambd)

        return updates
Example #9
0
	def build_and_train_model(self,n_hu,n_hl):
		print('Building Model')

		input_phrase = T.imatrix('train_inputmatrix')
		labels = T.imatrix('trainphrase_matrix')

		network = self.define_layers(input_phrase,labels,n_hu,n_hl)

		print("Defining loss")
		#Prediction or loss
		prediction = []
		prediction.append(T.clip(lasagne.layers.get_output(network[0]),1.0e-7,1.0-1.0e-7))
		prediction.append(T.clip(lasagne.layers.get_output(network[1]),1.0e-7,1.0-1.0e-7))

		loss = l.define_loss(prediction[0],prediction[1])
		self.model = network
		#define params
		params = lasagne.layers.get_all_params(network)
		updates = lasagne.updates.adadelta(loss,params)

		#run test

		train_fn = theano.function([input_phrase,labels],[loss, prediction[0], prediction[1]],updates=updates,allow_input_downcast=True)

		print("Model and params defined now training")
		epoch = 0
		for epoch in range(self.end_epoch):
			train_loss = 0
			train_pred = []
			start_time = time.time()
			loss, predicted, phrase = train_fn(self.train_inputmatrix,self.trainphrase_matrix)
			print('Training Loss: ' + str(loss) + ' Train Epoch ' + str(epoch))
			self.save_best(loss,predicted,network)
    def __init__(self, rng, input, filter_shape, image_shape, W=None, bias=False, padding='valid',activation=T.nnet.relu):

        assert image_shape[1] == filter_shape[1]
        self.input = input
        fan_in = numpy.prod(filter_shape[1:])
        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]))
        # initialize weights with random weights
        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
        if W==None:
            W = theano.shared(
                numpy.asarray(
                    rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
                    dtype=theano.config.floatX
                ),
                borrow=True
            )
        self.W =W

        conv_out = K.conv2d(
            x=input,
            kernel=self.W,
            filter_shape=filter_shape,
            image_shape=image_shape,
            border_mode=padding
        )
        
        if bias==True:
            b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
            self.b = theano.shared(value=b_values, borrow=True)
            self.output = self.output = T.clip(activation(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')), 0.001, 0.999)
            self.params = [self.W, self.b]
        else:
            self.output = T.clip(activation(conv_out), 0.001, 0.999)
            self.params = [self.W]
        self.input = input
Example #11
0
def kl_divergence(y_true, y_pred):

    y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
    y_true = T.clip(y_true, epsilon, 1.0 - epsilon)

    kld = T.mean(y_true * ( T.log(y_true) - T.log(y_pred)))
    return kld
    def redo_theano(self):

        self.h = shared(N.zeros(self.nhid, dtype=floatX), name="h")
        self.v = shared(N.zeros(self.nvis, dtype=floatX), name="v")

        input_v = T.vector()
        assert input_v.type.dtype == floatX

        self.init_h_v = function([input_v], updates={self.h: self.predict(input_v), self.v: input_v})

        coding_obj = self.coding_obj(self.v, self.h)
        assert len(coding_obj.type.broadcastable) == 0

        coding_grad = T.grad(coding_obj, self.h)
        assert len(coding_grad.type.broadcastable) == 1

        self.coding_obj_grad = function([], [coding_obj, coding_grad])

        self.new_h = shared(N.zeros(self.nhid, dtype=floatX), name="new_h")

        alpha = T.scalar(name="alpha")

        outside_grad = T.vector(name="outside_grad")

        new_h = T.clip(self.h * T.exp(-alpha * outside_grad), 1e-10, 1e4)

        new_obj = self.coding_obj(self.v, new_h)

        self.try_step = function([alpha, outside_grad], updates={self.new_h: new_h}, outputs=new_obj)

        self.accept_h = function([], updates={self.h: self.new_h})

        self.get_h = function([], self.h)

        V = T.matrix(name="V")
        H = T.matrix(name="H")

        coding_obj_batch = self.coding_obj_batch(V, H)

        self.code_learning_obj = function([V, H], coding_obj_batch)

        learning_grad = T.grad(coding_obj_batch, self.W)
        self.code_learning_step = function([V, H, alpha], updates={self.W: self.W - alpha * learning_grad})

        pred_obj = T.mean(T.sqr(self.predict(V) - H))

        predictor_params = [self.pred_W, self.pred_b, self.pred_g]

        pred_grads = T.grad(pred_obj, wrt=predictor_params)

        predictor_updates = {}

        for param, grad in zip(predictor_params, pred_grads):
            predictor_updates[param] = param - alpha * grad

        predictor_updates[self.pred_g] = T.clip(
            predictor_updates[self.pred_g], N.cast[floatX](0.5), N.cast[floatX](1000.0)
        )

        self.train_predictor = function([V, H, alpha], updates=predictor_updates)
Example #13
0
def sigmoid_readout(operators, v_in, h_L, external):
    """Sigmoid readout layer. Cost is the binary crossentropy and
    monitor is RMSE.
    :param operators: list of [weight, bias] with shapes (n_hidden, n_visible)
        and (n_visible, )
    :param h_L: shape (timesteps, n_hidden)
    :return: shape (timesteps, n_visible)
    """
    weight = operators[0]
    bias = operators[1]
    v_pred = sigmoid(T.dot(h_L, weight) + bias)  # broadcastable bias??
    v_pred_c = T.clip(v_pred, 1.0e-7, 1.0 - 1.0e-7)
    v_in_c = T.clip(v_in, 1.0e-7, 1.0 - 1.0e-7)

    # Sample is just rounded to nearest integer:
    v_sample = T.round(v_pred)
    v_sample_c = T.clip(v_sample, eps, 1.0 - eps)

    # Cost:
    # cost = 1000 * ((v_pred[:-1] - v_in[1:]) ** 2).mean()
    # cost = -T.xlogx.xlogy0(v_in_c[1:], v_pred_c[:-1]) - \
    #       T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_pred_c[:-1])
    cost = crossent(v_pred_c[:-1], v_in_c[1:])  # TODO: v_sample_c !!!
    cost = cost.mean()

    # Monitor:
    # monitor = -T.xlogx.xlogy0(v_in_c[1:], v_sample_c[:-1]) - \
    #          T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_sample_c[:-1])
    monitor = crossent(v_sample_c[:-1], v_in_c[1:])
    monitor = monitor.mean()

    return v_sample, cost, monitor, None
Example #14
0
 def _modify_updates(self, updates):
     if self.zero_hidbias:
         hidbias_updated = updates[self.hidbias]
         updates[self.hidbias] = tensor.clip(hidbias_updated, 0, 0)
     if self.zero_visbias:
         visbias_updated = updates[self.visbias]
         updates[self.visbias] = tensor.clip(visbias_updated, 0, 0)
Example #15
0
def sigmoid_readout_old(operators, v_in, h_L, g):
    """Sigmoid readout layer. Cost is the binary crossentropy and
    monitor is RMSE.
    :param params: list of [weight, bias] with shapes (n_hidden, n_visible)
        and (n_visible, )
    :param h_L: shape (timesteps, n_visible)
    :return: shape (timesteps, n_hidden)
    """
    weight = operators[0]
    bias = operators[1]
    v_pred = g(T.dot(h_L, weight) + bias)  # broadcastable bias??
    v_pred_c = T.clip(v_pred, 1.0e-7, 1.0 - 1.0e-7)
    v_in_c = T.clip(v_in, 1.0e-7, 1.0 - 1.0e-7)

    # Cost:
    cost = -T.xlogx.xlogy0(v_in_c[1:], v_pred_c[:-1]) - T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_pred_c[:-1])
    cost = cost.sum() / v_in.shape[0]

    # Sample is just rounded to nearest integer:
    v_sample = T.round(v_pred)
    v_sample_c = T.clip(v_sample, 1.0e-7, 1.0 - 1.0e-7)

    # Monitor (needs to return something... for now):
    monitor = -T.xlogx.xlogy0(v_in_c[1:], v_sample_c[:-1]) - T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_sample_c[:-1])
    monitor = monitor.sum() / v_in.shape[0]

    return v_sample, cost, monitor, None
Example #16
0
def softmax_readout(operators, v_in, h_L, external):
    """Softmax readout layer. Cost is the binary crossentropy and
    monitor is RMSE.
    :param operators: list of [weight, bias] with shapes (n_hidden, n_visible)
        and (n_visible, )
    :param h_L: shape (timesteps, n_hidden)
    :return: shape (timesteps, n_visible)
    """
    weight = operators[0]
    bias = operators[1]

    v_pred = softmax(T.dot(h_L, weight) + bias)  # broadcastable bias??
    v_pred_c = T.clip(v_pred, 1.0e-7, 1.0 - 1.0e-7)
    v_in_c = T.clip(v_in, 1.0e-7, 1.0 - 1.0e-7)

    # Sampled value is just the argmax of softmax:
    v_sample = rng.multinomial(pvals=v_pred, dtype=theano.config.floatX)
    v_sample_c = T.clip(v_sample, eps, 1.0 - eps)

    # Cost:
    # cost = 1000 * ((v_pred[:-1] - v_in[1:]) ** 2).mean()
    # cost = -T.xlogx.xlogy0(v_in_c[1:], v_pred_c[:-1]) - \
    #       T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_pred_c[:-1])
    cost = crossent(v_pred_c[:-1], v_in_c[1:])
    cost = cost.mean()

    # Monitor:
    # monitor = -T.xlogx.xlogy0(v_in_c[1:], v_sample_c[:-1]) - \
    #          T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_sample_c[:-1])
    # TODO: changed monitor to v_pred_c!!!
    monitor = crossent(v_pred_c[:-1], v_in_c[1:])
    monitor = monitor.mean()

    return v_sample, cost, monitor, None
Example #17
0
def lcn_std_diff(x,size=9):
    # Function borrowed from bengioe_util
    p = x.reshape((1,1,48,48))
    #p = (p-TT.mean(p))/T.std(p)
    g = gaussian(size,1.591/size)
    g/=g.sum()
    g = numpy.float32(g.reshape((1,1,size,size)))
    mean = TT.nnet.conv.conv2d(p,TT.constant(g),
                              (1,1,48,48),
                              (1,1,size,size),
                              'full').reshape((48+size-1,)*2)
    mean = mean[size/2:48+size/2,
                size/2:48+size/2]
    meansq = TT.nnet.conv.conv2d(TT.sqr(p),TT.constant(g),
                                (1,1,48,48),
                                (1,1,size,size),
                                'full').reshape((48+size-1,)*2)
    meansq = meansq[size/2:48+size/2,
                    size/2:48+size/2]
    var = meansq - TT.sqr(mean)
    var = TT.clip(var, 0, 1e30)
    std = TT.sqrt(var)
    std = TT.clip(std, TT.mean(std), 1e30)
    out = (p - mean) / std
    return out - out.min()
Example #18
0
def init_process(model, gaussian, delta, fn_type):
    print("Building model and compiling functions...")
    # Prepare Theano variables for inputs and targets
    import theano.tensor as T
    input_var_list = [T.tensor4('inputs{}'.format(i))
                      for i in range(scales)]
    target_var = T.imatrix('targets')

    # Create network model
    if model == 'jy':
        print('Building JY CNN...')
        network = JY_cnn(input_var_list, gaussian, delta)
        learning_rate = 0.006
    # elif model == 'fcrnn':
    #     print('Building FCRNN...')
    #     network = FCRNN(input_var_list, delta)
    #     learning_rate = 0.0005

    print('defining loss function')
    prediction = lasagne.layers.get_output(network)
    prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7)
    loss = lasagne.objectives.binary_crossentropy(prediction, target_var)
    loss = loss.mean()

    print('defining update')
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(
            loss, params, learning_rate=learning_rate, momentum=0.9)
    # updates = lasagne.updates.adagrad(loss, params, learning_rate=learning_rate)
    

    print('defining testing method')
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_prediction = T.clip(test_prediction, 1e-7, 1.0 - 1e-7)

    #frame prediction
    layer_list = lasagne.layers.get_all_layers(network)
    gauss_layer = layer_list[-3]
    pre_gauss_layer = layer_list[-4] if gaussian else layer_list[-3]
    gauss_pred = lasagne.layers.get_output(gauss_layer, deterministic=True)
    pre_gauss_pred = lasagne.layers.get_output(pre_gauss_layer, deterministic=True)


    test_loss = lasagne.objectives.binary_crossentropy(test_prediction, target_var)
    test_loss = test_loss.mean()
    test_pred_result = T.argmax(test_prediction, axis=1)
    target_result = T.argmax(target_var, axis=1)
    test_acc = T.mean(T.eq(test_pred_result, target_result),
                      dtype=theano.config.floatX)

    if fn_type == 'train':
        print('compiling training function')
        func = theano.function(input_var_list + [target_var], 
                    [loss, prediction, gauss_pred, pre_gauss_pred], updates=updates)
    elif fn_type == 'val' or fn_type == 'test':
        print('compiling validation and testing function')
        func = theano.function(input_var_list + [target_var], 
                    [test_loss, test_acc, test_pred_result, test_prediction, gauss_pred, pre_gauss_pred])

    return func, network
Example #19
0
    def rmsprop(self, lr, tparams, grads, inp_list, cost, params):
        clip = params["grad_clip"]
        decay_rate = tensor.constant(params["decay_rate"], dtype=theano.config.floatX)
        smooth_eps = tensor.constant(params["smooth_eps"], dtype=theano.config.floatX)
        zipped_grads = [theano.shared(np.zeros_like(p.get_value()), name="%s_grad" % k) for k, p in tparams.iteritems()]
        running_grads2 = [
            theano.shared(np.zeros_like(p.get_value()), name="%s_rgrad2" % k) for k, p in tparams.iteritems()
        ]
        zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
        if clip > 0.0:
            rg2up = [
                (
                    rg2,
                    tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g, -clip, clip) ** 2), 0.0, np.inf),
                )
                for rg2, g in zip(running_grads2, grads)
            ]
        else:
            rg2up = [
                (rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (g ** 2), 0.0, np.inf))
                for rg2, g in zip(running_grads2, grads)
            ]

        f_grad_shared = theano.function(inp_list, cost, updates=zgup + rg2up, name="rmsprop_f_grad_shared")

        updir = [theano.shared(p.get_value() * numpy_floatX(0.0), name="%s_updir" % k) for k, p in tparams.iteritems()]
        updir_new = [
            (ud, -lr * zg / (tensor.sqrt(rg2) + smooth_eps)) for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2)
        ]
        param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)]
        f_update = theano.function(
            [lr], [], updates=updir_new + param_up, on_unused_input="ignore", name="rmsprop_f_update"
        )

        return f_grad_shared, f_update, zipped_grads, running_grads2, updir
Example #20
0
 def train(self, X, evalinter=10):
     '''
     function to call to train this NMF GD on given matrix X
     Calls trainingloop()
     '''
     self.initvars(X)
     # define errors and cost
     tErr = (1./2.) * ((self.X - T.dot(self.W, self.H))**2).sum()
     tReg = (1./2.) * ((self.W**2).sum() * self.Wreg + (self.H**2).sum() * self.Hreg)
     tCost = tErr + tReg
     # get gradients
     gW, gH = T.grad(tCost, [self.W, self.H])
     # define updates and function
     updW = (self.W, T.clip(self.W - self.lr * gW, 0, np.infty))
     updH = (self.H, T.clip(self.H - self.lr * gH, 0, np.infty))
     trainf = theano.function(
         inputs=[],
         outputs=[tErr],
         updates=[updW, updH]
     )
     normf = theano.function(
         inputs=[],
         outputs=[],
         updates=[
             (self.W, (self.W.T/T.sum(self.W, axis=1)).T),
             #
         ]
     )
     # train loop
     err = self.trainloop(X, trainf=trainf, evalinter=evalinter)
     return self.W.get_value(), self.H.get_value(), err
Example #21
0
    def get_output_for(self, inputs, **kwargs):
        mu_area, sigma_area, is_not_padded, slicedists = inputs

        # Rescale input
        mu_area = mu_area / self.rescale_input
        sigma_area = sigma_area / self.rescale_input

        # For each slice pair, compute if both of them are valid
        is_pair_not_padded = is_not_padded[:, :-1] + is_not_padded[:, 1:] > 1.5

        # Compute the distance between slices
        h = slicedists[:, :-1]

        # Compute mu for each slice pair
        m1 = mu_area[:, :-1]
        m2 = mu_area[:, 1:]
        eps = 1e-2
        mu_volumes = (m1 + m2 + T.sqrt(T.clip(m1*m2, eps, utils.maxfloat))) * h / 3.0
        mu_volumes = mu_volumes * is_pair_not_padded

        # Compute sigma for each slice pair
        s1 = sigma_area[:, :-1]
        s2 = sigma_area[:, 1:]
        sigma_volumes = h*(s1 + s2) / 3.0
        sigma_volumes = sigma_volumes * is_pair_not_padded

        # Compute mu and sigma per patient
        mu_volume_patient = T.sum(mu_volumes, axis=1)
        sigma_volume_patient = T.sqrt(T.clip(T.sum(sigma_volumes**2, axis=1), eps, utils.maxfloat))

        # Concat and return
        return T.concatenate([
            mu_volume_patient.dimshuffle(0, 'x'),
            sigma_volume_patient.dimshuffle(0, 'x')], axis=1)
Example #22
0
File: unet.py Project: Rhoana/icon
    def unet_crossentropy_loss_sampled(y_true, y_pred):
        epsilon = 1.0e-4
        y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon))
        y_true = T.flatten(y_true)
        # this seems to work
        # it is super ugly though and I am sure there is a better way to do it
        # but I am struggling with theano to cooperate
        # filter the right indices
        classPos = 1
        classNeg = 0
        indPos   = T.eq(y_true, classPos).nonzero()[0]
        indNeg   = T.eq(y_true, classNeg).nonzero()[0]
        #pos      = y_true[ indPos ]
        #neg      = y_true[ indNeg ]

        # shuffle
        n = indPos.shape[0]
        indPos = indPos[UNET.srng.permutation(n=n)]
        n = indNeg.shape[0]
        indNeg = indNeg[UNET.srng.permutation(n=n)]
        # take equal number of samples depending on which class has less
        n_samples = T.cast(T.min([ indPos.shape[0], indNeg.shape[0]]), dtype='int64')
        #n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64')

        indPos = indPos[:n_samples]
        indNeg = indNeg[:n_samples]
        #loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg]))
        loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(y_pred_clipped[indNeg]))
        loss_vector = T.clip(loss_vector, epsilon, 1.0-epsilon)
        average_loss = T.mean(loss_vector)
        if T.isnan(average_loss):
            average_loss = T.mean( y_pred_clipped[indPos])
        return average_loss
Example #23
0
def theano_mu_sigma_erf(mu_erf, sigma_erf, eps=1e-7):
    x_axis = theano.shared(np.arange(0, 600, dtype='float32')).dimshuffle('x',0)
    if sigma_erf.ndim==0:
        sigma_erf = T.clip(sigma_erf.dimshuffle('x','x'), eps, 1)
    elif sigma_erf.ndim==1:
        sigma_erf = T.clip(sigma_erf.dimshuffle(0,'x'), eps, 1)
    x = (x_axis - mu_erf.dimshuffle(0,'x')) / (sigma_erf * np.sqrt(2).astype('float32'))
    return (T.erf(x) + 1)/2
Example #24
0
def objective(y_true, y_pred, P, Q, alpha=0., beta=0.15, dbeta=0., gamma=0.01, gamma1=-1., poos=0.23, eps=1e-6):
    '''Expects a binary class matrix instead of a vector of scalar classes.
    '''

    beta = np.float32(beta)
    dbeta = np.float32(dbeta)
    gamma = np.float32(gamma)
    poos = np.float32(poos)
    eps = np.float32(eps)

    # scale preds so that the class probas of each sample sum to 1
    y_pred += eps
    y_pred /= y_pred.sum(axis=-1, keepdims=True)

    y_true = T.cast(y_true.flatten(), 'int64')
    y1 = T.and_(T.gt(y_true, 0), T.le(y_true, Q))  # in-set
    y0 = T.or_(T.eq(y_true, 0), T.gt(y_true, Q))  # out-of-set or unlabeled
    y0sum = y0.sum() + eps  # number of oos
    y1sum = y1.sum() + eps  # number of in-set
    # we want to reduce cross entrophy of labeled data
    # convert all oos/unlabeled to label=0
    cost0 = T.nnet.categorical_crossentropy(y_pred, T.switch(y_true <= Q, y_true, 0))
    cost0 = T.dot(y1, cost0) / y1sum  # average cost per labeled example

    if alpha:
        cost1 = T.nnet.categorical_crossentropy(y_pred, y_pred)
        cost1 = T.dot(y0, cost1) / y0sum  # average cost per labeled example
        cost0 += alpha*cost1

    # we want to increase the average entrophy in each batch
    # average over batch
    if beta:
        y_pred_avg0 = T.dot(y0, y_pred) / y0sum
        y_pred_avg0 = T.clip(y_pred_avg0, eps, np.float32(1) - eps)
        y_pred_avg0 /= y_pred_avg0.sum(axis=-1, keepdims=True)
        cost2 = T.nnet.categorical_crossentropy(y_pred_avg0.reshape((1,-1)), P-dbeta)[0] # [None,:]
        cost2 = T.switch(y0sum > 0.5, cost2, 0.)  # ignore cost2 if no samples
        cost0 += beta*cost2

    # binary classifier score
    if gamma:
        y_pred0 = T.clip(y_pred[:,0], eps, np.float32(1) - eps)
        if gamma1 < 0.:
            cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot(np.float32(1)-poos*y0.T,T.log(np.float32(1)-y_pred0))
            cost3 /= y_pred.shape[0]
            cost0 += gamma*cost3
        elif gamma1 > 0.:
            cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0,T.log(np.float32(1)-y_pred0))
            cost3 /= y0sum
            cost31 =  - T.dot(y1,T.log(np.float32(1)-y_pred0))
            cost3 /= y1sum
            cost0 += gamma*cost3 + gamma1*cost31
        else:  # gamma1 == 0.
            cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0, T.log(np.float32(1)-y_pred0))
            cost3 /= y0sum
            cost0 += gamma*cost3
    return cost0
Example #25
0
def build_objective2(model, deterministic=False, epsilon=1e-12):
    predictions = nn.layers.get_output(model.l_out, deterministic=deterministic)
    targets = T.flatten(nn.layers.get_output(model.l_target))
    targets = T.clip(targets, 0, 1)
    p_no_nodule = predictions[:,0]
    p_nodule = np.float32(1.)-p_no_nodule
    p = T.clip(p_nodule, epsilon, 1.-epsilon)
    bce = T.nnet.binary_crossentropy(p, targets)
    return T.mean(bce)
def kl_divergence(target, prediction, eps=1e-6):
    '''Kullback-Leibler divergence'''
    prediction = T.reshape(prediction, (prediction.shape[1], prediction.shape[2]))
    target = T.reshape(target, (target.shape[1], target.shape[2]))
    prediction = T.clip(prediction, eps, 1 - eps)
    target = T.clip(target, eps, 1 - eps)

    kl = T.sum(target * T.log(target / prediction), axis=0, keepdims=True)
    return kl
Example #27
0
def _interpolate(im, x, y, out_height, out_width):
    # *_f are floats
    num_batch, height, width, channels = im.shape
    height_f = T.cast(height, theano.config.floatX)
    width_f = T.cast(width, theano.config.floatX)

    # scale indices from [-1, 1] to [0, width/height].
    x = (x + 1) / 2 * width_f
    y = (y + 1) / 2 * height_f

    # Clip indices to ensure they are not out of bounds.
    max_x = width_f - 1
    max_y = height_f - 1
    x0 = T.clip(x, 0, max_x)
    x1 = T.clip(x + 1, 0, max_x)
    y0 = T.clip(y, 0, max_y)
    y1 = T.clip(y + 1, 0, max_y)

    # We need floatX for interpolation and int64 for indexing.
    x0_f = T.floor(x0)
    x1_f = T.floor(x1)
    y0_f = T.floor(y0)
    y1_f = T.floor(y1)
    x0 = T.cast(x0, 'int64')
    x1 = T.cast(x1, 'int64')
    y0 = T.cast(y0, 'int64')
    y1 = T.cast(y1, 'int64')

    # The input is [num_batch, height, width, channels]. We do the lookup in
    # the flattened input, i.e [num_batch*height*width, channels]. We need
    # to offset all indices to match the flat version
    dim2 = width
    dim1 = width*height
    base = T.repeat(
        T.arange(num_batch, dtype='int64')*dim1, out_height*out_width)
    base_y0 = base + y0*dim2
    base_y1 = base + y1*dim2
    idx_a = base_y0 + x0
    idx_b = base_y1 + x0
    idx_c = base_y0 + x1
    idx_d = base_y1 + x1

    # use indices to lookup pixels for all samples
    im_flat = im.reshape((-1, channels))
    Ia = im_flat[idx_a]
    Ib = im_flat[idx_b]
    Ic = im_flat[idx_c]
    Id = im_flat[idx_d]

    # calculate interpolated values
    wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x')
    wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x')
    wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x')
    wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x')
    output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0)
    return output
Example #28
0
def _interpolate(im, x, y, out_height, out_width):
    # *_f are floats
    num_batch, height, width, channels = im.shape
    height_f = T.cast(height, theano.config.floatX)
    width_f = T.cast(width, theano.config.floatX)

    # clip coordinates to [-1, 1]
    x = T.clip(x, -1, 1)
    y = T.clip(y, -1, 1)

    # scale coordinates from [-1, 1] to [0, width/height - 1]
    x = (x + 1) / 2 * (width_f - 1)
    y = (y + 1) / 2 * (height_f - 1)

    # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates;
    # we need those in floatX for interpolation and in int64 for indexing. for
    # indexing, we need to take care they do not extend past the image.
    x0_f = T.floor(x)
    y0_f = T.floor(y)
    x1_f = x0_f + 1
    y1_f = y0_f + 1
    x0 = T.cast(x0_f, 'int64')
    y0 = T.cast(y0_f, 'int64')
    x1 = T.cast(T.minimum(x1_f, width_f - 1), 'int64')
    y1 = T.cast(T.minimum(y1_f, height_f - 1), 'int64')

    # The input is [num_batch, height, width, channels]. We do the lookup in
    # the flattened input, i.e [num_batch*height*width, channels]. We need
    # to offset all indices to match the flat version
    dim2 = width
    dim1 = width*height
    base = T.repeat(
        T.arange(num_batch, dtype='int64')*dim1, out_height*out_width)
    base_y0 = base + y0*dim2
    base_y1 = base + y1*dim2
    idx_a = base_y0 + x0
    idx_b = base_y1 + x0
    idx_c = base_y0 + x1
    idx_d = base_y1 + x1

    # use indices to lookup pixels for all samples
    im_flat = im.reshape((-1, channels))
    Ia = im_flat[idx_a]
    Ib = im_flat[idx_b]
    Ic = im_flat[idx_c]
    Id = im_flat[idx_d]

    # calculate interpolated values
    wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x')
    wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x')
    wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x')
    wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x')
    output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0)

    assert str(output.dtype) == theano.config.floatX, str(output.dtype)
    return output
Example #29
0
def _interpolate(im, x, y, out_height, out_width):
    # *_f are floats
    num_batch, height, width, channels = im.shape
    height_f = T.cast(height, 'float32')
    width_f = T.cast(width, 'float32')
    zero = T.zeros([], dtype='int64')
    max_y = im.shape[1] - 1
    max_x = im.shape[2] - 1

    # scale indices from [-1, 1] to [0, width/height].
    x = (x + 1.0)*(width_f) / 2.0
    y = (y + 1.0)*(height_f) / 2.0

    x0 = T.cast(T.floor(x), 'int64')
    x1 = x0 + 1
    y0 = T.cast(T.floor(y), 'int64')
    y1 = y0 + 1

    # Clip indicies to ensure they are not out of bounds.
    x0 = T.clip(x0, zero, max_x)
    x1 = T.clip(x1, zero, max_x)
    y0 = T.clip(y0, zero, max_y)
    y1 = T.clip(y1, zero, max_y)

    # The input is [num_batch, height, width, channels]. We do the lookup in
    # the flattened input, i.e [num_batch*height*width, channels]. We need
    # to offset all indices to match the flat version
    dim2 = width
    dim1 = width*height
    base = _repeat(
        T.arange(num_batch, dtype='int32')*dim1, out_height*out_width)
    base_y0 = base + y0*dim2
    base_y1 = base + y1*dim2
    idx_a = base_y0 + x0
    idx_b = base_y1 + x0
    idx_c = base_y0 + x1
    idx_d = base_y1 + x1

    # use indices to lookup pixels for all samples
    im_flat = im.reshape((-1, channels))
    Ia = im_flat[idx_a]
    Ib = im_flat[idx_b]
    Ic = im_flat[idx_c]
    Id = im_flat[idx_d]

    # calculate interpolated values
    x0_f = T.cast(x0, 'float32')
    x1_f = T.cast(x1, 'float32')
    y0_f = T.cast(y0, 'float32')
    y1_f = T.cast(y1, 'float32')
    wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x')
    wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x')
    wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x')
    wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x')
    output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0)
    return output
Example #30
0
        def _step_state(x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_t, ctrl, exist, time_step):

            a_t_e, v_t_e, x_t_e, t_t, t_h = step(x_h_, v_h_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_h, exist, time_step)

            t_h = common.disconnected_grad(t_h)
            t_t = common.disconnected_grad(t_t)

            # approximated dynamic of the un-observed parts in the state
            a_t_a = tt.zeros(shape=(3,2), dtype=np.float32)

            v_t_a = v_t_

            x_t_a = x_t_ + self.dt * v_t_a

            # difference in predictions
            n_v_t = v_t_e - v_t_a

            n_a_t = a_t_e - a_t_a

            n_x_t = x_t_e - x_t_a

            # disconnect the gradient of the noise signals
            n_v_t = common.disconnected_grad(n_v_t)

            n_a_t = common.disconnected_grad(n_a_t)

            n_x_t = common.disconnected_grad(n_x_t)

            # add the noise to the approximation
            a_t = a_t_a + n_a_t

            v_t = v_t_a + n_v_t

            x_t = x_t_a + n_x_t

            # update the observed part of the state
            delta_steer = ctrl[0]
            accel = ctrl[1]

            delta_steer = tt.clip(delta_steer, -np.pi/4, np.pi/4)

            angle = angle_ + delta_steer

            speed = speed_ + accel * self.dt

            speed = tt.clip(speed, 0, self.v_max)

            v_h_x = speed * tt.sin(angle)
            v_h_y = speed * tt.cos(angle)

            v_h = tt.stack([v_h_x,v_h_y])

            x_h = x_h_ + self.dt * v_h
            x_h = tt.clip(x_h, -self.bw, self.bw)

            return x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t
Example #31
0
def clipped_gradients(gradients, gradient_clipping):
    clipped_grads = [
        T.clip(g, -gradient_clipping, gradient_clipping) for g in gradients
    ]
    return clipped_grads
Example #32
0
def clip(x, min_value, max_value):
    if max_value < min_value:
        max_value = min_value
    return T.clip(x, min_value, max_value)
Example #33
0
def binary_crossentropy(output, target, from_logits=False):
    if from_logits:
        output = T.nnet.sigmoid(output)
    # avoid numerical instability with _EPSILON clipping
    output = T.clip(output, _EPSILON, 1.0 - _EPSILON)
    return T.nnet.binary_crossentropy(output, target)
Example #34
0
    def __init__(self,
                 extractor,
                 dataset,
                 train_batch_size=16,
                 extractor_learning_rate=1e-5,
                 ranker_learning_rate=1e-4,
                 weight_decay=1e-5,
                 optimizer=lasagne.updates.rmsprop,
                 ranker_nonlinearity=lasagne.nonlinearities.linear,
                 debug=False,
                 do_log=True):

        self.train_batch_size = train_batch_size
        self.extractor = extractor
        self.dataset = dataset
        self.weight_decay = weight_decay
        self.optimizer = optimizer
        self.ranker_nonlinearity = ranker_nonlinearity
        self.extractor_learning_rate = extractor_learning_rate
        self.ranker_learning_rate = ranker_learning_rate
        self.debug = debug
        self.do_log = do_log

        if force_not_log:
            self.do_log = False
            logger.warning('Not logging because pastalog is not installed.')

        extractor_name = self.extractor.__class__.__name__
        if extractor.augmentation:
            extractor_name = "%s-aug" % extractor_name

        self.NAME = "e:%s-d:%s-bs:%d-elr:%f-rlr:%f-opt:%s-rnl:%s-wd:%f-rs:%s" % (
            extractor_name, self.dataset.get_name(), self.train_batch_size,
            extractor_learning_rate, ranker_learning_rate,
            self.optimizer.__name__, self.ranker_nonlinearity.__name__,
            self.weight_decay, str(settings.RANDOM_SEED))
        if self.do_log:
            self.pastalog = Log('http://localhost:8100/', self.NAME)

        # TODO: check if converting these to shared variable actually improves
        # performance.
        self.input_var = T.ftensor4('inputs')
        self.target_var = T.fvector('targets')

        self.extractor.set_input_var(self.input_var,
                                     batch_size=train_batch_size)
        self.extractor_layer = self.extractor.get_output_layer()

        self.extractor_learning_rate_shared_var = theano.shared(
            np.cast['float32'](extractor_learning_rate),
            name='extractor_learning_rate')
        self.ranker_learning_rate_shared_var = theano.shared(
            np.cast['float32'](ranker_learning_rate),
            name='ranker_learning_rate')

        self.extractor_params = lasagne.layers.get_all_params(
            self.extractor_layer, trainable=True)

        self.absolute_rank_estimate, self.ranker_params = self._create_absolute_rank_estimate(
            self.extractor_layer)
        self.reshaped_input = lasagne.layers.ReshapeLayer(
            self.absolute_rank_estimate, (-1, 2))

        # the posterior estimate layer is not trainable
        self.posterior_estimate = lasagne.layers.DenseLayer(
            self.reshaped_input,
            num_units=1,
            W=lasagne.init.np.array([[1], [-1]]),
            b=lasagne.init.Constant(val=0),
            nonlinearity=lasagne.nonlinearities.sigmoid)
        self.posterior_estimate.params[self.posterior_estimate.W].remove(
            'trainable')
        self.posterior_estimate.params[self.posterior_estimate.b].remove(
            'trainable')

        # the clipping is done to prevent the model from diverging as caused by
        # binary XEnt
        self.predictions = T.clip(
            lasagne.layers.get_output(self.posterior_estimate).ravel(),
            self._epsilon, 1.0 - self._epsilon)

        self.xent_loss = lasagne.objectives.binary_crossentropy(
            self.predictions, self.target_var).mean()
        self.l2_penalty = lasagne.regularization.regularize_network_params(
            self.absolute_rank_estimate, lasagne.regularization.l2)
        self.loss = self.xent_loss + self.l2_penalty * self.weight_decay

        self.test_absolute_rank_estimate = lasagne.layers.get_output(
            self.absolute_rank_estimate, deterministic=True)

        self._create_theano_functions()
Example #35
0
def mean_absolute_percentage_error_loss(y_pred, y_actual, **kwargs):
    eprint("Use mean absolute percentage error. Ensure no outputs are exactly 0.")
    diff = T.abs_( (y_actual - y_pred) / T.clip(T.abs_(y_actual), epsilon, np.inf))
    return(100. * T.mean(diff, axis=-1))
Example #36
0
def build_objective2(model, deterministic=False, epsilon=1.e-7):
    predictions = T.flatten(
        nn.layers.get_output(model.l_out, deterministic=deterministic))
    targets = T.flatten(nn.layers.get_output(model.l_target))
    preds = T.clip(predictions, epsilon, 1. - epsilon)
    return T.mean(nn.objectives.binary_crossentropy(preds, targets))
def rho(x):
    return tt.clip(x, 0, 1)
Example #38
0
def categorical_crossentropy(expected, predicted):
    """ Categorical cross-entropy error.
    """
    epsilon = smallest_positive_number()
    predicted = T.clip(predicted, epsilon, 1.0 - epsilon)
    return T.nnet.categorical_crossentropy(predicted, expected).mean()
Example #39
0
def test_mlp(initial_learning_rate, learning_rate_decay,
             squared_filter_length_limit, n_epochs, batch_size, dropout,
             results_file_name, layer_sizes, dataset, use_bias):
    """
    The dataset is the one from the mlp demo on deeplearning.net.  This training
    function is lifted from there almost exactly.

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


    """
    datasets = load_mnist(dataset)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################

    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    epoch = T.scalar()
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels
    learning_rate = theano.shared(
        np.asarray(initial_learning_rate, dtype=theano.config.floatX))

    rng = np.random.RandomState(1234)

    # construct the MLP class
    classifier = MLP(rng=rng,
                     input=x,
                     layer_sizes=layer_sizes,
                     use_bias=use_bias)

    # Build the expresson for the cost function.
    cost = classifier.negative_log_likelihood(y)
    dropout_cost = classifier.dropout_negative_log_likelihood(y)

    # Compile theano function for testing.
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })
    #theano.printing.pydotprint(test_model, outfile="test_file.png",
    #        var_with_name_simple=True)

    # Compile theano function for validation.
    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })
    #theano.printing.pydotprint(validate_model, outfile="validate_file.png",
    #        var_with_name_simple=True)

    # Compute gradients of the model wrt parameters
    gparams = []
    for param in classifier.params:
        # Use the right cost function here to train with or without dropout.
        gparam = T.grad(dropout_cost if dropout else cost, param)
        gparams.append(gparam)

    # ... and allocate mmeory for momentum'd versions of the gradient
    gparams_mom = []
    for param in classifier.params:
        gparam_mom = theano.shared(
            np.zeros(param.get_value(borrow=True).shape,
                     dtype=theano.config.floatX))
        gparams_mom.append(gparam_mom)

    # Compute momentum for the current epoch
    mom = ifelse(epoch < 500,
                 0.5 * (1. - epoch / 500.) + 0.99 * (epoch / 500.), 0.99)

    # Update the step direction using momentum
    updates = {}
    for gparam_mom, gparam in zip(gparams_mom, gparams):
        updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam

    # ... and take a step along that direction
    for param, gparam_mom in zip(classifier.params, gparams_mom):
        stepped_param = param - (1. - mom) * learning_rate * gparam_mom

        # This is a silly hack to constrain the norms of the rows of the weight
        # matrices.  This just checks if there are two dimensions to the
        # parameter and constrains it if so... maybe this is a bit silly but it
        # should work for now.
        if param.get_value(borrow=True).ndim == 2:
            squared_norms = T.sum(stepped_param**2, axis=1).reshape(
                (stepped_param.shape[0], 1))
            scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms),
                           0., 1.)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param

    # Compile theano function for training.  This returns the training cost and
    # updates the model parameters.
    output = dropout_cost if dropout else cost
    train_model = theano.function(
        inputs=[epoch, index],
        outputs=output,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    #theano.printing.pydotprint(train_model, outfile="train_file.png",
    #        var_with_name_simple=True)

    # Theano function to decay the learning rate, this is separate from the
    # training function because we only want to do this once each epoch instead
    # of after each minibatch.
    decay_learning_rate = theano.function(
        inputs=[],
        outputs=learning_rate,
        updates={learning_rate: learning_rate * learning_rate_decay})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    best_params = None
    best_validation_errors = np.inf
    best_iter = 0
    test_score = 0.
    epoch_counter = 0
    start_time = time.clock()

    results_file = open(results_file_name, 'wb')

    while epoch_counter < n_epochs:
        # Train this epoch
        epoch_counter = epoch_counter + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(epoch_counter, minibatch_index)

        # Compute loss on validation set
        validation_losses = [
            validate_model(i) for i in xrange(n_valid_batches)
        ]
        this_validation_errors = np.sum(validation_losses)

        # Report and save progress.
        print "epoch {}, test error {}, learning_rate={}{}".format(
            epoch_counter, this_validation_errors,
            learning_rate.get_value(borrow=True),
            " **" if this_validation_errors < best_validation_errors else "")

        best_validation_errors = min(best_validation_errors,
                                     this_validation_errors)
        results_file.write("{0}\n".format(this_validation_errors))
        results_file.flush()

        new_learning_rate = decay_learning_rate()

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_errors * 100., best_iter, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #40
0
def binary_crossentropy(expected, predicted):
    """ Binary cross-entropy error.
    """
    epsilon = smallest_positive_number()
    predicted = T.clip(predicted, epsilon, 1.0 - epsilon)
    return T.nnet.binary_crossentropy(predicted, expected).mean()
Example #41
0
def build_objective(model, deterministic=False, epsilon=1e-12):
    p = nn.layers.get_output(model.l_out, deterministic=deterministic)
    targets = T.flatten(nn.layers.get_output(model.l_target))
    p = T.clip(p, epsilon, 1.-epsilon)
    bce = T.nnet.binary_crossentropy(p, targets)
    return T.mean(bce)
Example #42
0
def lrelu(x):
    return tensor.clip(tensor.nnet.relu(x, 1. / 3), -3.0, 3.0)
def train_rnn():
    global vocab, CNN_FEATURE_SIZE, word_to_index, index_to_word, SEQUENCE_LENGTH, MAX_SENTENCE_LENGTH
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # Load the preprocessed dataset containing features extracted by GoogLeNet
    dataset = pickle.load(open('./data/image_caption_with_cnn_features.pkl'))

    # Count words occuring at least 5 times and construct mapping int <-> word
    allwords = Counter()
    for item in dataset:
        for sentence in item['sentences']:
            allwords.update(sentence['tokens'])

    vocab = [k for k, v in allwords.items() if (v >= 2 and k not in ['best', 'Best', '!', 'ever'])]
    vocab.insert(0, '#START#')
    vocab.append('#END#')

    word_to_index = {w: i for i, w in enumerate(vocab)}
    index_to_word = {i: w for i, w in enumerate(vocab)}

    logging.info('Size of vocabulary: {0}'.format(len(vocab)))

    SEQUENCE_LENGTH = 9
    MAX_SENTENCE_LENGTH = SEQUENCE_LENGTH - 3  # 1 for image, 1 for start token, 1 for end token
    BATCH_SIZE = 75
    CNN_FEATURE_SIZE = 5
    EMBEDDING_SIZE = 1024
    LR = 0.001
    BATCH_SIZE = 75
    ITERATIONS = 19000
    # Configuration 23213: 512:0.001:20000
    # Configuration 23214: 256:0.001:20000
    # Configuration 23216: 512:0.0001:20000
    # Configuration 23217: 1024:0.0001:20000
    # Configuration 23227: 512:0.01:50000
    # Configuration 23229: 512:0.001:50000
    # Configuration 23231: 512:0.001:20000 BS=200
    # Configuration 23232: 1024:0.001:20000 BS=200
    # Configuration 23233: 1024:0.0001:20000 BS=200 SEQ = 32
    # Configuration 23234: 512:0.0001:20000 BS=200 SEQ=16
    # Configuration 23235: 512:0.0001:20000 BS=100 SEQ=8
    # Configuration 23242: 1024:0.0001:20000 BS=200 SEQ=13
    # Configuration 23243: 512:0.0001:20000 BS=200 SEQ=13 v >= 5
    # Configuration 23246: 512:0.0001:75000 BS=100 SEQ=15 removing best ! AND v >= 4
    # Config        23318: 512:0.001:25000 BS=200 SEQ=13 removing best ! AND v >= 3
    # Config        23319: 512:0.0001:25000 BS=100 SEQ=11 removing best ! AND v >= 3
    # Config        23322: 512:0.0001:25000 BS=100 SEQ=11 removing best ! AND v >= 2
    # Config        23323: 1024:0.001:25000 BS=100 SEQ=11 removing best ! AND v >= 2


    logging.info('Embeddings: {0} Learning rate: {1} Iter: {2}'.format(EMBEDDING_SIZE, LR, ITERATIONS))

    # sentence embedding maps integer sequence with dim (BATCH_SIZE, SEQUENCE_LENGTH - 1) to
    # (BATCH_SIZE, SEQUENCE_LENGTH-1, EMBEDDING_SIZE)
    l_input_sentence = lasagne.layers.InputLayer((BATCH_SIZE, SEQUENCE_LENGTH - 1))
    l_sentence_embedding = lasagne.layers.EmbeddingLayer(l_input_sentence,
                                                         input_size=len(vocab),
                                                         output_size=EMBEDDING_SIZE,
                                                         )

    # cnn embedding changes the dimensionality of the representation from 1000 to EMBEDDING_SIZE,
    # and reshapes to add the time dimension - final dim (BATCH_SIZE, 1, EMBEDDING_SIZE)
    l_input_cnn = lasagne.layers.InputLayer((BATCH_SIZE, CNN_FEATURE_SIZE))
    l_cnn_embedding = lasagne.layers.DenseLayer(l_input_cnn, num_units=EMBEDDING_SIZE,
                                                nonlinearity=lasagne.nonlinearities.identity)

    l_cnn_embedding = lasagne.layers.ReshapeLayer(l_cnn_embedding, ([0], 1, [1]))

    # the two are concatenated to form the RNN input with dim (BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_SIZE)
    l_rnn_input = lasagne.layers.ConcatLayer([l_cnn_embedding, l_sentence_embedding])

    l_dropout_input = lasagne.layers.DropoutLayer(l_rnn_input, p=0.5)
    l_lstm = lasagne.layers.LSTMLayer(l_dropout_input,
                                      num_units=EMBEDDING_SIZE,
                                      unroll_scan=True,
                                      grad_clipping=5.)
    l_dropout_output = lasagne.layers.DropoutLayer(l_lstm, p=0.5)

    # the RNN output is reshaped to combine the batch and time dimensions
    # dim (BATCH_SIZE * SEQUENCE_LENGTH, EMBEDDING_SIZE)
    l_shp = lasagne.layers.ReshapeLayer(l_dropout_output, (-1, EMBEDDING_SIZE))

    # decoder is a fully connected layer with one output unit for each word in the vocabulary
    l_decoder = lasagne.layers.DenseLayer(l_shp, num_units=len(vocab), nonlinearity=lasagne.nonlinearities.softmax)

    # finally, the separation between batch and time dimension is restored
    l_out = lasagne.layers.ReshapeLayer(l_decoder, (BATCH_SIZE, SEQUENCE_LENGTH, len(vocab)))

    # Define symbolic variables for the various inputs
    # cnn feature vector
    x_cnn_sym = T.matrix()

    # sentence encoded as sequence of integer word tokens
    x_sentence_sym = T.imatrix()

    # mask defines which elements of the sequence should be predicted
    mask_sym = T.imatrix()

    # ground truth for the RNN output
    y_sentence_sym = T.imatrix()

    output = lasagne.layers.get_output(l_out, {
        l_input_sentence: x_sentence_sym,
        l_input_cnn: x_cnn_sym
    })


    loss = T.mean(calc_cross_ent(output, mask_sym, y_sentence_sym))

    MAX_GRAD_NORM = 15

    all_params = lasagne.layers.get_all_params(l_out, trainable=True)

    all_grads = T.grad(loss, all_params)
    all_grads = [T.clip(g, -5, 5) for g in all_grads]
    all_grads, norm = lasagne.updates.total_norm_constraint(
        all_grads, MAX_GRAD_NORM, return_norm=True)

    updates = lasagne.updates.adam(all_grads, all_params, learning_rate=LR)

    f_train = theano.function([x_cnn_sym, x_sentence_sym, mask_sym, y_sentence_sym],
                              [loss, norm],
                              updates=updates
                              )

    f_val = theano.function([x_cnn_sym, x_sentence_sym, mask_sym, y_sentence_sym], loss)

    for iteration in range(ITERATIONS):
        x_cnn, x_sentence, y_sentence, mask = prep_batch_for_network(get_data_batch(dataset, BATCH_SIZE))
        loss_train, norm = f_train(x_cnn, x_sentence, mask, y_sentence)
        if not iteration % 250:
            logging.info('Iteration {} loss_train: {} norm: {}'.format(iteration, loss_train, norm))
            try:
                batch = get_data_batch(dataset, BATCH_SIZE, split='val')
                x_cnn, x_sentence, y_sentence, mask = prep_batch_for_network(batch)
                loss_val = f_val(x_cnn, x_sentence, mask, y_sentence)
                logging.info('Val loss: {}'.format(loss_val))
            except IndexError:
                continue

    param_values = lasagne.layers.get_all_param_values(l_out)
    d = {'param values': param_values,
         'vocab': vocab,
         'word_to_index': word_to_index,
         'index_to_word': index_to_word,
         }
    pickle.dump(d, open(r'./data/trained_lstm.pkl'.format(EMBEDDING_SIZE, LR*10000, ITERATIONS), 'wb'),
                protocol=pickle.HIGHEST_PROTOCOL)
Example #44
0
    def __init__(self,
                 arch=None,
                 lbda=1,
                 perdatapoint=False,
                 srng=RandomStreams(seed=427),
                 prior=log_normal,
                 opt='adam',
                 coupling=4,
                 coupling_dim=200,
                 pad='same',
                 stride=2,
                 pool=None,
                 uncoupled_init=0,
                 convex_combination=0):

        if arch == 'Riashat':
            kernel_width = 3
            self.kernel_width = kernel_width
            stride = 1
            self.stride = stride
            pad = 'valid'
            self.pad = pad
            self.weight_shapes = [
                (32, 1, kernel_width, kernel_width),  # -> (None, 16, 14, 14)
                (32, 32, kernel_width, kernel_width)
            ]  # -> (None, 16,  7,  7)
            self.args = [[32, kernel_width, stride, pad, rectify, 'none'],
                         [32, kernel_width, stride, pad, rectify, 'max']]
            self.pool_size = 5
        else:
            self.pool_size = 2

        self.n_kernels = np.array(self.weight_shapes)[:, 1].sum()
        self.kernel_shape = self.weight_shapes[0][:1] + self.weight_shapes[0][
            2:]
        print "kernel_shape", self.kernel_shape
        self.kernel_size = np.prod(self.weight_shapes[0])

        self.num_classes = 10
        if arch == 'Riashat':
            self.num_hids = 256
        else:
            self.num_hids = 128
        self.num_mlp_layers = 1
        self.num_mlp_params = self.num_classes + \
                              self.num_hids * self.num_mlp_layers
        self.num_cnn_params = np.sum(np.array(self.weight_shapes)[:, 0])
        self.num_params = self.num_mlp_params + self.num_cnn_params
        self.coupling = coupling
        self.extra_l2 = 0
        self.convex_combination = convex_combination

        #def __init__(self,

        self.lbda = lbda
        self.perdatapoint = perdatapoint
        self.srng = srng
        self.prior = prior
        self.__dict__.update(locals())

        if perdatapoint:
            self.wd1 = self.input_var.shape[0]
        else:
            self.wd1 = 1

    #def _get_theano_variables(self):
        self.input_var = T.matrix('input_var')
        self.input_var = T.tensor4('input_var')  # <-- for CNN
        self.target_var = T.matrix('target_var')
        self.dataset_size = T.scalar('dataset_size')
        self.learning_rate = T.scalar('learning_rate')

        #def _get_hyper_net(self):
        # inition random noise
        print self.num_params
        ep = self.srng.normal(size=(self.wd1, self.num_params), dtype=floatX)
        logdets_layers = []
        h_net = lasagne.layers.InputLayer([None, self.num_params])

        # mean and variation of the initial noise
        layer_temp = LinearFlowLayer(h_net)
        h_net = IndexLayer(layer_temp, 0)
        logdets_layers.append(IndexLayer(layer_temp, 1))

        if self.coupling:
            layer_temp = CoupledWNDenseLayer(h_net,
                                             coupling_dim,
                                             uncoupled_init=uncoupled_init)
            h_net = IndexLayer(layer_temp, 0)
            logdets_layers.append(IndexLayer(layer_temp, 1))

            for c in range(self.coupling - 1):
                h_net = PermuteLayer(h_net, self.num_params)

                layer_temp = CoupledWNDenseLayer(h_net,
                                                 coupling_dim,
                                                 uncoupled_init=uncoupled_init)
                h_net = IndexLayer(layer_temp, 0)
                logdets_layers.append(IndexLayer(layer_temp, 1))

        if self.convex_combination:
            layer_temp = ConvexBiasLayer(
                h_net, upweight_primary=self.convex_combination)
            h_net = IndexLayer(layer_temp, 0)
            logdets_layers.append(IndexLayer(layer_temp, 1))

        self.h_net = h_net
        self.weights = lasagne.layers.get_output(h_net, ep)
        self.logdets = sum([get_output(ld, ep) for ld in logdets_layers])

        #def _get_primary_net(self):

        t = np.cast['int32'](0)
        if 1:  #self.dataset == 'mnist':
            p_net = lasagne.layers.InputLayer([None, 1, 28, 28])
        print p_net.output_shape
        inputs = {p_net: self.input_var}

        #logpw = np.float32(0.)

        for ws, args in zip(self.weight_shapes, self.args):

            num_filters = ws[0]

            # TO-DO: generalize to have multiple samples?
            weight = self.weights[0, t:t + num_filters].dimshuffle(
                0, 'x', 'x', 'x')

            num_filters = args[0]
            filter_size = args[1]
            stride = args[2]
            pad = args[3]
            nonl = args[4]
            p_net = lasagne.layers.Conv2DLayer(p_net,
                                               num_filters,
                                               filter_size,
                                               stride,
                                               pad,
                                               nonlinearity=nonl)
            p_net = stochastic_weight_norm(p_net, weight)

            if args[5] == 'max':
                p_net = lasagne.layers.MaxPool2DLayer(p_net, self.pool_size)
            #print p_net.output_shape
            t += num_filters

        for layer in range(self.num_mlp_layers):
            weight = self.weights[:, t:t + self.num_hids].reshape(
                (self.wd1, self.num_hids))
            p_net = lasagne.layers.DenseLayer(p_net,
                                              self.num_hids,
                                              nonlinearity=rectify)
            p_net = stochastic_weight_norm(p_net, weight)
            if self.extra_l2:
                self.l2_penalty = lasagne.regularization.regularize_layer_params_weighted(
                    {p_net: 3.5 / 128}, lasagne.regularization.l2)
            t += self.num_hids

        weight = self.weights[:, t:t + self.num_classes].reshape(
            (self.wd1, self.num_classes))

        p_net = lasagne.layers.DenseLayer(p_net,
                                          self.num_classes,
                                          nonlinearity=nonlinearities.softmax)
        p_net = stochastic_weight_norm(p_net, weight)

        y = T.clip(get_output(p_net, inputs), 0.001, 0.999)  # stability

        self.p_net = p_net
        self.y = y

        #def _get_params(self):

        params = lasagne.layers.get_all_params([self.h_net, self.p_net])
        self.params = list()
        for param in params:
            if type(param) is not RSSV:
                self.params.append(param)

        params0 = lasagne.layers.get_all_param_values([self.h_net, self.p_net])
        params = lasagne.layers.get_all_params([self.h_net, self.p_net])
        updates = {p: p0 for p, p0 in zip(params, params0)}
        self.reset = theano.function([], None, updates=updates)
        self.add_reset('init')

        #def _get_elbo(self):

        logdets = self.logdets
        self.logqw = -logdets
        self.logpw = self.prior(self.weights, 0., -T.log(self.lbda)).sum(1)
        self.kl = (self.logqw - self.logpw).mean()
        self.kl_term = self.kl / T.cast(self.dataset_size, floatX)
        self.logpyx = -cc(self.y, self.target_var).mean()
        self.loss = -self.logpyx + self.kl_term

        # DK - extra monitoring (TODO)
        params = self.params
        ds = self.dataset_size
        self.logpyx_grad = flatten_list(
            T.grad(-self.logpyx, params, disconnected_inputs='warn')).norm(2)
        self.logpw_grad = flatten_list(
            T.grad(-self.logpw.mean() / ds, params,
                   disconnected_inputs='warn')).norm(2)
        self.logqw_grad = flatten_list(
            T.grad(self.logqw.mean() / ds, params,
                   disconnected_inputs='warn')).norm(2)
        self.monitored = [
            self.logpyx, self.logpw, self.logqw, self.logpyx_grad,
            self.logpw_grad, self.logqw_grad
        ]

        #def _get_grads(self):
        grads = T.grad(self.loss, self.params)
        mgrads = lasagne.updates.total_norm_constraint(grads,
                                                       max_norm=self.max_norm)
        cgrads = [T.clip(g, -self.clip_grad, self.clip_grad) for g in mgrads]
        if self.opt == 'adam':
            self.updates = lasagne.updates.adam(
                cgrads, self.params, learning_rate=self.learning_rate)
        elif self.opt == 'momentum':
            self.updates = lasagne.updates.nesterov_momentum(
                cgrads, self.params, learning_rate=self.learning_rate)
        elif self.opt == 'sgd':
            self.updates = lasagne.updates.sgd(
                cgrads, self.params, learning_rate=self.learning_rate)

    #def _get_train_func(self):
        train = theano.function([
            self.input_var, self.target_var, self.dataset_size,
            self.learning_rate
        ],
                                self.loss,
                                updates=self.updates)
        self.train_func = train
        # DK - putting this here, because is doesn't get overwritten by subclasses
        self.monitor_func = theano.function([
            self.input_var, self.target_var, self.dataset_size,
            self.learning_rate
        ],
                                            self.monitored,
                                            on_unused_input='warn')

        #def _get_useful_funcs(self):
        self.predict_proba = theano.function([self.input_var], self.y)
        self.predict = theano.function([self.input_var], self.y.argmax(1))
Example #45
0
def kullback_leibler_divergence_loss(y_pred, y_actual, **kwargs):
    y_actual = T.clip(y_actual, epsilon, 1)
    y_pred = T.clip(y_pred, epsilon, 1)
    return T.sum(y_actual * T.log(y_actual / y_pred), axis=-1)
    def parameters_updates(self, LR):

        updates = []

        beta1 = 0.9
        beta2 = 0.999
        epsilon = 1e-8
        alpha = 0.05

        t = self.n_samples + 1
        a_t = LR * T.sqrt(1 - beta2**t) / (1 - beta1**t)

        #updates.append((self.Wba, self.Wba))

        m_t_Wa = beta1 * self.m_Wa + (1 - beta1) * self.dEdWa
        v_t_Wa = beta2 * self.v_Wa + (1 - beta2) * self.dEdWa**2
        step_Wa = a_t * m_t_Wa / (T.sqrt(v_t_Wa) + epsilon)

        if self.binary_training == True:
            step_Wa = T.clip(step_Wa, -self.W0_a, self.W0_a)

        updates.append((self.m_Wa, m_t_Wa))
        updates.append((self.v_Wa, v_t_Wa))
        updates.append((self.Wa, self.Wa - step_Wa))

        m_t_Wx = beta1 * self.m_Wx + (1 - beta1) * self.dEdWx
        v_t_Wx = beta2 * self.v_Wx + (1 - beta2) * self.dEdWx**2
        step_Wx = a_t * m_t_Wx / (T.sqrt(v_t_Wx) + epsilon)

        if self.binary_training == True:
            step_Wx = T.clip(step_Wx, -self.W0_x, self.W0_x)

        updates.append((self.m_Wx, m_t_Wx))
        updates.append((self.v_Wx, v_t_Wx))
        updates.append((self.Wx, self.Wx - step_Wx))

        if self.BN == True:
            m_t_bn_a_beta = beta1 * self.m_bn_a_beta + (
                1 - beta1) * self.dEdbn_a_beta
            v_t_bn_a_beta = beta2 * self.v_bn_a_beta + (
                1 - beta2) * self.dEdbn_a_beta**2
            step_bn_a_beta = a_t * m_t_bn_a_beta / (T.sqrt(v_t_bn_a_beta) +
                                                    epsilon)
            updates.append((self.m_bn_a_beta, m_t_bn_a_beta))
            updates.append((self.v_bn_a_beta, v_t_bn_a_beta))
            updates.append((self.bn_a_beta, self.bn_a_beta - step_bn_a_beta))

            m_t_bn_a_gamma = beta1 * self.m_bn_a_gamma + (
                1 - beta1) * self.dEdbn_a_gamma
            v_t_bn_a_gamma = beta2 * self.v_bn_a_gamma + (
                1 - beta2) * self.dEdbn_a_gamma**2
            step_bn_a_gamma = a_t * m_t_bn_a_gamma / (T.sqrt(v_t_bn_a_gamma) +
                                                      epsilon)
            updates.append((self.m_bn_a_gamma, m_t_bn_a_gamma))
            updates.append((self.v_bn_a_gamma, v_t_bn_a_gamma))
            updates.append(
                (self.bn_a_gamma, self.bn_a_gamma - step_bn_a_gamma))

            m_t_bn_b_gamma = beta1 * self.m_bn_b_gamma + (
                1 - beta1) * self.dEdbn_b_gamma
            v_t_bn_b_gamma = beta2 * self.v_bn_b_gamma + (
                1 - beta2) * self.dEdbn_b_gamma**2
            step_bn_b_gamma = a_t * m_t_bn_b_gamma / (T.sqrt(v_t_bn_b_gamma) +
                                                      epsilon)
            updates.append((self.m_bn_b_gamma, m_t_bn_b_gamma))
            updates.append((self.v_bn_b_gamma, v_t_bn_b_gamma))
            updates.append(
                (self.bn_b_gamma, self.bn_b_gamma - step_bn_b_gamma))

            m_t_bn_c_beta = beta1 * self.m_bn_c_beta + (
                1 - beta1) * self.dEdbn_c_beta
            v_t_bn_c_beta = beta2 * self.v_bn_c_beta + (
                1 - beta2) * self.dEdbn_c_beta**2
            step_bn_c_beta = a_t * m_t_bn_c_beta / (T.sqrt(v_t_bn_c_beta) +
                                                    epsilon)
            updates.append((self.m_bn_c_beta, m_t_bn_c_beta))
            updates.append((self.v_bn_c_beta, v_t_bn_c_beta))
            updates.append((self.bn_c_beta, self.bn_c_beta - step_bn_c_beta))

            m_t_bn_c_gamma = beta1 * self.m_bn_c_gamma + (
                1 - beta1) * self.dEdbn_c_gamma
            v_t_bn_c_gamma = beta2 * self.v_bn_c_gamma + (
                1 - beta2) * self.dEdbn_c_gamma**2
            step_bn_c_gamma = a_t * m_t_bn_c_gamma / (T.sqrt(v_t_bn_c_gamma) +
                                                      epsilon)
            updates.append((self.m_bn_c_gamma, m_t_bn_c_gamma))
            updates.append((self.v_bn_c_gamma, v_t_bn_c_gamma))
            updates.append(
                (self.bn_c_gamma, self.bn_c_gamma - step_bn_c_gamma))

            # very sligthly biased variance estimation
            new_bn_a_mean = (1 - alpha) * self.bn_a_mean + alpha * self.a_mean
            new_bn_a_var = (1 - alpha) * self.bn_a_var + alpha * self.a_var

            new_bn_b_mean = (1 - alpha) * self.bn_b_mean + alpha * self.b_mean
            new_bn_b_var = (1 - alpha) * self.bn_b_var + alpha * self.b_var

            new_bn_c_mean = (1 - alpha) * self.bn_c_mean + alpha * self.c_mean
            new_bn_c_var = (1 - alpha) * self.bn_c_var + alpha * self.c_var

            updates.append((self.bn_a_mean, new_bn_a_mean))
            updates.append((self.bn_a_var, new_bn_a_var))

            updates.append((self.bn_b_mean, new_bn_b_mean))
            updates.append((self.bn_b_var, new_bn_b_var))

            updates.append((self.bn_c_mean, new_bn_c_mean))
            updates.append((self.bn_c_var, new_bn_c_var))

        else:
            m_t_bn_a_beta = beta1 * self.m_bn_a_beta + (
                1 - beta1) * self.dEdbn_a_beta
            v_t_bn_a_beta = beta2 * self.v_bn_a_beta + (
                1 - beta2) * self.dEdbn_a_beta**2
            step_bn_a_beta = a_t * m_t_bn_a_beta / (T.sqrt(v_t_bn_a_beta) +
                                                    epsilon)
            updates.append((self.m_bn_a_beta, m_t_bn_a_beta))
            updates.append((self.v_bn_a_beta, v_t_bn_a_beta))
            updates.append((self.bn_a_beta, self.bn_a_beta - step_bn_a_beta))

            m_t_bn_c_beta = beta1 * self.m_bn_c_beta + (
                1 - beta1) * self.dEdbn_c_beta
            v_t_bn_c_beta = beta2 * self.v_bn_c_beta + (
                1 - beta2) * self.dEdbn_c_beta**2
            step_bn_c_beta = a_t * m_t_bn_c_beta / (T.sqrt(v_t_bn_c_beta) +
                                                    epsilon)
            updates.append((self.m_bn_c_beta, m_t_bn_c_beta))
            updates.append((self.v_bn_c_beta, v_t_bn_c_beta))
            updates.append((self.bn_c_beta, self.bn_c_beta - step_bn_c_beta))

        m_t_h0 = beta1 * self.m_h0 + (1 - beta1) * self.dEdh0
        v_t_h0 = beta2 * self.v_h0 + (1 - beta2) * self.dEdh0**2
        step_h0 = a_t * m_t_h0 / (T.sqrt(v_t_h0) + epsilon)
        updates.append((self.m_h0, m_t_h0))
        updates.append((self.v_h0, v_t_h0))
        updates.append((self.h0, self.h0 - step_h0))

        m_t_c0 = beta1 * self.m_c0 + (1 - beta1) * self.dEdc0
        v_t_c0 = beta2 * self.v_c0 + (1 - beta2) * self.dEdc0**2
        step_c0 = a_t * m_t_c0 / (T.sqrt(v_t_c0) + epsilon)
        updates.append((self.m_c0, m_t_c0))
        updates.append((self.v_c0, v_t_c0))
        updates.append((self.c0, self.c0 - step_c0))

        updates.append((self.n_samples, t))

        return updates
def clip(gradient, bound):
    assert bound > 0
    return T.clip(gradient, -bound, bound)
 def hard_sigm(self, x):
     return T.clip((x + 1) / 2, 0, 1)
Example #49
0
def sgd_optimization_mnist(learning_rate=0.13,
                           n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # compute the gradient of cost with respect to theta = (W,b)
    g_W_temp = T.grad(cost=cost, wrt=classifier.W)
    g_b_temp = T.grad(cost=cost, wrt=classifier.b)
    # g_norm = T._tensor_py_operators.norm(g_W_temp, 4)
    g_W_clip = T.clip(g_W_temp, -2, 2)
    g_b_clip = T.clip(g_b_temp, -2, 2)
    # b_norm = T._tensor_py_operators.norm(g_b, 4)
    # g_b = T.clip(g_b, 0, .5)
    srng = RandomStreams(seed=234)
    rv_n_w = srng.normal(g_W_temp.shape, avg=0.0, std=0.28)
    rv_n_b = srng.normal(g_b_temp.shape, avg=0.0, std=0.28)
    g_W = g_W_clip + rv_n_w
    g_b = g_b_clip + rv_n_b

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # this theano function returns training error for each minibatch
    train_model_loss = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the model')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    validation_records = []
    training_records = []
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                # compute zero-one loss on training set
                training_losses = [
                    train_model_loss(i) for i in range(n_train_batches)
                ]
                this_training_loss = numpy.mean(training_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f training error %f %%'
                    % (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100., this_training_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    # test it on the test set

                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of'
                           ' best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

                    # save the best model
                    with open('best_model.pkl', 'wb') as f:
                        pickle.dump(classifier, f)

            if patience <= iter:
                done_looping = True
                break
        validation_records.append(this_validation_loss * 100)
        training_records.append(this_training_loss * 100)

    end_time = timeit.default_timer()
    print(('Optimization complete with best validation score of %f %%,'
           'with test performance %f %%') %
          (best_validation_loss * 100., test_score * 100.))
    print('The code run for %d epochs, with %f epochs/sec' %
          (epoch, 1. * epoch / (end_time - start_time)))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' %
         ((end_time - start_time))),
        file=sys.stderr)
    return validation_records, training_records, test_score * 100
Example #50
0
def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept,
                target_acceptance_rate, stepsize_inc, stepsize_dec,
                stepsize_min, stepsize_max, avg_acceptance_slowness):
    """This function is executed after `n_steps` of HMC sampling
    (`hmc_move` function). It creates the updates dictionary used by
    the `simulate` function. It takes care of updating: the position
    (if the move is accepted), the stepsize (to track a given target
    acceptance rate) and the average acceptance rate (computed as a
    moving average).

    Parameters
    ----------
    positions: shared variable, theano matrix
        Shared theano matrix whose rows contain the old position
    stepsize: shared variable, theano scalar
        Shared theano scalar containing current step size
    avg_acceptance_rate: shared variable, theano scalar
        Shared theano scalar containing the current average acceptance rate
    final_pos: shared variable, theano matrix
        Shared theano matrix whose rows contain the new position
    accept: theano scalar
        Boolean-type variable representing whether or not the proposed HMC move
        should be accepted or not.
    target_acceptance_rate: float
        The stepsize is modified in order to track this target acceptance rate.
    stepsize_inc: float
        Amount by which to increment stepsize when acceptance rate is too high.
    stepsize_dec: float
        Amount by which to decrement stepsize when acceptance rate is too low.
    stepsize_min: float
        Lower-bound on `stepsize`.
    stepsize_min: float
        Upper-bound on `stepsize`.
    avg_acceptance_slowness: float
        Average acceptance rate is computed as an exponential moving average.
        (1-avg_acceptance_slowness) is the weight given to the newest
        observation.

    Returns
    -------
    rval1: dictionary-like
        A dictionary of updates to be used by the `HMC_Sampler.simulate`
        function.  The updates target the position, stepsize and average
        acceptance rate.

    """

    ## POSITION UPDATES ##
    # broadcast `accept` scalar to tensor with the same dimensions as
    # final_pos.
    accept_matrix = accept.dimshuffle(0, *(('x', ) * (final_pos.ndim - 1)))
    # if accept is True, update to `final_pos` else stay put
    new_positions = TT.switch(accept_matrix, final_pos, positions)
    # end-snippet-5 start-snippet-7
    ## STEPSIZE UPDATES ##
    # if acceptance rate is too low, our sampler is too "noisy" and we reduce
    # the stepsize. If it is too high, our sampler is too conservative, we can
    # get away with a larger stepsize (resulting in better mixing).
    _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate,
                              stepsize * stepsize_inc, stepsize * stepsize_dec)
    # maintain stepsize in [stepsize_min, stepsize_max]

    new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max)
    # new_stepsize=stepsize # TODO remove for adaptive step sizes

    # end-snippet-7 start-snippet-6
    ## ACCEPT RATE UPDATES ##
    # perform exponential moving average
    mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype)
    new_acceptance_rate = TT.add(avg_acceptance_slowness * avg_acceptance_rate,
                                 (1.0 - avg_acceptance_slowness) *
                                 accept.mean(dtype=mean_dtype))
    # end-snippet-6 start-snippet-8
    return [(positions, new_positions), (stepsize, new_stepsize),
            (avg_acceptance_rate, new_acceptance_rate)]
Example #51
0
def _interpolate(im, x, y, out_height, out_width, border_mode):
    # *_f are floats
    num_batch, height, width, channels = im.shape
    height_f = T.cast(height, theano.config.floatX)
    width_f = T.cast(width, theano.config.floatX)

    # scale coordinates from [-1, 1] to [0, width/height - 1]
    x = (x + 1) / 2 * (width_f - 1)
    y = (y + 1) / 2 * (height_f - 1)

    # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates;
    # we need those in floatX for interpolation and in int64 for indexing.
    x0_f = T.floor(x)
    y0_f = T.floor(y)
    x1_f = x0_f + 1
    y1_f = y0_f + 1

    # for indexing, we need to take care of the border mode for outside pixels.
    if border_mode == 'nearest':
        x0 = T.clip(x0_f, 0, width_f - 1)
        x1 = T.clip(x1_f, 0, width_f - 1)
        y0 = T.clip(y0_f, 0, height_f - 1)
        y1 = T.clip(y1_f, 0, height_f - 1)
    elif border_mode == 'mirror':
        w = 2 * (width_f - 1)
        x0 = T.minimum(x0_f % w, -x0_f % w)
        x1 = T.minimum(x1_f % w, -x1_f % w)
        h = 2 * (height_f - 1)
        y0 = T.minimum(y0_f % h, -y0_f % h)
        y1 = T.minimum(y1_f % h, -y1_f % h)
    elif border_mode == 'wrap':
        x0 = T.mod(x0_f, width_f)
        x1 = T.mod(x1_f, width_f)
        y0 = T.mod(y0_f, height_f)
        y1 = T.mod(y1_f, height_f)
    else:
        raise ValueError("border_mode must be one of "
                         "'nearest', 'mirror', 'wrap'")
    x0, x1, y0, y1 = (T.cast(v, 'int64') for v in (x0, x1, y0, y1))

    # The input is [num_batch, height, width, channels]. We do the lookup in
    # the flattened input, i.e [num_batch*height*width, channels]. We need
    # to offset all indices to match the flat version
    dim2 = width
    dim1 = width*height
    base = T.repeat(
        T.arange(num_batch, dtype='int64')*dim1, out_height*out_width)
    base_y0 = base + y0*dim2
    base_y1 = base + y1*dim2
    idx_a = base_y0 + x0
    idx_b = base_y1 + x0
    idx_c = base_y0 + x1
    idx_d = base_y1 + x1

    # use indices to lookup pixels for all samples
    im_flat = im.reshape((-1, channels))
    Ia = im_flat[idx_a]
    Ib = im_flat[idx_b]
    Ic = im_flat[idx_c]
    Id = im_flat[idx_d]

    # calculate interpolated values
    wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x')
    wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x')
    wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x')
    wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x')
    output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0)
    return output
Example #52
0
 def forward(self, inputtensor):
     x = inputtensor[0]
     x = T.clip(x, -1, 1)
     return (binaryOp(x),)
 def clipped_v(self, x):
     return T.clip(T.abs_(x), 0, 1)
Example #54
0
	    Prob1 = T.tanh(th*Prob1) #the nonlinear tanh() function accelerates the state transfer
		   
        delta_W2 = updates[param] - param 
        delta_W2_direction = T.cast(T.sgn(delta_W2),theano.config.floatX)	   
        dis2=T.abs_(delta_W2) #the absolute distance
        k2=delta_W2_direction*T.floor(dis2/L) #the integer part
        v2=delta_W2-k2*L #the decimal part
        Prob2= T.abs_(v2/L) #the transfer probability
        Prob2 = T.tanh(th*Prob2) #the nonlinear tanh() function accelerates the state transfer
        
        srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
        Gate1 = T.cast(srng.binomial(n=1, p=Prob1, size=T.shape(Prob1)), theano.config.floatX) # Gate1 is a binary variable with probability of Prob1 to be 1
        Gate2 = T.cast(srng.binomial(n=1, p=Prob2, size=T.shape(Prob2)), theano.config.floatX) # Gate2 is a binary variable with probability of Prob2 to be 1

        delta_W1_new=(k1+delta_W1_direction*Gate1)*L #delta_W1_new = k*L where k is an integer   
        updates_param1 = T.clip(parambest + delta_W1_new,-H,H)
        updates_param1 = weight_tune(updates_param1,-H,H) #fine tuning for guaranteeing each element strictly constrained in the discrete space

        delta_W2_new=(k2+delta_W2_direction*Gate2)*L #delta_W2_new = k*L where k is an integer  
        updates_param2 = T.clip(param + delta_W2_new,-H,H)
        updates_param2 = weight_tune(updates_param2,-H,H) #fine tuning for guaranteeing each element strictly constrained in the discrete space

		# if update_type<100, the weight probabilistically tranfers from parambest to state_rand, which helps to search the global minimum
        # elst it would probabilistically transfer from param to a state nearest to updates[param]		
        updates[param]= T.switch(T.lt(update_type,100), updates_param1, updates_param2)    
    
    return updates


def train(  network,
            train_fn,val_fn,
Example #55
0
 def cross_entropy_binary(self, y):
     output = T.clip(self.p_y_given_x, 1e-7, 1 - (1e-7))
     return T.sum(binary_crossentropy(output, y), axis=1)
Example #56
0
#updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE)

###########
all_params = lib.get_params(cost,
                            lambda x: hasattr(x, 'param') and x.param == True)
ip_params = lib.get_params(ip_cost, lambda x: hasattr(x, 'param') and x.param==True\
    and 'BigFrameLevel' in x.name)
other_params = [p for p in all_params if p not in ip_params]
all_params = ip_params + other_params
lib.print_params_info(ip_params, path=FOLDER_PREFIX)
lib.print_params_info(other_params, path=FOLDER_PREFIX)
lib.print_params_info(all_params, path=FOLDER_PREFIX)

ip_grads = T.grad(ip_cost, wrt=ip_params, disconnected_inputs='warn')
ip_grads = [
    T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in ip_grads
]

other_grads = T.grad(cost, wrt=other_params, disconnected_inputs='warn')
other_grads = [
    T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP))
    for g in other_grads
]

grads = T.grad(cost, wrt=all_params, disconnected_inputs='warn')
grads = [
    T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads
]

ip_updates = lasagne.updates.adam(ip_grads, ip_params)
other_updates = lasagne.updates.adam(other_grads, other_params)
Example #57
0
    l_output = ReshapeLayer(l_output_dense, (batch_size, seqlen, size + 2))

    return l_output, l_ntm


if __name__ == '__main__':
    # Define the input and expected output variable
    input_var, target_var = T.tensor3s('input', 'target')
    # The generator to sample examples from
    generator = RepeatCopyTask(batch_size=1, max_iter=1000000, size=8, min_length=3, \
        max_length=5, max_repeats=5, unary=True, end_marker=True)
    # The model (1-layer Neural Turing Machine)
    l_output, l_ntm = model(input_var, batch_size=generator.batch_size,
        size=generator.size, num_units=100, memory_shape=(128, 20))
    # The generated output variable and the loss function
    pred_var = T.clip(lasagne.layers.get_output(l_output), 1e-6, 1. - 1e-6)
    loss = T.mean(lasagne.objectives.binary_crossentropy(pred_var, target_var))
    # Create the update expressions
    params = lasagne.layers.get_all_params(l_output, trainable=True)
    updates = lasagne.updates.adam(loss, params, learning_rate=5e-4)
    # Compile the function for a training step, as well as the prediction function and
    # a utility function to get the inner details of the NTM
    train_fn = theano.function([input_var, target_var], loss, updates=updates)
    ntm_fn = theano.function([input_var], pred_var)
    ntm_layer_fn = theano.function([input_var], lasagne.layers.get_output(l_ntm, get_details=True))

    # Training
    try:
        scores, all_scores = [], []
        for i, (example_input, example_output) in generator:
            score = train_fn(example_input, example_output)
Example #58
0
    def __init__(self,
                 learning_rate,
                 drop_out,
                 Layers,
                 N_hidden,
                 D_input,
                 D_out,
                 Task_type='regression',
                 L2_lambda=0.0,
                 _EPSILON=1e-12,
                 fixlayer=[],
                 mid_target='0'):
        #------varibles------
        #label
        self.hard_target = T.matrix('hard_target')
        #input layer
        self.l_in = lasagne.layers.InputLayer(shape=(None, D_input))
        #last hidden layer
        self.l_hid = self.l_in
        #stack hidden layers
        #l2 regularization
        self.l2_penalty = 0
        self.lr = theano.shared(
            np.array(learning_rate, dtype=theano.config.floatX))
        for i in range(Layers):
            self.l_hid = lasagne.layers.DenseLayer(
                self.l_hid,
                num_units=N_hidden,
                W=lasagne.init.HeUniform(gain='relu'),
                b=lasagne.init.Constant(0.001),
                nonlinearity=lasagne.nonlinearities.rectify)
            print('Add Dense layer')
            self.l2_penalty += lasagne.regularization.regularize_layer_params(
                self.l_hid, l2) * L2_lambda
            self.l_hid = lasagne.layers.dropout(self.l_hid, drop_out)
            print('Add Dropout layer')
        #out_layer
        if mid_target == "mid_target":
            self.l_out = lasagne.layers.DenseLayer(
                self.l_hid,
                num_units=D_out,
                nonlinearity=lasagne.nonlinearities.rectify)
            print('relu out')
        else:
            self.l_out = lasagne.layers.DenseLayer(
                self.l_hid,
                num_units=D_out,
                nonlinearity=lasagne.nonlinearities.linear)
            print('linear out')
        #select weights not to be updated
        d = 1  # how many have deleted
        self.all_params = lasagne.layers.get_all_params(self.l_out)
        self.get_weights = lasagne.layers.get_all_param_values(self.l_out)
        for f in fixlayer:
            del self.all_params[(f - d) * 2]
            del self.all_params[(f - d) * 2]
            d += 1
        #------training function------
        #output of net for train / eval
        self.l_out_train = lasagne.layers.get_output(self.l_out,
                                                     deterministic=False)
        self.l_out_eval = lasagne.layers.get_output(self.l_out,
                                                    deterministic=True)
        if Task_type != 'regression':
            self.l_out_train = T.exp(self.l_out_train) / T.sum(
                T.exp(self.l_out_train), axis=1, keepdims=True)
            self.l_out_eval = T.exp(self.l_out_eval) / T.sum(
                T.exp(self.l_out_eval), axis=1, keepdims=True)
            print('Add Softmax output layer')
            self.l_out_train = T.clip(self.l_out_train, _EPSILON,
                                      1.0 - _EPSILON)
            self.l_out_eval = T.clip(self.l_out_eval, _EPSILON, 1.0 - _EPSILON)
        #loss function for train / eval
        if Task_type != 'regression':
            self.loss_train = T.mean(
                lasagne.objectives.categorical_crossentropy(
                    self.l_out_train, self.hard_target))
            self.loss_eval = T.mean(
                lasagne.objectives.categorical_crossentropy(
                    self.l_out_eval, self.hard_target))
        else:
            self.loss_train = T.mean(
                lasagne.objectives.squared_error(self.l_out_train,
                                                 self.hard_target))
            self.loss_eval = T.mean(
                lasagne.objectives.squared_error(self.l_out_train,
                                                 self.hard_target))
        self.acc = T.mean(
            lasagne.objectives.categorical_accuracy(self.l_out_eval,
                                                    self.hard_target))

        #eval functions
        self.get_acc = theano.function([self.l_in.input_var, self.hard_target],
                                       self.acc)
        self.get_loss = theano.function(
            [self.l_in.input_var, self.hard_target], self.loss_eval)
        self.updates = lasagne.updates.adam(self.loss_train + self.l2_penalty,
                                            self.all_params,
                                            learning_rate=self.lr)

        #train function
        self.train = theano.function([self.l_in.input_var, self.hard_target],
                                     updates=self.updates)
        self.train_loss_acc = theano.function(
            [self.l_in.input_var, self.hard_target],
            [self.loss_eval, self.acc],
            updates=self.updates)
        #output function
        self.get_out = theano.function([self.l_in.input_var], self.l_out_eval)
        self.hid_out = theano.function([self.l_in.input_var],
                                       lasagne.layers.get_output(
                                           self.l_hid, deterministic=True))
Example #59
0
def hard_sigmoid(x):
    return T.clip((x+1.)/2.,0,1)
Example #60
0
def sqrt(x):
    x = T.clip(x, 0., np.inf)
    return T.sqrt(x)