Exemple #1
0
    def __init__(self, random_state=None, low=0.0, high=1.0):
        super(Uniform, self).__init__(low=low, high=high,
                                      random_state=random_state,
                                      optimizer=None)

        # pdf
        self.pdf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            0.,
            1. / (self.high - self.low)).ravel()
        self.make_(self.pdf_, "pdf")

        # -log pdf
        self.nnlf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            np.inf,
            T.log(self.high - self.low)).ravel()
        self.make_(self.nnlf_, "nnlf")

        # cdf
        self.cdf_ = T.switch(
            T.lt(self.X, self.low),
            0.,
            T.switch(
                T.lt(self.X, self.high),
                (self.X - self.low) / (self.high - self.low),
                1.)).ravel()
        self.make_(self.cdf_, "cdf")

        # ppf
        self.ppf_ = self.p * (self.high - self.low) + self.low
        self.make_(self.ppf_, "ppf", args=[self.p])
Exemple #2
0
    def get_updates(self, loss, lr, max_norm=1, beta1=0.9, beta2=0.999,
                    epsilon=1e-8, grads=None):
        # Gradients
        if grads is None:
            grads = tensor.grad(loss, self.trainables)

        # Clipping
        norm  = tensor.sqrt(sum([tensor.sqr(g).sum() for g in grads]))
        m     = theanotools.clipping_multiplier(norm, max_norm)
        grads = [m*g for g in grads]

        # Safeguard against numerical instability
        new_cond = tensor.or_(tensor.or_(tensor.isnan(norm), tensor.isinf(norm)),
                              tensor.or_(norm < 0, norm > 1e10))
        grads = [tensor.switch(new_cond, np.float32(0), g) for g in grads]

        # Safeguard against numerical instability
        #cond  = tensor.or_(norm < 0, tensor.or_(tensor.isnan(norm), tensor.isinf(norm)))
        #grads = [tensor.switch(cond, np.float32(0), g) for g in grads]

        # New values
        t       = self.time + 1
        lr_t    = lr*tensor.sqrt(1. - beta2**t)/(1. - beta1**t)
        means_t = [beta1*m + (1. - beta1)*g for g, m in zip(grads, self.means)]
        vars_t  = [beta2*v + (1. - beta2)*tensor.sqr(g) for g, v in zip(grads, self.vars)]
        steps   = [lr_t*m_t/(tensor.sqrt(v_t) + epsilon)
                   for m_t, v_t in zip(means_t, vars_t)]

        # Updates
        updates  = [(x, x - step) for x, step in zip(self.trainables, steps)]
        updates += [(m, m_t) for m, m_t in zip(self.means, means_t)]
        updates += [(v, v_t) for v, v_t in zip(self.vars, vars_t)]
        updates += [(self.time, t)]

        return norm, grads, updates
Exemple #3
0
def get_train(U_Ot, U_R, lenW, n_facts):
    def phi_x1(x_t, L):
        return T.concatenate([L[x_t].reshape((-1,)), zeros((2*lenW,)), zeros((3,))], axis=0)
    def phi_x2(x_t, L):
        return T.concatenate([zeros((lenW,)), L[x_t].reshape((-1,)), zeros((lenW,)), zeros((3,))], axis=0)
    def phi_y(x_t, L):
        return T.concatenate([zeros((2*lenW,)), L[x_t].reshape((-1,)), zeros((3,))], axis=0)
    def phi_t(x_t, y_t, yp_t, L):
        return T.concatenate([zeros(3*lenW,), T.stack(T.switch(T.lt(x_t,y_t), 1, 0), T.switch(T.lt(x_t,yp_t), 1, 0), T.switch(T.lt(y_t,yp_t), 1, 0))], axis=0)
    def s_Ot(xs, y_t, yp_t, L):
        result, updates = theano.scan(
            lambda x_t, t: T.dot(T.dot(T.switch(T.eq(t, 0), phi_x1(x_t, L).reshape((1,-1)), phi_x2(x_t, L).reshape((1,-1))), U_Ot.T),
                           T.dot(U_Ot, (phi_y(y_t, L) - phi_y(yp_t, L) + phi_t(x_t, y_t, yp_t, L)))),
            sequences=[xs, T.arange(T.shape(xs)[0])])
        return result.sum()
    def sR(xs, y_t, L, V):
        result, updates = theano.scan(
            lambda x_t, t: T.dot(T.dot(T.switch(T.eq(t, 0), phi_x1(x_t, L).reshape((1,-1)), phi_x2(x_t, L).reshape((1,-1))), U_R.T),
                                 T.dot(U_R, phi_y(y_t, V))),
            sequences=[xs, T.arange(T.shape(xs)[0])])
        return result.sum()

    x_t = T.iscalar('x_t')
    m = [x_t] + [T.iscalar('m_o%d' % i) for i in xrange(n_facts)]
    f = [T.iscalar('f%d_t' % i) for i in xrange(n_facts)]
    r_t = T.iscalar('r_t')
    gamma = T.scalar('gamma')
    L = T.fmatrix('L') # list of messages
    V = T.fmatrix('V') # vocab
    r_args = T.stack(*m)

    cost_arr = [0] * 2 * (len(m)-1)
    updates_arr = [0] * 2 * (len(m)-1)
    for i in xrange(len(m)-1):
        cost_arr[2*i], updates_arr[2*i] = theano.scan(
                lambda f_bar, t: T.switch(T.or_(T.eq(t, f[i]), T.eq(t, T.shape(L)-1)), 0, T.largest(gamma - s_Ot(T.stack(*m[:i+1]), f[i], t, L), 0)),
            sequences=[L, T.arange(T.shape(L)[0])])
        cost_arr[2*i+1], updates_arr[2*i+1] = theano.scan(
                lambda f_bar, t: T.switch(T.or_(T.eq(t, f[i]), T.eq(t, T.shape(L)-1)), 0, T.largest(gamma + s_Ot(T.stack(*m[:i+1]), t, f[i], L), 0)),
            sequences=[L, T.arange(T.shape(L)[0])])

    cost1, u1 = theano.scan(
        lambda r_bar, t: T.switch(T.eq(r_t, t), 0, T.largest(gamma - sR(r_args, r_t, L, V) + sR(r_args, t, L, V), 0)),
        sequences=[V, T.arange(T.shape(V)[0])])

    cost = cost1.sum()
    for c in cost_arr:
        cost += c.sum()

    g_uo, g_ur = T.grad(cost, [U_Ot, U_R])

    train = theano.function(
        inputs=[r_t, gamma, L, V] + m + f,
        outputs=[cost],
        updates=[(U_Ot, U_Ot-alpha*g_uo), (U_R, U_R-alpha*g_ur)])
    return train
Exemple #4
0
    def compute_updates(self, training_cost, params):
        updates = []
         
        grads = T.grad(training_cost, params)
        grads = OrderedDict(zip(params, grads))
        
        # Clip stuff
        c = numpy.float32(self.cutoff)
        clip_grads = []
        
        norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items()))
        normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
        notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
         
        for p, g in grads.items():
            clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization)))
        
        grads = OrderedDict(clip_grads)

        if self.updater == 'adagrad':
            updates = Adagrad(grads, self.lr)  
        elif self.updater == 'sgd':
            raise Exception("Sgd not implemented!")
        elif self.updater == 'adadelta':
            updates = Adadelta(grads)
        elif self.updater == 'rmsprop':
            updates = RMSProp(grads, self.lr)
        elif self.updater == 'adam':
            updates = Adam(grads)
        else:
            raise Exception("Updater not understood!") 
        return updates
Exemple #5
0
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        """
        for p, g in grads.items():
            grads[p] = g / self.batch_size
        g_norm = 0.
        for g in grads.values():
            g_norm += (g**2).sum()
        """
        g_norm = 0.
        for p, g in grads.items():
            g /= self.batch_size
            grads[p] = g
            g_norm += (g**2).sum()
        not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))
        g_norm = T.sqrt(g_norm)
        scaler = self.scaler / T.maximum(self.scaler, g_norm)
        for p, g in grads.items():
            grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        mainloop.grads = grads
 def minimize(self, loss, momentum, rescale):
     super(RMSPropOptimizer, self).minimize(loss)
     grads = self.gradparams
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1E-4
     updates = []
     params = self.params
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (
             1. - combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg ** 2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - self.lr * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * self.lr * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     
     return updates
Exemple #7
0
    def mcmc(ll, *frvs):
        full_observations = dict(observations)
        full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, frvs)]))
        
        loglik = -full_log_likelihood(full_observations)

        proposals = free_RVs_prop
        H = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + loglik

# -- this should be an inner loop
        g = []
        g.append(tensor.grad(loglik, frvs))
        
        proposals = [(p - epsilon*gg[0]/2.) for p, gg in zip(proposals, g)]

        rvsp = [(rvs + epsilon*rvp) for rvs,rvp in zip(frvs, proposals)]
        
        full_observations = dict(observations)
        full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, rvsp)]))
        new_loglik = -full_log_likelihood(full_observations)
        
        gnew = []
        gnew.append(tensor.grad(new_loglik, rvsp))
        proposals = [(p - epsilon*gn[0]/2.) for p, gn in zip(proposals, gnew)]
# --
        
        Hnew = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + new_loglik

        dH = Hnew - H
        accept = tensor.or_(dH < 0., U < tensor.exp(-dH))

        return [tensor.switch(accept, -new_loglik, ll)] + \
            [tensor.switch(accept, p, f) for p, f in zip(rvsp, frvs)], \
            {}, theano.scan_module.until(accept)
Exemple #8
0
    def get_output_for(self, input, deterministic=False, **kwargs):
        if deterministic or self.p == 0:
            return T.ones_like(self.retain, dtype=input.dtype)
        else:
            # Using theano constant to prevent upcasting
            # one = T.constant(1)

            # retain_prob = one - self.p
            # if self.rescale:
            #     input /= retain_prob

            # use nonsymbolic shape for dropout mask if possible
            mask_shape = self.input_shape
            if any(s is None for s in mask_shape):
                mask_shape = input.shape

            # apply dropout, respecting shared axes
            if self.shared_axes:
                shared_axes = tuple(a if a >= 0 else a + input.ndim
                                    for a in self.shared_axes)
                mask_shape = tuple(1 if a in shared_axes else s
                                   for a, s in enumerate(mask_shape))
            mask = self._srng.binomial(mask_shape, p=self.retain,
                                       dtype=input.dtype)
            mask = T.or_(mask, self.previous_mask)
            if self.shared_axes:
                bcast = tuple(bool(s == 1) for s in mask_shape)
                mask = T.patternbroadcast(mask, bcast)
            return mask
 def graves_rmsprop_updates(self, params, grads, learning_rate=1e-4, alpha=0.9, epsilon=1e-4, chi=0.95):
     """
     Alex Graves' RMSProp [1]_.
     .. math ::
         n_{i} &= \chi * n_i-1 + (1 - \chi) * grad^{2}\\
         g_{i} &= \chi * g_i-1 + (1 - \chi) * grad\\
         \Delta_{i} &= \alpha * Delta_{i-1} - learning_rate * grad /
                 sqrt(n_{i} - g_{i}^{2} + \epsilon)\\
         w_{i} &= w_{i-1} + \Delta_{i}
     References
     ----------
     .. [1] Graves, Alex.
         "Generating Sequences With Recurrent Neural Networks", p.23
         arXiv:1308.0850
     """
     updates = []
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param, grad)
         old_square = self.running_square_[n]
         old_avg = self.running_avg_[n]
         old_memory = self.memory_[n]
         new_square = chi * old_square + (1. - chi) * grad ** 2
         new_avg = chi * old_avg + (1. - chi) * grad
         new_memory = alpha * old_memory - learning_rate * grad / T.sqrt(new_square - \
                     new_avg ** 2 + epsilon)
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((old_memory, new_memory))
         updates.append((param, param + new_memory))
     return updates
Exemple #10
0
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        g_norm = 0.

        for p, g in grads.items():
            g /= T.cast(self.batch_size, dtype=theano.config.floatX)
            grads[p] = g
            g_norm += (g**2).sum()

        if self.check_nan:
            not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))

        g_norm = T.sqrt(g_norm)
        scaler = self.scaler / T.maximum(self.scaler, g_norm)

        if self.check_nan:
            for p, g in grads.items():
                grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        else:
            for p, g in grads.items():
                grads[p] = g * scaler

        mainloop.grads = grads
Exemple #11
0
Fichier : ctc.py Projet : choko/ctc
def compute_cost_log_in_parallel(original_rnn_outputs, labels, func, x_ends, y_ends):
	mask = T.log(1 - T.or_(T.eq(labels, T.zeros_like(labels)), T.eq(labels, shift_matrix(labels, 2))))

	initial_state = T.log(T.zeros_like(labels))
	initial_state = T.set_subtensor(initial_state[:,0], 0)

	def select_probabilities(rnn_outputs, label):
		return rnn_outputs[:,label]	

	rnn_outputs, _ = theano.map(select_probabilities, [original_rnn_outputs, labels])
	rnn_outputs = T.log(rnn_outputs.dimshuffle((1,0,2)))

	def forward_step(probabilities, last_probabilities):
		all_forward_probabilities = T.stack(
			last_probabilities + probabilities,
			log_shift_matrix(last_probabilities, 1) + probabilities,
			log_shift_matrix(last_probabilities, 2) + probabilities + mask,
		)

		result = func(all_forward_probabilities, 0)
		return result

	forward_probabilities, _ = theano.scan(fn = forward_step, sequences = rnn_outputs, outputs_info = initial_state)
	forward_probabilities = forward_probabilities.dimshuffle((1,0,2))

	def compute_cost(forward_probabilities, x_end, y_end):
		return -func(forward_probabilities[x_end-1,y_end-2:y_end])

	return theano.map(compute_cost, [forward_probabilities, x_ends, y_ends])[0]
Exemple #12
0
    def get_gradients(self, model, data, ** kwargs):

        cost = self.expr(model=model, data=data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        if self.gradient_clipping:
            norm_gs = 0.
            for grad in gradients.values():
                norm_gs += (grad ** 2).sum()
            not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
            norm_gs = T.sqrt(norm_gs)
            norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude),
                               self.max_magnitude / norm_gs,
                               1.)

            for param, grad in gradients.items():
                gradients[param] = T.switch(not_finite,
                                            .1 * param,
                                            grad * norm_gs)

        updates = OrderedDict()

        return gradients, updates
Exemple #13
0
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)
    
    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m) 
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Exemple #14
0
def tnormal_icdf(size, avg, std, lbound, ubound, theano_rng, dtype):
    """
    Alternative Method:
    sample = -Phi_inv(Phi(-lbound)*(1-u) + Phi(-ubound)*u)
    """

    def Phi(x):
        erfarg = (x - avg) / (std * SQRT2)
        rval = 0.5 * (1. + T.erf(erfarg))
        return rval.astype(dtype)
    
    def Phi_inv(y, eps=3e-8):
        """ eps was calibrated for cublas.erfinv using float32 """
        temp = 2. * y - 1.
        erfinv_input = T.clip(temp, -1+eps, 1-eps)
        rval = avg + std * SQRT2 * T.erfinv(erfinv_input)
        return rval.astype(dtype)

    # center lower and upper bounds based on mean
    u = theano_rng.uniform(size=size, dtype=dtype)

    # Inverse CDF method. When method becomes numerically unstable, we simply
    # return the bounds based on whether avg < lbound, or ubound < avg.
    cdf_range = Phi(ubound) - Phi(lbound)
    sample = T.switch(
                T.or_(
                    T.lt(cdf_range, 3e-8),
                    T.gt(cdf_range, 1-3e-8)),
                T.switch(
                    T.lt(avg, lbound),
                    lbound,
                    ubound),
                Phi_inv(Phi(lbound) + u * cdf_range))

    return sample
Exemple #15
0
def truncated_normal(size, avg, std, lbound, ubound, theano_rng, dtype):

    def phi(x):
        erfarg = (x - avg) / (std * SQRT2)
        rval = 0.5 * (1. + T.erf(erfarg))
        return rval.astype(dtype)
    
    def phi_inv(phi_x):
        erfinv_input = T.clip(2. * phi_x - 1., -1.+1e-6, 1.-1e-6)
        rval = avg + std * SQRT2 * T.erfinv(erfinv_input)
        return rval.astype(dtype)

    # center lower and upper bounds based on mean
    u = theano_rng.uniform(size=size, dtype=dtype)

    cdf_range = phi(ubound) - phi(lbound)
    sample = phi_inv(phi(lbound) + u * cdf_range)

    # if avg >> ubound, return ubound
    # if avg << lbound, return lbound
    # else return phi(lbound) + u * [phi(ubound) - phi(lbound)]
    rval = T.switch(
                T.or_(sample < lbound, sample > ubound),
                T.switch(avg >= ubound, ubound, lbound),
                sample)

    return rval
Exemple #16
0
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.0)

    i = shared(floatX(0.0))
    i_t = i + 1.0
    fix1 = 1.0 - (1.0 - b1) ** i_t
    fix2 = 1.0 - (1.0 - b2) ** i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.0)
        v = shared(p.get_value() * 0.0)
        m_t = (b1 * g) + ((1.0 - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1.0 - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)

        # e_t = shared(p.get_value() * 0.)
        # de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05  #*p_t
        # p_t = p_t + de_t
        # updates.append((e_t, e_t + de_t))

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Exemple #17
0
def theano_metrics(y_pred, y_true, n_classes, void_labels):
    """
    Returns the intersection I and union U (to compute the jaccard I/U) and the accuracy.

    :param y_pred: tensor of predictions. shape  (b*0*1, c) with c = n_classes
    :param y_true: groundtruth, shape  (b,0,1) or (b,c,0,1) with c=1
    :param n_classes: int
    :param void_labels: list of indexes of void labels
    :return: return tensors I and U of size (n_classes), and scalar acc
    """

    # Put y_pred and y_true under the same shape
    y_true = T.flatten(y_true)
    y_pred = T.argmax(y_pred, axis=1)

    # We use not_void in case the prediction falls in the void class of the groundtruth
    for i in range(len(void_labels)):
        if i == 0:
            not_void = T.neq(y_true, void_labels[i])
        else:
            not_void = not_void * T.neq(y_true, void_labels[i])

    I = T.zeros(n_classes)
    U = T.zeros(n_classes)

    for i in range(n_classes):
        y_true_i = T.eq(y_true, i)
        y_pred_i = T.eq(y_pred, i)
        I = T.set_subtensor(I[i], T.sum(y_true_i * y_pred_i))
        U = T.set_subtensor(U[i], T.sum(T.or_(y_true_i, y_pred_i) * not_void))

    accuracy = T.sum(I) / T.sum(not_void)

    return I, U, accuracy
Exemple #18
0
 def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.):
     grads = T.grad(cost, params)
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1e-4
     updates = []
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (
             1. - combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg ** 2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - learning_rate * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * learning_rate * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     return updates
Exemple #19
0
def abs(x, axis=0):
    """
    Takes the matrix/vector x and finds the absolute along the axis.
    :param x: T.matrix
    :return: Absolute along the given axis. T.vector
    """
    x = assert_op(x, T.or_(T.eq(x.ndim, 2), T.eq(x.ndim, 1)))
    return T.sqrt(T.sum(T.sqr(x), axis))
Exemple #20
0
def weighted_thresholded_binary_cross_entropy(preds, targets, imbalance_factor,
        lower_threshold):
    loss = weighted_binary_cross_entropy(preds, targets,
        imbalance_factor=imbalance_factor,)
    # preds that are below 0.2 where there is no target, are ignored
    loss_mask = T.or_(T.gt(preds,lower_threshold), T.eq(targets, 1))
    loss = loss * loss_mask
    return loss
Exemple #21
0
    def find_right_bound(prev_func_output, step, maxstep):
        func_output = f(step)
        is_output_decrease = T.gt(prev_func_output, func_output)
        step = ifelse(is_output_decrease, T.minimum(2.0 * step, maxstep), step)

        is_output_increse = T.lt(prev_func_output, func_output)
        stoprule = theano.scan_module.until(T.or_(is_output_increse, step > maxstep))
        return [func_output, step], stoprule
def filter_stop_case(d, filter_vector = None):
    eq = T.eq(d, stop_case)
    f = T.prod(eq, axis=1)
    if filter_vector:
        f = T.or_(filter_vector, f) # increment new overlapped history
    f_inv = T.abs_(f-1)
    d_f = d.T * f_inv.T
    d_f = T.cast(d_f.T, 'float32')
    return d_f, f
Exemple #23
0
def objective(y_true, y_pred, P, Q, alpha=0., beta=0.15, dbeta=0., gamma=0.01, gamma1=-1., poos=0.23, eps=1e-6):
    '''Expects a binary class matrix instead of a vector of scalar classes.
    '''

    beta = np.float32(beta)
    dbeta = np.float32(dbeta)
    gamma = np.float32(gamma)
    poos = np.float32(poos)
    eps = np.float32(eps)

    # scale preds so that the class probas of each sample sum to 1
    y_pred += eps
    y_pred /= y_pred.sum(axis=-1, keepdims=True)

    y_true = T.cast(y_true.flatten(), 'int64')
    y1 = T.and_(T.gt(y_true, 0), T.le(y_true, Q))  # in-set
    y0 = T.or_(T.eq(y_true, 0), T.gt(y_true, Q))  # out-of-set or unlabeled
    y0sum = y0.sum() + eps  # number of oos
    y1sum = y1.sum() + eps  # number of in-set
    # we want to reduce cross entrophy of labeled data
    # convert all oos/unlabeled to label=0
    cost0 = T.nnet.categorical_crossentropy(y_pred, T.switch(y_true <= Q, y_true, 0))
    cost0 = T.dot(y1, cost0) / y1sum  # average cost per labeled example

    if alpha:
        cost1 = T.nnet.categorical_crossentropy(y_pred, y_pred)
        cost1 = T.dot(y0, cost1) / y0sum  # average cost per labeled example
        cost0 += alpha*cost1

    # we want to increase the average entrophy in each batch
    # average over batch
    if beta:
        y_pred_avg0 = T.dot(y0, y_pred) / y0sum
        y_pred_avg0 = T.clip(y_pred_avg0, eps, np.float32(1) - eps)
        y_pred_avg0 /= y_pred_avg0.sum(axis=-1, keepdims=True)
        cost2 = T.nnet.categorical_crossentropy(y_pred_avg0.reshape((1,-1)), P-dbeta)[0] # [None,:]
        cost2 = T.switch(y0sum > 0.5, cost2, 0.)  # ignore cost2 if no samples
        cost0 += beta*cost2

    # binary classifier score
    if gamma:
        y_pred0 = T.clip(y_pred[:,0], eps, np.float32(1) - eps)
        if gamma1 < 0.:
            cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot(np.float32(1)-poos*y0.T,T.log(np.float32(1)-y_pred0))
            cost3 /= y_pred.shape[0]
            cost0 += gamma*cost3
        elif gamma1 > 0.:
            cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0,T.log(np.float32(1)-y_pred0))
            cost3 /= y0sum
            cost31 =  - T.dot(y1,T.log(np.float32(1)-y_pred0))
            cost3 /= y1sum
            cost0 += gamma*cost3 + gamma1*cost31
        else:  # gamma1 == 0.
            cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0, T.log(np.float32(1)-y_pred0))
            cost3 /= y0sum
            cost0 += gamma*cost3
    return cost0
Exemple #24
0
 def dtw(i, q_p, b_p, Q, D, inf):
   i0 = T.eq(i, 0)
   # inf = T.cast(1e10,'float32') * T.cast(T.switch(T.eq(self.n,0), T.switch(T.eq(i,0), 0, 1), 1), 'float32')
   penalty = T.switch(T.and_(T.neg(n0), i0), big, T.constant(0.0, 'float32'))
   loop = T.constant(0.0, 'float32') + q_p
   forward = T.constant(0.0, 'float32') + T.switch(T.or_(n0, i0), 0, Q[i - 1])
   opt = T.stack([loop, forward])
   k_out = T.cast(T.argmin(opt, axis=0), 'int32')
   return opt[k_out, T.arange(opt.shape[1])] + D[i] + penalty, k_out
Exemple #25
0
def jaccard_similarity(y_true, y_predicted):
    """
    y_true: tensor ({1, 0})
    y_predicted: tensor ({1, 0})
    note - we round predicted because float probabilities would not work
    """
    y_predicted = T.round(y_predicted).astype(theano.config.floatX)
    either_nonzero = T.or_(T.neq(y_true, 0), T.neq(y_predicted, 0))
    return T.and_(T.neq(y_true, y_predicted), either_nonzero).sum(axis=-1, dtype=theano.config.floatX) / either_nonzero.sum(axis=-1, dtype=theano.config.floatX)
Exemple #26
0
def step_clipping(params, gparams, scale=1.0):
    grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
    notfinite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
    multiplier = T.switch(grad_norm < scale, 1.0, scale / grad_norm)
    _g = []
    for param, gparam in izip(params, gparams):
        tmp_g = gparam * multiplier
        _g.append(T.switch(notfinite, param * 0.1, tmp_g))

    params_clipping = _g

    return params_clipping
Exemple #27
0
    def __init__(self, low=0.0, high=1.0):
        """Constructor.

        Parameters
        ----------
        * `low` [float]:
            The lower bound.

        * `high` [float]:
            The upper bound
        """
        super(Uniform, self).__init__(low=low, high=high)

        # pdf
        self.pdf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            0.,
            1. / (self.high - self.low)).ravel()
        self._make(self.pdf_, "pdf")

        # -log pdf
        self.nll_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            np.inf,
            T.log(self.high - self.low)).ravel()
        self._make(self.nll_, "nll")

        # cdf
        self.cdf_ = T.switch(
            T.lt(self.X, self.low),
            0.,
            T.switch(
                T.lt(self.X, self.high),
                (self.X - self.low) / (self.high - self.low),
                1.)).ravel()
        self._make(self.cdf_, "cdf")

        # ppf
        self.ppf_ = self.p * (self.high - self.low) + self.low
        self._make(self.ppf_, "ppf", args=[self.p])
Exemple #28
0
    def _step(
            i,
            pkm1, pkm2, qkm1, qkm2,
            k1, k2, k3, k4, k5, k6, k7, k8, r
    ):
        xk = -(x * k1 * k2) / (k3 * k4)
        pk = pkm1 + pkm2 * xk
        qk = qkm1 + qkm2 * xk
        pkm2 = pkm1
        pkm1 = pk
        qkm2 = qkm1
        qkm1 = qk

        xk = (x * k5 * k6) / (k7 * k8)
        pk = pkm1 + pkm2 * xk
        qk = qkm1 + qkm2 * xk
        pkm2 = pkm1
        pkm1 = pk
        qkm2 = qkm1
        qkm1 = qk

        old_r = r
        r = tt.switch(tt.eq(qk, zero), r, pk/qk)

        k1 += one
        k2 += k26update
        k3 += two
        k4 += two
        k5 += one
        k6 -= k26update
        k7 += two
        k8 += two

        big_cond = tt.gt(tt.abs_(qk) + tt.abs_(pk), BIG)
        biginv_cond = tt.or_(
            tt.lt(tt.abs_(qk), BIGINV),
            tt.lt(tt.abs_(pk), BIGINV)
        )

        pkm2 = tt.switch(big_cond, pkm2 * BIGINV, pkm2)
        pkm1 = tt.switch(big_cond, pkm1 * BIGINV, pkm1)
        qkm2 = tt.switch(big_cond, qkm2 * BIGINV, qkm2)
        qkm1 = tt.switch(big_cond, qkm1 * BIGINV, qkm1)

        pkm2 = tt.switch(biginv_cond, pkm2 * BIG, pkm2)
        pkm1 = tt.switch(biginv_cond, pkm1 * BIG, pkm1)
        qkm2 = tt.switch(biginv_cond, qkm2 * BIG, qkm2)
        qkm1 = tt.switch(biginv_cond, qkm1 * BIG, qkm1)

        return ((pkm1, pkm2, qkm1, qkm2,
                 k1, k2, k3, k4, k5, k6, k7, k8, r),
                until(tt.abs_(old_r - r) < (THRESH * tt.abs_(r))))
Exemple #29
0
    def __init__(self, x, xr, mask, L_enc, pdrop, args):
        # NOTE shape[1] is batch size since shape[0] is seq length
        outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)]
        flayers = list()
        blayers = list()
        fsubset = L_enc[x.flatten()]
        bsubset = L_enc[xr.flatten()]
        finp = fsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
        binp = bsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1]))
        fseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
        bseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop)
        finplayer = GRULayer(finp.astype(floatX), mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc0")
        binplayer = GRULayer(
            binp.astype(floatX), mask, bseqmask, args.rnn_dim, outputs_info, args, suffix="benc0", backwards=True
        )
        flayers.append(finplayer)
        blayers.append(binplayer)
        self.routs = list()  # unlike RNNEncoder, contains hs, not just final h
        self.routs.append(finplayer.out + binplayer.out)
        downs = []
        for k in xrange(1, args.rlayers):
            # concatenate consecutive steps in the sequence (which are downscaled to half from the previous layer)
            d = Downscale(self.routs[-1], args.rnn_dim, suffix="ds%d" % k)
            downs.append(d)
            inp = d.out
            twocols = mask.T.reshape([-1, 2])
            mask = T.or_(twocols[:, 0], twocols[:, 1]).reshape([mask.shape[1], -1]).T

            fseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop)
            bseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop)
            flayer = GRULayer(
                Dropout(inp, pdrop).out, mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc%d" % k
            )
            blayer = GRULayer(
                Dropout(inp, pdrop).out,
                mask,
                bseqmask,
                args.rnn_dim,
                outputs_info,
                args,
                suffix="benc%d" % k,
                backwards=True,
            )
            self.routs.append(flayer.out + blayer.out)
            flayers.append(flayer)
            blayers.append(blayer)
        self.hs = self.routs[-1]  # for attention
        olayer = LayerWrapper(self.routs)
        rlayers = flayers + blayers  # NOTE careful not to assume rlayers = # layers in all cases

        # undo the temporary hack
        super(BiPyrRNNEncoder, self).__init__(rlayers, olayer, downscales=downs)
Exemple #30
0
    def shadow(self, points, lights):
        """
        Returns whether points are in shadow of this object.

        See: http://en.wikipedia.org/wiki/Line-sphere_intersection
        """
        y = points  # vector from points to our center
        x = T.tensordot(y, -1*lights[0].normed_dir(), 1)
        decider = T.sqr(x) - T.sum(T.mul(y, y), 2) + 1

        # if shadow, below is >= 0
        is_nan_or_nonpos = T.or_(T.isnan(decider), decider <= 0)
        return T.switch(is_nan_or_nonpos, -1, -x - T.sqrt(decider))
Exemple #31
0
    def IoU_flatt(y_true, y_pred):
        '''Expects a binary class matrix instead of a vector of scalar classes.
        '''
        if dim_ordering == 'th':
            y_pred = K.permute_dimensions(y_pred, (0, 2, 3, 1))
        shp_y_pred = K.shape(y_pred)
        y_pred = K.reshape(y_pred, (shp_y_pred[0]*shp_y_pred[1]*shp_y_pred[2],
                           shp_y_pred[3]))  # go back to b01,c
        # shp_y_true = K.shape(y_true)
        y_true = K.cast(K.flatten(y_true), 'int32')  # b,01 -> b01
        y_pred = K.argmax(y_pred, axis=-1)

        # We use not_void in case the prediction falls in the void class of
        # the groundtruth
        for i in range(len(void_labels)):
            if i == 0:
                not_void = K.not_equal(y_true, void_labels[i])
            else:
                not_void = not_void * K.not_equal(y_true, void_labels[i])

        sum_I = K.zeros((1,), dtype='float32')

        out = {}
        for i in range(n_classes):
            y_true_i = K.equal(y_true, i)
            y_pred_i = K.equal(y_pred, i)

            if dim_ordering == 'th':
                I_i = K.sum(y_true_i * y_pred_i)
                U_i = K.sum(T.or_(y_true_i, y_pred_i) * not_void)
                # I = T.set_subtensor(I[i], I_i)
                # U = T.set_subtensor(U[i], U_i)
                sum_I = sum_I + I_i
            else:
                U_i = K.sum(K.cast(tf.logical_and(tf.logical_or(y_true_i, y_pred_i), not_void), 'float32'))
                y_true_i = K.cast(y_true_i, 'float32')
                y_pred_i = K.cast(y_pred_i, 'float32')
                I_i = K.sum(y_true_i * y_pred_i)
                sum_I = sum_I + I_i
            out['I'+str(i)] = I_i
            out['U'+str(i)] = U_i

        if dim_ordering == 'th':
            accuracy = K.sum(sum_I) / K.sum(not_void)
        else:
            accuracy = K.sum(sum_I) / tf.reduce_sum(tf.cast(not_void, 'float32'))
        out['acc'] = accuracy
        return out
Exemple #32
0
def masked_categorical_crossentropy(output, target, mask, from_logits=False):
    if from_logits:
        output = T.nnet.softmax(output)
    else:
        # scale preds so that the class probas of each sample sum to 1
        output /= output.sum(axis=-1, keepdims=True)
    # avoid numerical instability with _EPSILON clipping
    output = T.clip(output, _EPSILON, 1.0 - _EPSILON)

    objective = -T.sum(target * T.log(output), axis=output.ndim - 1)

    objective = T.set_subtensor(
        objective[T.or_(T.eq(target[:, :, mask], 1), T.eq(target[:, :, 0],
                                                          1)).nonzero()], 0.0)

    return printing.Print('Objective', global_fn=_debug_fn)(objective)
Exemple #33
0
 def gradient_descent(self, loss):
     """Momentum GD with gradient clipping."""
     grad = T.grad(loss, self.params)
     self.momentum_velocity_ = [0.] * len(grad)
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad)))
     updates = OrderedDict()
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     scaling_den = T.maximum(5.0, grad_norm)
     for n, (param, grad) in enumerate(zip(self.params, grad)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (5.0 / scaling_den))
         velocity = self.momentum_velocity_[n]
         update_step = self.momentum * velocity - self.learning_rate * grad
         self.momentum_velocity_[n] = update_step
         updates[param] = param + update_step
     return updates
Exemple #34
0
    def zoom_itertion_step(x_low, y_low, y_deriv_low, x_high, y_high,
                           x_recent, y_recent, x_star):
        x_new = cubic_minimizer(x_low, y_low, y_deriv_low,
                                x_high, y_high,
                                x_recent, y_recent)

        y_new = f(x_new)
        y_deriv_new = f_deriv(x_new)

        stop_loop_rule = sequential_and(
            y_new <= y0 + c1 * x_new * y_deriv_0,
            y_new < y_low,
            abs(y_deriv_new) <= -c2 * y_deriv_0,
        )

        condition1 = T.or_(
            y_new > y0 + c1 * x_new * y_deriv_0,
            y_new >= y_low
        )
        condition2 = y_deriv_new * (x_high - x_low) >= zero

        y_recent, x_recent, x_high, y_high = ifelse(
            condition1,
            [y_high, x_high, x_new, y_new],
            ifelse(
                condition2,
                [y_high, x_high, x_low, y_low],
                [y_low, x_low, x_high, y_high],
            )
        )

        x_low, y_low, y_deriv_low = ifelse(
            condition1,
            [x_low, y_low, y_deriv_low],
            [x_new, y_new, y_deriv_new],
        )
        x_star = x_new

        return (
            [
                x_low, y_low, y_deriv_low,
                x_high, y_high,
                y_recent, x_recent,
                x_star
            ],
            theano.scan_module.scan_utils.until(stop_loop_rule)
        )
Exemple #35
0
    def search_iteration_step(x_previous, x_current, y_previous, y_current,
                              y_deriv_previous, is_first_iteration, x_star):

        y_deriv_current = f_deriv(x_current)

        x_new = x_current * asfloat(2)
        y_new = f(x_new)

        condition1 = T.or_(
            y_current > (y0 + c1 * x_current * y_deriv_0),
            T.and_(
                y_current >= y_previous,
                bitwise_not(is_first_iteration),
            ))
        condition2 = T.abs_(y_deriv_current) <= -c2 * y_deriv_0
        condition3 = y_deriv_current >= zero

        x_star = ifelse(
            condition1,
            zoom(x_previous, x_current, y_previous, y_current,
                 y_deriv_previous, f, f_deriv, y0, y_deriv_0, c1, c2),
            ifelse(
                condition2,
                x_current,
                ifelse(
                    condition3,
                    zoom(x_current, x_previous, y_current, y_previous,
                         y_deriv_current, f, f_deriv, y0, y_deriv_0, c1, c2),
                    x_new,
                ),
            ),
        )
        y_deriv_previous_new = ifelse(condition1, y_deriv_previous,
                                      y_deriv_current)

        is_any_condition_satisfied = sequential_or(condition1, condition2,
                                                   condition3)
        y_current_new = ifelse(is_any_condition_satisfied, y_current, y_new)
        return ([
            x_current, x_new, y_current, y_current_new, y_deriv_previous_new,
            theano_false, x_star
        ],
                theano.scan_module.scan_utils.until(
                    sequential_or(
                        T.eq(x_new, zero),
                        is_any_condition_satisfied,
                    )))
Exemple #36
0
def rmsprop(cost, params, learning_rate, momentum=0.5, rescale=5.):

    grads = T.grad(cost=cost, wrt=params)

    running_square_ = [
        theano.shared(np.zeros_like(p.get_value(), dtype=p.dtype),
                      broadcastable=p.broadcastable) for p in params
    ]
    running_avg_ = [
        theano.shared(np.zeros_like(p.get_value(), dtype=p.dtype),
                      broadcastable=p.broadcastable) for p in params
    ]
    memory_ = [
        theano.shared(np.zeros_like(p.get_value(), dtype=p.dtype),
                      broadcastable=p.broadcastable) for p in params
    ]

    grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
    not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
    grad_norm = T.sqrt(grad_norm)
    scaling_num = rescale
    scaling_den = T.maximum(rescale, grad_norm)
    # Magic constants
    combination_coeff = 0.9
    minimum_grad = 1E-4
    updates = []
    for n, (param, grad) in enumerate(zip(params, grads)):
        grad = T.switch(not_finite, 0.1 * param,
                        grad * (scaling_num / scaling_den))
        old_square = running_square_[n]
        new_square = combination_coeff * old_square + (
            1. - combination_coeff) * T.sqr(grad)
        old_avg = running_avg_[n]
        new_avg = combination_coeff * old_avg + (1. - combination_coeff) * grad
        rms_grad = T.sqrt(new_square - new_avg**2)
        rms_grad = T.maximum(rms_grad, minimum_grad)
        memory = memory_[n]
        update = momentum * memory - learning_rate * grad / rms_grad

        update2 = momentum * momentum * memory - (
            1 + momentum) * learning_rate * grad / rms_grad

        updates.append((old_square, new_square))
        updates.append((old_avg, new_avg))
        updates.append((memory, update))
        updates.append((param, param + update2))
    return updates
Exemple #37
0
    def _step(i, pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r):
        xk = -(x * k1 * k2) / (k3 * k4)
        pk = pkm1 + pkm2 * xk
        qk = qkm1 + qkm2 * xk
        pkm2 = pkm1
        pkm1 = pk
        qkm2 = qkm1
        qkm1 = qk

        xk = (x * k5 * k6) / (k7 * k8)
        pk = pkm1 + pkm2 * xk
        qk = qkm1 + qkm2 * xk
        pkm2 = pkm1
        pkm1 = pk
        qkm2 = qkm1
        qkm1 = qk

        old_r = r
        r = tt.switch(tt.eq(qk, zero), r, pk / qk)

        k1 += one
        k2 += k26update
        k3 += two
        k4 += two
        k5 += one
        k6 -= k26update
        k7 += two
        k8 += two

        big_cond = tt.gt(tt.abs_(qk) + tt.abs_(pk), BIG)
        biginv_cond = tt.or_(tt.lt(tt.abs_(qk), BIGINV),
                             tt.lt(tt.abs_(pk), BIGINV))

        pkm2 = tt.switch(big_cond, pkm2 * BIGINV, pkm2)
        pkm1 = tt.switch(big_cond, pkm1 * BIGINV, pkm1)
        qkm2 = tt.switch(big_cond, qkm2 * BIGINV, qkm2)
        qkm1 = tt.switch(big_cond, qkm1 * BIGINV, qkm1)

        pkm2 = tt.switch(biginv_cond, pkm2 * BIG, pkm2)
        pkm1 = tt.switch(biginv_cond, pkm1 * BIG, pkm1)
        qkm2 = tt.switch(biginv_cond, qkm2 * BIG, qkm2)
        qkm1 = tt.switch(biginv_cond, qkm1 * BIG, qkm1)

        return (
            (pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r),
            until(tt.abs_(old_r - r) < (THRESH * tt.abs_(r))),
        )
Exemple #38
0
    def get_clip_rmsprop_updates(self,
                                 params,
                                 cost,
                                 learning_rate,
                                 momentum,
                                 rescale=5.):
        gparams = T.grad(cost, params)
        updates = OrderedDict()

        if not hasattr(self, "running_average_"):
            self.running_square_ = [0.] * len(gparams)
            self.running_avg_ = [0.] * len(gparams)
            self.updates_storage_ = [0.] * len(gparams)

        if not hasattr(self, "momentum_velocity_"):
            self.momentum_velocity_ = [0.] * len(gparams)

        # Gradient clipping
        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        grad_norm = T.sqrt(grad_norm)
        scaling_num = rescale
        scaling_den = T.maximum(rescale, grad_norm)
        for n, (param, gparam) in enumerate(zip(params, gparams)):
            gparam = T.switch(not_finite, 0.1 * param,
                              gparam * (scaling_num / scaling_den))
            combination_coeff = 0.9
            minimum_grad = 1e-4
            old_square = self.running_square_[n]
            new_square = combination_coeff * old_square + (
                1. - combination_coeff) * T.sqr(gparam)
            old_avg = self.running_avg_[n]
            new_avg = combination_coeff * old_avg + (
                1. - combination_coeff) * gparam
            rms_grad = T.sqrt(new_square - new_avg**2)
            rms_grad = T.maximum(rms_grad, minimum_grad)
            velocity = self.momentum_velocity_[n]
            update_step = momentum * velocity - learning_rate * (gparam /
                                                                 rms_grad)
            self.running_square_[n] = new_square
            self.running_avg_[n] = new_avg
            self.updates_storage_[n] = update_step
            self.momentum_velocity_[n] = update_step
            updates[param] = param + update_step

        return updates
Exemple #39
0
def clip_grad_remove_nan(grads, clip_c_shared, mt_tparams):
    g2 = 0.
    for g in grads:
        g2 += (g * g).sum()
    not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2))
    if clip_c_shared.get_value() > 0.:
        new_grads = []
        for g, p in zip(grads, itemlist(mt_tparams)):
            tmpg = tensor.switch(g2 > (clip_c_shared * clip_c_shared),
                                 g / tensor.sqrt(g2) * clip_c_shared, g)
            new_grads.append(
                tensor.switch(not_finite,
                              np.float32(0.1) * p, tmpg))

        return new_grads, tensor.sqrt(g2)
    else:
        return grads, tensor.sqrt(g2)
Exemple #40
0
    def distance(self, rayField):
        """
        Returns the distances along the rays that hits occur.

        If no hit, returns inf.
        """

        rf = self.w2o(rayField)

        pdotv = T.tensordot(rf.rays, rf.origin, 1)
        vnorm = T.sum(rf.rays * rf.rays, axis=2)
        determinent = self._hit(rf.rays, rf.origin)
        distance1 = (-pdotv - T.sqrt(determinent)) / vnorm
        distance2 = (-pdotv + T.sqrt(determinent)) / vnorm
        distance = T.minimum(distance1, distance2)
        is_nan_or_negative = T.or_(determinent <= 0, T.isnan(determinent))
        stabilized = T.switch(is_nan_or_negative, float('inf'), distance)
        return stabilized
Exemple #41
0
    def compute_updates(self, training_cost, params):
        updates = {}

        grads = T.grad(training_cost, params)
        grads = OrderedDict(zip(params, grads))

        # Clip stuff
        c = numpy.float32(self.cutoff)
        clip_grads = []

        norm_gs = T.sqrt(sum(T.sum(g**2) for p, g in grads.items()))
        normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
        notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))

        for p, g in grads.items():
            clip_grads.append((p,
                               T.switch(notfinite,
                                        numpy.float32(.1) * p,
                                        g * normalization)))
        grads = OrderedDict(clip_grads)

        if self.initialize_from_pretrained_word_embeddings and self.fix_pretrained_word_embeddings:
            # Keep pretrained word embeddings fixed
            logger.debug("Will use mask to fix pretrained word embeddings")
            grads[self.language_model.W_emb] = grads[
                self.language_model.
                W_emb] * self.language_model.W_emb_pretrained_mask

        else:
            logger.debug("Will train all word embeddings")

        if self.updater == 'adagrad':
            updates = Adagrad(grads, self.lr)
        elif self.updater == 'sgd':
            raise Exception("Sgd not implemented!")
        elif self.updater == 'adadelta':
            updates = Adadelta(grads)
        elif self.updater == 'rmsprop':
            updates = RMSProp(grads, self.lr)
        elif self.updater == 'adam':
            updates = Adam(grads)
        else:
            raise Exception("Updater not understood!")
        return updates
def sgdmgc(cost, params, lr=1.0, alpha=0.1, max_magnitude=5.0, infDecay=0.1):
    """SGD with momentum and gradient clipping"""
    grads = T.grad(cost=cost, wrt=params)
    updates = []

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude),
                           max_magnitude / sqrtnorm, 1.)

    for p, g in zip(params, grads):
        v = shared(p.get_value() * 0.)
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        v_new = v * (1.0 - alpha) - alpha * lr * g
        updates.append((v, v_new))
        updates.append((p, p + v_new))

    return updates, norm
Exemple #43
0
    def get_action_results(self, last_states, actions, **kwargs):

        #unpack state and action
        last_state = check_list(last_states)[0]
        action = check_list(actions)[0]

        #state is a boolean vector: whether or not i-th action
        #was tried already during this session
        #last output[:,end_code] always remains 1 after first being triggered

        #whether session was active before tick
        session_active = T.eq(last_state[:, -1], 0)
        #whether session was terminated by the end of this tick
        session_terminated = T.or_(T.eq(session_active, 0),
                                   in1d(action, self.terminal_action_ids))

        batch_range = T.arange(action.shape[0])
        state_after_action = T.set_subtensor(last_state[batch_range, action],
                                             1)
        state_after_action = T.set_subtensor(state_after_action[:, -1],
                                             session_terminated)

        new_state = T.switch(session_active.reshape([-1, 1]),
                             state_after_action, last_state)

        #if allowed to see attribute
        observed_attrs = T.switch(
            state_after_action[:, :self.attributes.shape[1]], self.attributes,
            -1)

        observation = T.concatenate(
            [
                observed_attrs,  #float32[batch,1] response
                T.extra_ops.to_one_hot(
                    action,
                    self.joint_data.shape[1]),  #what action was commited
                session_terminated.reshape(
                    [-1, 1]),  # whether session is terminated by now
            ],
            axis=1)

        return new_state, observation
Exemple #44
0
def clip_gradients_norm(gradients, threshold, parameters, fix_nan=False):
    gradient_sqr_vec = T.concatenate([T.sqr(g.flatten()) for g in gradients])
    gradient_norm = T.sqrt(gradient_sqr_vec.sum())
    rescale = T.maximum(gradient_norm, threshold)
    if fix_nan:
        isnan = T.or_(T.isnan(gradient_norm), T.isinf(gradient_norm))
    else:
        isnan = None
    rv = []
    for i, g in enumerate(gradients):
        if fix_nan:
            alt_g = 0.1 * parameters[i]
            print_alt_g = Print(
                "NaN detected! Fixing with pseudogradient with mean:",
                ["mean"])(alt_g)
            new_g = T.switch(isnan, print_alt_g, g / rescale)
        else:
            new_g = g / rescale
        rv.append(new_g)
    return rv
def adamgc_(cost,
            params,
            lr=0.0002,
            b1=0.1,
            b2=0.01,
            e=1e-8,
            max_magnitude=5.0,
            infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude),
                           max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)

        #e_t = shared(p.get_value() * 0.)
        #de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05  #*p_t
        #p_t = p_t + de_t
        #updates.append((e_t, e_t + de_t))

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Exemple #46
0
def th_el_nearestd(x1: tt.TensorVariable, x2: tt.TensorVariable,
                   mu: tt.TensorConstant, Ainv: tt.TensorConstant) -> tt.TensorVariable:
    # See numpy version for info/comments.

    D = x1.shape[1]

    x1g = th_el_backproject_all(x1, mu, Ainv)  # (N, K, D)
    x2g = th_el_backproject_all(x2, mu, Ainv)

    x1gf = x1g.reshape((-1, D))
    x2gf = x2g.reshape((-1, D))
    # x1gf = x1g.reshape(-1, D)
    # x2gf = x2g.reshape(-1, D)

    diff = x2gf - x1gf
    num = -matmul(x1gf.dimshuffle(0, 'x', 1), diff.dimshuffle(0, 1, 'x')).squeeze()
    # num = -np.matmul(x1gf[..., None, :], diff[..., :, None]).squeeze()
    den = matmul(diff.dimshuffle(0, 'x', 1), diff.dimshuffle(0, 1, 'x')).squeeze()
    # den = np.matmul(diff[..., None, :], diff[..., :, None]).squeeze()

    t = num / den  # type: tt.TensorVariable

    tneg = t < 0
    tbig = t > tt.sqrt(den)
    # tbig = t > np.sqrt(den)
    tout = tt.or_(tneg, tbig)
    # tout = np.logical_or(tneg, tbig)
    tin = ~tout
    # tin = np.logical_not(tout)

    d_ = x1gf + diff * t.dimshuffle(0, 'x')
    dpoa = tt.sqrt(tt.sum(d_ * d_, axis=1, keepdims=True)).squeeze() * tin
    # dpoa = np.linalg.norm(x1gf + diff * t[:, None], axis=1) * tin
    dx1 = tt.sqrt(tt.sum(x1gf * x1gf, axis=1, keepdims=True)).squeeze() * tneg
    # dx1 = np.linalg.norm(x1gf, axis=1) * tneg
    dx2 = tt.sqrt(tt.sum(x2gf * x2gf, axis=1, keepdims=True)).squeeze() * tbig
    # dx2 = np.linalg.norm(x2gf, axis=1) * tbig

    d = dpoa + dx1 + dx2
    d = d.reshape(x1g.shape[:-1])  # (N, K)
    return tt.min(d, axis=1)  # (N,)
Exemple #47
0
    def mcmc(ll, *frvs):
        full_observations = dict(observations)
        full_observations.update(
            dict([(rv, s) for rv, s in zip(free_RVs, frvs)]))

        loglik = -full_log_likelihood(full_observations)

        proposals = free_RVs_prop
        H = tensor.add(*[tensor.sum(tensor.sqr(p))
                         for p in proposals]) / 2. + loglik

        # -- this should be an inner loop
        g = []
        g.append(tensor.grad(loglik, frvs))

        proposals = [(p - epsilon * gg[0] / 2.) for p, gg in zip(proposals, g)]

        rvsp = [(rvs + epsilon * rvp) for rvs, rvp in zip(frvs, proposals)]

        full_observations = dict(observations)
        full_observations.update(
            dict([(rv, s) for rv, s in zip(free_RVs, rvsp)]))
        new_loglik = -full_log_likelihood(full_observations)

        gnew = []
        gnew.append(tensor.grad(new_loglik, rvsp))
        proposals = [(p - epsilon * gn[0] / 2.)
                     for p, gn in zip(proposals, gnew)]
        # --

        Hnew = tensor.add(*[tensor.sum(tensor.sqr(p))
                            for p in proposals]) / 2. + new_loglik

        dH = Hnew - H
        accept = tensor.or_(dH < 0., U < tensor.exp(-dH))

        return [tensor.switch(accept, -new_loglik, ll)] + \
            [tensor.switch(accept, p, f) for p, f in zip(rvsp, frvs)], \
            {}, theano.scan_module.until(accept)
Exemple #48
0
def compute_updates(training_cost, params, config):
    updates = []
     
    grads = T.grad(training_cost, params)
    grads = OrderedDict(zip(params, grads))

    # Clip stuff
    c = np.float32(1.)
    clip_grads = []
    
    norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items()))
    normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
    notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
     
    for p, g in grads.items():
        clip_grads.append((p, T.switch(notfinite, np.float32(.1) * p, g * normalization)))
    
    grads = OrderedDict(clip_grads)

    updates = Adam(grads, config.learning_rate) #使用adam梯度更新策略

    return updates
    def adam(self,
             cost,
             params,
             learning_rate=0.001,
             beta1=0.9,
             beta2=0.999,
             epsilon=1e-8):

        all_grads = T.grad(cost=cost, wrt=params)
        all_grads = total_norm_constraint(all_grads, 10)

        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), all_grads)))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))

        t_prev = theano.shared(utils.floatX(0.))
        updates = OrderedDict()

        t = t_prev + 1
        a_t = learning_rate * T.sqrt(1 - beta2**t) / (1 - beta1**t)

        for param, g_t in zip(params, all_grads):
            g_t = T.switch(not_finite, 0.1 * param, g_t)
            value = param.get_value(borrow=True)
            m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                   broadcastable=param.broadcastable)
            v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                   broadcastable=param.broadcastable)

            m_t = beta1 * m_prev + (1 - beta1) * g_t
            v_t = beta2 * v_prev + (1 - beta2) * g_t**2
            step = a_t * m_t / (T.sqrt(v_t) + epsilon)

            updates[m_prev] = m_t
            updates[v_prev] = v_t
            updates[param] = param - step

        updates[t_prev] = t
        return updates
Exemple #50
0
def anneal_learning_rate(lr, t, method='half-life', **kwargs):
    if not isinstance(lr, (T.sharedvar.ScalarSharedVariable, T.sharedvar.TensorSharedVariable)):
        raise TypeError('lr must be a shared variable, got %s.' % type(lr))

    lr_ = lr.get_value()
    if method == 'half-life':
        num_iters = kwargs.pop('num_iters', None)
        decay = kwargs.pop('decay', .1)
        if num_iters is None:
            raise ValueError('num_iters must be provided.')

        cond = T.cast(T.or_(T.eq(t, num_iters // 2), T.eq(t, 3 * num_iters // 4)), theano.config.floatX)
        lr.default_update = lr * decay * cond + (1. - cond) * lr
    elif method == 'step':
        step = kwargs.pop('step', None)
        decay = kwargs.pop('decay', .5)
        if step is None:
            raise ValueError('step must be provided.')

        cond = T.cast(T.eq(T.mod(t, step), 0), theano.config.floatX)
        lr.default_update = lr * decay * cond + (1. - cond) * lr
    elif method == 'exponential':
        decay = kwargs.pop('decay', 1e-4)
        t = T.cast(t, theano.config.floatX)
        lr.default_update = lr_ * T.exp(-decay * t)
    elif method == 'linear':
        num_iters = kwargs.pop('num_iters', None)
        if num_iters is None:
            raise ValueError('num_iters must be provided.')

        t = T.cast(t, theano.config.floatX)
        lr.default_update = lr_ * (1. - t / np.cast[theano.config.floatX](num_iters))
    elif method == 'inverse':
        decay = kwargs.pop('decay', .01)
        t = T.cast(t, theano.config.floatX)
        lr.default_update = lr_ / (1. + decay * t)
    else:
        raise ValueError('Unknown annealing method.')
Exemple #51
0
    def get_clip_sgd_updates(self, X_sym, y_sym, params, cost, learning_rate,
                             momentum, rescale=5.):
        gparams = T.grad(cost, params)
        updates = OrderedDict()

        if not hasattr(self, "momentum_velocity_"):
            self.momentum_velocity_ = [0.] * len(gparams)

        # Gradient clipping
        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        grad_norm = T.sqrt(grad_norm)
        scaling_num = rescale
        scaling_den = T.maximum(rescale, grad_norm)
        for n, (param, gparam) in enumerate(zip(params, gparams)):
            # clip gradient directly, not momentum etc.
            gparam = T.switch(not_finite, 0.1 * param,
                              gparam * (scaling_num / scaling_den))
            velocity = self.momentum_velocity_[n]
            update_step = momentum * velocity - learning_rate * gparam
            self.momentum_velocity_[n] = update_step
            updates[param] = param + update_step
        return updates
Exemple #52
0
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        g_norm = 0.
        for p, g in grads.items():
            g /= T.cast(self.batch_size, dtype=theano.config.floatX)
            grads[p] = g
            g_norm += (g**2).sum()
        if self.check_nan:
            not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))
        g_norm = T.sqrt(g_norm)
        scaler = self.scaler / T.maximum(self.scaler, g_norm)
        if self.check_nan:
            for p, g in grads.items():
                grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        else:
            for p, g in grads.items():
                grads[p] = g * scaler
        mainloop.grads = grads
Exemple #53
0
    def clip_gradient(self, params, gparams, scalar=5, check_nanF=True):
        """
            Sequence to sequence
        """
        num_params = len(gparams)
        g_norm = 0.
        for i in xrange(num_params):
            gparam = gparams[i]
            g_norm += (gparam**2).sum()
        if check_nanF:
            not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))
        g_norm = T.sqrt(g_norm)
        scalar = scalar / T.maximum(scalar, g_norm)
        if check_nanF:
            for i in xrange(num_params):
                param = params[i]
                gparams[i] = T.switch(not_finite, 0.1 * param,
                                      gparams[i] * scalar)
        else:
            for i in xrange(num_params):
                gparams[i] = gparams[i] * scalar

        return gparams
Exemple #54
0
		def to_weights(d, m, p, prior):
			hid_inp = self.dwe[d, :] # mw x ms x hd
			if self.is_lstm or self.is_gru:
				logit = T.exp(T.dot(hid_inp, L0)[:,:,p])# (mw x ms) x mw
				mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m)
				mask = mk.dimshuffle(0, 'x', 'x')
				l2 = logit * mask # mw x ms x mw
				l2 = T.sum(l2 * mk.dimshuffle('x', 'x', 0), axis=2) * m # mw x ms 
				w0 = l2 / T.sum(l2, axis=1).dimshuffle(0, 'x')
				w1 = T.switch(T.isnan(w0), 0, w0)
			else:
				if self.lm_mode == 'diag':
					B = hid_inp * Wt.dimshuffle('x', 'x', 0)
					tmp = T.tensordot(B, B.T, axes = 1)
				elif self.lm_mode == 'iden':
					logit = T.tensordot(self.dwe[d, :], self.dwe.T, axes=1)[:,:,d] # mw x ms x mw x ms
					cnt = T.sum(m, axis=1).dimshuffle('x', 'x', 0) # 1 x 1 x mw
					logit = T.sum(logit * m.dimshuffle('x', 'x', 0, 1), axis=3) / cnt # mw x ms x mw
					logit = T.exp(10*T.switch(T.isnan(logit), 0, logit)) # mw x ms x mw
					logit = T.prod(logit, axis=2) * prior # mw x ms
					sm = T.sum(logit * m, axis=1, keepdims=True) # mw x 1
					logit = (logit * m) / sm # mw x ms
					return T.switch(T.or_(T.isnan(logit), T.isinf(logit)), 0, logit)
				else:
					tmp = T.tensordot(T.dot(hid_inp, self.params['Wt']), hid_inp.T, axes=1) # mw x ms x ms x mw
				tmp = T.exp(tmp.dimshuffle(0, 1, 3, 2)) # mw x ms x mw x ms
				tmp = tmp * m.dimshuffle('x', 'x', 0, 1)
				nrm = T.sum(tmp, axis=3)
				tmp = tmp / nrm.dimshuffle(0, 1, 2, 'x')
				tmp = T.switch(T.isnan(tmp), 0, tmp)
				mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m)
				tmp = T.max(tmp, axis=3) * mk.dimshuffle('x', 'x', 0) # mw x ms x mw
				tmp = T.exp(T.sum(T.log(T.switch(T.eq(tmp, 0), 1, tmp)), axis=2)) * m # mw x ms
				tmp = tmp * prior
				tmp = tmp / T.sum(tmp, axis=1).dimshuffle(0, 'x')
				w1 = T.switch(T.isnan(tmp), 0, tmp)
			return w1
Exemple #55
0
        def get_idx(q_nbrs, q_mem):
            """Gets the index of sample in memory for computing loss.

            We first look to see if the query label can be found in the
            retrieved neighbours, and if not, look to memory for a key with
            the same value.

            We keep track of a boolean mask, which indicates whether or not we
            were able to find a sample with a label that matches the query.
            """

            # Whether a matching sample can be found in neighbours or memory
            any_match_nbrs = T.any(q_nbrs, axis=1)
            any_match_mem = T.any(q_mem, axis=1)
            any_match = T.or_(any_match_nbrs, any_match_mem)

            # Look in neighbours then memory for corresponding sample.
            # If from neighbours, we need to retrieve the full mem idx.
            rows = T.arange(nbrs.shape[0])
            idx = T.switch(any_match_nbrs,
                           nbrs[rows, tensor_choose_k(q_nbrs, self.rng, k=1)],
                           tensor_choose_k(q_mem, self.rng, k=1, random=True))

            return (idx, any_match)
Exemple #56
0
 def update_s(s, alphas, scorematrix, queryseq, blank, t):
     l = (s - 1) // 2
     alphas = ifelse(
         tensor.eq(s % 2, 0),
         ifelse(tensor.eq(s, 0),
                tensor.set_subtensor(
                    alphas[s, t], alphas[s, t - 1] * scorematrix[blank, t]),
                tensor.set_subtensor(
                    alphas[s,
                           t], (alphas[s, t - 1] + alphas[s - 1, t - 1]) *
                    scorematrix[blank, t]),
                name='for_blank_symbol'),
         ifelse(tensor.or_(tensor.eq(s, 1),
                           tensor.eq(queryseq[l], queryseq[l - 1])),
                tensor.set_subtensor(
                    alphas[s,
                           t], (alphas[s, t - 1] + alphas[s - 1, t - 1]) *
                    scorematrix[queryseq[l], t]),
                tensor.set_subtensor(
                    alphas[s, t],
                    (alphas[s, t - 1] + alphas[s - 1, t - 1] +
                     alphas[s - 2, t - 1]) * scorematrix[queryseq[l], t]),
                name='for_same_label_twice'))
     return alphas
Exemple #57
0
def truncated_normal(size, avg, std, lbound, ubound, theano_rng, dtype):
    def phi(x):
        erfarg = (x - avg) / (std * SQRT2)
        rval = 0.5 * (1. + T.erf(erfarg))
        return rval.astype(dtype)

    def phi_inv(phi_x):
        erfinv_input = T.clip(2. * phi_x - 1., -1. + 1e-6, 1. - 1e-6)
        rval = avg + std * SQRT2 * T.erfinv(erfinv_input)
        return rval.astype(dtype)

    # center lower and upper bounds based on mean
    u = theano_rng.uniform(size=size, dtype=dtype)

    cdf_range = phi(ubound) - phi(lbound)
    sample = phi_inv(phi(lbound) + u * cdf_range)

    # if avg >> ubound, return ubound
    # if avg << lbound, return lbound
    # else return phi(lbound) + u * [phi(ubound) - phi(lbound)]
    rval = T.switch(T.or_(sample < lbound, sample > ubound),
                    T.switch(avg >= ubound, ubound, lbound), sample)

    return rval
Exemple #58
0
    def __init__(self, nc, nf, kwargs):
        assert nf; assert nc
        self.kwargs = extract_rnn_params(kwargs)
        for pname in RDNN.param_names:
            setattr(self, pname, kwargs[pname])
        
        self.lr = theano.shared(np.array(self.lr, dtype='float32'), allow_downcast=True)
        self.gclip = False if self.gclip == 0 else self.gclip # mysteriously, we need this line

        self.activation = [self.activation] * len(self.n_hidden)
        self.deep_ltypes = [act_str.split('-')[1] for act_str in self.activation]

        self.opt = getattr(lasagne.updates, self.opt)
        ldepth = len(self.n_hidden)

        # network
        default_gate = lambda : lasagne.layers.Gate(W_in=lasagne.init.GlorotUniform(), 
            W_hid=lasagne.init.GlorotUniform())
        
        forget_gate = lambda : lasagne.layers.Gate(W_in=lasagne.init.GlorotUniform(), 
            W_hid=lasagne.init.GlorotUniform(),
            b=lasagne.init.Constant(self.fbias))
        
        """default_gate = lambda : lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), 
            W_hid=lasagne.init.Orthogonal())
        
        forget_gate = lambda : lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
            b=lasagne.init.Constant(self.fbias))"""

        l_in = lasagne.layers.InputLayer(shape=(None, None, nf))
        logging.debug('l_in: {}'.format(lasagne.layers.get_output_shape(l_in)))
        N_BATCH_VAR, MAX_SEQ_LEN_VAR, _ = l_in.input_var.shape # symbolic ref to input_var shape
        # l_mask = lasagne.layers.InputLayer(shape=(N_BATCH_VAR, MAX_SEQ_LEN_VAR))
        l_mask = lasagne.layers.InputLayer(shape=(None, None))
        logging.debug('l_mask: {}'.format(lasagne.layers.get_output_shape(l_mask)))

        curlayer = l_in
        if self.emb:
            l_reshape = lasagne.layers.ReshapeLayer(l_in, (-1, nf))
            logging.debug('l_reshape: {}'.format(lasagne.layers.get_output_shape(l_reshape)))
            l_emb = lasagne.layers.DenseLayer(l_reshape, num_units=self.emb, nonlinearity=None, b=None)
            logging.debug('l_emb: {}'.format(lasagne.layers.get_output_shape(l_emb)))
            l_emb = lasagne.layers.ReshapeLayer(l_emb, (N_BATCH_VAR, MAX_SEQ_LEN_VAR, self.emb))
            logging.debug('l_emb: {}'.format(lasagne.layers.get_output_shape(l_emb)))
            curlayer = l_emb

        if self.drates[0] > 0:
            l_in_drop = lasagne.layers.DropoutLayer(curlayer, p=self.drates[0])
            logging.debug('l_drop: {}'.format(lasagne.layers.get_output_shape(l_in_drop)))
            curlayer = l_in_drop
        self.layers = [curlayer]
        self.blayers = []
        for level, ltype, n_hidden in zip(range(1,ldepth+1), self.deep_ltypes, self.n_hidden):
            prev_layer = self.layers[level-1]
            if ltype in ['relu','lrelu', 'relu6', 'elu']:
                LayerType = lasagne.layers.RecurrentLayer
                if ltype == 'relu': nonlin = lasagne.nonlinearities.rectify
                elif ltype == 'lrelu': nonlin = lasagne.nonlinearities.leaky_rectify
                elif ltype == 'relu6': nonlin = lambda x: T.min(lasagne.nonlinearities.rectify(x), 6)
                elif ltype == 'elu': nonlin = lambda x: T.switch(x >= 0, x, T.exp(x) - 1)
                l_forward = LayerType(prev_layer, n_hidden, mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate,
                        W_hid_to_hid=Identity(), W_in_to_hid=lasagne.init.GlorotUniform(gain='relu'), nonlinearity=nonlin)
                l_backward = LayerType(prev_layer, n_hidden, mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate,
                        W_hid_to_hid=Identity(), W_in_to_hid=lasagne.init.GlorotUniform(gain='relu'), nonlinearity=nonlin, backwards=True)
            elif ltype == 'lstm':
                LayerType = lasagne.layers.LSTMLayer
                l_forward = LayerType(prev_layer, n_hidden, ingate=default_gate(),
                    forgetgate=forget_gate(), outgate=default_gate(), mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate)
                l_backward = LayerType(prev_layer, n_hidden, ingate=default_gate(),
                    forgetgate=forget_gate(), outgate=default_gate(), mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate, backwards=True)

            elif ltype == 'gru':
                LayerType = lasagne.layers.GRULayer
                l_forward = LayerType(prev_layer, n_hidden, mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate)
                l_backward = LayerType(prev_layer, n_hidden, mask_input=l_mask, grad_clipping=self.gclip, gradient_steps=self.truncate, backwards=True)

            logging.debug('l_forward: {}'.format(lasagne.layers.get_output_shape(l_forward)))
            logging.debug('l_backward: {}'.format(lasagne.layers.get_output_shape(l_backward)))

            if self.fbmerge == 'concat':
                l_fbmerge = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2)
            elif self.fbmerge == 'sum':
                l_fbmerge = lasagne.layers.ElemwiseSumLayer([l_forward, l_backward])
            logging.debug('l_fbmerge: {}'.format(lasagne.layers.get_output_shape(l_fbmerge)))

            if self.drates[level] > 0:
                l_fbmerge = lasagne.layers.DropoutLayer(l_fbmerge, p=self.drates[level])

            self.blayers.append((l_forward, l_backward))
            self.layers.append(l_fbmerge)
        
        l_fbmerge = lasagne.layers.ConcatLayer([l_fbmerge, curlayer], axis=2) if self.in2out else l_fbmerge

        if self.recout == 1:
            logging.info('using recout:%d.'%self.recout)
            l_out = lasagne.layers.RecurrentLayer(l_fbmerge, num_units=nc, mask_input=l_mask, W_hid_to_hid=Identity(),
                    W_in_to_hid=lasagne.init.GlorotUniform(), nonlinearity=log_softmax)
                    # W_in_to_hid=lasagne.init.GlorotUniform(), nonlinearity=lasagne.nonlinearities.softmax) CHANGED
            logging.debug('l_out: {}'.format(lasagne.layers.get_output_shape(l_out)))
        elif self.recout == 2:
            logging.info('using recout:%d.'%self.recout)
            l_fout = lasagne.layers.RecurrentLayer(l_fbmerge, num_units=nc, mask_input=l_mask, W_hid_to_hid=Identity(),
                    W_in_to_hid=lasagne.init.GlorotUniform(), nonlinearity=log_softmax)
            l_bout = lasagne.layers.RecurrentLayer(l_fbmerge, num_units=nc, mask_input=l_mask, W_hid_to_hid=Identity(),
                    W_in_to_hid=lasagne.init.GlorotUniform(), nonlinearity=log_softmax, backwards=True)
            l_out = lasagne.layers.ElemwiseSumLayer([l_fout, l_bout], coeffs=0.5)
            # l_out = LogSoftMerge([l_fout, l_bout])
            logging.debug('l_out: {}'.format(lasagne.layers.get_output_shape(l_out)))
        else:
            l_reshape = lasagne.layers.ReshapeLayer(l_fbmerge, (-1, self.n_hidden[-1]*(2 if self.fbmerge=='concat' else 1)))
            logging.debug('l_reshape: {}'.format(lasagne.layers.get_output_shape(l_reshape)))
            l_rec_out = lasagne.layers.DenseLayer(l_reshape, num_units=nc, nonlinearity=log_softmax)

            logging.debug('l_rec_out: {}'.format(lasagne.layers.get_output_shape(l_rec_out)))
            l_out = lasagne.layers.ReshapeLayer(l_rec_out, (N_BATCH_VAR, MAX_SEQ_LEN_VAR, nc))
            logging.debug('l_out: {}'.format(lasagne.layers.get_output_shape(l_out)))

        self.l_soft_out = l_rec_out
        self.output_layer = l_out

        target_output = T.tensor3('target_output')
        out_mask = T.tensor3('mask')

        """
        def cost(output):
            return -T.sum(out_mask*target_output*T.log(output))/T.sum(out_mask)
        """
        def cost(output): # expects log softmax output
            return -T.sum(out_mask*target_output*output)/T.sum(out_mask)

        cost_train = cost(lasagne.layers.get_output(l_out, deterministic=False))
        cost_eval = cost(lasagne.layers.get_output(l_out, deterministic=True))


        all_params = lasagne.layers.get_all_params(l_out, trainable=True)
        logging.debug(all_params)

        self.recout_hid2hid = lambda : l_out.get_params() if self.recout == 0 else lambda : l_out.get_params()[-1].get_value()

        grads = T.grad(cost_train, all_params)

        all_grads, total_norm = lasagne.updates.total_norm_constraint(grads, self.norm, return_norm=True)
        #all_grads.append(grads[-2])
        #all_grads.append(grads[-1])
        all_grads = [T.switch(T.or_(T.isnan(total_norm), T.isinf(total_norm)), p*0.01 , g) for g,p in zip(all_grads, all_params)]
        
        if self.gnoise:
            from theano.tensor.shared_randomstreams import RandomStreams
            srng = RandomStreams(seed=1234)
            e_prev = theano.shared(lasagne.utils.floatX(0.))
            nu = 0.01
            gamma = 0.55
            gs = [g + srng.normal(T.shape(g), std=(nu / ((1 + e_prev)**gamma))) for g in all_grads]
            updates = self.opt(gs, all_params, self.lr, self.eps)
            updates[e_prev] = e_prev + 1
        else:
            updates = self.opt(all_grads, all_params, self.lr, self.eps)
        

        logging.info("Compiling functions...")
        self.train_model = theano.function(inputs=[l_in.input_var, target_output, l_mask.input_var, out_mask], outputs=cost_train, updates=updates, allow_input_downcast=True)
        self.predict_model = theano.function(
                inputs=[l_in.input_var, target_output, l_mask.input_var, out_mask],
                outputs=[cost_eval, lasagne.layers.get_output(l_out, deterministic=True)])

        # aux
        self.train_model_debug = theano.function(
                inputs=[l_in.input_var, target_output, l_mask.input_var, out_mask],
                outputs=[cost_train]+lasagne.layers.get_output([l_out, l_fbmerge], deterministic=True)+[total_norm],
                updates=updates)
        self.compute_cost = theano.function([l_in.input_var, target_output, l_mask.input_var, out_mask], cost_eval)
        self.compute_cost_train = theano.function([l_in.input_var, target_output, l_mask.input_var, out_mask], cost_train)
        # self.info_model = theano.function([],recout_hid2hid)
        logging.info("Compiling done.")
Exemple #59
0
    fp_multiplier = 1
    loss = T.mean(
        T.sqr(T.maximum(0., 1. - target * train_output)) *
        np.asarray([fp_multiplier, 1]))
    err = T.mean(T.neq(T.argmax(train_output, axis=1), T.argmax(target,
                                                                axis=1)),
                 dtype=theano.config.floatX)
    train_1_when_0 = T.sum(T.gt(T.argmax(train_output, axis=1),
                                T.argmax(target, axis=1)),
                           dtype=theano.config.floatX)  # face = 0, bg = 1 : fn
    train_0_when_1 = T.sum(T.lt(T.argmax(train_output, axis=1),
                                T.argmax(target, axis=1)),
                           dtype=theano.config.floatX)  # fp
    # the T.invert function seems to react differently depending on theano versions...
    train_0_when_0 = T.sum(T.invert(
        T.or_(T.argmax(train_output, axis=1), T.argmax(target, axis=1))),
                           dtype=theano.config.floatX)
    # if this does not work, try
    # train_0_when_0 = batch_size - T.sum(T.or_(T.argmax(train_output,axis=1),T.argmax(target,axis=1))),dtype=theano.config.floatX)
    train_precision = train_0_when_0 / (train_0_when_0 + train_0_when_1
                                        )  # TP/(TP+FP)
    train_recall = train_0_when_0 / (train_0_when_0 + train_1_when_0
                                     )  # TP/(TP+FN)

    if binary:

        # W updates
        W = lasagne.layers.get_all_params(cnn, binary=True)
        W_grads = binary_net.compute_grads(loss, cnn)
        updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                       params=W,
Exemple #60
0
    def __init__(self, model, state, data):
        """
        :type model: groundhog model class
        :param model: class depicting the model to be optimized

        :type state: dictionary or jobman DD object
        :param state: dictionary containing various hyper-parameters. The
            class will write into this dictionary updates like the current
            training error and so on

        :type data: groundhog dataset object
        :param data: data iterator over which training is done
        """

        #####################################
        # Step 0. Constructs shared variables
        #####################################
        bs = state['bs']
        self.model = model
        self.rng = numpy.random.RandomState(state['seed'])
        srng = RandomStreams(self.rng.randint(213))
        self.gs = [
            theano.shared(numpy.zeros(p.get_value(borrow=True).shape,
                                      dtype=theano.config.floatX),
                          name=p.name) for p in model.params
        ]
        self.step = 0
        self.bs = bs
        self.state = state
        self.data = data
        self.step_timer = time.time()
        self.gdata = [
            theano.shared(numpy.zeros((2, ) * x.ndim, dtype=x.dtype),
                          name=x.name) for x in model.inputs
        ]

        if 'profile' not in self.state:
            self.state['profile'] = 0

        ###################################
        # Step 1. Compile training function
        ###################################
        print 'Constructing grad function'
        loc_data = self.gdata
        lr = TT.scalar('lr')
        self.prop_exprs = [x[1] for x in model.properties]
        self.prop_names = [x[0] for x in model.properties]
        self.update_rules = [x[1] for x in model.updates]
        rval = theano.clone(model.param_grads + self.update_rules + \
                            self.prop_exprs + [model.train_cost],
                            replace=zip(model.inputs, loc_data))
        nparams = len(model.params)
        nouts = len(self.prop_exprs)
        nrules = len(self.update_rules)
        gs = rval[:nparams]
        rules = rval[nparams:nparams + nrules]
        outs = rval[nparams + nrules:]

        norm_gs = sum(
            TT.sum(x**2) for x, p in zip(gs, self.model.params)
            if p not in self.model.exclude_params_for_norm)
        if 'cutoff' in state and state['cutoff'] > 0:
            c = numpy.float32(state['cutoff'])
            if state['cutoff_rescale_length']:
                c = c * TT.cast(loc_data[0].shape[0], 'float32')

            notfinite = TT.or_(TT.isnan(norm_gs), TT.isinf(norm_gs))
            _gs = []
            for g, p in zip(gs, self.model.params):
                if p not in self.model.exclude_params_for_norm:
                    tmpg = TT.switch(TT.ge(norm_gs, c), g * c / norm_gs, g)
                    _gs.append(
                        TT.switch(notfinite,
                                  numpy.float32(.1) * p, tmpg))
                else:
                    _gs.append(g)
            gs = _gs

        store_gs = [(s, g) for s, g in zip(self.gs, gs)]
        updates = store_gs + [(s[0], r) for s, r in zip(model.updates, rules)]
        print 'Compiling grad function'
        st = time.time()
        self.train_fn = theano.function([],
                                        outs,
                                        name='train_function',
                                        updates=updates,
                                        givens=zip(model.inputs, loc_data),
                                        profile=self.state['profile'])
        print 'took', time.time() - st

        self.lr = numpy.float32(state['lr'])
        new_params = [
            p - s * lr * g
            for s, p, g in zip(model.params_grad_scale, model.params, self.gs)
        ]
        self.update_fn = theano.function([lr], [],
                                         name='update_function',
                                         allow_input_downcast=True,
                                         updates=zip(model.params, new_params),
                                         profile=self.state['profile'])

        self.old_cost = 1e20
        self.schedules = model.get_schedules()
        self.return_names = self.prop_names + \
                ['cost',
                 'time_step',
                 'whole_time',
                  'lr']