Ejemplo n.º 1
0
    def marginalize_over_v_z(self, h):
        # energy = \sum_{i=1}^{|h|} h_i*b_i - \beta * ln(1 + e^{b_i})

        # In theory should use the following line
        # energy = (h * self.b).T
        # However, when there is broadcasting, the Theano element-wise multiplication between np.NaN and 0 is 0 instead of np.NaN!
        # so we use T.tensordot and T.diagonal instead as a workaround!
        # See Theano issue #3848 (https://github.com/Theano/Theano/issues/3848)
        energy = T.tensordot(h, self.b, axes=0)
        energy = T.diagonal(energy, axis1=1, axis2=2).T

        if self.penalty == "softplus_bi":
            energy = energy - self.beta * T.log(1 + T.exp(self.b))[:, None]

        elif self.penalty == "softplus0":
            energy = energy - self.beta * T.log(1 + T.exp(0))[:, None]

        else:
            raise NameError("Invalid penalty term")

        energy = T.set_subtensor(energy[(T.isnan(energy)).nonzero()], 0)  # Remove NaN
        energy = T.sum(energy, axis=0, keepdims=True).T

        ener = T.tensordot(h, self.W, axes=0)
        ener = T.diagonal(ener, axis1=1, axis2=2)
        ener = T.set_subtensor(ener[(T.isnan(ener)).nonzero()], 0)
        ener = T.sum(ener, axis=2) + self.c[None, :]
        ener = T.sum(T.log(1 + T.exp(ener)), axis=1, keepdims=True)

        return -(energy + ener)
Ejemplo n.º 2
0
    def updates(self, cost):
        grad = T.grad(cost, self.param)
        grad2 = hessian_diagonal(cost, self.param, grad=grad)
        # calculate memory constants
        tau_rec = 1.0 / self.tau
        tau_inv_rec = 1.0 - tau_rec

        # new moving average of gradient
        g_avg_new = tau_inv_rec * self.g_avg + tau_rec * grad
        # new moving average of squared gradient
        v_avg_new = tau_inv_rec * self.v_avg + tau_rec * grad**2
        # new moving average of hessian diagonal
        h_avg_new = tau_inv_rec * self.h_avg + tau_rec * T.abs_(grad2)

        rate_unsafe = (g_avg_new ** 2) / (v_avg_new * h_avg_new)
        rate = T.switch(T.isinf(rate_unsafe) | T.isnan(rate_unsafe), self.learning_rate, rate_unsafe)

        tau_unsafe = (1 - (g_avg_new ** 2) / v_avg_new) * self.tau + 1
        tau_new = T.switch(T.isnan(tau_unsafe) | T.isinf(tau_unsafe), self.tau, tau_unsafe)

        return [(self.g_avg, g_avg_new),
                (self.v_avg, v_avg_new),
                (self.h_avg, h_avg_new),
                (self.tau, tau_new),
                (self.last_grad, grad),
                (self.last_grad2, grad2),
                (self.last_rate, rate),
                (self.param, self.param - rate * grad)]
Ejemplo n.º 3
0
    def from_partial(self, X, dX):
        eps = 1e-10
        U, S, V = X
        dU, dS, dV = dX

        umask = 1 - (1 - tensor.isnan(dU)) * (1 - tensor.isinf(dU)
                                              )  # indicators of nan/inf values
        vmask = 1 - (1 - tensor.isnan(dV)) * (1 - tensor.isinf(dV)
                                              )  # indicators of nan/inf values

        # U S V => U mask product by columns, V by rows
        smask = 1 - tensor.prod(1 - umask, axis=0) * tensor.prod(1 - vmask,
                                                                 axis=1)
        S = tensor.diag(S)

        dU = tensor.set_subtensor(dU[umask.nonzero()], 0.0)
        S_pinv = tensor.switch(tensor.gt(abs(S), eps), 1.0 / S, 0.0)
        S_pinv = tensor.set_subtensor(S_pinv[smask.nonzero()], 0.0)
        S_pinv = tensor.diag(S_pinv)
        dV = tensor.set_subtensor(dV[vmask.nonzero()], 0.0)
        ZV = dU.dot(S_pinv)
        UtZV = dS
        ZtU = S_pinv.dot(dV)

        Zproj = (ZV - U.dot(UtZV), UtZV, ZtU - (UtZV.dot(V)))
        return Zproj
Ejemplo n.º 4
0
    def marginalize_over_v_z(self, h):
        # energy = \sum_{i=1}^{|h|} h_i*b_i - \beta * ln(1 + e^{b_i})

        # In theory should use the following line
        # energy = (h * self.b).T
        # However, when there is broadcasting, the Theano element-wise multiplication between np.NaN and 0 is 0 instead of np.NaN!
        # so we use T.tensordot and T.diagonal instead as a workaround!
        # See Theano issue #3848 (https://github.com/Theano/Theano/issues/3848)
        energy = T.tensordot(h, self.b, axes=0)
        energy = T.diagonal(energy, axis1=1, axis2=2).T

        if self.penalty == "softplus_bi":
            energy = energy - self.beta * T.log(1 + T.exp(self.b))[:, None]

        elif self.penalty == "softplus0":
            energy = energy - self.beta * T.log(1 + T.exp(0))[:, None]

        else:
            raise NameError("Invalid penalty term")

        energy = T.set_subtensor(energy[(T.isnan(energy)).nonzero()],
                                 0)  # Remove NaN
        energy = T.sum(energy, axis=0, keepdims=True).T

        ener = T.tensordot(h, self.W, axes=0)
        ener = T.diagonal(ener, axis1=1, axis2=2)
        ener = T.set_subtensor(ener[(T.isnan(ener)).nonzero()], 0)
        ener = T.sum(ener, axis=2) + self.c[None, :]
        ener = T.sum(T.log(1 + T.exp(ener)), axis=1, keepdims=True)

        return -(energy + ener)
Ejemplo n.º 5
0
    def get_f_scores(self):
        prediction = self.get_predictions(0.5)  # 0.5 is an arbitrary threshold

        # Different computation for R, P and F with the autoencoder
        true_pos = T.sum(prediction & self.x_as_int, axis=0)
        pos = T.sum(self.x_as_int, axis=0)
        predicted_pos = T.sum(prediction, axis=0)

        # If pos==0 (no actual positives) recall is undefined
        # Simple way out of div zero: wherever pos==0, setting pos=1 is fine (since recall==1)
        recalls = T.switch(T.eq(pos, 0), float('nan'), true_pos) / T.switch(T.eq(pos, 0), 1., pos)
        # Simple way out of div zero: wherever predicted_pos==0 we're setting num directly, so 1 denom is fine
        precisions = T.switch(
            T.eq(predicted_pos, 0) & T.eq(pos, 0),
            float('nan'),  # Don't penalize precision if there are no positives
            true_pos / T.switch(T.eq(predicted_pos, 0), 1., predicted_pos)
        )
        f_scores = T.switch(
            T.isnan(precisions) | T.isnan(recalls),
            float('nan'),
            2. * precisions * recalls /
            T.switch(
                precisions + recalls > 0,
                precisions + recalls,
                1.
            ),
        )
        return f_scores, precisions, recalls
Ejemplo n.º 6
0
    def updates(self, cost):
        grad = T.grad(cost, self.param)
        grad2 = hessian_diagonal(cost, self.param, grad=grad)
        # calculate memory constants
        tau_rec = 1.0 / self.tau
        tau_inv_rec = 1.0 - tau_rec

        # new moving average of gradient
        g_avg_new = tau_inv_rec * self.g_avg + tau_rec * grad
        # new moving average of squared gradient
        v_avg_new = tau_inv_rec * self.v_avg + tau_rec * grad**2
        # new moving average of hessian diagonal
        h_avg_new = tau_inv_rec * self.h_avg + tau_rec * T.abs_(grad2)

        rate_unsafe = (g_avg_new**2) / (v_avg_new * h_avg_new)
        rate = T.switch(
            T.isinf(rate_unsafe) | T.isnan(rate_unsafe), self.learning_rate,
            rate_unsafe)

        tau_unsafe = (1 - (g_avg_new**2) / v_avg_new) * self.tau + 1
        tau_new = T.switch(
            T.isnan(tau_unsafe) | T.isinf(tau_unsafe), self.tau, tau_unsafe)

        return [(self.g_avg, g_avg_new), (self.v_avg, v_avg_new),
                (self.h_avg, h_avg_new), (self.tau, tau_new),
                (self.last_grad, grad), (self.last_grad2, grad2),
                (self.last_rate, rate), (self.param, self.param - rate * grad)]
Ejemplo n.º 7
0
def scaled_cost(x, t):
    sq_error = (x - t) ** 2
    above_thresh_sq_error = sq_error[(t > THRESHOLD).nonzero()]
    below_thresh_sq_error = sq_error[(t <= THRESHOLD).nonzero()]
    above_thresh_mean = above_thresh_sq_error.mean()
    below_thresh_mean = below_thresh_sq_error.mean()
    above_thresh_mean = ifelse(T.isnan(above_thresh_mean), 0.0, above_thresh_mean)
    below_thresh_mean = ifelse(T.isnan(below_thresh_mean), 0.0, below_thresh_mean)
    return (above_thresh_mean + below_thresh_mean) / 2.0
Ejemplo n.º 8
0
    def __init__(self,
                 labels,
                 g=0.1,
                 m=0.01,
                 feature_dimension=128,
                 n_codewords=16,
                 n_feature_samples=100,
                 eta=0.01):
        """
        The labels of the objects used for the optimization.
        The objects must be in the same order when the fit function is called
        :param labels: labels of the objects used for the optimization
        :param g: BoW quantization parameter
        :param m: entropy softness parameter
        :param feature_dimension: dimension of the extracted feature vectors
        :param n_codewords: number of codewords in the dictionary
        :param n_feature_samples: number of feature vectors to use in each iteration
        :param eta: learning rate
        """

        SoftBoW.__init__(self,
                         g=g,
                         feature_dimension=feature_dimension,
                         n_codewords=n_codewords)

        self.entropy = SoftEntropy(m=m, labels=labels)
        self.entropy_loss = None
        self.learning_rate = eta
        self.n_feature_samples = n_feature_samples

        # Histograms
        self.S = self._sym_histograms(self.X)

        # Entropy loss
        self.entropy_loss = self.entropy._sym_entropy(self.S)

        # Compile loss function
        self.calculate_loss_theano = theano.function([self.X],
                                                     self.entropy_loss)

        # Define gradients w.r.t. V (and take care of NaNs)
        entropy_grad = T.grad(self.entropy_loss, self.S)
        entropy_grad = T.switch(T.isnan(entropy_grad), 0, entropy_grad)
        dictionary_grad = T.grad(self.entropy._sym_entropy(self.S),
                                 self.V,
                                 known_grads={self.S: entropy_grad})
        dictionary_grad = T.switch(T.isnan(dictionary_grad), 0,
                                   dictionary_grad)

        # Define and compile the training function
        self.updates = adam([dictionary_grad], [self.V],
                            learning_rate=self.learning_rate)
        self.train_theano = theano.function(inputs=[self.X],
                                            outputs=[self.entropy_loss],
                                            updates=self.updates)
Ejemplo n.º 9
0
def get_nesterov_sgd_updates(param_list, gradients, velocities, lr, mu):
    """Do SGD updates with Nesterov momentum."""
    updates = []
    for p, g, v in zip(param_list, gradients, velocities):
        new_v = mu * v - lr * g
        new_p = p - mu * v + (1 + mu) * new_v
        has_non_finite = (T.any(T.isnan(new_p) + T.isinf(new_p)) +
                          T.any(T.isnan(new_v) + T.isinf(new_v)))
        updates.append((p, ifelse(has_non_finite, p, new_p)))
        updates.append((v, ifelse(has_non_finite, v, new_v)))
    return updates
Ejemplo n.º 10
0
 def to_weight(d, m, p, prior):
     logit = T.tensordot(dwe[d, :], dwe.T,
                         axes=1)[:, :, d]  # mw x ms x mw x ms
     cnt = T.sum(m, axis=1).dimshuffle('x', 'x', 0)  # 1 x 1 x mw
     logit = T.sum(logit * m.dimshuffle('x', 'x', 0, 1),
                   axis=3) / cnt  # mw x ms x mw
     logit = T.exp(10 *
                   T.switch(T.isnan(logit), 0, logit))  # mw x ms x mw
     logit = T.prod(logit, axis=2) * prior  # mw x ms
     sm = T.sum(logit * m, axis=1, keepdims=True)  # mw x 1
     #mask = T.switch(T.lt(p, 0), 0, 1).dimshuffle(0, 'x') #
     logit = (logit * m) / sm  # mw x ms
     return T.switch(T.or_(T.isnan(logit), T.isinf(logit)), 0, logit)
Ejemplo n.º 11
0
 def predict_logK(self, x, z, params):
     if self.conditional:
         s_x = TT.switch(TT.isnan(x), self.n_idxs - 1, x)
         s_z = TT.switch(TT.isnan(z), self.n_idxs - 1, z)
     else:
         s_x = x
         s_z = z
     P_unit = self.unit(params)
     K = TT.dot(P_unit[s_x.flatten().astype('int32')],
                P_unit[s_x.flatten().astype('int32')].T)
     #K_reg = K + 1e-12 * TT.eye(x.shape[0])
     K_new = TT.dot(P_unit[s_x.flatten().astype('int32')],
                    P_unit[s_z.flatten().astype('int32')].T)
     return TT.log(K), TT.log(K_new)
Ejemplo n.º 12
0
 def predict_logK(self, x, z, params):
     if self.conditional:
         s_x = TT.switch(TT.isnan(x), self.n_idxs - 1, x)
         s_z = TT.switch(TT.isnan(z), self.n_idxs - 1, z)
     else:
         s_x = x
         s_z = z
     P_unit = self.unit(params)
     K = TT.dot(P_unit[s_x.flatten().astype('int32')],
                P_unit[s_x.flatten().astype('int32')].T)
     #K_reg = K + 1e-12 * TT.eye(x.shape[0])
     K_new = TT.dot(P_unit[s_x.flatten().astype('int32')],
                    P_unit[s_z.flatten().astype('int32')].T)
     return TT.log(K), TT.log(K_new)
Ejemplo n.º 13
0
    def get_clip_sgd_updates(self,
                             params,
                             cost,
                             learning_rate,
                             momentum,
                             rescale=5.):
        gparams = T.grad(cost, params)
        updates = OrderedDict()

        if not hasattr(self, "momentum_velocity_"):
            self.momentum_velocity_ = [0.] * len(gparams)

        # Gradient clipping
        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        grad_norm = T.sqrt(grad_norm)
        scaling_num = rescale
        scaling_den = T.maximum(rescale, grad_norm)
        for n, (param, gparam) in enumerate(zip(params, gparams)):
            # clip gradient directly, not momentum etc.
            gparam = T.switch(not_finite, 0.1 * param,
                              gparam * (scaling_num / scaling_den))
            velocity = self.momentum_velocity_[n]
            update_step = momentum * velocity - learning_rate * gparam
            self.momentum_velocity_[n] = update_step
            updates[param] = param + update_step
        return updates
Ejemplo n.º 14
0
 def graves_rmsprop_updates(self, params, grads, learning_rate=1e-4, alpha=0.9, epsilon=1e-4, chi=0.95):
     """
     Alex Graves' RMSProp [1]_.
     .. math ::
         n_{i} &= \chi * n_i-1 + (1 - \chi) * grad^{2}\\
         g_{i} &= \chi * g_i-1 + (1 - \chi) * grad\\
         \Delta_{i} &= \alpha * Delta_{i-1} - learning_rate * grad /
                 sqrt(n_{i} - g_{i}^{2} + \epsilon)\\
         w_{i} &= w_{i-1} + \Delta_{i}
     References
     ----------
     .. [1] Graves, Alex.
         "Generating Sequences With Recurrent Neural Networks", p.23
         arXiv:1308.0850
     """
     updates = []
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param, grad)
         old_square = self.running_square_[n]
         old_avg = self.running_avg_[n]
         old_memory = self.memory_[n]
         new_square = chi * old_square + (1. - chi) * grad ** 2
         new_avg = chi * old_avg + (1. - chi) * grad
         new_memory = alpha * old_memory - learning_rate * grad / T.sqrt(new_square - \
                     new_avg ** 2 + epsilon)
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((old_memory, new_memory))
         updates.append((param, param + new_memory))
     return updates
Ejemplo n.º 15
0
    def compute_step(self, param, previous_step):
        grad_norm = l2_norm([previous_step])
        not_finite = tensor.or_(tensor.isnan(grad_norm),
                                tensor.isinf(grad_norm))
        step = tensor.switch(not_finite, self.scaler * param, previous_step)

        return step, []
Ejemplo n.º 16
0
Archivo: sgd.py Proyecto: frsong/pyrl
    def get_updates(self, loss, lr, max_norm=1, beta1=0.9, beta2=0.999,
                    epsilon=1e-8, grads=None):
        # Gradients
        if grads is None:
            grads = tensor.grad(loss, self.trainables)

        # Clipping
        norm  = tensor.sqrt(sum([tensor.sqr(g).sum() for g in grads]))
        m     = theanotools.clipping_multiplier(norm, max_norm)
        grads = [m*g for g in grads]

        # Safeguard against numerical instability
        new_cond = tensor.or_(tensor.or_(tensor.isnan(norm), tensor.isinf(norm)),
                              tensor.or_(norm < 0, norm > 1e10))
        grads = [tensor.switch(new_cond, np.float32(0), g) for g in grads]

        # Safeguard against numerical instability
        #cond  = tensor.or_(norm < 0, tensor.or_(tensor.isnan(norm), tensor.isinf(norm)))
        #grads = [tensor.switch(cond, np.float32(0), g) for g in grads]

        # New values
        t       = self.time + 1
        lr_t    = lr*tensor.sqrt(1. - beta2**t)/(1. - beta1**t)
        means_t = [beta1*m + (1. - beta1)*g for g, m in zip(grads, self.means)]
        vars_t  = [beta2*v + (1. - beta2)*tensor.sqr(g) for g, v in zip(grads, self.vars)]
        steps   = [lr_t*m_t/(tensor.sqrt(v_t) + epsilon)
                   for m_t, v_t in zip(means_t, vars_t)]

        # Updates
        updates  = [(x, x - step) for x, step in zip(self.trainables, steps)]
        updates += [(m, m_t) for m, m_t in zip(self.means, means_t)]
        updates += [(v, v_t) for v, v_t in zip(self.vars, vars_t)]
        updates += [(self.time, t)]

        return norm, grads, updates
Ejemplo n.º 17
0
    def __init__(self, n_comp=10, verbose=False):

        # Theano initialization
        self.T_weights = shared(np.eye(n_comp, dtype=np.float32))
        self.T_bias = shared(np.ones((n_comp, 1), dtype=np.float32))

        T_p_x_white = T.fmatrix()
        T_lrate = T.fscalar()
        T_block = T.fscalar()
        T_unmixed = T.dot(self.T_weights,T_p_x_white) + T.addbroadcast(self.T_bias,1)
        T_logit = 1 - 2 / (1 + T.exp(-T_unmixed))

        T_out =  self.T_weights +  T_lrate * T.dot(T_block * T.identity_like(self.T_weights) + T.dot(T_logit, T.transpose(T_unmixed)), self.T_weights)
        T_bias_out = self.T_bias + T_lrate * T.reshape(T_logit.sum(axis=1), (-1,1))
        T_max_w = T.max(self.T_weights)
        T_isnan = T.any(T.isnan(self.T_weights))

        self.w_up_fun = theano.function([T_p_x_white, T_lrate, T_block],
                                        [T_max_w, T_isnan],
                                        updates=[(self.T_weights, T_out),
                                                 (self.T_bias, T_bias_out)],
                                        allow_input_downcast=True)

        T_matrix = T.fmatrix()
        T_cov = T.dot(T_matrix,T.transpose(T_matrix))/T_block
        self.cov_fun = theano.function([T_matrix, T_block], T_cov, allow_input_downcast=True)
        
        self.loading = None
        self.sources = None
        self.weights = None
        self.n_comp = n_comp
        self.verbose = verbose
Ejemplo n.º 18
0
def rmsprop(params, cost=None, gradients=None, learningrate=0.0005, rho=0.9, epsilon=1e-6):

    # Validate input
    assert not (cost is None and gradients is None), "Update function rmsprop requires either a cost scalar or a " \
                                                     "list of gradients."

    # Compute gradients if requested
    if gradients is None and cost is not None:
        pdC = T.grad(cost, wrt=params)
        # Kill gradients if cost is nan
        dC = [th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam) for dparam in pdC]
    else:
        dC = gradients

    # Init update list
    updates = []

    for param, dparam in zip(params, dC):
        # Check if layer is trainable. Skip if not.
        if not netutils.getbaggage(param, 'trainable', True):
            continue

        paramshape = param.get_value().shape
        acc = th.shared(np.zeros(paramshape, dtype=th.config.floatX))
        newacc = rho * acc + (1 - rho) * dparam ** 2
        gradscale = T.sqrt(newacc + epsilon)
        dparam = dparam / gradscale
        updates.append((acc, newacc))
        updates.append((param, param - learningrate * dparam))

    return updates
Ejemplo n.º 19
0
def nan_shield(parameters, deltas, other_updates):
    delta_sum = sum(T.sum(d) for d in deltas)
    not_finite = T.isnan(delta_sum) | T.isinf(delta_sum)
    parameter_updates = [(p, T.switch(not_finite, 0.9 * p, p - d))
                         for p, d in izip(parameters, deltas)]
    other_updates = [(p, T.switch(not_finite, p, u)) for p, u in other_updates]
    return parameter_updates, other_updates
Ejemplo n.º 20
0
        def lda_logp(rt, gaze, values, error_lls, s_condition_index,
                     s_subject_index, v_condition_index, v_subject_index,
                     tau_condition_index, tau_subject_index,
                     gamma_condition_index, gamma_subject_index,
                     t0_condition_index, t0_subject_index, zerotol):

            # compute drifts
            drift = glam.components.expdrift(
                v[tt.cast(v_subject_index, dtype='int32'),
                  tt.cast(v_condition_index, dtype='int32')][:, None],
                tau[tt.cast(tau_subject_index, dtype='int32'),
                    tt.cast(tau_condition_index, dtype='int32')][:, None],
                gamma[tt.cast(gamma_subject_index, dtype='int32'),
                      tt.cast(gamma_condition_index, dtype='int32')][:, None],
                values, gaze, zerotol)
            glam_ll = glam.components.tt_wienerrace_pdf(
                rt[:, None], drift,
                s[tt.cast(s_subject_index, dtype='int32'),
                  tt.cast(s_condition_index, dtype='int32')][:, None], b,
                t0[tt.cast(t0_subject_index, dtype='int32'),
                   tt.cast(t0_condition_index, dtype='int32')][:,
                                                               None], zerotol)

            # mix likelihoods
            mixed_ll = ((1 - p_error) * glam_ll +
                        p_error * error_lls[subject_idx])

            mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll)
            mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll)
            return tt.sum(tt.log(mixed_ll + zerotol))
Ejemplo n.º 21
0
def adadelta(cost,
             parameters,
             param_clip=0,
             l_r=1.0,
             decay=0.95,
             consider_constant=None):
    """ 
    Each element of parameters is an array with 4 elements:
        param, update, hist_grad, hist_update
    """
    updates_for_func = OrderedDict()
    for param, update, hist_grad, hist_update in parameters:
        gparam = T.grad(cost, param, consider_constant=consider_constant)
        gparam = ifelse(T.isnan(T.sum(gparam)), T.zeros_like(gparam), gparam)
        new_hist_grad = decay * hist_grad + (1 - decay) * (gparam**2)
        new_update = -l_r * T.sqrt(hist_update + 1e-6) / T.sqrt(new_hist_grad +
                                                                1e-6) * gparam
        if (param_clip > 0):
            new_update = T.clip(new_update, -param_clip, param_clip)
        new_param = param + new_update
        new_hist_update = decay * hist_update + (1 - decay) * (new_update**2)

        # Note that the order is important
        updates_for_func[hist_grad] = new_hist_grad
        updates_for_func[update] = new_update
        updates_for_func[param] = new_param
        updates_for_func[hist_update] = new_hist_update

    return updates_for_func
Ejemplo n.º 22
0
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        """
        for p, g in grads.items():
            grads[p] = g / self.batch_size
        g_norm = 0.
        for g in grads.values():
            g_norm += (g**2).sum()
        """
        g_norm = 0.
        for p, g in grads.items():
            g /= self.batch_size
            grads[p] = g
            g_norm += (g**2).sum()
        not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))
        g_norm = T.sqrt(g_norm)
        scaler = self.scaler / T.maximum(self.scaler, g_norm)
        for p, g in grads.items():
            grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        mainloop.grads = grads
Ejemplo n.º 23
0
Archivo: ext.py Proyecto: Beronx86/cle
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        g_norm = 0.

        for p, g in grads.items():
            g /= T.cast(self.batch_size, dtype=theano.config.floatX)
            grads[p] = g
            g_norm += (g**2).sum()

        if self.check_nan:
            not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))

        g_norm = T.sqrt(g_norm)
        scaler = self.scaler / T.maximum(self.scaler, g_norm)

        if self.check_nan:
            for p, g in grads.items():
                grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        else:
            for p, g in grads.items():
                grads[p] = g * scaler

        mainloop.grads = grads
Ejemplo n.º 24
0
    def unet_crossentropy_loss_sampled(y_true, y_pred):
        epsilon = 1.0e-4
        y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon))
        y_true = T.flatten(y_true)
        # this seems to work
        # it is super ugly though and I am sure there is a better way to do it
        # but I am struggling with theano to cooperate
        # filter the right indices
        classPos = 1
        classNeg = 0
        indPos   = T.eq(y_true, classPos).nonzero()[0]
        indNeg   = T.eq(y_true, classNeg).nonzero()[0]
        #pos      = y_true[ indPos ]
        #neg      = y_true[ indNeg ]

        # shuffle
        n = indPos.shape[0]
        indPos = indPos[UNET.srng.permutation(n=n)]
        n = indNeg.shape[0]
        indNeg = indNeg[UNET.srng.permutation(n=n)]
        # take equal number of samples depending on which class has less
        n_samples = T.cast(T.min([ indPos.shape[0], indNeg.shape[0]]), dtype='int64')
        #n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64')

        indPos = indPos[:n_samples]
        indNeg = indNeg[:n_samples]
        #loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg]))
        loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(y_pred_clipped[indNeg]))
        loss_vector = T.clip(loss_vector, epsilon, 1.0-epsilon)
        average_loss = T.mean(loss_vector)
        if T.isnan(average_loss):
            average_loss = T.mean( y_pred_clipped[indPos])
        return average_loss
Ejemplo n.º 25
0
        def lda_logp(rt, gaze, values, error_ll, v_index, tau_index,
                     gamma_index, s_index, t0_index, is_multiplicative,
                     zerotol):

            # compute drifts
            ## Select the right drift function
            drift = ifelse(
                is_multiplicative,
                glam.components.tt_drift_multiplicative(
                    v[0, tt.cast(v_index, dtype='int32')][:, None],
                    tau[0, tt.cast(tau_index, dtype='int32')][:, None],
                    gamma[0, tt.cast(gamma_index, dtype='int32')][:, None],
                    values, gaze, zerotol),
                glam.components.tt_drift_additive(
                    v[0, tt.cast(v_index, dtype='int32')][:, None],
                    tau[0, tt.cast(tau_index, dtype='int32')][:, None],
                    gamma[0, tt.cast(gamma_index, dtype='int32')][:, None],
                    values, gaze, zerotol))
            # drift = driftfun(v[0, tt.cast(v_index, dtype='int32')][:, None],
            #                  tau[0, tt.cast(tau_index, dtype='int32')][:, None],
            #                  gamma[0, tt.cast(gamma_index, dtype='int32')][:, None],
            #                  values,
            #                  gaze,
            #                  zerotol)
            glam_ll = glam.components.tt_wienerrace_pdf(
                rt[:, None], drift,
                s[0, tt.cast(s_index, dtype='int32')][:, None], b,
                t0[0, tt.cast(t0_index, dtype='int32')][:, None], zerotol)

            # mix likelihoods
            mixed_ll = ((1 - p_error) * glam_ll + p_error * error_ll)

            mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll)
            mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll)
            return tt.log(mixed_ll + zerotol)
Ejemplo n.º 26
0
def theano_digitize(x, bins):
    """
    Equivalent to numpy digitize.

    Parameters
    ----------
    x : Theano tensor or array_like
        The array or matrix to be digitized
    bins : array_like
        The bins with which x should be digitized

    Returns
    -------
    A Theano tensor
        The indices of the bins to which each value in input array belongs.
    """
    binned = T.zeros_like(x) + len(bins)
    for i in range(len(bins)):
        bin = bins[i]
        if i == 0:
            binned = T.switch(T.lt(x, bin), i, binned)
        else:
            ineq = T.and_(T.ge(x, bins[i - 1]), T.lt(x, bin))
            binned = T.switch(ineq, i, binned)
    binned = T.switch(T.isnan(x), len(bins), binned)
    return binned
 def minimize(self, loss, momentum, rescale):
     super(RMSPropOptimizer, self).minimize(loss)
     grads = self.gradparams
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1E-4
     updates = []
     params = self.params
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (
             1. - combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg ** 2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - self.lr * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * self.lr * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     
     return updates
Ejemplo n.º 28
0
def theano_digitize(x, bins):
    """
    Equivalent to numpy digitize.

    Parameters
    ----------
    x : Theano tensor or array_like
        The array or matrix to be digitized
    bins : array_like
        The bins with which x should be digitized

    Returns
    -------
    A Theano tensor
        The indices of the bins to which each value in input array belongs.
    """
    binned = T.zeros_like(x) + len(bins)
    for i in range(len(bins)):
        bin=bins[i]
        if i == 0:
            binned=T.switch(T.lt(x,bin),i,binned)
        else:
            ineq = T.and_(T.ge(x,bins[i-1]),T.lt(x,bin))
            binned=T.switch(ineq,i,binned)
    binned=T.switch(T.isnan(x), len(bins), binned)
    return binned
Ejemplo n.º 29
0
 def compute_step(self, parameter, previous_step):
     step_sum = tensor.sum(previous_step)
     not_finite = (tensor.isnan(step_sum) +
                   tensor.isinf(step_sum))
     step = tensor.switch(
         not_finite > 0, (1 - self.scaler) * parameter, previous_step)
     return step, []
Ejemplo n.º 30
0
 def updates(self, params, grads, learning_rate, momentum, rescale=5.):
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1E-4
     updates = []
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (1. -
                                                  combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg**2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - learning_rate * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * learning_rate * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     return updates
Ejemplo n.º 31
0
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.0)

    i = shared(floatX(0.0))
    i_t = i + 1.0
    fix1 = 1.0 - (1.0 - b1) ** i_t
    fix2 = 1.0 - (1.0 - b2) ** i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.0)
        v = shared(p.get_value() * 0.0)
        m_t = (b1 * g) + ((1.0 - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1.0 - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)

        # e_t = shared(p.get_value() * 0.)
        # de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05  #*p_t
        # p_t = p_t + de_t
        # updates.append((e_t, e_t + de_t))

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Ejemplo n.º 32
0
    def compute_step(self, param, previous_step):
        not_finite = tensor.any(
            tensor.or_(tensor.isnan(previous_step),
                       tensor.isinf(previous_step)))
        step = tensor.switch(not_finite, self.scaler * param, previous_step)

        return step, []
Ejemplo n.º 33
0
    def get_gradients(self, model, data, ** kwargs):

        cost = self.expr(model=model, data=data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        if self.gradient_clipping:
            norm_gs = 0.
            for grad in gradients.values():
                norm_gs += (grad ** 2).sum()
            not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
            norm_gs = T.sqrt(norm_gs)
            norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude),
                               self.max_magnitude / norm_gs,
                               1.)

            for param, grad in gradients.items():
                gradients[param] = T.switch(not_finite,
                                            .1 * param,
                                            grad * norm_gs)

        updates = OrderedDict()

        return gradients, updates
Ejemplo n.º 34
0
Archivo: boew.py Proyecto: zbxzc35/boew
    def sym_entropy(self, S, mapping):
        """
        Defines the symbolic calculation of the soft entropy
        """
        if self.distance == 'euclidean':
            distances = euclidean_distance(S, self.C)
        else:
            distances = cosine_distance(S, self.C)
        Q = T.nnet.softmax(-distances / self.m)

        # Calculates the fuzzy membership vector for each histogram S
        # Q, scan_u = theano.map(fn=self.sym_get_similarity, sequences=[S])

        Nk = T.sum(Q, axis=0)

        H = T.dot(mapping.T, Q)
        P = H / Nk

        entropy_per_cluster = P * T.log2(P)
        entropy_per_cluster = T.switch(T.isnan(entropy_per_cluster), 0,
                                       entropy_per_cluster)
        entropy_per_cluster = entropy_per_cluster.sum(axis=0)

        Rk = Nk / Nk.sum()
        E = -(entropy_per_cluster * Rk).sum()
        return T.squeeze(E)
Ejemplo n.º 35
0
 def compute_step(self, parameter, previous_step):
     step_sum = tensor.sum(previous_step)
     not_finite = (tensor.isnan(step_sum) +
                   tensor.isinf(step_sum))
     step = tensor.switch(
         not_finite > 0, (1 - self.scaler) * parameter, previous_step)
     return step, []
Ejemplo n.º 36
0
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)
    
    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m) 
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Ejemplo n.º 37
0
def sgd(cost,
        parameters,
        mom,
        l_r,
        gradient_clip,
        param_clip=0,
        consider_constant=None):
    """ 
    Each element of parameters is an array with 4 elements:
        param, update, hist_grad, hist_update
    """
    updates_for_func = OrderedDict()
    for param, update in parameters:
        gparam = T.grad(cost, param, consider_constant=consider_constant)
        gparam = ifelse(T.isnan(T.sum(gparam)), T.zeros_like(gparam), gparam)

        upd = mom * update - l_r * gparam
        if (gradient_clip > 0):
            gradient_len = T.sqrt(T.sum(
                upd**2)) + 0.0000001  # To avoid zero divident
            upd = ifelse(T.lt(gradient_len, gradient_clip), upd,
                         upd / gradient_len * gradient_clip)
        updates_for_func[update] = upd

        new_weight = param + upd
        if (param_clip > 0):
            new_weight = T.clip(new_weight, -param_clip, param_clip)
        updates_for_func[param] = new_weight

    return updates_for_func
Ejemplo n.º 38
0
 def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.):
     grads = T.grad(cost, params)
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1e-4
     updates = []
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (
             1. - combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg ** 2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - learning_rate * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * learning_rate * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     return updates
Ejemplo n.º 39
0
    def compute_updates(self, training_cost, params):
        updates = []

        grads = T.grad(training_cost, params)
        grads = OrderedDict(zip(params, grads))

        # Clip stuff
        c = numpy.float32(self.cutoff)
        clip_grads = []

        norm_gs = T.sqrt(sum(T.sum(g**2) for p, g in grads.items()))
        normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
        notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))

        for p, g in grads.items():
            clip_grads.append((p,
                               T.switch(notfinite,
                                        numpy.float32(.1) * p,
                                        g * normalization)))

        grads = OrderedDict(clip_grads)

        if self.updater == 'adagrad':
            updates = Adagrad(grads, self.lr)
        elif self.updater == 'sgd':
            raise Exception("Sgd not implemented!")
        elif self.updater == 'adadelta':
            updates = Adadelta(grads)
        elif self.updater == 'rmsprop':
            updates = RMSProp(grads, self.lr)
        elif self.updater == 'adam':
            updates = Adam(grads)
        else:
            raise Exception("Updater not understood!")
        return updates
Ejemplo n.º 40
0
    def get_probs(self, z_p, z_h):

        probs = get_output(self.nn_out, {self.nn_in[0]: z_p, self.nn_in[1]: z_h}, deterministic=True)

        probs = T.switch(T.isnan(probs), 0, probs)

        return probs
Ejemplo n.º 41
0
    def posdef(self, x, diag):
        """ Check to determine postive definiteness of the Kronecker-structured 
            covariance matrix. This operation is slow, and is thus not recommended 
            to be called repeatedly as a check during optimization. Rather, the user 
            should use this function as a guide to ensuring positive definiteness 
            of the model for varying values of the kernel parameters. 
            
            Args:
                tensor x: The input coordinates.
                tensor diag: The white noise variances. This should be an NxM 
                    array where N is the length of x and M is the size of 
                    alpha.
                    
            Returns: 
                isposdef: A boolean that is True if the covariance matrix 
                    is positive definite and False otherwise. The user will 
                    need to call ``isposdef.eval()`` to compute the returned value 
                    from the theano tensor variable. 
        """

        diag = tt.as_tensor_variable(diag)
        diag = tt.reshape(diag.T, (1, diag.size))[0]
        x = tt.as_tensor_variable(x)
        T = self.term.value(x[:, None] - x[None, :])
        if 'alpha' in vars(self):
            R = self.alpha[:, None] * self.alpha[None, :]
            K = tt.slinalg.kron(T, R)
        elif 'R' in vars(self):
            K = tt.slinalg(T, self.R)
        chol = tt.slinalg.Cholesky(on_error='nan')
        L = chol(K + tt.diag(diag))
        return tt.switch(tt.any(tt.isnan(L)), np.array(False), np.array(True))
Ejemplo n.º 42
0
    def compute_updates(self, training_cost, params):
        updates = []
         
        grads = T.grad(training_cost, params)
        grads = OrderedDict(zip(params, grads))
        
        # Clip stuff
        c = numpy.float32(self.cutoff)
        clip_grads = []
        
        norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items()))
        normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
        notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
         
        for p, g in grads.items():
            clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization)))
        
        grads = OrderedDict(clip_grads)

        if self.updater == 'adagrad':
            updates = Adagrad(grads, self.lr)  
        elif self.updater == 'sgd':
            raise Exception("Sgd not implemented!")
        elif self.updater == 'adadelta':
            updates = Adadelta(grads)
        elif self.updater == 'rmsprop':
            updates = RMSProp(grads, self.lr)
        elif self.updater == 'adam':
            updates = Adam(grads)
        else:
            raise Exception("Updater not understood!") 
        return updates
Ejemplo n.º 43
0
    def __init__(self, n_visible, n_hidden=150, n_hidden_recurrent=100, lr=0.001, l2_norm=None, l1_norm=None):
        (v, v_sample, cost, monitor, params, updates_train,
         v_t, updates_generate, n_steps) = build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent, lr, l2_norm=l2_norm,
                                                        l1_norm=l1_norm)

        for param in params:
            gradient = T.grad(cost, param, consider_constant=[v_sample])

            # remove nan and inf values
            not_finite = T.or_(T.isnan(gradient), T.isinf(gradient))
            gradient = T.switch(not_finite, 0.1 * param, gradient)
            # max_grad = param * 1e-3
            # gradient = T.switch(T.gt(gradient, max_grad), max_grad, gradient)

            # momentum
            # velocity = shared_zeros('velocity_' + str(param.name), param.get_value(borrow=True).shape)
            # update = param - T.cast(lr, dtype=dtype) * gradient
            # x = momentum * velocity + update - param
            # updates_train[velocity] = x
            # updates_train[param] = momentum * x + update

            # rmsprop
            accu = shared_zeros('accu_' + str(param.name), param.get_value(borrow=True).shape)
            accu_new = 0.9 * accu + 0.1 * gradient ** 2
            updates_train[accu] = accu_new
            updates_train[param] = param - (lr * gradient / T.sqrt(accu_new + 1e-6))
        self.params = params
        self.train_function = theano.function([v], monitor, updates=updates_train)
        self.generate_function = theano.function([n_steps], v_t, updates=updates_generate)
Ejemplo n.º 44
0
def clip(grads, threshold, square=True, params=None):
    '''
		Build the computational graph that clips the gradient if the norm of the gradient exceeds the threshold. 

		:type grads: theano variable
		:param grads: the gradient to be clipped

		:type threshold: float
		:param threshold: the threshold of the norm of the gradient

		:returns: theano variable. The clipped gradient.
	'''
    grads_norm2 = sum(tensor.sum(g**2) for g in grads)
    if square:
        grads_norm2 = tensor.sqrt(grads_norm2)
    grads_clip = [
        tensor.switch(tensor.ge(grads_norm2, threshold),
                      g / grads_norm2 * threshold, g) for g in grads
    ]
    #deal with nan
    grads_clip = [
        tensor.switch(tensor.isnan(grads_norm2), 0.01 * p, g)
        for p, g in zip(params, grads_clip)
    ]
    return grads_clip, grads_norm2
Ejemplo n.º 45
0
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        g_norm = 0.

        for p, g in grads.items():
            g /= T.cast(self.batch_size, dtype=theano.config.floatX)
            grads[p] = g
            g_norm += (g**2).sum()

        if self.check_nan:
            not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))

        g_norm = T.sqrt(g_norm)
        scaler = self.scaler / T.maximum(self.scaler, g_norm)

        if self.check_nan:
            for p, g in grads.items():
                grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        else:
            for p, g in grads.items():
                grads[p] = g * scaler

        mainloop.grads = grads
Ejemplo n.º 46
0
def adamgc(cost,
           params,
           lr=0.0002,
           b1=0.1,
           b2=0.001,
           e=1e-8,
           max_magnitude=5.0,
           infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude),
                           max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Ejemplo n.º 47
0
def nan_shield(parameters, deltas, other_updates):
    delta_sum = sum(T.sum(d) for d in deltas)
    not_finite = T.isnan(delta_sum) | T.isinf(delta_sum)
    parameter_updates = [(p, T.switch(not_finite, 0.9 * p, p - d))
                         for p, d in izip(parameters, deltas)]
    other_updates = [(p, T.switch(not_finite, p, u))
                     for p, u in other_updates]
    return parameter_updates, other_updates
Ejemplo n.º 48
0
def clip(clip_size,parameters,gradients):
    grad_mag = T.sqrt(sum(T.sum(T.sqr(w)) for w in parameters))
    exploded = T.isnan(grad_mag) | T.isinf(grad_mag)
    scale = clip_size / T.maximum(clip_size,grad_mag)

    return [ T.switch(exploded,
                    0.1 * p,
                    scale * g
                ) for p,g in zip(parameters,gradients) ]
Ejemplo n.º 49
0
def log_add(lna, lnb):
    """
    Compute the ln(a+b) given {lna,lnb}
    :param
    :return: ln(a+b)
    """
    max_ = tensor.maximum(lna, lnb)
    result = (max_ + tensor.log1p(tensor.exp(lna + lnb - 2 * max_)))   #log1p(x) = log(1+x)
    return tensor.switch(tensor.isnan(result), max_, result)
Ejemplo n.º 50
0
    def __init__(self, paramMap, loss, learning_rate): 
        g_mom = {}
        updates = {}
        sqr_gradients = {}
        paramObjLst = paramMap.values()

        obj2Grad = {}

        l2_loss = 0.0

        for param in paramObjLst:
            gparam_mom = theano.shared(np.zeros(param.get_value(borrow=True).shape,dtype=theano.config.floatX))
            g_mom[param] = gparam_mom

            sqr_grad = theano.shared(np.zeros(param.get_value(borrow=True).shape,dtype=theano.config.floatX))

            sqr_gradients[param] = sqr_grad

            l2_loss += T.sum(param**2)

        gradLst = T.grad(loss, paramObjLst)

        for i in range(0, len(paramObjLst)): 
            obj2Grad[paramObjLst[i]] = gradLst[i]

        for param in paramObjLst:
            gparam = g_mom[param]
            sqr_grad = sqr_gradients[param]
            #new_gradient = T.grad(loss, param)

            new_gradient = obj2Grad[param]

            scaling_factor  = 1.0 #T.maximum(1.0, (T.sqrt(T.sum(T.sqr(new_gradient)))))
            #Divide by the norm of the gradient if it is greater than one
            new_gradient = new_gradient / scaling_factor

            new_gradient = T.switch(T.isnan(new_gradient), 0.0, new_gradient)

            mom = 0.7

            learning_rate_use = learning_rate# / (T.sqrt(sqr_grad) + 1.0)

            updates[gparam] = T.cast(mom * gparam - (1.0 - mom) * learning_rate_use * new_gradient, theano.config.floatX)

            updates[sqr_grad] = T.cast(T.clip(sqr_grad + T.abs_(new_gradient), 0.0, 10000.0), theano.config.floatX)

        for param in paramObjLst:
            updated_value = param + updates[g_mom[param]]


            if param.ndim == 2:
                updated_value = normalize(updated_value)

            updates[param] = T.cast(updated_value, theano.config.floatX)

        self.updates = updates
Ejemplo n.º 51
0
def replace_nans(tensor):
    """
    convert nans and infs to float_max.
    convert -infs to float_min.
    """
    tensor = T.switch(T.isnan(tensor), sys.float_info.max, tensor)
    return T.switch(T.isinf(tensor),
                    T.switch(T.lt(tensor, 0),
                             sys.float_info.min,
                             sys.float_info.max),
                    tensor)
Ejemplo n.º 52
0
    def grads(self, cost):
        grad_dict = {}

        grads = T.grad(cost, self.params)
        for param, grad in zip(self.params, grads):
            grad = T.switch(T.isnan(grad), 0.0, grad)
            if param in self.param_masks:
                grad = grad * self.param_masks[param]
            grad_dict[param] = grad

        return grad_dict
Ejemplo n.º 53
0
    def marginalize_over_v_z(self, h):
        # energy = \sum_{i=1}^{|h|} h_i*b_i - \beta * ln(1 + e^{b_i})

        if self.penalty == "softplus_bi":
            energy = (h * self.b).T - self.beta * T.log(1 + T.exp(self.b))[:, None]
        elif self.penalty == "softplus0":
            energy = (h * self.b).T - self.beta * T.log(1 + T.exp(0))[:, None]
        else:
            raise NameError("Invalid penalty term")

        energy = T.set_subtensor(energy[(T.isnan(energy)).nonzero()], 0)  # Remove
        energy = T.sum(energy, axis=0, keepdims=True).T

        ener = T.tensordot(h, self.W, axes=0)
        ener = T.diagonal(ener, axis1=1, axis2=2)
        ener = T.set_subtensor(ener[(T.isnan(ener)).nonzero()], 0)
        ener = T.sum(ener, axis=2) + self.c[None, :]
        ener = T.sum(T.log(1 + T.exp(ener)), axis=1, keepdims=True)

        return -(energy + ener)
Ejemplo n.º 54
0
def custom_loss(y_true_mach, y_pred):
    # y_pred = T.clip(y_pred, _EPSILON, 1.0 - _EPSILON)
    # Flatten() is crucial, i don't know why :)
    y_true_mach = y_true_mach.flatten()
    # Find out non -1 targets
    nz = T.neq(y_true_mach, -1).nonzero()[0]

    # loss can be nan if for this position no target is available in the batch
    # replace nan loss with a value near 0
    # May slow down training, take care of this with another approach.
    loss = -T.log(y_pred[nz, T.cast(y_true_mach[nz], "uint16")]).mean()
    return ifelse(T.isnan(loss), 0.0001, loss)
Ejemplo n.º 55
0
    def get_cost_grads_updates(self, x):
        ha, h, ya, y = self.network.propVHV(x, noise_std=self.train_hypers['noise_std'])

        q = T.switch(T.isnan(self.q), h.mean(axis=0),
                     0.9*self.q + 0.1*h.mean(axis=0))

        lamb = T.cast(self.train_hypers['lamb'], self.dtype)
        rho = T.cast(self.train_hypers['rho'], self.dtype)
        cost = ((x - y)**2).mean(axis=0).sum() + lamb*(T.abs_(q - rho)).sum()

        updates = {self.q: q}
        return cost, self.grads(cost), updates
Ejemplo n.º 56
0
def safe_logaddexp(a, b):
    """Symbolic log(exp(a) + exp(b)). The edge case where `a` - `b` is undefined is handled by
    setting the difference to 0. This occurs if both `a` and `b` are +inf or -inf.

    Returns:
        symbolic log(exp(a) + exp(b))
    """
    diff = b - a
    safe_diff = tt.switch(tt.isnan(diff), 0, diff)
    return tt.switch(safe_diff >= 0,
                     b + tt.log1p(tt.exp(-safe_diff)),
                     a + tt.log1p(tt.exp(safe_diff)))
Ejemplo n.º 57
0
def step_clipping(params, gparams, scale=1.0):
    grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
    notfinite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
    multiplier = T.switch(grad_norm < scale, 1.0, scale / grad_norm)
    _g = []
    for param, gparam in izip(params, gparams):
        tmp_g = gparam * multiplier
        _g.append(T.switch(notfinite, param * 0.1, tmp_g))

    params_clipping = _g

    return params_clipping
Ejemplo n.º 58
0
    def shadow(self, points, lights):
        """
        Returns whether points are in shadow of this object.

        See: http://en.wikipedia.org/wiki/Line-sphere_intersection
        """
        y = points  # vector from points to our center
        x = T.tensordot(y, -1*lights[0].normed_dir(), 1)
        decider = T.sqr(x) - T.sum(T.mul(y, y), 2) + 1

        # if shadow, below is >= 0
        is_nan_or_nonpos = T.or_(T.isnan(decider), decider <= 0)
        return T.switch(is_nan_or_nonpos, -1, -x - T.sqrt(decider))