Example #1
0
File: mlp.py Project: caglar/prmlp
    def sgd_updates_adagrad(self,
                    cost,
                    learning_rate):
        """
        Return the dictionary of parameter specific learning rate updates using adagrad algorithm.
        """
        #Initialize the variables
        accumulators = {}
        e0s = {}
        learn_rates = []
        ups = {}

        #initialize the accumulator and the epsilon_0
        for param in self.params:
                accumulators[param] = theano.shared(value=as_floatX(0.), name="acc_%s" % param.name)
                e0s[param] = as_floatX(learning_rate)

        self.grads = [T.grad(cost, p) for p in self.params]

        #Compute the learning rates
        for param, gp in zip(self.params, self.grads):
                acc = accumulators[param]
                ups[acc] = T.sqrt((gp ** 2).sum())
                learn_rates.append(e0s[param] / ups[acc])

        #Find the updates based on the parameters
        updates = [(p, p - step * gp) for (step, p, gp) in zip(learn_rates,
        self.params, self.grads)]
        p_up = dict(updates)
        safe_update(ups, p_up)
        return ups
Example #2
0
File: mlp.py Project: caglar/prmlp
    def sgd_updates_adagrad(self, cost, learning_rate):
        """
        Return the dictionary of parameter specific learning rate updates using adagrad algorithm.
        """
        #Initialize the variables
        accumulators = {}
        e0s = {}
        learn_rates = []
        ups = {}

        #initialize the accumulator and the epsilon_0
        for param in self.params:
            accumulators[param] = theano.shared(value=as_floatX(0.),
                                                name="acc_%s" % param.name)
            e0s[param] = as_floatX(learning_rate)

        self.grads = [T.grad(cost, p) for p in self.params]

        #Compute the learning rates
        for param, gp in zip(self.params, self.grads):
            acc = accumulators[param]
            ups[acc] = T.sqrt((gp**2).sum())
            learn_rates.append(e0s[param] / ups[acc])

        #Find the updates based on the parameters
        updates = [(p, p - step * gp)
                   for (step, p,
                        gp) in zip(learn_rates, self.params, self.grads)]
        p_up = dict(updates)
        safe_update(ups, p_up)
        return ups
Example #3
0
    def __init__(self, inputs, cost,
            layers, max_col_norm=None,
            loss_based_pooling=False,
            pooling_loss=None,
            learning_rate=0.01,
            momentum=None,
            rmsprop=True,
            adadelta=False,
            center_grads=False,
            rho=0.96,
            epsilon=1e-8,
            use_nesterov=True,
            seed=None,
            rng=None,
            constants=None, **kw):

        self.loss_based_pooling = loss_based_pooling
        self.rng = rng
        params = [layer.W for layer in layers] + [layer.b for layer in layers]
        self.learning_rate = theano.shared(numpy.asarray(learning_rate, dtype=theano.config.floatX))
        self.layers = layers
        self.max_col_norm = max_col_norm
        #Initialize parameters for rmsprop:
        accumulators = OrderedDict({})
        accumulators_mgrad = OrderedDict({})
        exp_sqr_grads = OrderedDict({})
        exp_sqr_ups = OrderedDict({})
        e0s = OrderedDict({})
        learn_rates = []
        from utils import as_floatX

        self.max_col_norm = max_col_norm

        gparams = []
        for param in params:
            eps_p = numpy.zeros_like(param.get_value())

            accumulators[param] = theano.shared(value=as_floatX(eps_p), name="acc_%s" % param.name)
            accumulators_mgrad[param] = theano.shared(value=as_floatX(eps_p), name="acc_mgrad%s" % param.name)
            exp_sqr_grads[param] = theano.shared(value=as_floatX(eps_p), name="exp_grad_%s" % param.name)
            exp_sqr_ups[param] = theano.shared(value=as_floatX(eps_p), name="exp_grad_%s" % param.name)
            e0s[param] = as_floatX(learning_rate)
            gparam  = T.grad(cost, param, consider_constant=constants)
            gparams.append(gparam)

        updates = OrderedDict({})

        i = 0

        for param, gparam in zip(params, gparams):
            if rmsprop:
                acc = accumulators[param]
                rms_grad = rho * acc + (1 - rho) * T.sqr(gparam)

                updates[acc] = rms_grad
                val = T.maximum(T.sqrt(T.sum(rms_grad, axis=0)), epsilon)

                learn_rates.append(e0s[param] / val)

                if center_grads:
                    acc_mg = accumulators_mgrad[param]
                    mean_grad = rho * acc_mg + (1 - rho) * gparam
                    gparam = gparam - mean_grad
                    updates[acc_mg] = mean_grad
                if momentum and not use_nesterov:
                    memory = theano.shared(param.get_value() * 0.)
                    updates[param] = param - memory
                    updates[memory] = momentum * memory + learn_rates[i] * gparam
                elif use_nesterov:
                    memory = theano.shared(param.get_value() * 0.)
                    new_memo = momentum * memory - e0s[param] * gparam
                    #new_memo = momentum * memory - learn_rates[i] * gparam
                    updates[memory] = new_memo
                    updates[param] = param + (momentum * new_memo - e0s[param] * gparam) / val
                else:
                    updates[param] = param - learn_rates[i] * gparam
                i +=1
            elif adadelta:
                exp_sg = exp_sqr_grads[param]
                exp_su = exp_sqr_ups[param]
                up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gparam)
                updates[exp_sg] = up_exp_sg
                step =  -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gparam
                updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
                updates[param] = param + step
            else:
                if momentum and not use_nesterov:
                    memory = theano.shared(param.get_value() * 0.)
                    updates[param] = param - memory
                    updates[memory] = momentum * memory + learning_rate * gparam
                elif use_nesterov:
                    memory = theano.shared(param.get_value() * 0.)
                    new_memo = momentum * memory - learning_rate * gparam
                    updates[memory] = new_memo
                    updates[param] = param + momentum * new_memo - learning_rate * gparam
                else:
                    updates[param] = param - learning_rate * gparam

        if max_col_norm is not None:
            updates = self.constrain_weights(layers, updates, max_col_norm)

        self.updates = updates
        self._train = theano.function(inputs, outputs=cost, updates=updates)
        self._constrain_inputs = theano.function(inputs, outputs=T.argsort(pooling_loss, axis=0))
Example #4
0
    def __init__(self,
                 inputs,
                 cost,
                 layers,
                 max_col_norm=None,
                 loss_based_pooling=False,
                 pooling_loss=None,
                 learning_rate=0.01,
                 momentum=None,
                 rmsprop=True,
                 adadelta=False,
                 center_grads=False,
                 rho=0.96,
                 epsilon=1e-8,
                 use_nesterov=True,
                 seed=None,
                 rng=None,
                 constants=None,
                 **kw):

        self.loss_based_pooling = loss_based_pooling
        self.rng = rng
        params = [layer.W for layer in layers] + [layer.b for layer in layers]
        self.learning_rate = theano.shared(
            numpy.asarray(learning_rate, dtype=theano.config.floatX))
        self.layers = layers
        self.max_col_norm = max_col_norm
        #Initialize parameters for rmsprop:
        accumulators = OrderedDict({})
        accumulators_mgrad = OrderedDict({})
        exp_sqr_grads = OrderedDict({})
        exp_sqr_ups = OrderedDict({})
        e0s = OrderedDict({})
        learn_rates = []
        from utils import as_floatX

        self.max_col_norm = max_col_norm

        gparams = []
        for param in params:
            eps_p = numpy.zeros_like(param.get_value())

            accumulators[param] = theano.shared(value=as_floatX(eps_p),
                                                name="acc_%s" % param.name)
            accumulators_mgrad[param] = theano.shared(value=as_floatX(eps_p),
                                                      name="acc_mgrad%s" %
                                                      param.name)
            exp_sqr_grads[param] = theano.shared(value=as_floatX(eps_p),
                                                 name="exp_grad_%s" %
                                                 param.name)
            exp_sqr_ups[param] = theano.shared(value=as_floatX(eps_p),
                                               name="exp_grad_%s" % param.name)
            e0s[param] = as_floatX(learning_rate)
            gparam = T.grad(cost, param, consider_constant=constants)
            gparams.append(gparam)

        updates = OrderedDict({})

        i = 0

        for param, gparam in zip(params, gparams):
            if rmsprop:
                acc = accumulators[param]
                rms_grad = rho * acc + (1 - rho) * T.sqr(gparam)

                updates[acc] = rms_grad
                val = T.maximum(T.sqrt(T.sum(rms_grad, axis=0)), epsilon)

                learn_rates.append(e0s[param] / val)

                if center_grads:
                    acc_mg = accumulators_mgrad[param]
                    mean_grad = rho * acc_mg + (1 - rho) * gparam
                    gparam = gparam - mean_grad
                    updates[acc_mg] = mean_grad
                if momentum and not use_nesterov:
                    memory = theano.shared(param.get_value() * 0.)
                    updates[param] = param - memory
                    updates[
                        memory] = momentum * memory + learn_rates[i] * gparam
                elif use_nesterov:
                    memory = theano.shared(param.get_value() * 0.)
                    new_memo = momentum * memory - e0s[param] * gparam
                    #new_memo = momentum * memory - learn_rates[i] * gparam
                    updates[memory] = new_memo
                    updates[param] = param + (momentum * new_memo -
                                              e0s[param] * gparam) / val
                else:
                    updates[param] = param - learn_rates[i] * gparam
                i += 1
            elif adadelta:
                exp_sg = exp_sqr_grads[param]
                exp_su = exp_sqr_ups[param]
                up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gparam)
                updates[exp_sg] = up_exp_sg
                step = -(T.sqrt(exp_su + epsilon) /
                         T.sqrt(up_exp_sg + epsilon)) * gparam
                updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
                updates[param] = param + step
            else:
                if momentum and not use_nesterov:
                    memory = theano.shared(param.get_value() * 0.)
                    updates[param] = param - memory
                    updates[
                        memory] = momentum * memory + learning_rate * gparam
                elif use_nesterov:
                    memory = theano.shared(param.get_value() * 0.)
                    new_memo = momentum * memory - learning_rate * gparam
                    updates[memory] = new_memo
                    updates[
                        param] = param + momentum * new_memo - learning_rate * gparam
                else:
                    updates[param] = param - learning_rate * gparam

        if max_col_norm is not None:
            updates = self.constrain_weights(layers, updates, max_col_norm)

        self.updates = updates
        self._train = theano.function(inputs, outputs=cost, updates=updates)
        self._constrain_inputs = theano.function(inputs,
                                                 outputs=T.argsort(
                                                     pooling_loss, axis=0))