Exemple #1
0
    def test_zero_optimal(self):
        """ minimizes the kl divergence between q and p
            using batch gradient descent and checks that
            the result is zero"""

        rng = np.random.RandomState([1,2,3])

        dim = self.dim

        num_trials = 3

        mu = rng.randn(dim).astype(floatX)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.p.mu.set_value(mu)
        mu = rng.randn(dim).astype(floatX)
        self.q.mu.set_value(mu)
        self.p.beta.set_value(beta)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.q.beta.set_value(beta)

        kl = kl_divergence(self.q,self.p)

        p = self.p
        q = self.q

        optimizer = BatchGradientDescent(
                    max_iter = 100,
                    line_search_mode = 'exhaustive',
                    verbose = True,
                    objective = kl,
                    conjugate = True,
                    params = [ p.mu, p.beta, q.mu, q.beta ],
                    param_constrainers = [ p.censor_updates,
                        q.censor_updates ])

        #optimizer.verbose = True

        kl = optimizer.minimize()

        if kl < 0.:

            if config.floatX == 'float32':
                neg_tol = 4.8e-7
            else:
                neg_tol = 0.

            if kl < - neg_tol:
                raise AssertionError("KL divergence should "
                    "be non-negative but is "+
                    str(kl))

            warnings.warn("KL divergence is not very numerically stable, evidently")

        tol = 6e-5
        if kl > tol:
            print 'kl:',kl
            print 'tol:',tol
        assert kl <= tol
        assert not (kl > tol )
Exemple #2
0
    def test_zero_optimal(self):
        """ minimizes the kl divergence between q and p
            using batch gradient descent and checks that
            the result is zero"""

        rng = np.random.RandomState([1,2,3])

        dim = self.dim

        num_trials = 3

        mu = rng.randn(dim).astype(floatX)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.p.mu.set_value(mu)
        mu = rng.randn(dim).astype(floatX)
        self.q.mu.set_value(mu)
        self.p.beta.set_value(beta)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.q.beta.set_value(beta)

        kl = kl_divergence(self.q,self.p)

        p = self.p
        q = self.q

        optimizer = BatchGradientDescent(
                    objective = kl,
                    params = [ p.mu, p.beta, q.mu, q.beta ],
                    param_constrainers = [ p.censor_updates,
                        q.censor_updates ])

        #optimizer.verbose = True

        kl = optimizer.minimize()

        if kl < 0.:
            raise AssertionError("KL divergence should "
                    "be non-negative but is "+
                    str(kl))

        tol = 5.4e-5
        assert kl <= tol
        assert not (kl > tol )
    def fit(self, params=None, l1=.0, l2=.0):
        """
        Fit the model by minimizing the Leave One Out (LOO) loss using gradient-based optimization.
        """
        loo_loss = self.loss_symbolic(self.L, self.y, self.mu, self.R, self.eta, self.eps)

        if params is None:
            params = [self.eta]

        # Symbolic Theano variables that represent the L1 and L2 regularization terms
        L1, L2 = .0, .0
        for param in params:
            L1 += T.sum(abs(param))
            L2 += T.sum(param ** 2)

        regularized_loo_loss = loo_loss + l1 * L1 + l2 * L2

        minimizer = BatchGradientDescent(objective=regularized_loo_loss, params=params, inputs=[], verbose=1)

        minimizer.minimize()
Exemple #4
0
    def createGradientFunctions(self):
        #create
        X = T.dmatrices("X")
        mu, logSigma, u, v, f, R = T.dcols("mu", "logSigma", "u", "v", "f",
                                           "R")
        mu = sharedX(np.random.normal(10, 10, (self.dimTheta, 1)), name='mu')
        logSigma = sharedX(np.random.uniform(0, 4, (self.dimTheta, 1)),
                           name='logSigma')
        logLambd = sharedX(np.matrix(np.random.uniform(0, 10)),
                           name='logLambd')
        logLambd = T.patternbroadcast(T.dmatrix("logLambd"), [1, 1])
        negKL = 0.5 * T.sum(1 + 2 * logSigma - mu**2 - T.exp(logSigma)**2)
        theta = mu + T.exp(logSigma) * v
        W = theta
        y = X[:, 0]
        X_sim = X[:, 1:]
        f = (T.dot(X_sim, W) + u).flatten()

        gradvariables = [mu, logSigma, logLambd]

        logLike = T.sum(-(0.5 * np.log(2 * np.pi) + logLambd) - 0.5 *
                        ((y - f) / (T.exp(logLambd)))**2)

        logp = (negKL + logLike) / self.m

        optimizer = -logp

        self.negKL = th.function([mu, logSigma],
                                 negKL,
                                 on_unused_input='ignore')
        self.f = th.function(gradvariables + [X, u, v],
                             f,
                             on_unused_input='ignore')
        self.logLike = th.function(gradvariables + [X, u, v],
                                   logLike,
                                   on_unused_input='ignore')
        derivatives = T.grad(logp, gradvariables)
        derivatives.append(logp)

        self.gradientfunction = th.function(gradvariables + [X, u, v],
                                            derivatives,
                                            on_unused_input='ignore')
        self.lowerboundfunction = th.function(gradvariables + [X, u, v],
                                              logp,
                                              on_unused_input='ignore')

        self.optimizer = BatchGradientDescent(objective=optimizer,
                                              params=gradvariables,
                                              inputs=[X, u, v],
                                              conjugate=True,
                                              max_iter=1)
    def fit(self, params=None, l1=.0, l2=.0):
        """
        Fit the model by minimizing the Leave One Out (LOO) loss using gradient-based optimization.
        """
        loo_loss = self.loss_symbolic(self.L, self.y, self.mu, self.R,
                                      self.eta, self.eps)

        if params is None:
            params = [self.eta]

        # Symbolic Theano variables that represent the L1 and L2 regularization terms
        L1, L2 = .0, .0
        for param in params:
            L1 += T.sum(abs(param))
            L2 += T.sum(param**2)

        regularized_loo_loss = loo_loss + l1 * L1 + l2 * L2

        minimizer = BatchGradientDescent(objective=regularized_loo_loss,
                                         params=params,
                                         inputs=[],
                                         verbose=1)

        minimizer.minimize()
Exemple #6
0
        #Get the objective function
        nce = DNCE(noise_distribution)
        J = nce(model,X,Y)

        accs = []
        for Y_i in Y:
            pos_prob = 1./(1.+T.exp(model.free_energy(X)-model.free_energy(Y_i)))
            acc = (pos_prob > .5).mean()
            accs.append(acc)
        acc = sum(accs) / float(len(accs))

        print '\tinit accuracy ',function([],acc)()

        #Minimize the objective function with batch gradient descent
        minimizer = BatchGradientDescent( objective = J,
                                            params = model.get_params(),
                                            param_constrainers = [ model.censor_updates ])

        print '\tinit obj:',minimizer.obj()
        #minimizer.verbose = True
        minimizer.minimize()
        print '\tfinal obj:',minimizer.obj()

        recovered_beta = model.beta.get_value()
        recovered_mu = model.mu.get_value()

        print '\trecovered beta:',recovered_beta
        print '\trecovered mu:',recovered_mu

        kl = kl_divergence(true, model)
        kl = function([],kl)()
def test_batch_gradient_descent():
        """ Verify that batch gradient descent works by checking that
        it minimizes a quadratic function f(x) = x^T A x + b^T x + c
        correctly for several sampled values of A, b, and c.
        The ground truth minimizer is x = np.linalg.solve(A,-b)"""

        n = 3

        A = T.matrix(name = 'A')
        b = T.vector(name = 'b')
        c = T.scalar(name = 'c')

        x = sharedX( np.zeros((n,)) , name = 'x')

        half = np.cast[config.floatX](0.5)

        obj = half * T.dot(T.dot(x,A),x)+T.dot(b,x)+c

        minimizer = BatchGradientDescent(
                        objective = obj,
                        params = [ x],
                        inputs = [ A, b, c])

        num_samples = 3

        rng = np.random.RandomState([1,2,3])

        for i in xrange(num_samples):
            A = np.cast[config.floatX](rng.randn(1.5*n,n))
            A = np.cast[config.floatX](np.dot(A.T,A))
            A += np.cast[config.floatX](np.identity(n) * .02)
            b = np.cast[config.floatX](rng.randn(n))
            c = np.cast[config.floatX](rng.randn())
            x.set_value(np.cast[config.floatX](rng.randn(n)))

            analytical_x = np.linalg.solve(A,-b)

            actual_obj = minimizer.minimize(A,b,c)
            actual_x = x.get_value()

            #Check that the value returned by the minimize method
            #is the objective function value at the parameters
            #chosen by the minimize method
            cur_obj = minimizer.obj(A,b,c)
            assert np.allclose(actual_obj, cur_obj)

            x.set_value(analytical_x)
            analytical_obj = minimizer.obj(A,b,c)

            #make sure the objective function is accurate to first 4 digits
            condition1 = not np.allclose(analytical_obj, actual_obj)
            condition2 = np.abs(analytical_obj-actual_obj) >= 1e-4 * np.abs(analytical_obj)

            if (config.floatX == 'float64' and condition1) \
                    or (config.floatX == 'float32' and condition2):
                print 'objective function value came out wrong on sample ',i
                print 'analytical obj', analytical_obj
                print 'actual obj',actual_obj

                """
                The following section of code was used to verify that numerical
                error can make the objective function look non-convex

                print 'Checking for numerically induced non-convex behavior'
                def f(x):
                    return 0.5 * np.dot(x,np.dot(A,x)) + np.dot(b,x) + c

                x.set_value(actual_x)
                minimizer._compute_grad(A,b,c)
                minimizer._normalize_grad()
                d = minimizer.param_to_grad_shared[x].get_value()

                x = actual_x.copy()
                prev = f(x)
                print prev
                step_size = 1e-4
                x += step_size * d
                cur = f(x)
                print cur
                cur_sgn = np.sign(cur-prev)
                flip_cnt = 0
                for i in xrange(10000):
                    x += step_size * d
                    prev = cur
                    cur = f(x)
                    print cur
                    prev_sgn = cur_sgn
                    cur_sgn = np.sign(cur-prev)
                    if cur_sgn != prev_sgn:
                        print 'flip'
                        flip_cnt += 1
                        if flip_cnt > 1:
                            print "Non-convex!"

                            from matplotlib import pyplot as plt
                            y = []

                            x = actual_x.copy()
                            for j in xrange(10000):
                                y.append(f(x))
                                x += step_size * d

                            plt.plot(y)
                            plt.show()

                            assert False

                print 'None found'
                """

                #print 'actual x',actual_x
                #print 'A:'
                #print A
                #print 'b:'
                #print b
                #print 'c:'
                #print c
                x.set_value(actual_x)
                minimizer._compute_grad(A,b,c)
                x_grad = minimizer.param_to_grad_shared[x]
                actual_grad =  x_grad.get_value()
                correct_grad = 0.5 * np.dot(A,x.get_value())+ 0.5 * np.dot(A.T, x.get_value()) +b
                if not np.allclose(actual_grad, correct_grad):
                    print 'gradient was wrong at convergence point'
                    print 'actual grad: '
                    print actual_grad
                    print 'correct grad: '
                    print correct_grad
                    print 'max difference: ',np.abs(actual_grad-correct_grad).max()
                    assert False


                minimizer._normalize_grad()
                d = minimizer.param_to_grad_shared[x].get_value()
                step_len = ( np.dot(b,d) + 0.5 * np.dot(d,np.dot(A,actual_x)) \
                        + 0.5 * np.dot(actual_x,np.dot(A,d)) ) / np.dot(d, np.dot(A,d))

                g = np.dot(A,actual_x)+b
                deriv = np.dot(g,d)

                print 'directional deriv at actual', deriv
                print 'optimal step_len', step_len
                optimal_x = actual_x - d * step_len
                g = np.dot(A,optimal_x) + b
                deriv = np.dot(g,d)

                print 'directional deriv at optimal: ',deriv
                x.set_value(optimal_x)
                print 'obj at optimal: ',minimizer.obj(A,b,c)



                print 'eigenvalue range:'
                val, vec = np.linalg.eig(A)
                print (val.min(),val.max())
                print 'condition number: ',(val.max()/val.min())
                assert False
Exemple #8
0
class BGD(TrainingAlgorithm):
    """Batch Gradient Descent training algorithm class"""
    def __init__(self, cost, batch_size=None, batches_per_iter=None,
                 updates_per_batch = 10,
                 monitoring_batches=None, monitoring_dataset=None,
                 termination_criterion = None, set_batch_size = False,
                 reset_alpha = True, conjugate = False,
                 min_init_alpha = .001,
                 reset_conjugate = True, line_search_mode = None,
                 verbose_optimization=False, scale_step=1., theano_function_mode=None,
                 init_alpha=None, seed=None):
        """
        cost: a pylearn2 Cost
        batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm
                    still iterates over minibatches of data. The difference
                    is that this class uses partial line searches to choose
                    the step size along each gradient direction, and can do
                    repeated updates on the same batch. The assumption is
                    that you use big enough minibatches with this algorithm that
                    a large step size will generalize reasonably well to other
                    minibatches.
                    To implement true Batch Gradient Descent, set the batch_size
                    to the total number of examples available.
                    If batch_size is None, it will revert to the model's force_batch_size
                    attribute.
        set_batch_size: If True, BGD will attempt to override the model's force_batch_size
                attribute by calling set_batch_size on it.
        updates_per_batch: Passed through to the optimization.BatchGradientDescent's
                   max_iters parameter
        reset_alpha, conjugate, reset_conjugate: passed through to the
            optimization.BatchGradientDescent parameters of the same names
        monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets
        """

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches == None


        self._set_monitoring_dataset(monitoring_dataset)

        self.bSetup = False
        self.termination_criterion = termination_criterion
        if seed is None:
            seed = [2012, 10, 16]
        self.rng = np.random.RandomState(seed)

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size):
                    raise ValueError("batch_size is %d but model.force_batch_size is %d" %
                            (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)
        X = self.model.get_input_space().make_theano_batch()
        X.name = 'BGD_X'
        self.topo = X.ndim != 2
        Y = T.matrix()
        Y.name = 'BGD_Y'

        fixed_var_descr = self.cost.get_fixed_var_descr(model, X, Y)
        self.on_load_batch = fixed_var_descr.on_load_batch

        if not self.cost.supervised:
            Y = None

        if self.cost.supervised:
            obj = self.cost(model, X, Y, ** fixed_var_descr.fixed_vars)
            grads, grad_updates = self.cost.get_gradients(model, X, Y, ** fixed_var_descr.fixed_vars)
            ipt = (X,Y)
        else:
            obj = self.cost(model, X, ** fixed_var_descr.fixed_vars)
            grads, grad_updates = self.cost.get_gradients(model, X, ** fixed_var_descr.fixed_vars)
            ipt = X
            Y = None

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)


        if obj is None:
            raise ValueError("BGD is incompatible with "+str(self.cost)+" because "
                    " it is intractable, but BGD uses the cost function value to do "
                    " line searches.")

        if self.monitoring_dataset is not None:
            if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]):
                Y = None

            channels = model.get_monitoring_channels(X,Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, X, Y, ** fixed_var_descr.fixed_vars))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                    mode="sequential",
                                    batch_size=self.batch_size,
                                    num_batches=self.monitoring_batches)

                # The monitor compiles all channels for the same dataset into one function, and
                # runs all prereqs before calling the function. So we only need to register the
                # on_load_batch prereq once per monitoring dataset.
                self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=obj,
                        dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X,Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name= prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset = monitoring_dataset,
                                             prereqs=prereqs)

        if self.cost.supervised:
            ipts = [X, Y]
        else:
            ipts = [X]

        params = model.get_params()

        self.optimizer = BatchGradientDescent(
                            objective = obj,
                            gradients = grads,
                            gradient_updates = grad_updates,
                            params = params,
                            param_constrainers = [ model.censor_updates ],
                            lr_scalers = model.get_lr_scalers(),
                            inputs = ipts,
                            verbose = self.verbose_optimization,
                            max_iter = self.updates_per_batch,
                            reset_alpha = self.reset_alpha,
                            conjugate = self.conjugate,
                            reset_conjugate = self.reset_conjugate,
                            min_init_alpha = self.min_init_alpha,
                            line_search_mode = self.line_search_mode,
                            theano_function_mode=self.theano_function_mode,
                            init_alpha=self.init_alpha)

        if self.monitoring_dataset is not None:
            self.monitor.add_channel(name='ave_step_size',
                    ipt=ipt, val = self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(name='ave_grad_size',
                    ipt=ipt, val = self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(name='ave_grad_mult',
                    ipt=ipt, val = self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0])


        self.first = True
        self.bSetup = True

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        batch_size = self.batch_size

        if self.topo:
            get_data = dataset.get_batch_topo
        else:
            get_data = dataset.get_batch_design

        rng = self.rng
        train_iteration_mode = 'shuffled_sequential'
        if not is_stochastic(train_iteration_mode):
            rng = None
        iterator = dataset.iterator(mode=train_iteration_mode,
                batch_size=self.batch_size,
                targets=self.cost.supervised,
                num_batches=self.batches_per_iter,
                topo=self.topo,
                rng = rng)
        for data in iterator:
            if self.cost.supervised:
                args = data
                X, Y = data
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    stry = str(Y).replace('\n',' ')
                    mode.record.handle_line('data Y '+stry+'\n')
                for on_load_batch in self.on_load_batch:
                    on_load_batch(X, Y)
            else:
                args = [ data ]
                X = data
                for on_load_batch in self.on_load_batch:
                    on_load_batch(X, None)
            self.before_step(model)
            self.optimizer.minimize(*args)
            self.after_step(model)
            model.monitor.report_batch( X.shape[0] )

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion(self.model)

    def before_step(self, model):
        if self.scale_step != 1.:
            self.params = list(model.get_params())
            self.value = [ param.get_value() for param in self.params ]

    def after_step(self, model):
        if self.scale_step != 1:
            for param, value in safe_zip(self.params, self.value):
                value = (1.-self.scale_step) * value + self.scale_step * param.get_value()
                param.set_value(value)
Exemple #9
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size):
                    raise ValueError("batch_size is %d but model.force_batch_size is %d" %
                            (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)
        X = self.model.get_input_space().make_theano_batch()
        X.name = 'BGD_X'
        self.topo = X.ndim != 2
        Y = T.matrix()
        Y.name = 'BGD_Y'

        fixed_var_descr = self.cost.get_fixed_var_descr(model, X, Y)
        self.on_load_batch = fixed_var_descr.on_load_batch

        if not self.cost.supervised:
            Y = None

        if self.cost.supervised:
            obj = self.cost(model, X, Y, ** fixed_var_descr.fixed_vars)
            grads, grad_updates = self.cost.get_gradients(model, X, Y, ** fixed_var_descr.fixed_vars)
            ipt = (X,Y)
        else:
            obj = self.cost(model, X, ** fixed_var_descr.fixed_vars)
            grads, grad_updates = self.cost.get_gradients(model, X, ** fixed_var_descr.fixed_vars)
            ipt = X
            Y = None

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)


        if obj is None:
            raise ValueError("BGD is incompatible with "+str(self.cost)+" because "
                    " it is intractable, but BGD uses the cost function value to do "
                    " line searches.")

        if self.monitoring_dataset is not None:
            if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]):
                Y = None

            channels = model.get_monitoring_channels(X,Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, X, Y, ** fixed_var_descr.fixed_vars))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                    mode="sequential",
                                    batch_size=self.batch_size,
                                    num_batches=self.monitoring_batches)

                # The monitor compiles all channels for the same dataset into one function, and
                # runs all prereqs before calling the function. So we only need to register the
                # on_load_batch prereq once per monitoring dataset.
                self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=obj,
                        dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X,Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name= prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset = monitoring_dataset,
                                             prereqs=prereqs)

        if self.cost.supervised:
            ipts = [X, Y]
        else:
            ipts = [X]

        params = model.get_params()

        self.optimizer = BatchGradientDescent(
                            objective = obj,
                            gradients = grads,
                            gradient_updates = grad_updates,
                            params = params,
                            param_constrainers = [ model.censor_updates ],
                            lr_scalers = model.get_lr_scalers(),
                            inputs = ipts,
                            verbose = self.verbose_optimization,
                            max_iter = self.updates_per_batch,
                            reset_alpha = self.reset_alpha,
                            conjugate = self.conjugate,
                            reset_conjugate = self.reset_conjugate,
                            min_init_alpha = self.min_init_alpha,
                            line_search_mode = self.line_search_mode,
                            theano_function_mode=self.theano_function_mode,
                            init_alpha=self.init_alpha)

        if self.monitoring_dataset is not None:
            self.monitor.add_channel(name='ave_step_size',
                    ipt=ipt, val = self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(name='ave_grad_size',
                    ipt=ipt, val = self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(name='ave_grad_mult',
                    ipt=ipt, val = self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0])


        self.first = True
        self.bSetup = True
Exemple #10
0
    def setup_impl(self, model, dataset, algorithm):
        cost = algorithm.cost

        root = model.get_param_vector()

        dim = root.size

        rng = self.rng


        points = rng.randn(self.num_points, self.num_basis_vectors)
        points = points.astype(root.dtype)
        points *= self.scale

        if self.include_root:
            points[0, :] = 0.

        if not hasattr(self, 'cost_fn'):
            # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now
            # =======================================
            data_specs = cost.get_data_specs(model)
            mapping = DataSpecsMapping(data_specs)
            space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
            source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

            # Build a flat tuple of Theano Variables, one for each space.
            # We want that so that if the same space/source is specified
            # more than once in data_specs, only one Theano Variable
            # is generated for it, and the corresponding value is passed
            # only once to the compiled Theano function.
            theano_args = []
            for space, source in safe_zip(space_tuple, source_tuple):
                name = '%s[%s]' % (self.__class__.__name__, source)
                arg = space.make_theano_batch(name=name,
                                              batch_size=self.batch_size)
                theano_args.append(arg)
            theano_args = tuple(theano_args)

            # Methods of `cost` need args to be passed in a format compatible
            # with data_specs
            nested_args = mapping.nest(theano_args)
            fixed_var_descr = cost.get_fixed_var_descr(model, nested_args)
            self.on_load_batch = fixed_var_descr.on_load_batch

            cost_value = cost.expr(model, nested_args,
                                        ** fixed_var_descr.fixed_vars)
            # End cargo culting
            # ======================

            print "Compiling cost function..."
            cost_fn = function(theano_args, cost_value)
            self.cost_fn = cost_fn
        else:
            cost_fn = self.cost_fn

        cost_values = np.zeros(self.num_points)


        data = list(dataset.get_batch_design(self.batch_size,
            include_labels=True))
        from pylearn2.utils.one_hot import one_hot
        data[1] = one_hot(data[1])


        if self.method == 'gaussian':
            basis = rng.normal(dim, self.num_basis_vectors).astype(root.dtype)
        elif self.method == 'element':
            basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype)
            for i in xrange(self.num_basis_vectors):
                basis[rng.randint(dim), i] = 1.
        elif self.method == 'gradient':
            if not hasattr(self, 'grad_fn'):
                self.grad_fn = function(theano_args, grad(cost_value, model.get_params()))
            grad_fn = self.grad_fn

            basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype)
            for i in xrange(self.num_basis_vectors):
                ipt = list(dataset.get_batch_design(1, include_labels=True))
                label = ipt[1]
                assert label.size == 1
                label = label[0]
                one_hot = np.zeros((1, 10,),dtype='float32')
                one_hot[0, label] = 1
                ipt[1] = one_hot
                g = grad_fn(*ipt)
                basis[:,i] = np.concatenate([e.reshape(e.size) for e in g], axis=0)
        else:
            assert False

        basis /= np.sqrt(np.square(basis).sum(axis=0))

        # Orthogonalize basis
        for i in xrange(self.num_basis_vectors):
            v = basis[:,i ].copy()
            for j in xrange(i - 1):
                u = basis[:, j].copy()
                v -= np.dot(u, v) * u
            norm = np.sqrt(np.square(v).sum())
            assert norm > 1e-4
            v /= norm
            basis[:,i] = v


        for i in xrange(self.num_points):
            print "Evaluating cost at point ", i

            point = points[i, :]
            full_point = root + np.dot(basis, point)
            model.set_param_vector(full_point)

            cost_values[i] = cost_fn(*data)
            print cost_values[i]


        from pylearn2.utils import sharedX
        import theano.tensor as T

        print "!!!!!!!! FITTING THE QUADRATIC FUNCTION !!!!!!!!!!!!!!!!!!!"

        if not hasattr(self, 'fit_quad'):
            points = sharedX(points)
            #from theano import config
            #config.compute_test_value = 'raise'
            cost_values = sharedX(cost_values)
            A = sharedX(np.zeros((self.num_basis_vectors, self.num_basis_vectors)))
            if self.psd:
                mat = T.dot(A.T, A)
            else:
                mat = A
            b = sharedX(np.zeros(self.num_basis_vectors))
            c = sharedX(0.)
            half_quad = T.dot(points, mat)
            quad = (points * half_quad).sum(axis=1)
            lin = T.dot(points, b)
            pred = quad + lin + c

            from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent

            mse = T.square(pred - cost_values).mean()
            mae = abs(pred - cost_values).mean()

            obj = locals()[self.fitting_cost]

            fit_quad = BatchGradientDescent(obj, params = [A, b, c],
                    max_iter = self.num_basis_vectors ** 2,
                    verbose = 3, tol = None,
                    init_alpha = None, min_init_alpha = 1e-7,
                    reset_alpha = False, conjugate = True,
                    reset_conjugate = False,
                    line_search_mode = 'exhaustive')
            self.fit_quad = fit_quad
            self.A = A
            self.b = b
            self.c = c
            self.points = points
            self.cost_values = cost_values
        else:
            self.A.set_value(.001 * np.identity(self.A.get_value().shape[0], dtype=self.A.dtype))
            self.b.set_value(self.b.get_value() * 0.)
            self.c.set_value(self.c.get_value() * 0.)
            self.points.set_value(points)
            self.cost_values.set_value(cost_values.astype(self.cost_values.dtype))

        self.fit_quad.minimize()

        print "!!!!!!!!!!!!! FINDING ITS MINIMUM !!!!!!!!!!!!!!!!!!!!!!!!!!!"

        if self.use_solver:
            if self.psd:
                Av = self.A.get_value()
                mat_v = np.dot(Av.T, Av)
            else:
                mat_v = self.A.get_value()
            bv = self.b.get_value()

            # minimize for x^T A x + b^T x + c
            # -> solve 2 A x + b = 0
            # Ax = - b / 2

            print "********** mat_v", mat_v.min(), mat_v.max()
            x, ignored_residuals, ignored_rank, ignored_singular_values = np.linalg.lstsq(mat_v, - 0.5 * bv)
            print "********** soln: ", x.min(), x.mean(), x.max()
            print "********** SVs: ", ignored_singular_values.min(), ignored_singular_values.max()
            assert x.ndim == 1, x.shape
            prod = np.dot(basis, x)
            norm = np.sqrt(np.square(prod).sum())
            print "*************** Moving params by ",norm
            vector = root + prod
            model.set_param_vector(vector)

        else: # use minimizer
            if not hasattr(self, 'fit_params'):
                self.vector = sharedX(points.get_value().mean(axis=0))
                vector = self.vector
                obj = T.dot(T.dot(mat, vector), vector) + T.dot(b, vector)

                def constrain(d):
                    assert vector in d
                    n = d[vector]
                    norm = T.sqrt(T.square(n).sum())
                    desired_norm = T.clip(norm, 0., self.max_jump_norm)
                    d[vector] = n * desired_norm / norm

                self.fit_params = BatchGradientDescent(obj, params=[vector],
                    max_iter = self.num_basis_vectors,
                    verbose = 3, tol=None,
                    param_constrainers = [constrain],
                    init_alpha = None, min_init_alpha = 1e-3,
                    reset_alpha=False, conjugate=True, reset_conjugate=False,
                    line_search_mode='exhaustive')
            else:
                self.vector.set_value(points.mean(axis=0).astype(self.vector.dtype))

            self.fit_params.minimize()

            model.set_param_vector(root + np.dot(basis , self.vector.get_value()))
Exemple #11
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model : object
            A Python object representing the model to train. Loosely
            implementing the interface of models.model.Model.
        dataset : pylearn2.datasets.dataset.Dataset
            Dataset object used to draw training data
        """
        self.model = model

        if self.cost is None:
            self.cost = model.get_default_cost()

        try:
            if self.cost.is_stochastic():
                raise TypeError("BGD is not compatible with stochastic "
                                "costs.")
        except NotImplementedError:
            warnings.warn("BGD is not compatible with stochastic costs "
                          "and cannot determine whether the current cost is "
                          "stochastic.")

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size is None or
                        model.force_batch_size <= 0 or
                        batch_size == model.force_batch_size):
                    raise ValueError("batch_size is %d but " +
                                     "model.force_batch_size is %d" %
                                     (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)

        data_specs = self.cost.get_data_specs(model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space,
        # named according to the sources.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = 'BGD_[%s]' % source
            arg = space.make_theano_batch(name=name)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with their data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)
        grads, grad_updates = self.cost.get_gradients(
            model, nested_args, ** fixed_var_descr.fixed_vars)

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)

        if cost_value is None:
            raise ValueError("BGD is incompatible with " + str(self.cost) +
                             " because it is intractable, but BGD uses the " +
                             "cost function value to do line searches.")

        # obj_prereqs has to be a list of function f called with f(*data),
        # where data is a data tuple coming from the iterator.
        # this function enables capturing "mapping" and "f", while
        # enabling the "*data" syntax
        def capture(f, mapping=mapping):
            new_f = lambda *args: f(mapping.flatten(args, return_tuple=True))
            return new_f

        obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch]

        if self.monitoring_dataset is not None:
            if (self.monitoring_batch_size is None and
                    self.monitoring_batches is None):
                self.monitoring_batch_size = self.batch_size
                self.monitoring_batches = self.batches_per_iter
            self.monitor.setup(
                dataset=self.monitoring_dataset,
                cost=self.cost,
                batch_size=self.monitoring_batch_size,
                num_batches=self.monitoring_batches,
                obj_prereqs=obj_prereqs,
                cost_monitoring_args=fixed_var_descr.fixed_vars)

        params = model.get_params()

        self.optimizer = BatchGradientDescent(
            objective=cost_value,
            gradients=grads,
            gradient_updates=grad_updates,
            params=params,
            param_constrainers=[model.modify_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=theano_args,
            verbose=self.verbose_optimization,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
            theano_function_mode=self.theano_function_mode,
            init_alpha=self.init_alpha)

        # These monitoring channels keep track of shared variables,
        # which do not need inputs nor data.
        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                name='ave_step_size',
                ipt=None,
                val=self.optimizer.ave_step_size,
                data_specs=(NullSpace(), ''),
                dataset=first_value(self.monitoring_dataset))
            self.monitor.add_channel(
                name='ave_grad_size',
                ipt=None,
                val=self.optimizer.ave_grad_size,
                data_specs=(NullSpace(), ''),
                dataset=first_value(self.monitoring_dataset))
            self.monitor.add_channel(
                name='ave_grad_mult',
                ipt=None,
                val=self.optimizer.ave_grad_mult,
                data_specs=(NullSpace(), ''),
                dataset=first_value(self.monitoring_dataset))

        self.first = True
        self.bSetup = True
Exemple #12
0
class BGD(TrainingAlgorithm):
    """Batch Gradient Descent training algorithm class"""
    def __init__(self,
                 cost,
                 batch_size=None,
                 batches_per_iter=None,
                 updates_per_batch=10,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 termination_criterion=None,
                 set_batch_size=False,
                 reset_alpha=True,
                 conjugate=False,
                 min_init_alpha=None,
                 reset_conjugate=True,
                 line_search_mode=None):
        """
        cost: a pylearn2 Cost
        batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm
                    still iterates over minibatches of data. The difference
                    is that this class uses partial line searches to choose
                    the step size along each gradient direction, and can do
                    repeated updates on the same batch. The assumption is
                    that you use big enough minibatches with this algorithm that
                    a large step size will generalize reasonably well to other
                    minibatches.
                    To implement true Batch Gradient Descent, set the batch_size
                    to the total number of examples available.
                    If batch_size is None, it will revert to the model's force_batch_size
                    attribute.
        set_batch_size: If True, BGD will attempt to override the model's force_batch_size
                attribute by calling set_batch_size on it.
        updates_per_batch: Passed through to the optimization.BatchGradientDescent's
                   max_iters parameter
        reset_alpha, conjugate, reset_conjugate: passed through to the
            optimization.BatchGradientDescent parameters of the same names
        monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets
        """

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches == None

        self._set_monitoring_dataset(monitoring_dataset)

        self.bSetup = False
        self.termination_criterion = termination_criterion
        self.rng = np.random.RandomState([2012, 10, 16])

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0
                        or batch_size == model.force_batch_size):
                    raise ValueError(
                        "batch_size is %d but model.force_batch_size is %d" %
                        (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        X = self.model.get_input_space().make_theano_batch()
        self.topo = X.ndim != 2
        Y = T.matrix()

        if self.cost.supervised:
            obj = self.cost(model, X, Y)
            grads, grad_updates = self.cost.get_gradients(model, X, Y)
            ipt = (X, Y)
        else:
            obj = self.cost(model, X)
            grads, grad_updates = self.cost.get_gradients(model, X)
            ipt = X
        if obj is None:
            raise ValueError(
                "BGD is incompatible with " + str(self.cost) + " because "
                " it is intractable, but BGD uses the cost function value to do "
                " line searches.")

        if self.monitoring_dataset is not None:
            if not any([
                    dataset.has_targets()
                    for dataset in self.monitoring_dataset.values()
            ]):
                Y = None

            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, X, Y))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                         mode="sequential",
                                         batch_size=self.batch_size,
                                         num_batches=self.monitoring_batches)

                self.monitor.add_channel(prefix + 'objective',
                                         ipt=ipt,
                                         val=obj,
                                         dataset=monitoring_dataset)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name=prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset=monitoring_dataset,
                                             prereqs=prereqs)

        if ipt is X:
            ipts = [X]
        else:
            ipts = ipt

        self.optimizer = BatchGradientDescent(
            objective=obj,
            gradients=grads,
            gradient_updates=grad_updates,
            params=model.get_params(),
            param_constrainers=[model.censor_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=ipts,
            verbose=True,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode)

        self.first = True
        self.bSetup = True

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        batch_size = self.batch_size

        if self.topo:
            get_data = dataset.get_batch_topo
        else:
            get_data = dataset.get_batch_design

        rng = self.rng
        train_iteration_mode = 'shuffled_sequential'
        if not is_stochastic(train_iteration_mode):
            rng = None
        iterator = dataset.iterator(mode=train_iteration_mode,
                                    batch_size=self.batch_size,
                                    targets=self.cost.supervised,
                                    num_batches=self.batches_per_iter,
                                    topo=self.topo,
                                    rng=rng)
        for data in iterator:
            if self.cost.supervised:
                args = data
                X, Y = data
            else:
                args = [data]
                X = data
            self.optimizer.minimize(*args)
            model.monitor.report_batch(X.shape[0])

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion(self.model)
Exemple #13
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.set_batch_size:
            model.set_batch_size(self.batch_size)

        if self.batch_size is None:
            self.batch_size = model.force_batch_size

        model.cost = self.cost
        model.mask_gen = self.mask_gen

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)
        prereq = self.get_setup_batch_object()
        #We want to use big batches. We need to make several theano calls on each
        #batch. To avoid paying the GPU latency every time, we use a shared variable
        #but the shared variable needs to stay allocated during the time that the
        #monitor is working, and we don't want the monitor to increase the memory
        #overhead. So we make the monitor work off of the same shared variable
        space = model.get_input_space()
        X = sharedX(space.get_origin_batch(model.batch_size), 'BGD_X')
        self.space = space
        rng = np.random.RandomState([2012, 7, 20])
        test_mask = space.get_origin_batch(model.batch_size)
        test_mask = rng.randint(0, 2, test_mask.shape)
        if hasattr(self.mask_gen,
                   'sync_channels') and self.mask_gen.sync_channels:
            if test_mask.ndim != 4:
                raise NotImplementedError()
            test_mask = test_mask[:, :, :, 0]
            assert test_mask.ndim == 3
        drop_mask = sharedX(np.cast[X.dtype](test_mask), name='drop_mask')
        self.drop_mask = drop_mask
        assert drop_mask.ndim == test_mask.ndim

        Y = None
        drop_mask_Y = None
        if self.cost.supervised:
            Y = sharedX(
                model.get_output_space().get_origin_batch(model.batch_size),
                'BGD_Y')
            self.Y = Y
            test_mask_Y = rng.randint(0, 2, (model.batch_size, ))
            drop_mask_Y = sharedX(np.cast[Y.dtype](test_mask_Y),
                                  name='drop_mask_Y')
            self.drop_mask_Y = drop_mask_Y
            dmx, dmy = self.mask_gen(X, Y)
            updates = OrderedDict([ (drop_mask, dmx),\
                    (drop_mask_Y, dmy)] )
        else:
            updates = OrderedDict([(drop_mask, self.mask_gen(X))])

        obj = self.cost(model,
                        X,
                        Y,
                        drop_mask=drop_mask,
                        drop_mask_Y=drop_mask_Y)
        gradients, gradient_updates = self.cost.get_gradients(
            model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y)

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            theano_rng = MRG_RandomStreams(2012 + 11 + 20)
            for elem in flatten([
                    model.inference_procedure.V_dropout,
                    model.inference_procedure.H_dropout
            ]):
                updates[elem] = theano_rng.binomial(
                    p=include_prob, size=elem.shape, dtype=elem.dtype,
                    n=1) / include_prob
        self.update_mask = function([], updates=updates)

        if self.monitoring_dataset is not None:
            if not any([
                    dataset.has_targets()
                    for dataset in self.monitoring_dataset.values()
            ]):
                Y = None
            assert X.name is not None
            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            assert X.name is not None
            wtf = self.cost.get_monitoring_channels(model,
                                                    X=X,
                                                    Y=Y,
                                                    drop_mask=drop_mask,
                                                    drop_mask_Y=drop_mask_Y)
            for key in wtf:
                channels[key] = wtf[key]

            for dataset_name in self.monitoring_dataset:

                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'

                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                         mode="sequential",
                                         batch_size=self.batch_size,
                                         num_batches=self.monitoring_batches)
                #we only need to put the prereq in once to make sure it gets run
                #adding it more times shouldn't hurt, but be careful
                #each time you say "self.setup_batch" you get a new object with a
                #different id, and if you install n of those the prereq will run n
                #times. It won't cause any wrong results, just a big slowdown
                warnings.warn(
                    "This is weird-- ipt=(X,Y)=tell the monitor to replace X, Y with the givens dict, "
                    " but you don't actually want them to be replaced.")
                ipt = X
                if Y is not None:
                    ipt = [X, Y]
                self.monitor.add_channel(prefix + 'objective',
                                         ipt=ipt,
                                         val=obj,
                                         dataset=monitoring_dataset,
                                         prereqs=[prereq])

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = []

                    prereqs = list(prereqs)
                    prereqs.append(prereq)

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name=prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset=monitoring_dataset,
                                             prereqs=prereqs)

        self.accumulate = self.combine_batches > 1
        if self.accumulate:
            self.inputs = [
                elem for elem in [X, Y, drop_mask, drop_mask_Y]
                if elem is not None
            ]
        else:
            self.inputs = None

        self.optimizer = BatchGradientDescent(
            objective=obj,
            inputs=self.inputs,
            verbose=1,
            gradients=gradients,
            gradient_updates=gradient_updates,
            params=model.get_params(),
            lr_scalers=model.get_lr_scalers(),
            param_constrainers=[model.censor_updates],
            max_iter=self.max_iter,
            tol=3e-7,
            init_alpha=self.init_alpha,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
            accumulate=self.accumulate,
            theano_function_mode=self.theano_function_mode)
        self.X = X

        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                name='ave_step_size',
                ipt=ipt,
                val=self.optimizer.ave_step_size,
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_size',
                ipt=ipt,
                val=self.optimizer.ave_grad_size,
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_mult',
                ipt=ipt,
                val=self.optimizer.ave_grad_mult,
                dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True
Exemple #14
0
class DNCE_Algorithm(object):
    def __init__(self, noise, batch_size=1000, batches_per_iter=10,
                     noise_per_clean = 30,
                 monitoring_batches=-1, monitoring_dataset=None):
        """
        if batch_size is None, reverts to the force_batch_size field of the
        model
        """
        self.batch_size, self.batches_per_iter = batch_size, batches_per_iter
        if monitoring_dataset is None:
            assert monitoring_batches == -1
        self.monitoring_dataset = monitoring_dataset
        self.monitoring_batches = monitoring_batches
        self.bSetup = False
        self.noise = noise
        self.noise_per_clean = noise_per_clean

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        self.monitor = Monitor.get_monitor(model)
        X = T.matrix()
        Y = T.matrix()
        dnce = DNCE( self.noise)
        if self.monitoring_dataset is not None:
            if not self.monitoring_dataset.has_targets():
                Y = None
            self.monitor.set_dataset(dataset=self.monitoring_dataset,
                                mode="sequential",
                                batch_size=self.batch_size,
                                num_batches=self.monitoring_batches)
            X.tag.test_value = self.monitoring_dataset.get_batch_design(2)
            channels = model.get_monitoring_channels(X,Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))

            dnce.noise_per_clean = self.noise_per_clean
            obj = dnce(model,X)
            dnce.noise_per_clean = None
            self.monitor.add_channel('DNCE',ipt=X,val=obj)

            for name in channels:
                J = channels[name]
                if isinstance(J, tuple):
                    assert len(J) == 2
                    J, prereqs = J
                else:
                    prereqs = None

                if Y is not None:
                    ipt = (X,Y)
                else:
                    ipt = X

                self.monitor.add_channel(name=name,
                                         ipt=ipt,
                                         val=J,
                                         prereqs=prereqs)

        X = sharedX( dataset.get_batch_design(1), 'X')
        Y = []
        updates = {}
        for i in xrange(self.noise_per_clean):
            Y_i = sharedX( X.get_value().copy() )
            updates[Y_i] = self.noise.random_design_matrix(X)
            Y.append(Y_i)
        self.update_noise = function([], updates = updates)


        obj = dnce(model,X,Y)

        self.optimizer = BatchGradientDescent(
                            objective = obj,
                            params = model.get_params(),
                            param_constrainers = [ model.censor_updates ],
                            max_iter = 5)
        self.X = X
        self.Y = Y


        self.first = True
        self.bSetup = True

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        if self.batch_size is None:
            batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if hasattr(model, 'force_batch_size'):
                assert (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size)

        for i in xrange(self.batches_per_iter):
            self.X.set_value(dataset.get_batch_design(self.batch_size))
            self.update_noise()
            self.optimizer.minimize()
            model.monitor.report_batch( batch_size )
        return True
Exemple #15
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model : object
            A Python object representing the model to train loosely \
            implementing the interface of models.model.Model.
        dataset : pylearn2.datasets.dataset.Dataset
            Dataset object used to draw training data
        """
        self.model = model

        if self.cost is None:
            self.cost = model.get_default_cost()

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size):
                    raise ValueError("batch_size is %d but " +
                                     "model.force_batch_size is %d" %
                                     (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)

        data_specs = self.cost.get_data_specs(model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space,
        # named according to the sources.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = 'BGD_[%s]' % source
            arg = space.make_theano_batch(name=name)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with their data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)
        grads, grad_updates = self.cost.get_gradients(
                model, nested_args, ** fixed_var_descr.fixed_vars)

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)

        if cost_value is None:
            raise ValueError("BGD is incompatible with " + str(self.cost) +
                             " because it is intractable, but BGD uses the " +
                             "cost function value to do line searches.")

        # obj_prereqs has to be a list of function f called with f(*data),
        # where data is a data tuple coming from the iterator.
        # this function enables capturing "mapping" and "f", while
        # enabling the "*data" syntax
        def capture(f, mapping=mapping):
            new_f = lambda *args: f(mapping.flatten(args, return_tuple=True))
            return new_f

        obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch]

        if self.monitoring_dataset is not None:
            self.monitor.setup(
                    dataset=self.monitoring_dataset,
                    cost=self.cost,
                    batch_size=self.batch_size,
                    num_batches=self.monitoring_batches,
                    obj_prereqs=obj_prereqs,
                    cost_monitoring_args=fixed_var_descr.fixed_vars)

            # TODO : Why is this commented?
            '''
            channels = model.get_monitoring_channels(theano_args)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, theano_args, ** fixed_var_descr.fixed_vars))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                    mode="sequential",
                                    batch_size=self.batch_size,
                                    num_batches=self.monitoring_batches)

                # The monitor compiles all channels for the same dataset into one function, and
                # runs all prereqs before calling the function. So we only need to register the
                # on_load_batch prereq once per monitoring dataset.
                self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=cost_value,
                        dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    self.monitor.add_channel(name= prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             data_specs=data_specs,
                                             dataset = monitoring_dataset,
                                             prereqs=prereqs)
                '''

        params = model.get_params()


        self.optimizer = BatchGradientDescent(
                            objective = cost_value,
                            gradients = grads,
                            gradient_updates = grad_updates,
                            params = params,
                            param_constrainers = [ model.censor_updates ],
                            lr_scalers = model.get_lr_scalers(),
                            inputs = theano_args,
                            verbose = self.verbose_optimization,
                            max_iter = self.updates_per_batch,
                            reset_alpha = self.reset_alpha,
                            conjugate = self.conjugate,
                            reset_conjugate = self.reset_conjugate,
                            min_init_alpha = self.min_init_alpha,
                            line_search_mode = self.line_search_mode,
                            theano_function_mode=self.theano_function_mode,
                            init_alpha=self.init_alpha)

        # These monitoring channels keep track of shared variables,
        # which do not need inputs nor data.
        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                    name='ave_step_size',
                    ipt=None,
                    val=self.optimizer.ave_step_size,
                    data_specs=(NullSpace(), ''),
                    dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                    name='ave_grad_size',
                    ipt=None,
                    val=self.optimizer.ave_grad_size,
                    data_specs=(NullSpace(), ''),
                    dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                    name='ave_grad_mult',
                    ipt=None,
                    val=self.optimizer.ave_grad_mult,
                    data_specs=(NullSpace(), ''),
                    dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True
Exemple #16
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size):
                    raise ValueError("batch_size is %d but model.force_batch_size is %d" %
                            (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        X = self.model.get_input_space().make_theano_batch()
        self.topo = X.ndim != 2
        Y = T.matrix()
        if self.monitoring_dataset is not None:
            if not self.monitoring_dataset.has_targets():
                Y = None
            self.monitor.add_dataset(dataset=self.monitoring_dataset,
                                mode="sequential",
                                batch_size=self.batch_size,
                                num_batches=self.monitoring_batches)
            channels = model.get_monitoring_channels(X,Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))

            #TODO: currently only supports unsupervised costs, support supervised too
            obj = self.cost(model,X)
            self.monitor.add_channel('batch_gd_objective',ipt=X,val=obj)

            for name in channels:
                J = channels[name]
                if isinstance(J, tuple):
                    assert len(J) == 2
                    J, prereqs = J
                else:
                    prereqs = None

                if Y is not None:
                    ipt = (X,Y)
                else:
                    ipt = X

                self.monitor.add_channel(name=name,
                                         ipt=ipt,
                                         val=J,
                                         prereqs=prereqs)


        obj = self.cost(model,X)

        self.optimizer = BatchGradientDescent(
                            objective = obj,
                            params = model.get_params(),
                            param_constrainers = [ model.censor_updates ],
                            lr_scalers = model.get_lr_scalers(),
                            inputs = [ X ],
                            verbose = True,
                            max_iter = self.updates_per_batch)


        self.first = True
        self.bSetup = True
Exemple #17
0
_, model_path = sys.argv
from pylearn2.utils import serial
model = serial.load(model_path)
d = model.discriminator
import gc
del model
gc.collect()
from pylearn2.utils import sharedX
X = sharedX(d.get_input_space().get_origin_batch(1))
obj =  -d.fprop(X).sum()
from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent as BGD
import theano.tensor as T
def norm_constraint(updates):
    assert X in updates
    updates[X] = updates[X] / (1e-7 + T.sqrt(T.sqr(X).sum()))
opt = BGD(objective=obj, params=[X], param_constrainers=[norm_constraint], conjugate=True, reset_conjugate=False,
        reset_alpha=False, line_search_mode='exhaustive', verbose=3, max_iter=20)
results = []
import numpy as np
rng = np.random.RandomState([1, 2, 3])
for i in xrange(10):
    X.set_value(rng.randn(*X.get_value().shape).astype(X.dtype) / 10.)
    opt.minimize()
    Xv = X.dimshuffle(3, 1, 2, 0).eval()
    results.append(Xv)
X = np.concatenate(results, axis=0)
from pylearn2.gui.patch_viewer import make_viewer
v = make_viewer(X)
v.show()

Exemple #18
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0
                        or batch_size == model.force_batch_size):
                    raise ValueError(
                        "batch_size is %d but model.force_batch_size is %d" %
                        (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        X = self.model.get_input_space().make_theano_batch()
        self.topo = X.ndim != 2
        Y = T.matrix()

        if self.cost.supervised:
            obj = self.cost(model, X, Y)
            grads, grad_updates = self.cost.get_gradients(model, X, Y)
            ipt = (X, Y)
        else:
            obj = self.cost(model, X)
            grads, grad_updates = self.cost.get_gradients(model, X)
            ipt = X
        if obj is None:
            raise ValueError(
                "BGD is incompatible with " + str(self.cost) + " because "
                " it is intractable, but BGD uses the cost function value to do "
                " line searches.")

        if self.monitoring_dataset is not None:
            if not any([
                    dataset.has_targets()
                    for dataset in self.monitoring_dataset.values()
            ]):
                Y = None

            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, X, Y))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                         mode="sequential",
                                         batch_size=self.batch_size,
                                         num_batches=self.monitoring_batches)

                self.monitor.add_channel(prefix + 'objective',
                                         ipt=ipt,
                                         val=obj,
                                         dataset=monitoring_dataset)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name=prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset=monitoring_dataset,
                                             prereqs=prereqs)

        if ipt is X:
            ipts = [X]
        else:
            ipts = ipt

        self.optimizer = BatchGradientDescent(
            objective=obj,
            gradients=grads,
            gradient_updates=grad_updates,
            params=model.get_params(),
            param_constrainers=[model.censor_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=ipts,
            verbose=True,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode)

        self.first = True
        self.bSetup = True
Exemple #19
0
outputs = model.fprop(normed, return_all=True)

output = outputs[layer_idx]
neuron = output[tuple(idxs)]

from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent

bgd = BatchGradientDescent(objective=-neuron,
        params=[X],
        inputs=None,
        max_iter=100,
        lr_scalers=None,
        verbose=3,
        tol=None,
        init_alpha=None,
        min_init_alpha=1e-3,
        reset_alpha=True,
        conjugate=True,
        gradients=None,
        gradient_updates=None,
        accumulate=False,
        theano_function_mode=None,
        param_constrainers=None)

bgd.minimize()


X = normed.eval()[:,:,:,0].transpose(1,2,0)
import numpy as np
X /= np.abs(X).max()
print (X.min(), X.max())
Exemple #20
0
p, h = state

p_shape = layer.get_output_space().shape
i = p_shape[0] / 2
j = p_shape[1] / 2

act = p[0,filter_idx,i,j]

obj = - act + norm_penalty * T.square(X).sum()

assert obj.ndim == 0

optimizer = BatchGradientDescent(objective = obj,
        params = [X],
        inputs = None,
        param_constrainers = None,
        max_iter = 1000,
        verbose = True,
        tol = None,
        init_alpha = (.001, .005, .01, .05, .1))

optimizer.minimize()

img = X.get_value()[0,:,:,:]

print 'max mag: ',np.abs(img).max()
print 'norm: ',np.square(img).sum()
print 'min: ',img.min()
print 'max: ',img.max()

img /= np.abs(img).max()
Exemple #21
0
class BGD(TrainingAlgorithm):
    """Batch Gradient Descent training algorithm class"""

    def __init__(
        self,
        cost,
        batch_size=None,
        batches_per_iter=None,
        updates_per_batch=10,
        monitoring_batches=None,
        monitoring_dataset=None,
        termination_criterion=None,
        set_batch_size=False,
        reset_alpha=True,
        conjugate=False,
        min_init_alpha=None,
        reset_conjugate=True,
        line_search_mode=None,
    ):
        """
        cost: a pylearn2 Cost
        batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm
                    still iterates over minibatches of data. The difference
                    is that this class uses partial line searches to choose
                    the step size along each gradient direction, and can do
                    repeated updates on the same batch. The assumption is
                    that you use big enough minibatches with this algorithm that
                    a large step size will generalize reasonably well to other
                    minibatches.
                    To implement true Batch Gradient Descent, set the batch_size
                    to the total number of examples available.
                    If batch_size is None, it will revert to the model's force_batch_size
                    attribute.
        set_batch_size: If True, BGD will attempt to override the model's force_batch_size
                attribute by calling set_batch_size on it.
        updates_per_batch: Passed through to the optimization.BatchGradientDescent's
                   max_iters parameter
        reset_alpha, conjugate, reset_conjugate: passed through to the
            optimization.BatchGradientDescent parameters of the same names
        monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets
        """

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches == None

        self._set_monitoring_dataset(monitoring_dataset)

        self.bSetup = False
        self.termination_criterion = termination_criterion
        self.rng = np.random.RandomState([2012, 10, 16])

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, "force_batch_size"):
                if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size):
                    raise ValueError(
                        "batch_size is %d but model.force_batch_size is %d" % (batch_size, model.force_batch_size)
                    )

        self.monitor = Monitor.get_monitor(model)
        X = self.model.get_input_space().make_theano_batch()
        self.topo = X.ndim != 2
        Y = T.matrix()

        if self.cost.supervised:
            obj = self.cost(model, X, Y)
            grads, grad_updates = self.cost.get_gradients(model, X, Y)
            ipt = (X, Y)
        else:
            obj = self.cost(model, X)
            grads, grad_updates = self.cost.get_gradients(model, X)
            ipt = X
        if obj is None:
            raise ValueError(
                "BGD is incompatible with " + str(self.cost) + " because "
                " it is intractable, but BGD uses the cost function value to do "
                " line searches."
            )

        if self.monitoring_dataset is not None:
            if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]):
                Y = None

            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError(
                    "model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)
                )
            channels.update(self.cost.get_monitoring_channels(model, X, Y))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == "":
                    prefix = ""
                else:
                    prefix = dataset_name + "_"
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(
                    dataset=monitoring_dataset,
                    mode="sequential",
                    batch_size=self.batch_size,
                    num_batches=self.monitoring_batches,
                )

                self.monitor.add_channel(prefix + "objective", ipt=ipt, val=obj, dataset=monitoring_dataset)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(
                        name=prefix + name, ipt=ipt, val=J, dataset=monitoring_dataset, prereqs=prereqs
                    )

        if ipt is X:
            ipts = [X]
        else:
            ipts = ipt

        self.optimizer = BatchGradientDescent(
            objective=obj,
            gradients=grads,
            gradient_updates=grad_updates,
            params=model.get_params(),
            param_constrainers=[model.censor_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=ipts,
            verbose=True,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
        )

        self.first = True
        self.bSetup = True

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        batch_size = self.batch_size

        if self.topo:
            get_data = dataset.get_batch_topo
        else:
            get_data = dataset.get_batch_design

        rng = self.rng
        train_iteration_mode = "shuffled_sequential"
        if not is_stochastic(train_iteration_mode):
            rng = None
        iterator = dataset.iterator(
            mode=train_iteration_mode,
            batch_size=self.batch_size,
            targets=self.cost.supervised,
            num_batches=self.batches_per_iter,
            topo=self.topo,
            rng=rng,
        )
        for data in iterator:
            if self.cost.supervised:
                args = data
                X, Y = data
            else:
                args = [data]
                X = data
            self.optimizer.minimize(*args)
            model.monitor.report_batch(X.shape[0])

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion(self.model)
Exemple #22
0
class InpaintAlgorithm(object):
    def __init__(self,
                 mask_gen,
                 cost,
                 batch_size=None,
                 batches_per_iter=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 max_iter=5,
                 suicide=False,
                 init_alpha=None,
                 reset_alpha=True,
                 conjugate=False,
                 reset_conjugate=True,
                 termination_criterion=None,
                 set_batch_size=False,
                 line_search_mode=None,
                 min_init_alpha=1e-3,
                 duplicate=1,
                 combine_batches=1,
                 scale_step=1.,
                 theano_function_mode=None):
        assert False  # deprecated
        """
        if batch_size is None, reverts to the force_batch_size field of the
        model
        """

        if line_search_mode is None and init_alpha is None:
            init_alpha = (.001, .005, .01, .05, .1)

        self.__dict__.update(locals())
        del self.self
        if monitoring_dataset is None:
            assert monitoring_batches == None
        if isinstance(monitoring_dataset, Dataset):
            self.monitoring_dataset = {'': monitoring_dataset}
        self.bSetup = False
        self.rng = np.random.RandomState([2012, 10, 17])

    def setup_batch(self, X, Y=None):
        assert not isinstance(X, tuple)
        self.X.set_value(X)
        assert self.cost.supervised == (Y is not None)
        if Y is not None:
            assert Y.ndim == 2
            assert self.Y.ndim == 2
            self.Y.set_value(Y)
        self.update_mask()

    def get_setup_batch_object(self):
        return SetupBatch(self)

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.set_batch_size:
            model.set_batch_size(self.batch_size)

        if self.batch_size is None:
            self.batch_size = model.force_batch_size

        model.cost = self.cost
        model.mask_gen = self.mask_gen

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)
        prereq = self.get_setup_batch_object()
        #We want to use big batches. We need to make several theano calls on each
        #batch. To avoid paying the GPU latency every time, we use a shared variable
        #but the shared variable needs to stay allocated during the time that the
        #monitor is working, and we don't want the monitor to increase the memory
        #overhead. So we make the monitor work off of the same shared variable
        space = model.get_input_space()
        X = sharedX(space.get_origin_batch(model.batch_size), 'BGD_X')
        self.space = space
        rng = np.random.RandomState([2012, 7, 20])
        test_mask = space.get_origin_batch(model.batch_size)
        test_mask = rng.randint(0, 2, test_mask.shape)
        if hasattr(self.mask_gen,
                   'sync_channels') and self.mask_gen.sync_channels:
            if test_mask.ndim != 4:
                raise NotImplementedError()
            test_mask = test_mask[:, :, :, 0]
            assert test_mask.ndim == 3
        drop_mask = sharedX(np.cast[X.dtype](test_mask), name='drop_mask')
        self.drop_mask = drop_mask
        assert drop_mask.ndim == test_mask.ndim

        Y = None
        drop_mask_Y = None
        if self.cost.supervised:
            Y = sharedX(
                model.get_output_space().get_origin_batch(model.batch_size),
                'BGD_Y')
            self.Y = Y
            test_mask_Y = rng.randint(0, 2, (model.batch_size, ))
            drop_mask_Y = sharedX(np.cast[Y.dtype](test_mask_Y),
                                  name='drop_mask_Y')
            self.drop_mask_Y = drop_mask_Y
            dmx, dmy = self.mask_gen(X, Y)
            updates = OrderedDict([ (drop_mask, dmx),\
                    (drop_mask_Y, dmy)] )
        else:
            updates = OrderedDict([(drop_mask, self.mask_gen(X))])

        obj = self.cost(model,
                        X,
                        Y,
                        drop_mask=drop_mask,
                        drop_mask_Y=drop_mask_Y)
        gradients, gradient_updates = self.cost.get_gradients(
            model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y)

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            theano_rng = MRG_RandomStreams(2012 + 11 + 20)
            for elem in flatten([
                    model.inference_procedure.V_dropout,
                    model.inference_procedure.H_dropout
            ]):
                updates[elem] = theano_rng.binomial(
                    p=include_prob, size=elem.shape, dtype=elem.dtype,
                    n=1) / include_prob
        self.update_mask = function([], updates=updates)

        if self.monitoring_dataset is not None:
            if not any([
                    dataset.has_targets()
                    for dataset in self.monitoring_dataset.values()
            ]):
                Y = None
            assert X.name is not None
            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            assert X.name is not None
            wtf = self.cost.get_monitoring_channels(model,
                                                    X=X,
                                                    Y=Y,
                                                    drop_mask=drop_mask,
                                                    drop_mask_Y=drop_mask_Y)
            for key in wtf:
                channels[key] = wtf[key]

            for dataset_name in self.monitoring_dataset:

                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'

                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                         mode="sequential",
                                         batch_size=self.batch_size,
                                         num_batches=self.monitoring_batches)
                #we only need to put the prereq in once to make sure it gets run
                #adding it more times shouldn't hurt, but be careful
                #each time you say "self.setup_batch" you get a new object with a
                #different id, and if you install n of those the prereq will run n
                #times. It won't cause any wrong results, just a big slowdown
                warnings.warn(
                    "This is weird-- ipt=(X,Y)=tell the monitor to replace X, Y with the givens dict, "
                    " but you don't actually want them to be replaced.")
                ipt = X
                if Y is not None:
                    ipt = [X, Y]
                self.monitor.add_channel(prefix + 'objective',
                                         ipt=ipt,
                                         val=obj,
                                         dataset=monitoring_dataset,
                                         prereqs=[prereq])

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = []

                    prereqs = list(prereqs)
                    prereqs.append(prereq)

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name=prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset=monitoring_dataset,
                                             prereqs=prereqs)

        self.accumulate = self.combine_batches > 1
        if self.accumulate:
            self.inputs = [
                elem for elem in [X, Y, drop_mask, drop_mask_Y]
                if elem is not None
            ]
        else:
            self.inputs = None

        self.optimizer = BatchGradientDescent(
            objective=obj,
            inputs=self.inputs,
            verbose=1,
            gradients=gradients,
            gradient_updates=gradient_updates,
            params=model.get_params(),
            lr_scalers=model.get_lr_scalers(),
            param_constrainers=[model.censor_updates],
            max_iter=self.max_iter,
            tol=3e-7,
            init_alpha=self.init_alpha,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
            accumulate=self.accumulate,
            theano_function_mode=self.theano_function_mode)
        self.X = X

        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                name='ave_step_size',
                ipt=ipt,
                val=self.optimizer.ave_step_size,
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_size',
                ipt=ipt,
                val=self.optimizer.ave_grad_size,
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_mult',
                ipt=ipt,
                val=self.optimizer.ave_grad_mult,
                dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True

    def before_step(self, model):
        if self.scale_step != 1.:
            self.params = list(model.get_params())
            self.value = [param.get_value() for param in self.params]

    def after_step(self, model):
        if self.scale_step != 1:
            for param, value in safe_zip(self.params, self.value):
                value = (1. - self.scale_step
                         ) * value + self.scale_step * param.get_value()
                param.set_value(value)

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        if self.batch_size is None:
            batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if hasattr(model, 'force_batch_size'):
                assert (model.force_batch_size <= 0
                        or batch_size == model.force_batch_size)

        assert self.batch_size % self.duplicate == 0
        rng = self.rng
        train_iteration_mode = 'shuffled_sequential'
        if not is_stochastic(train_iteration_mode):
            rng = None
        iterator = dataset.iterator(mode=train_iteration_mode,
                                    batch_size=self.batch_size //
                                    self.duplicate,
                                    num_batches=self.batches_per_iter,
                                    targets=self.cost.supervised,
                                    topo=self.X.ndim != 2,
                                    rng=rng)

        accum_batches = []

        if self.accumulate:
            warnings.warn(
                "InpaintAlg.train wastes time setting shared variables only to pull their value back out."
            )

        for data in iterator:
            if self.cost.supervised:
                X, Y = data
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    stry = str(Y).replace('\n', ' ')
                    mode.record.handle_line('data Y ' + stry + '\n')
                if self.duplicate > 1:
                    Y = np.concatenate([Y] * self.duplicate, axis=0)
                self.Y.set_value(Y)
            else:
                X = data

            if self.duplicate > 1:
                X = np.concatenate([X] * self.duplicate, axis=0)
            self.X.set_value(X)

            self.update_mask()
            if self.accumulate:
                accum_batches.append(
                    [elem.get_value() for elem in self.inputs])
                if len(accum_batches) == self.combine_batches:
                    self.before_step(model)
                    self.optimizer.minimize(*accum_batches)
                    self.after_step(model)
                    actual_batch_size = sum(
                        [batch[0].shape[0] for batch in accum_batches])
                    model.monitor.report_batch(actual_batch_size)
                    accum_batches = []
            else:
                self.before_step(model)
                self.optimizer.minimize()
                self.after_step(model)
                actual_batch_size = X.shape[0]
                model.monitor.report_batch(actual_batch_size)
        assert len(accum_batches) == 0

    def continue_learning(self, model):
        if self.termination_criterion is not None:
            return self.termination_criterion(self.model)
        return True
Exemple #23
0
class BGD(TrainingAlgorithm):
    """Batch Gradient Descent training algorithm class"""
    def __init__(self, cost=None, batch_size=None, batches_per_iter=None,
                 updates_per_batch=10, monitoring_batches=None,
                 monitoring_dataset=None, termination_criterion = None,
                 set_batch_size=False, reset_alpha=True, conjugate=False,
                 min_init_alpha=.001, reset_conjugate=True,
                 line_search_mode=None, verbose_optimization=False,
                 scale_step=1., theano_function_mode=None, init_alpha=None,
                 seed=None):
        """
        Parameters
        ----------
        cost : pylearn2.costs.Cost
            A pylearn2 Cost, or None, in which case model.get_default_cost() \
            will be used
        batch_size : int
            Like the SGD TrainingAlgorithm, this TrainingAlgorithm still \
            iterates over minibatches of data. The difference is that this \
            class uses partial line searches to choose the step size along \
            each gradient direction, and can do repeated updates on the same \
            batch. The assumption is that you use big enough minibatches with \
            this algorithm that a large step size will generalize reasonably \
            well to other minibatches. To implement true Batch Gradient \
            Descent, set the batch_size to the total number of examples \
            available. If batch_size is None, it will revert to the model's \
            force_batch_size attribute.
        batches_per_iter : int
            WRITEME
        updates_per_batch : int
            Passed through to the optimization.BatchGradientDescent's \
            `max_iters parameter`
        monitoring_batches : WRITEME
        monitoring_dataset: Dataset or dict
            A Dataset or a dictionary mapping string dataset names to Datasets
        termination_criterion : WRITEME
        set_batch_size : bool
            If True, BGD will attempt to override the model's \
            `force_batch_size` attribute by calling set_batch_size on it.
        reset_alpha : bool
            Passed through to the optimization.BatchGradientDescent's \
            `max_iters parameter`
        conjugate : bool
            Passed through to the optimization.BatchGradientDescent's \
            `max_iters parameter`
        min_init_alpha : float
            WRITEME
        reset_conjugate : bool
            Passed through to the optimization.BatchGradientDescent's \
            `max_iters parameter`
        line_search_mode : WRITEME
        verbose_optimization : bool
            WRITEME
        scale_step : float
            WRITEME
        theano_function_mode : WRITEME
        init_alpha : WRITEME
        seed : WRITEME
        """

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches == None


        self._set_monitoring_dataset(monitoring_dataset)

        self.bSetup = False
        self.termination_criterion = termination_criterion
        if seed is None:
            seed = [2012, 10, 16]
        self.rng = np.random.RandomState(seed)

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model : object
            A Python object representing the model to train loosely \
            implementing the interface of models.model.Model.
        dataset : pylearn2.datasets.dataset.Dataset
            Dataset object used to draw training data
        """
        self.model = model

        if self.cost is None:
            self.cost = model.get_default_cost()

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size):
                    raise ValueError("batch_size is %d but " +
                                     "model.force_batch_size is %d" %
                                     (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)

        data_specs = self.cost.get_data_specs(model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space,
        # named according to the sources.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = 'BGD_[%s]' % source
            arg = space.make_theano_batch(name=name)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with their data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)
        grads, grad_updates = self.cost.get_gradients(
                model, nested_args, ** fixed_var_descr.fixed_vars)

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)

        if cost_value is None:
            raise ValueError("BGD is incompatible with " + str(self.cost) +
                             " because it is intractable, but BGD uses the " +
                             "cost function value to do line searches.")

        # obj_prereqs has to be a list of function f called with f(*data),
        # where data is a data tuple coming from the iterator.
        # this function enables capturing "mapping" and "f", while
        # enabling the "*data" syntax
        def capture(f, mapping=mapping):
            new_f = lambda *args: f(mapping.flatten(args, return_tuple=True))
            return new_f

        obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch]

        if self.monitoring_dataset is not None:
            self.monitor.setup(
                    dataset=self.monitoring_dataset,
                    cost=self.cost,
                    batch_size=self.batch_size,
                    num_batches=self.monitoring_batches,
                    obj_prereqs=obj_prereqs,
                    cost_monitoring_args=fixed_var_descr.fixed_vars)

            # TODO : Why is this commented?
            '''
            channels = model.get_monitoring_channels(theano_args)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, theano_args, ** fixed_var_descr.fixed_vars))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                    mode="sequential",
                                    batch_size=self.batch_size,
                                    num_batches=self.monitoring_batches)

                # The monitor compiles all channels for the same dataset into one function, and
                # runs all prereqs before calling the function. So we only need to register the
                # on_load_batch prereq once per monitoring dataset.
                self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=cost_value,
                        dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    self.monitor.add_channel(name= prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             data_specs=data_specs,
                                             dataset = monitoring_dataset,
                                             prereqs=prereqs)
                '''

        params = model.get_params()


        self.optimizer = BatchGradientDescent(
                            objective = cost_value,
                            gradients = grads,
                            gradient_updates = grad_updates,
                            params = params,
                            param_constrainers = [ model.censor_updates ],
                            lr_scalers = model.get_lr_scalers(),
                            inputs = theano_args,
                            verbose = self.verbose_optimization,
                            max_iter = self.updates_per_batch,
                            reset_alpha = self.reset_alpha,
                            conjugate = self.conjugate,
                            reset_conjugate = self.reset_conjugate,
                            min_init_alpha = self.min_init_alpha,
                            line_search_mode = self.line_search_mode,
                            theano_function_mode=self.theano_function_mode,
                            init_alpha=self.init_alpha)

        # These monitoring channels keep track of shared variables,
        # which do not need inputs nor data.
        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                    name='ave_step_size',
                    ipt=None,
                    val=self.optimizer.ave_step_size,
                    data_specs=(NullSpace(), ''),
                    dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                    name='ave_grad_size',
                    ipt=None,
                    val=self.optimizer.ave_grad_size,
                    data_specs=(NullSpace(), ''),
                    dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                    name='ave_grad_mult',
                    ipt=None,
                    val=self.optimizer.ave_grad_mult,
                    data_specs=(NullSpace(), ''),
                    dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True

    def train(self, dataset):
        """
        .. todo::

            WRITEME
        """
        assert self.bSetup
        model = self.model

        rng = self.rng
        train_iteration_mode = 'shuffled_sequential'
        if not is_stochastic(train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)
        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError("Unable to train with BGD, because "
                    "the cost does not actually use data from the data set. "
                    "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=train_iteration_mode,
                batch_size=self.batch_size,
                num_batches=self.batches_per_iter,
                data_specs=flat_data_specs, return_tuple=True,
                rng = rng)

        mode = self.theano_function_mode
        for data in iterator:
            if ('targets' in source_tuple and mode is not None
                    and hasattr(mode, 'record')):
                Y = data[source_tuple.index('targets')]
                stry = str(Y).replace('\n',' ')
                mode.record.handle_line('data Y '+stry+'\n')

            for on_load_batch in self.on_load_batch:
                on_load_batch(mapping.nest(data))

            self.before_step(model)
            self.optimizer.minimize(*data)
            self.after_step(model)
            actual_batch_size = flat_data_specs[0].np_batch_size(data)
            model.monitor.report_batch(actual_batch_size)

    def continue_learning(self, model):
        """
        .. todo::

            WRITEME
        """
        if self.termination_criterion is None:
            return True
        else:
            rval = self.termination_criterion.continue_learning(self.model)
            assert rval in [True, False, 0, 1]
            return rval

    def before_step(self, model):
        """
        .. todo::

            WRITEME
        """
        if self.scale_step != 1.:
            self.params = list(model.get_params())
            self.value = [ param.get_value() for param in self.params ]

    def after_step(self, model):
        """
        .. todo::

            WRITEME
        """
        if self.scale_step != 1:
            for param, value in safe_zip(self.params, self.value):
                value = (1.-self.scale_step) * value + self.scale_step * param.get_value()
                param.set_value(value)
Exemple #24
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, "force_batch_size"):
                if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size):
                    raise ValueError(
                        "batch_size is %d but model.force_batch_size is %d" % (batch_size, model.force_batch_size)
                    )

        self.monitor = Monitor.get_monitor(model)
        X = self.model.get_input_space().make_theano_batch()
        self.topo = X.ndim != 2
        Y = T.matrix()

        if self.cost.supervised:
            obj = self.cost(model, X, Y)
            grads, grad_updates = self.cost.get_gradients(model, X, Y)
            ipt = (X, Y)
        else:
            obj = self.cost(model, X)
            grads, grad_updates = self.cost.get_gradients(model, X)
            ipt = X
        if obj is None:
            raise ValueError(
                "BGD is incompatible with " + str(self.cost) + " because "
                " it is intractable, but BGD uses the cost function value to do "
                " line searches."
            )

        if self.monitoring_dataset is not None:
            if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]):
                Y = None

            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError(
                    "model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)
                )
            channels.update(self.cost.get_monitoring_channels(model, X, Y))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == "":
                    prefix = ""
                else:
                    prefix = dataset_name + "_"
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(
                    dataset=monitoring_dataset,
                    mode="sequential",
                    batch_size=self.batch_size,
                    num_batches=self.monitoring_batches,
                )

                self.monitor.add_channel(prefix + "objective", ipt=ipt, val=obj, dataset=monitoring_dataset)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(
                        name=prefix + name, ipt=ipt, val=J, dataset=monitoring_dataset, prereqs=prereqs
                    )

        if ipt is X:
            ipts = [X]
        else:
            ipts = ipt

        self.optimizer = BatchGradientDescent(
            objective=obj,
            gradients=grads,
            gradient_updates=grad_updates,
            params=model.get_params(),
            param_constrainers=[model.censor_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=ipts,
            verbose=True,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
        )

        self.first = True
        self.bSetup = True