コード例 #1
0
ファイル: test_mnd.py プロジェクト: Alienfeel/pylearn2
    def test_zero_optimal(self):
        """ minimizes the kl divergence between q and p
            using batch gradient descent and checks that
            the result is zero"""

        rng = np.random.RandomState([1,2,3])

        dim = self.dim

        num_trials = 3

        mu = rng.randn(dim).astype(floatX)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.p.mu.set_value(mu)
        mu = rng.randn(dim).astype(floatX)
        self.q.mu.set_value(mu)
        self.p.beta.set_value(beta)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.q.beta.set_value(beta)

        kl = kl_divergence(self.q,self.p)

        p = self.p
        q = self.q

        optimizer = BatchGradientDescent(
                    max_iter = 100,
                    line_search_mode = 'exhaustive',
                    verbose = True,
                    objective = kl,
                    conjugate = True,
                    params = [ p.mu, p.beta, q.mu, q.beta ],
                    param_constrainers = [ p.censor_updates,
                        q.censor_updates ])

        #optimizer.verbose = True

        kl = optimizer.minimize()

        if kl < 0.:

            if config.floatX == 'float32':
                neg_tol = 4.8e-7
            else:
                neg_tol = 0.

            if kl < - neg_tol:
                raise AssertionError("KL divergence should "
                    "be non-negative but is "+
                    str(kl))

            warnings.warn("KL divergence is not very numerically stable, evidently")

        tol = 6e-5
        if kl > tol:
            print 'kl:',kl
            print 'tol:',tol
        assert kl <= tol
        assert not (kl > tol )
コード例 #2
0
ファイル: test_mnd.py プロジェクト: capybaralet/current
    def test_zero_optimal(self):
        """ minimizes the kl divergence between q and p
            using batch gradient descent and checks that
            the result is zero"""

        rng = np.random.RandomState([1,2,3])

        dim = self.dim

        num_trials = 3

        mu = rng.randn(dim).astype(floatX)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.p.mu.set_value(mu)
        mu = rng.randn(dim).astype(floatX)
        self.q.mu.set_value(mu)
        self.p.beta.set_value(beta)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.q.beta.set_value(beta)

        kl = kl_divergence(self.q,self.p)

        p = self.p
        q = self.q

        optimizer = BatchGradientDescent(
                    max_iter = 100,
                    line_search_mode = 'exhaustive',
                    verbose = True,
                    objective = kl,
                    conjugate = True,
                    params = [ p.mu, p.beta, q.mu, q.beta ],
                    param_constrainers = [ p.censor_updates,
                        q.censor_updates ])

        #optimizer.verbose = True

        kl = optimizer.minimize()

        if kl < 0.:

            if config.floatX == 'float32':
                neg_tol = 4.8e-7
            else:
                neg_tol = 0.

            if kl < - neg_tol:
                raise AssertionError("KL divergence should "
                    "be non-negative but is "+
                    str(kl))

            warnings.warn("KL divergence is not very numerically stable, evidently")

        tol = 5.4e-5
        if kl > tol:
            print 'kl:',kl
            print 'tol:',tol
        assert kl <= tol
        assert not (kl > tol )
コード例 #3
0
    def fit(self, params=None, l1=.0, l2=.0):
        """
        Fit the model by minimizing the Leave One Out (LOO) loss using gradient-based optimization.
        """
        loo_loss = self.loss_symbolic(self.L, self.y, self.mu, self.R, self.eta, self.eps)

        if params is None:
            params = [self.eta]

        # Symbolic Theano variables that represent the L1 and L2 regularization terms
        L1, L2 = .0, .0
        for param in params:
            L1 += T.sum(abs(param))
            L2 += T.sum(param ** 2)

        regularized_loo_loss = loo_loss + l1 * L1 + l2 * L2

        minimizer = BatchGradientDescent(objective=regularized_loo_loss, params=params, inputs=[], verbose=1)

        minimizer.minimize()
コード例 #4
0
    def fit(self, params=None, l1=.0, l2=.0):
        NLL = self.loss_symbolic(self.L, self.y, self.mu, self.R, self.eta,
                                 self.eps)

        if params is None:
            params = [self.eta]

        # Symbolic Theano variables that represent the L1 and L2 regularization terms
        L1, L2 = .0, .0
        for param in params:
            L1 += T.sum(abs(param))
            L2 += T.sum(param**2)

        regularized_NLL = NLL + l1 * L1 + l2 * L2

        minimizer = BatchGradientDescent(objective=regularized_NLL,
                                         params=params,
                                         inputs=[],
                                         verbose=1)

        minimizer.minimize()
コード例 #5
0
    def fit(self, params=None, l1=.0, l2=.0):
        """
        Fit the model by minimizing the Leave One Out (LOO) loss using gradient-based optimization.
        """
        loo_loss = self.loss_symbolic(self.L, self.y, self.mu, self.R,
                                      self.eta, self.eps)

        if params is None:
            params = [self.eta]

        # Symbolic Theano variables that represent the L1 and L2 regularization terms
        L1, L2 = .0, .0
        for param in params:
            L1 += T.sum(abs(param))
            L2 += T.sum(param**2)

        regularized_loo_loss = loo_loss + l1 * L1 + l2 * L2

        minimizer = BatchGradientDescent(objective=regularized_loo_loss,
                                         params=params,
                                         inputs=[],
                                         verbose=1)

        minimizer.minimize()
コード例 #6
0
ファイル: test_mnd.py プロジェクト: wqren/pylearn
    def test_zero_optimal(self):
        """ minimizes the kl divergence between q and p
            using batch gradient descent and checks that
            the result is zero"""

        rng = np.random.RandomState([1,2,3])

        dim = self.dim

        num_trials = 3

        mu = rng.randn(dim).astype(floatX)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.p.mu.set_value(mu)
        mu = rng.randn(dim).astype(floatX)
        self.q.mu.set_value(mu)
        self.p.beta.set_value(beta)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.q.beta.set_value(beta)

        kl = kl_divergence(self.q,self.p)

        p = self.p
        q = self.q

        optimizer = BatchGradientDescent(
                    objective = kl,
                    params = [ p.mu, p.beta, q.mu, q.beta ],
                    param_constrainers = [ p.censor_updates,
                        q.censor_updates ])

        #optimizer.verbose = True

        kl = optimizer.minimize()

        if kl < 0.:
            raise AssertionError("KL divergence should "
                    "be non-negative but is "+
                    str(kl))

        tol = 5.4e-5
        assert kl <= tol
        assert not (kl > tol )
コード例 #7
0
    def test_zero_optimal(self):
        """ minimizes the kl divergence between q and p
            using batch gradient descent and checks that
            the result is zero"""

        rng = np.random.RandomState([1, 2, 3])

        dim = self.dim

        num_trials = 3

        mu = rng.randn(dim).astype(floatX)
        beta = rng.uniform(.1, 10., (dim, )).astype(floatX)
        self.p.mu.set_value(mu)
        mu = rng.randn(dim).astype(floatX)
        self.q.mu.set_value(mu)
        self.p.beta.set_value(beta)
        beta = rng.uniform(.1, 10., (dim, )).astype(floatX)
        self.q.beta.set_value(beta)

        kl = kl_divergence(self.q, self.p)

        p = self.p
        q = self.q

        optimizer = BatchGradientDescent(
            objective=kl,
            params=[p.mu, p.beta, q.mu, q.beta],
            param_constrainers=[p.censor_updates, q.censor_updates])

        #optimizer.verbose = True

        kl = optimizer.minimize()

        if kl < 0.:
            raise AssertionError("KL divergence should "
                                 "be non-negative but is " + str(kl))

        tol = 5.4e-5
        assert kl <= tol
        assert not (kl > tol)
コード例 #8
0
ファイル: optimal_input.py プロジェクト: vd114/galatea
from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent

bgd = BatchGradientDescent(objective=-neuron,
        params=[X],
        inputs=None,
        max_iter=100,
        lr_scalers=None,
        verbose=3,
        tol=None,
        init_alpha=None,
        min_init_alpha=1e-3,
        reset_alpha=True,
        conjugate=True,
        gradients=None,
        gradient_updates=None,
        accumulate=False,
        theano_function_mode=None,
        param_constrainers=None)

bgd.minimize()


X = normed.eval()[:,:,:,0].transpose(1,2,0)
import numpy as np
X /= np.abs(X).max()
print (X.min(), X.max())

from pylearn2.utils.image import show
show(X)
コード例 #9
0
class BGD(TrainingAlgorithm):
    """Batch Gradient Descent training algorithm class"""
    def __init__(self,
                 cost,
                 batch_size=None,
                 batches_per_iter=None,
                 updates_per_batch=10,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 termination_criterion=None,
                 set_batch_size=False,
                 reset_alpha=True,
                 conjugate=False,
                 min_init_alpha=None,
                 reset_conjugate=True,
                 line_search_mode=None):
        """
        cost: a pylearn2 Cost
        batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm
                    still iterates over minibatches of data. The difference
                    is that this class uses partial line searches to choose
                    the step size along each gradient direction, and can do
                    repeated updates on the same batch. The assumption is
                    that you use big enough minibatches with this algorithm that
                    a large step size will generalize reasonably well to other
                    minibatches.
                    To implement true Batch Gradient Descent, set the batch_size
                    to the total number of examples available.
                    If batch_size is None, it will revert to the model's force_batch_size
                    attribute.
        set_batch_size: If True, BGD will attempt to override the model's force_batch_size
                attribute by calling set_batch_size on it.
        updates_per_batch: Passed through to the optimization.BatchGradientDescent's
                   max_iters parameter
        reset_alpha, conjugate, reset_conjugate: passed through to the
            optimization.BatchGradientDescent parameters of the same names
        monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets
        """

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches == None

        self._set_monitoring_dataset(monitoring_dataset)

        self.bSetup = False
        self.termination_criterion = termination_criterion
        self.rng = np.random.RandomState([2012, 10, 16])

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0
                        or batch_size == model.force_batch_size):
                    raise ValueError(
                        "batch_size is %d but model.force_batch_size is %d" %
                        (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        X = self.model.get_input_space().make_theano_batch()
        self.topo = X.ndim != 2
        Y = T.matrix()

        if self.cost.supervised:
            obj = self.cost(model, X, Y)
            grads, grad_updates = self.cost.get_gradients(model, X, Y)
            ipt = (X, Y)
        else:
            obj = self.cost(model, X)
            grads, grad_updates = self.cost.get_gradients(model, X)
            ipt = X
        if obj is None:
            raise ValueError(
                "BGD is incompatible with " + str(self.cost) + " because "
                " it is intractable, but BGD uses the cost function value to do "
                " line searches.")

        if self.monitoring_dataset is not None:
            if not any([
                    dataset.has_targets()
                    for dataset in self.monitoring_dataset.values()
            ]):
                Y = None

            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, X, Y))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                         mode="sequential",
                                         batch_size=self.batch_size,
                                         num_batches=self.monitoring_batches)

                self.monitor.add_channel(prefix + 'objective',
                                         ipt=ipt,
                                         val=obj,
                                         dataset=monitoring_dataset)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name=prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset=monitoring_dataset,
                                             prereqs=prereqs)

        if ipt is X:
            ipts = [X]
        else:
            ipts = ipt

        self.optimizer = BatchGradientDescent(
            objective=obj,
            gradients=grads,
            gradient_updates=grad_updates,
            params=model.get_params(),
            param_constrainers=[model.censor_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=ipts,
            verbose=True,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode)

        self.first = True
        self.bSetup = True

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        batch_size = self.batch_size

        if self.topo:
            get_data = dataset.get_batch_topo
        else:
            get_data = dataset.get_batch_design

        rng = self.rng
        train_iteration_mode = 'shuffled_sequential'
        if not is_stochastic(train_iteration_mode):
            rng = None
        iterator = dataset.iterator(mode=train_iteration_mode,
                                    batch_size=self.batch_size,
                                    targets=self.cost.supervised,
                                    num_batches=self.batches_per_iter,
                                    topo=self.topo,
                                    rng=rng)
        for data in iterator:
            if self.cost.supervised:
                args = data
                X, Y = data
            else:
                args = [data]
                X = data
            self.optimizer.minimize(*args)
            model.monitor.report_batch(X.shape[0])

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion(self.model)
コード例 #10
0
ファイル: realest.py プロジェクト: vd114/galatea
from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent as BGD
import theano.tensor as T


def norm_constraint(updates):
    assert X in updates
    updates[X] = updates[X] / (1e-7 + T.sqrt(T.sqr(X).sum()))


opt = BGD(objective=obj,
          params=[X],
          param_constrainers=[norm_constraint],
          conjugate=True,
          reset_conjugate=False,
          reset_alpha=False,
          line_search_mode='exhaustive',
          verbose=3,
          max_iter=20)
results = []
import numpy as np
rng = np.random.RandomState([1, 2, 3])
for i in xrange(10):
    X.set_value(rng.randn(*X.get_value().shape).astype(X.dtype) / 10.)
    opt.minimize()
    Xv = X.dimshuffle(3, 1, 2, 0).eval()
    results.append(Xv)
X = np.concatenate(results, axis=0)
from pylearn2.gui.patch_viewer import make_viewer
v = make_viewer(X)
v.show()
コード例 #11
0
def test_batch_gradient_descent():
        """ Verify that batch gradient descent works by checking that
        it minimizes a quadratic function f(x) = x^T A x + b^T x + c
        correctly for several sampled values of A, b, and c.
        The ground truth minimizer is x = np.linalg.solve(A,-b)"""

        n = 3

        A = T.matrix(name = 'A')
        b = T.vector(name = 'b')
        c = T.scalar(name = 'c')

        x = sharedX( np.zeros((n,)) , name = 'x')

        half = np.cast[config.floatX](0.5)

        obj = half * T.dot(T.dot(x,A),x)+T.dot(b,x)+c

        minimizer = BatchGradientDescent(
                        objective = obj,
                        params = [ x],
                        inputs = [ A, b, c])

        num_samples = 3

        rng = np.random.RandomState([1,2,3])

        for i in xrange(num_samples):
            A = np.cast[config.floatX](rng.randn(1.5*n,n))
            A = np.cast[config.floatX](np.dot(A.T,A))
            A += np.cast[config.floatX](np.identity(n) * .02)
            b = np.cast[config.floatX](rng.randn(n))
            c = np.cast[config.floatX](rng.randn())
            x.set_value(np.cast[config.floatX](rng.randn(n)))

            analytical_x = np.linalg.solve(A,-b)

            actual_obj = minimizer.minimize(A,b,c)
            actual_x = x.get_value()

            #Check that the value returned by the minimize method
            #is the objective function value at the parameters
            #chosen by the minimize method
            cur_obj = minimizer.obj(A,b,c)
            assert np.allclose(actual_obj, cur_obj)

            x.set_value(analytical_x)
            analytical_obj = minimizer.obj(A,b,c)

            #make sure the objective function is accurate to first 4 digits
            condition1 = not np.allclose(analytical_obj, actual_obj)
            condition2 = np.abs(analytical_obj-actual_obj) >= 1e-4 * np.abs(analytical_obj)

            if (config.floatX == 'float64' and condition1) \
                    or (config.floatX == 'float32' and condition2):
                print 'objective function value came out wrong on sample ',i
                print 'analytical obj', analytical_obj
                print 'actual obj',actual_obj

                """
                The following section of code was used to verify that numerical
                error can make the objective function look non-convex

                print 'Checking for numerically induced non-convex behavior'
                def f(x):
                    return 0.5 * np.dot(x,np.dot(A,x)) + np.dot(b,x) + c

                x.set_value(actual_x)
                minimizer._compute_grad(A,b,c)
                minimizer._normalize_grad()
                d = minimizer.param_to_grad_shared[x].get_value()

                x = actual_x.copy()
                prev = f(x)
                print prev
                step_size = 1e-4
                x += step_size * d
                cur = f(x)
                print cur
                cur_sgn = np.sign(cur-prev)
                flip_cnt = 0
                for i in xrange(10000):
                    x += step_size * d
                    prev = cur
                    cur = f(x)
                    print cur
                    prev_sgn = cur_sgn
                    cur_sgn = np.sign(cur-prev)
                    if cur_sgn != prev_sgn:
                        print 'flip'
                        flip_cnt += 1
                        if flip_cnt > 1:
                            print "Non-convex!"

                            from matplotlib import pyplot as plt
                            y = []

                            x = actual_x.copy()
                            for j in xrange(10000):
                                y.append(f(x))
                                x += step_size * d

                            plt.plot(y)
                            plt.show()

                            assert False

                print 'None found'
                """

                #print 'actual x',actual_x
                #print 'A:'
                #print A
                #print 'b:'
                #print b
                #print 'c:'
                #print c
                x.set_value(actual_x)
                minimizer._compute_grad(A,b,c)
                x_grad = minimizer.param_to_grad_shared[x]
                actual_grad =  x_grad.get_value()
                correct_grad = 0.5 * np.dot(A,x.get_value())+ 0.5 * np.dot(A.T, x.get_value()) +b
                if not np.allclose(actual_grad, correct_grad):
                    print 'gradient was wrong at convergence point'
                    print 'actual grad: '
                    print actual_grad
                    print 'correct grad: '
                    print correct_grad
                    print 'max difference: ',np.abs(actual_grad-correct_grad).max()
                    assert False


                minimizer._normalize_grad()
                d = minimizer.param_to_grad_shared[x].get_value()
                step_len = ( np.dot(b,d) + 0.5 * np.dot(d,np.dot(A,actual_x)) \
                        + 0.5 * np.dot(actual_x,np.dot(A,d)) ) / np.dot(d, np.dot(A,d))

                g = np.dot(A,actual_x)+b
                deriv = np.dot(g,d)

                print 'directional deriv at actual', deriv
                print 'optimal step_len', step_len
                optimal_x = actual_x - d * step_len
                g = np.dot(A,optimal_x) + b
                deriv = np.dot(g,d)

                print 'directional deriv at optimal: ',deriv
                x.set_value(optimal_x)
                print 'obj at optimal: ',minimizer.obj(A,b,c)



                print 'eigenvalue range:'
                val, vec = np.linalg.eig(A)
                print (val.min(),val.max())
                print 'condition number: ',(val.max()/val.min())
                assert False
コード例 #12
0
ファイル: synth6.py プロジェクト: cc13ny/galatea
        for Y_i in Y:
            pos_prob = 1./(1.+T.exp(model.free_energy(X)-model.free_energy(Y_i)))
            acc = (pos_prob > .5).mean()
            accs.append(acc)
        acc = sum(accs) / float(len(accs))

        print '\tinit accuracy ',function([],acc)()

        #Minimize the objective function with batch gradient descent
        minimizer = BatchGradientDescent( objective = J,
                                            params = model.get_params(),
                                            param_constrainers = [ model.censor_updates ])

        print '\tinit obj:',minimizer.obj()
        #minimizer.verbose = True
        minimizer.minimize()
        print '\tfinal obj:',minimizer.obj()

        recovered_beta = model.beta.get_value()
        recovered_mu = model.mu.get_value()

        print '\trecovered beta:',recovered_beta
        print '\trecovered mu:',recovered_mu

        kl = kl_divergence(true, model)
        kl = function([],kl)()
        assert kl >= 0.0

        print '\tkl was ',kl
        print '\tfinal accuracy ',function([],acc)()
        kls[trial,idx1] = kl
コード例 #13
0
def test_batch_gradient_descent():
    """ Verify that batch gradient descent works by checking that
        it minimizes a quadratic function f(x) = x^T A x + b^T x + c
        correctly for several sampled values of A, b, and c.
        The ground truth minimizer is x = np.linalg.solve(A,-b)"""

    n = 3

    A = T.matrix(name='A')
    b = T.vector(name='b')
    c = T.scalar(name='c')

    x = sharedX(np.zeros((n, )), name='x')

    half = np.cast[config.floatX](0.5)

    obj = half * T.dot(T.dot(x, A), x) + T.dot(b, x) + c

    minimizer = BatchGradientDescent(objective=obj,
                                     params=[x],
                                     inputs=[A, b, c])

    num_samples = 3

    rng = np.random.RandomState([1, 2, 3])

    for i in xrange(num_samples):
        A = np.cast[config.floatX](rng.randn(1.5 * n, n))
        A = np.cast[config.floatX](np.dot(A.T, A))
        A += np.cast[config.floatX](np.identity(n) * .02)
        b = np.cast[config.floatX](rng.randn(n))
        c = np.cast[config.floatX](rng.randn())
        x.set_value(np.cast[config.floatX](rng.randn(n)))

        analytical_x = np.linalg.solve(A, -b)

        actual_obj = minimizer.minimize(A, b, c)
        actual_x = x.get_value()

        #Check that the value returned by the minimize method
        #is the objective function value at the parameters
        #chosen by the minimize method
        cur_obj = minimizer.obj(A, b, c)
        assert np.allclose(actual_obj, cur_obj)

        x.set_value(analytical_x)
        analytical_obj = minimizer.obj(A, b, c)

        #make sure the objective function is accurate to first 4 digits
        condition1 = not np.allclose(analytical_obj, actual_obj)
        condition2 = np.abs(analytical_obj -
                            actual_obj) >= 1e-4 * np.abs(analytical_obj)

        if (config.floatX == 'float64' and condition1) \
                or (config.floatX == 'float32' and condition2):
            print 'objective function value came out wrong on sample ', i
            print 'analytical obj', analytical_obj
            print 'actual obj', actual_obj
            """
                The following section of code was used to verify that numerical
                error can make the objective function look non-convex

                print 'Checking for numerically induced non-convex behavior'
                def f(x):
                    return 0.5 * np.dot(x,np.dot(A,x)) + np.dot(b,x) + c

                x.set_value(actual_x)
                minimizer._compute_grad(A,b,c)
                minimizer._normalize_grad()
                d = minimizer.param_to_grad_shared[x].get_value()

                x = actual_x.copy()
                prev = f(x)
                print prev
                step_size = 1e-4
                x += step_size * d
                cur = f(x)
                print cur
                cur_sgn = np.sign(cur-prev)
                flip_cnt = 0
                for i in xrange(10000):
                    x += step_size * d
                    prev = cur
                    cur = f(x)
                    print cur
                    prev_sgn = cur_sgn
                    cur_sgn = np.sign(cur-prev)
                    if cur_sgn != prev_sgn:
                        print 'flip'
                        flip_cnt += 1
                        if flip_cnt > 1:
                            print "Non-convex!"

                            from matplotlib import pyplot as plt
                            y = []

                            x = actual_x.copy()
                            for j in xrange(10000):
                                y.append(f(x))
                                x += step_size * d

                            plt.plot(y)
                            plt.show()

                            assert False

                print 'None found'
                """

            #print 'actual x',actual_x
            #print 'A:'
            #print A
            #print 'b:'
            #print b
            #print 'c:'
            #print c
            x.set_value(actual_x)
            minimizer._compute_grad(A, b, c)
            x_grad = minimizer.param_to_grad_shared[x]
            actual_grad = x_grad.get_value()
            correct_grad = 0.5 * np.dot(A, x.get_value()) + 0.5 * np.dot(
                A.T, x.get_value()) + b
            if not np.allclose(actual_grad, correct_grad):
                print 'gradient was wrong at convergence point'
                print 'actual grad: '
                print actual_grad
                print 'correct grad: '
                print correct_grad
                print 'max difference: ', np.abs(actual_grad -
                                                 correct_grad).max()
                assert False

            minimizer._normalize_grad()
            d = minimizer.param_to_grad_shared[x].get_value()
            step_len = ( np.dot(b,d) + 0.5 * np.dot(d,np.dot(A,actual_x)) \
                    + 0.5 * np.dot(actual_x,np.dot(A,d)) ) / np.dot(d, np.dot(A,d))

            g = np.dot(A, actual_x) + b
            deriv = np.dot(g, d)

            print 'directional deriv at actual', deriv
            print 'optimal step_len', step_len
            optimal_x = actual_x - d * step_len
            g = np.dot(A, optimal_x) + b
            deriv = np.dot(g, d)

            print 'directional deriv at optimal: ', deriv
            x.set_value(optimal_x)
            print 'obj at optimal: ', minimizer.obj(A, b, c)

            print 'eigenvalue range:'
            val, vec = np.linalg.eig(A)
            print(val.min(), val.max())
            print 'condition number: ', (val.max() / val.min())
            assert False
コード例 #14
0
class InpaintAlgorithm(object):
    def __init__(self,
                 mask_gen,
                 cost,
                 batch_size=None,
                 batches_per_iter=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 max_iter=5,
                 suicide=False,
                 init_alpha=None,
                 reset_alpha=True,
                 conjugate=False,
                 reset_conjugate=True,
                 termination_criterion=None,
                 set_batch_size=False,
                 line_search_mode=None,
                 min_init_alpha=1e-3,
                 duplicate=1,
                 combine_batches=1,
                 scale_step=1.,
                 theano_function_mode=None):
        assert False  # deprecated
        """
        if batch_size is None, reverts to the force_batch_size field of the
        model
        """

        if line_search_mode is None and init_alpha is None:
            init_alpha = (.001, .005, .01, .05, .1)

        self.__dict__.update(locals())
        del self.self
        if monitoring_dataset is None:
            assert monitoring_batches == None
        if isinstance(monitoring_dataset, Dataset):
            self.monitoring_dataset = {'': monitoring_dataset}
        self.bSetup = False
        self.rng = np.random.RandomState([2012, 10, 17])

    def setup_batch(self, X, Y=None):
        assert not isinstance(X, tuple)
        self.X.set_value(X)
        assert self.cost.supervised == (Y is not None)
        if Y is not None:
            assert Y.ndim == 2
            assert self.Y.ndim == 2
            self.Y.set_value(Y)
        self.update_mask()

    def get_setup_batch_object(self):
        return SetupBatch(self)

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.set_batch_size:
            model.set_batch_size(self.batch_size)

        if self.batch_size is None:
            self.batch_size = model.force_batch_size

        model.cost = self.cost
        model.mask_gen = self.mask_gen

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)
        prereq = self.get_setup_batch_object()
        #We want to use big batches. We need to make several theano calls on each
        #batch. To avoid paying the GPU latency every time, we use a shared variable
        #but the shared variable needs to stay allocated during the time that the
        #monitor is working, and we don't want the monitor to increase the memory
        #overhead. So we make the monitor work off of the same shared variable
        space = model.get_input_space()
        X = sharedX(space.get_origin_batch(model.batch_size), 'BGD_X')
        self.space = space
        rng = np.random.RandomState([2012, 7, 20])
        test_mask = space.get_origin_batch(model.batch_size)
        test_mask = rng.randint(0, 2, test_mask.shape)
        if hasattr(self.mask_gen,
                   'sync_channels') and self.mask_gen.sync_channels:
            if test_mask.ndim != 4:
                raise NotImplementedError()
            test_mask = test_mask[:, :, :, 0]
            assert test_mask.ndim == 3
        drop_mask = sharedX(np.cast[X.dtype](test_mask), name='drop_mask')
        self.drop_mask = drop_mask
        assert drop_mask.ndim == test_mask.ndim

        Y = None
        drop_mask_Y = None
        if self.cost.supervised:
            Y = sharedX(
                model.get_output_space().get_origin_batch(model.batch_size),
                'BGD_Y')
            self.Y = Y
            test_mask_Y = rng.randint(0, 2, (model.batch_size, ))
            drop_mask_Y = sharedX(np.cast[Y.dtype](test_mask_Y),
                                  name='drop_mask_Y')
            self.drop_mask_Y = drop_mask_Y
            dmx, dmy = self.mask_gen(X, Y)
            updates = OrderedDict([ (drop_mask, dmx),\
                    (drop_mask_Y, dmy)] )
        else:
            updates = OrderedDict([(drop_mask, self.mask_gen(X))])

        obj = self.cost(model,
                        X,
                        Y,
                        drop_mask=drop_mask,
                        drop_mask_Y=drop_mask_Y)
        gradients, gradient_updates = self.cost.get_gradients(
            model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y)

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            theano_rng = MRG_RandomStreams(2012 + 11 + 20)
            for elem in flatten([
                    model.inference_procedure.V_dropout,
                    model.inference_procedure.H_dropout
            ]):
                updates[elem] = theano_rng.binomial(
                    p=include_prob, size=elem.shape, dtype=elem.dtype,
                    n=1) / include_prob
        self.update_mask = function([], updates=updates)

        if self.monitoring_dataset is not None:
            if not any([
                    dataset.has_targets()
                    for dataset in self.monitoring_dataset.values()
            ]):
                Y = None
            assert X.name is not None
            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            assert X.name is not None
            wtf = self.cost.get_monitoring_channels(model,
                                                    X=X,
                                                    Y=Y,
                                                    drop_mask=drop_mask,
                                                    drop_mask_Y=drop_mask_Y)
            for key in wtf:
                channels[key] = wtf[key]

            for dataset_name in self.monitoring_dataset:

                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'

                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                         mode="sequential",
                                         batch_size=self.batch_size,
                                         num_batches=self.monitoring_batches)
                #we only need to put the prereq in once to make sure it gets run
                #adding it more times shouldn't hurt, but be careful
                #each time you say "self.setup_batch" you get a new object with a
                #different id, and if you install n of those the prereq will run n
                #times. It won't cause any wrong results, just a big slowdown
                warnings.warn(
                    "This is weird-- ipt=(X,Y)=tell the monitor to replace X, Y with the givens dict, "
                    " but you don't actually want them to be replaced.")
                ipt = X
                if Y is not None:
                    ipt = [X, Y]
                self.monitor.add_channel(prefix + 'objective',
                                         ipt=ipt,
                                         val=obj,
                                         dataset=monitoring_dataset,
                                         prereqs=[prereq])

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = []

                    prereqs = list(prereqs)
                    prereqs.append(prereq)

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name=prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset=monitoring_dataset,
                                             prereqs=prereqs)

        self.accumulate = self.combine_batches > 1
        if self.accumulate:
            self.inputs = [
                elem for elem in [X, Y, drop_mask, drop_mask_Y]
                if elem is not None
            ]
        else:
            self.inputs = None

        self.optimizer = BatchGradientDescent(
            objective=obj,
            inputs=self.inputs,
            verbose=1,
            gradients=gradients,
            gradient_updates=gradient_updates,
            params=model.get_params(),
            lr_scalers=model.get_lr_scalers(),
            param_constrainers=[model.censor_updates],
            max_iter=self.max_iter,
            tol=3e-7,
            init_alpha=self.init_alpha,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
            accumulate=self.accumulate,
            theano_function_mode=self.theano_function_mode)
        self.X = X

        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                name='ave_step_size',
                ipt=ipt,
                val=self.optimizer.ave_step_size,
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_size',
                ipt=ipt,
                val=self.optimizer.ave_grad_size,
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_mult',
                ipt=ipt,
                val=self.optimizer.ave_grad_mult,
                dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True

    def before_step(self, model):
        if self.scale_step != 1.:
            self.params = list(model.get_params())
            self.value = [param.get_value() for param in self.params]

    def after_step(self, model):
        if self.scale_step != 1:
            for param, value in safe_zip(self.params, self.value):
                value = (1. - self.scale_step
                         ) * value + self.scale_step * param.get_value()
                param.set_value(value)

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        if self.batch_size is None:
            batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if hasattr(model, 'force_batch_size'):
                assert (model.force_batch_size <= 0
                        or batch_size == model.force_batch_size)

        assert self.batch_size % self.duplicate == 0
        rng = self.rng
        train_iteration_mode = 'shuffled_sequential'
        if not is_stochastic(train_iteration_mode):
            rng = None
        iterator = dataset.iterator(mode=train_iteration_mode,
                                    batch_size=self.batch_size //
                                    self.duplicate,
                                    num_batches=self.batches_per_iter,
                                    targets=self.cost.supervised,
                                    topo=self.X.ndim != 2,
                                    rng=rng)

        accum_batches = []

        if self.accumulate:
            warnings.warn(
                "InpaintAlg.train wastes time setting shared variables only to pull their value back out."
            )

        for data in iterator:
            if self.cost.supervised:
                X, Y = data
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    stry = str(Y).replace('\n', ' ')
                    mode.record.handle_line('data Y ' + stry + '\n')
                if self.duplicate > 1:
                    Y = np.concatenate([Y] * self.duplicate, axis=0)
                self.Y.set_value(Y)
            else:
                X = data

            if self.duplicate > 1:
                X = np.concatenate([X] * self.duplicate, axis=0)
            self.X.set_value(X)

            self.update_mask()
            if self.accumulate:
                accum_batches.append(
                    [elem.get_value() for elem in self.inputs])
                if len(accum_batches) == self.combine_batches:
                    self.before_step(model)
                    self.optimizer.minimize(*accum_batches)
                    self.after_step(model)
                    actual_batch_size = sum(
                        [batch[0].shape[0] for batch in accum_batches])
                    model.monitor.report_batch(actual_batch_size)
                    accum_batches = []
            else:
                self.before_step(model)
                self.optimizer.minimize()
                self.after_step(model)
                actual_batch_size = X.shape[0]
                model.monitor.report_batch(actual_batch_size)
        assert len(accum_batches) == 0

    def continue_learning(self, model):
        if self.termination_criterion is not None:
            return self.termination_criterion(self.model)
        return True
コード例 #15
0
class BGD(TrainingAlgorithm):
    """Batch Gradient Descent training algorithm class


    Parameters
    ----------
    cost : pylearn2.costs.Cost
        A pylearn2 Cost, or None, in which case model.get_default_cost() \
        will be used
    batch_size : int
        Like the SGD TrainingAlgorithm, this TrainingAlgorithm still \
        iterates over minibatches of data. The difference is that this \
        class uses partial line searches to choose the step size along \
        each gradient direction, and can do repeated updates on the same \
        batch. The assumption is that you use big enough minibatches with \
        this algorithm that a large step size will generalize reasonably \
        well to other minibatches. To implement true Batch Gradient \
        Descent, set the batch_size to the total number of examples \
        available. If batch_size is None, it will revert to the model's \
        force_batch_size attribute.
    batches_per_iter : int
        WRITEME
    updates_per_batch : int
        Passed through to the optimization.BatchGradientDescent's \
        `max_iters parameter`
    monitoring_batch_size : int
        Size of monitoring batches.
    monitoring_batches : WRITEME
    monitoring_dataset: Dataset or dict
        A Dataset or a dictionary mapping string dataset names to Datasets
    termination_criterion : WRITEME
    set_batch_size : bool
        If True, BGD will attempt to override the model's \
        `force_batch_size` attribute by calling set_batch_size on it.
    reset_alpha : bool
        Passed through to the optimization.BatchGradientDescent's \
        `max_iters parameter`
    conjugate : bool
        Passed through to the optimization.BatchGradientDescent's \
        `max_iters parameter`
    min_init_alpha : float
        WRITEME
    reset_conjugate : bool
        Passed through to the optimization.BatchGradientDescent's \
        `max_iters parameter`
    line_search_mode : WRITEME
    verbose_optimization : bool
        WRITEME
    scale_step : float
        WRITEME
    theano_function_mode : WRITEME
    init_alpha : WRITEME
    seed : WRITEME
    """
    def __init__(self,
                 cost=None,
                 batch_size=None,
                 batches_per_iter=None,
                 updates_per_batch=10,
                 monitoring_batch_size=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 termination_criterion=None,
                 set_batch_size=False,
                 reset_alpha=True,
                 conjugate=False,
                 min_init_alpha=.001,
                 reset_conjugate=True,
                 line_search_mode=None,
                 verbose_optimization=False,
                 scale_step=1.,
                 theano_function_mode=None,
                 init_alpha=None,
                 seed=None):

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches is None
            assert monitoring_batch_size is None

        self._set_monitoring_dataset(monitoring_dataset)

        self.bSetup = False
        self.termination_criterion = termination_criterion
        self.rng = make_np_rng(seed, [2012, 10, 16],
                               which_method=["randn", "randint"])

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model : object
            A Python object representing the model to train loosely \
            implementing the interface of models.model.Model.
        dataset : pylearn2.datasets.dataset.Dataset
            Dataset object used to draw training data
        """
        self.model = model

        if self.cost is None:
            self.cost = model.get_default_cost()

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0
                        or batch_size == model.force_batch_size):
                    raise ValueError("batch_size is %d but " +
                                     "model.force_batch_size is %d" %
                                     (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)

        data_specs = self.cost.get_data_specs(model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space,
        # named according to the sources.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = 'BGD_[%s]' % source
            arg = space.make_theano_batch(name=name)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with their data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    **fixed_var_descr.fixed_vars)
        grads, grad_updates = self.cost.get_gradients(
            model, nested_args, **fixed_var_descr.fixed_vars)

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)

        if cost_value is None:
            raise ValueError("BGD is incompatible with " + str(self.cost) +
                             " because it is intractable, but BGD uses the " +
                             "cost function value to do line searches.")

        # obj_prereqs has to be a list of function f called with f(*data),
        # where data is a data tuple coming from the iterator.
        # this function enables capturing "mapping" and "f", while
        # enabling the "*data" syntax
        def capture(f, mapping=mapping):
            new_f = lambda *args: f(mapping.flatten(args, return_tuple=True))
            return new_f

        obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch]

        if self.monitoring_dataset is not None:
            if (self.monitoring_batch_size is None
                    and self.monitoring_batches is None):
                self.monitoring_batch_size = self.batch_size
                self.monitoring_batches = self.batches_per_iter
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.monitoring_batch_size,
                               num_batches=self.monitoring_batches,
                               obj_prereqs=obj_prereqs,
                               cost_monitoring_args=fixed_var_descr.fixed_vars)

        params = model.get_params()

        self.optimizer = BatchGradientDescent(
            objective=cost_value,
            gradients=grads,
            gradient_updates=grad_updates,
            params=params,
            param_constrainers=[model.censor_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=theano_args,
            verbose=self.verbose_optimization,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
            theano_function_mode=self.theano_function_mode,
            init_alpha=self.init_alpha)

        # These monitoring channels keep track of shared variables,
        # which do not need inputs nor data.
        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                name='ave_step_size',
                ipt=None,
                val=self.optimizer.ave_step_size,
                data_specs=(NullSpace(), ''),
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_size',
                ipt=None,
                val=self.optimizer.ave_grad_size,
                data_specs=(NullSpace(), ''),
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_mult',
                ipt=None,
                val=self.optimizer.ave_grad_mult,
                data_specs=(NullSpace(), ''),
                dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True

    def train(self, dataset):
        """
        .. todo::

            WRITEME
        """
        assert self.bSetup
        model = self.model

        rng = self.rng
        train_iteration_mode = 'shuffled_sequential'
        if not is_stochastic(train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)
        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError(
                "Unable to train with BGD, because "
                "the cost does not actually use data from the data set. "
                "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=train_iteration_mode,
                                    batch_size=self.batch_size,
                                    num_batches=self.batches_per_iter,
                                    data_specs=flat_data_specs,
                                    return_tuple=True,
                                    rng=rng)

        mode = self.theano_function_mode
        for data in iterator:
            if ('targets' in source_tuple and mode is not None
                    and hasattr(mode, 'record')):
                Y = data[source_tuple.index('targets')]
                stry = str(Y).replace('\n', ' ')
                mode.record.handle_line('data Y ' + stry + '\n')

            for on_load_batch in self.on_load_batch:
                on_load_batch(mapping.nest(data))

            self.before_step(model)
            self.optimizer.minimize(*data)
            self.after_step(model)
            actual_batch_size = flat_data_specs[0].np_batch_size(data)
            model.monitor.report_batch(actual_batch_size)

    def continue_learning(self, model):
        """
        .. todo::

            WRITEME
        """
        if self.termination_criterion is None:
            return True
        else:
            rval = self.termination_criterion.continue_learning(self.model)
            assert rval in [True, False, 0, 1]
            return rval

    def before_step(self, model):
        """
        .. todo::

            WRITEME
        """
        if self.scale_step != 1.:
            self.params = list(model.get_params())
            self.value = [param.get_value() for param in self.params]

    def after_step(self, model):
        """
        .. todo::

            WRITEME
        """
        if self.scale_step != 1:
            for param, value in safe_zip(self.params, self.value):
                value = (1.-self.scale_step) * value + self.scale_step \
                        * param.get_value()
                param.set_value(value)
コード例 #16
0
ファイル: bgd.py プロジェクト: alouisos/pylearn2
class BGD(TrainingAlgorithm):
    """Batch Gradient Descent training algorithm class"""
    def __init__(self, cost=None, batch_size=None, batches_per_iter=None,
                 updates_per_batch=10, monitoring_batches=None,
                 monitoring_dataset=None, termination_criterion = None,
                 set_batch_size=False, reset_alpha=True, conjugate=False,
                 min_init_alpha=.001, reset_conjugate=True,
                 line_search_mode=None, verbose_optimization=False,
                 scale_step=1., theano_function_mode=None, init_alpha=None,
                 seed=None):
        """
        Parameters
        ----------
        cost : pylearn2.costs.Cost
            A pylearn2 Cost, or None, in which case model.get_default_cost() \
            will be used
        batch_size : int
            Like the SGD TrainingAlgorithm, this TrainingAlgorithm still \
            iterates over minibatches of data. The difference is that this \
            class uses partial line searches to choose the step size along \
            each gradient direction, and can do repeated updates on the same \
            batch. The assumption is that you use big enough minibatches with \
            this algorithm that a large step size will generalize reasonably \
            well to other minibatches. To implement true Batch Gradient \
            Descent, set the batch_size to the total number of examples \
            available. If batch_size is None, it will revert to the model's \
            force_batch_size attribute.
        batches_per_iter : int
            WRITEME
        updates_per_batch : int
            Passed through to the optimization.BatchGradientDescent's \
            `max_iters parameter`
        monitoring_batches : WRITEME
        monitoring_dataset: Dataset or dict
            A Dataset or a dictionary mapping string dataset names to Datasets
        termination_criterion : WRITEME
        set_batch_size : bool
            If True, BGD will attempt to override the model's \
            `force_batch_size` attribute by calling set_batch_size on it.
        reset_alpha : bool
            Passed through to the optimization.BatchGradientDescent's \
            `max_iters parameter`
        conjugate : bool
            Passed through to the optimization.BatchGradientDescent's \
            `max_iters parameter`
        min_init_alpha : float
            WRITEME
        reset_conjugate : bool
            Passed through to the optimization.BatchGradientDescent's \
            `max_iters parameter`
        line_search_mode : WRITEME
        verbose_optimization : bool
            WRITEME
        scale_step : float
            WRITEME
        theano_function_mode : WRITEME
        init_alpha : WRITEME
        seed : WRITEME
        """

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches == None


        self._set_monitoring_dataset(monitoring_dataset)

        self.bSetup = False
        self.termination_criterion = termination_criterion
        if seed is None:
            seed = [2012, 10, 16]
        self.rng = np.random.RandomState(seed)

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model : object
            A Python object representing the model to train loosely \
            implementing the interface of models.model.Model.
        dataset : pylearn2.datasets.dataset.Dataset
            Dataset object used to draw training data
        """
        self.model = model

        if self.cost is None:
            self.cost = model.get_default_cost()

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size):
                    raise ValueError("batch_size is %d but " +
                                     "model.force_batch_size is %d" %
                                     (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)

        data_specs = self.cost.get_data_specs(model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space,
        # named according to the sources.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = 'BGD_[%s]' % source
            arg = space.make_theano_batch(name=name)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with their data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)
        grads, grad_updates = self.cost.get_gradients(
                model, nested_args, ** fixed_var_descr.fixed_vars)

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)

        if cost_value is None:
            raise ValueError("BGD is incompatible with " + str(self.cost) +
                             " because it is intractable, but BGD uses the " +
                             "cost function value to do line searches.")

        # obj_prereqs has to be a list of function f called with f(*data),
        # where data is a data tuple coming from the iterator.
        # this function enables capturing "mapping" and "f", while
        # enabling the "*data" syntax
        def capture(f, mapping=mapping):
            new_f = lambda *args: f(mapping.flatten(args, return_tuple=True))
            return new_f

        obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch]

        if self.monitoring_dataset is not None:
            self.monitor.setup(
                    dataset=self.monitoring_dataset,
                    cost=self.cost,
                    batch_size=self.batch_size,
                    num_batches=self.monitoring_batches,
                    obj_prereqs=obj_prereqs,
                    cost_monitoring_args=fixed_var_descr.fixed_vars)

            # TODO : Why is this commented?
            '''
            channels = model.get_monitoring_channels(theano_args)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, theano_args, ** fixed_var_descr.fixed_vars))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                    mode="sequential",
                                    batch_size=self.batch_size,
                                    num_batches=self.monitoring_batches)

                # The monitor compiles all channels for the same dataset into one function, and
                # runs all prereqs before calling the function. So we only need to register the
                # on_load_batch prereq once per monitoring dataset.
                self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=cost_value,
                        dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    self.monitor.add_channel(name= prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             data_specs=data_specs,
                                             dataset = monitoring_dataset,
                                             prereqs=prereqs)
                '''

        params = model.get_params()


        self.optimizer = BatchGradientDescent(
                            objective = cost_value,
                            gradients = grads,
                            gradient_updates = grad_updates,
                            params = params,
                            param_constrainers = [ model.censor_updates ],
                            lr_scalers = model.get_lr_scalers(),
                            inputs = theano_args,
                            verbose = self.verbose_optimization,
                            max_iter = self.updates_per_batch,
                            reset_alpha = self.reset_alpha,
                            conjugate = self.conjugate,
                            reset_conjugate = self.reset_conjugate,
                            min_init_alpha = self.min_init_alpha,
                            line_search_mode = self.line_search_mode,
                            theano_function_mode=self.theano_function_mode,
                            init_alpha=self.init_alpha)

        # These monitoring channels keep track of shared variables,
        # which do not need inputs nor data.
        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                    name='ave_step_size',
                    ipt=None,
                    val=self.optimizer.ave_step_size,
                    data_specs=(NullSpace(), ''),
                    dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                    name='ave_grad_size',
                    ipt=None,
                    val=self.optimizer.ave_grad_size,
                    data_specs=(NullSpace(), ''),
                    dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                    name='ave_grad_mult',
                    ipt=None,
                    val=self.optimizer.ave_grad_mult,
                    data_specs=(NullSpace(), ''),
                    dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True

    def train(self, dataset):
        """
        .. todo::

            WRITEME
        """
        assert self.bSetup
        model = self.model

        rng = self.rng
        train_iteration_mode = 'shuffled_sequential'
        if not is_stochastic(train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)
        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError("Unable to train with BGD, because "
                    "the cost does not actually use data from the data set. "
                    "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=train_iteration_mode,
                batch_size=self.batch_size,
                num_batches=self.batches_per_iter,
                data_specs=flat_data_specs, return_tuple=True,
                rng = rng)

        mode = self.theano_function_mode
        for data in iterator:
            if ('targets' in source_tuple and mode is not None
                    and hasattr(mode, 'record')):
                Y = data[source_tuple.index('targets')]
                stry = str(Y).replace('\n',' ')
                mode.record.handle_line('data Y '+stry+'\n')

            for on_load_batch in self.on_load_batch:
                on_load_batch(mapping.nest(data))

            self.before_step(model)
            self.optimizer.minimize(*data)
            self.after_step(model)
            actual_batch_size = flat_data_specs[0].np_batch_size(data)
            model.monitor.report_batch(actual_batch_size)

    def continue_learning(self, model):
        """
        .. todo::

            WRITEME
        """
        if self.termination_criterion is None:
            return True
        else:
            rval = self.termination_criterion.continue_learning(self.model)
            assert rval in [True, False, 0, 1]
            return rval

    def before_step(self, model):
        """
        .. todo::

            WRITEME
        """
        if self.scale_step != 1.:
            self.params = list(model.get_params())
            self.value = [ param.get_value() for param in self.params ]

    def after_step(self, model):
        """
        .. todo::

            WRITEME
        """
        if self.scale_step != 1:
            for param, value in safe_zip(self.params, self.value):
                value = (1.-self.scale_step) * value + self.scale_step * param.get_value()
                param.set_value(value)
コード例 #17
0
ファイル: optimal_input.py プロジェクト: vd114/galatea
act = p[0,filter_idx,i,j]

obj = - act + norm_penalty * T.square(X).sum()

assert obj.ndim == 0

optimizer = BatchGradientDescent(objective = obj,
        params = [X],
        inputs = None,
        param_constrainers = None,
        max_iter = 1000,
        verbose = True,
        tol = None,
        init_alpha = (.001, .005, .01, .05, .1))

optimizer.minimize()

img = X.get_value()[0,:,:,:]

print 'max mag: ',np.abs(img).max()
print 'norm: ',np.square(img).sum()
print 'min: ',img.min()
print 'max: ',img.max()

img /= np.abs(img).max()

img *= .5
img += 1

show(img)
コード例 #18
0
class BGD(TrainingAlgorithm):
    """Batch Gradient Descent training algorithm class"""

    def __init__(
        self,
        cost,
        batch_size=None,
        batches_per_iter=None,
        updates_per_batch=10,
        monitoring_batches=None,
        monitoring_dataset=None,
        termination_criterion=None,
        set_batch_size=False,
        reset_alpha=True,
        conjugate=False,
        min_init_alpha=None,
        reset_conjugate=True,
        line_search_mode=None,
    ):
        """
        cost: a pylearn2 Cost
        batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm
                    still iterates over minibatches of data. The difference
                    is that this class uses partial line searches to choose
                    the step size along each gradient direction, and can do
                    repeated updates on the same batch. The assumption is
                    that you use big enough minibatches with this algorithm that
                    a large step size will generalize reasonably well to other
                    minibatches.
                    To implement true Batch Gradient Descent, set the batch_size
                    to the total number of examples available.
                    If batch_size is None, it will revert to the model's force_batch_size
                    attribute.
        set_batch_size: If True, BGD will attempt to override the model's force_batch_size
                attribute by calling set_batch_size on it.
        updates_per_batch: Passed through to the optimization.BatchGradientDescent's
                   max_iters parameter
        reset_alpha, conjugate, reset_conjugate: passed through to the
            optimization.BatchGradientDescent parameters of the same names
        monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets
        """

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches == None

        self._set_monitoring_dataset(monitoring_dataset)

        self.bSetup = False
        self.termination_criterion = termination_criterion
        self.rng = np.random.RandomState([2012, 10, 16])

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, "force_batch_size"):
                if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size):
                    raise ValueError(
                        "batch_size is %d but model.force_batch_size is %d" % (batch_size, model.force_batch_size)
                    )

        self.monitor = Monitor.get_monitor(model)
        X = self.model.get_input_space().make_theano_batch()
        self.topo = X.ndim != 2
        Y = T.matrix()

        if self.cost.supervised:
            obj = self.cost(model, X, Y)
            grads, grad_updates = self.cost.get_gradients(model, X, Y)
            ipt = (X, Y)
        else:
            obj = self.cost(model, X)
            grads, grad_updates = self.cost.get_gradients(model, X)
            ipt = X
        if obj is None:
            raise ValueError(
                "BGD is incompatible with " + str(self.cost) + " because "
                " it is intractable, but BGD uses the cost function value to do "
                " line searches."
            )

        if self.monitoring_dataset is not None:
            if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]):
                Y = None

            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError(
                    "model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)
                )
            channels.update(self.cost.get_monitoring_channels(model, X, Y))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == "":
                    prefix = ""
                else:
                    prefix = dataset_name + "_"
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(
                    dataset=monitoring_dataset,
                    mode="sequential",
                    batch_size=self.batch_size,
                    num_batches=self.monitoring_batches,
                )

                self.monitor.add_channel(prefix + "objective", ipt=ipt, val=obj, dataset=monitoring_dataset)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(
                        name=prefix + name, ipt=ipt, val=J, dataset=monitoring_dataset, prereqs=prereqs
                    )

        if ipt is X:
            ipts = [X]
        else:
            ipts = ipt

        self.optimizer = BatchGradientDescent(
            objective=obj,
            gradients=grads,
            gradient_updates=grad_updates,
            params=model.get_params(),
            param_constrainers=[model.censor_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=ipts,
            verbose=True,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
        )

        self.first = True
        self.bSetup = True

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        batch_size = self.batch_size

        if self.topo:
            get_data = dataset.get_batch_topo
        else:
            get_data = dataset.get_batch_design

        rng = self.rng
        train_iteration_mode = "shuffled_sequential"
        if not is_stochastic(train_iteration_mode):
            rng = None
        iterator = dataset.iterator(
            mode=train_iteration_mode,
            batch_size=self.batch_size,
            targets=self.cost.supervised,
            num_batches=self.batches_per_iter,
            topo=self.topo,
            rng=rng,
        )
        for data in iterator:
            if self.cost.supervised:
                args = data
                X, Y = data
            else:
                args = [data]
                X = data
            self.optimizer.minimize(*args)
            model.monitor.report_batch(X.shape[0])

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion(self.model)
コード例 #19
0
ファイル: dnce_alg.py プロジェクト: vd114/galatea
class DNCE_Algorithm(object):
    def __init__(self, noise, batch_size=1000, batches_per_iter=10,
                     noise_per_clean = 30,
                 monitoring_batches=-1, monitoring_dataset=None):
        """
        if batch_size is None, reverts to the force_batch_size field of the
        model
        """
        self.batch_size, self.batches_per_iter = batch_size, batches_per_iter
        if monitoring_dataset is None:
            assert monitoring_batches == -1
        self.monitoring_dataset = monitoring_dataset
        self.monitoring_batches = monitoring_batches
        self.bSetup = False
        self.noise = noise
        self.noise_per_clean = noise_per_clean

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        self.monitor = Monitor.get_monitor(model)
        X = T.matrix()
        Y = T.matrix()
        dnce = DNCE( self.noise)
        if self.monitoring_dataset is not None:
            if not self.monitoring_dataset.has_targets():
                Y = None
            self.monitor.set_dataset(dataset=self.monitoring_dataset,
                                mode="sequential",
                                batch_size=self.batch_size,
                                num_batches=self.monitoring_batches)
            X.tag.test_value = self.monitoring_dataset.get_batch_design(2)
            channels = model.get_monitoring_channels(X,Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))

            dnce.noise_per_clean = self.noise_per_clean
            obj = dnce(model,X)
            dnce.noise_per_clean = None
            self.monitor.add_channel('DNCE',ipt=X,val=obj)

            for name in channels:
                J = channels[name]
                if isinstance(J, tuple):
                    assert len(J) == 2
                    J, prereqs = J
                else:
                    prereqs = None

                if Y is not None:
                    ipt = (X,Y)
                else:
                    ipt = X

                self.monitor.add_channel(name=name,
                                         ipt=ipt,
                                         val=J,
                                         prereqs=prereqs)

        X = sharedX( dataset.get_batch_design(1), 'X')
        Y = []
        updates = {}
        for i in xrange(self.noise_per_clean):
            Y_i = sharedX( X.get_value().copy() )
            updates[Y_i] = self.noise.random_design_matrix(X)
            Y.append(Y_i)
        self.update_noise = function([], updates = updates)


        obj = dnce(model,X,Y)

        self.optimizer = BatchGradientDescent(
                            objective = obj,
                            params = model.get_params(),
                            param_constrainers = [ model.censor_updates ],
                            max_iter = 5)
        self.X = X
        self.Y = Y


        self.first = True
        self.bSetup = True

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        if self.batch_size is None:
            batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if hasattr(model, 'force_batch_size'):
                assert (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size)

        for i in xrange(self.batches_per_iter):
            self.X.set_value(dataset.get_batch_design(self.batch_size))
            self.update_noise()
            self.optimizer.minimize()
            model.monitor.report_batch( batch_size )
        return True
コード例 #20
0
ファイル: bgd.py プロジェクト: gdesjardins/pylearn
class BGD(object):
    """Batch Gradient Descent training algorithm class"""
    def __init__(self, cost, batch_size=None, batches_per_iter=10,
                 updates_per_batch = 10,
                 monitoring_batches=-1, monitoring_dataset=None,
                 termination_criterion = None):
        """
        if batch_size is None, reverts to the force_batch_size field of the
        model
        """

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches == -1
        self.bSetup = False
        self.termination_criterion = termination_criterion

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size):
                    raise ValueError("batch_size is %d but model.force_batch_size is %d" %
                            (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        X = self.model.get_input_space().make_theano_batch()
        self.topo = X.ndim != 2
        Y = T.matrix()
        if self.monitoring_dataset is not None:
            if not self.monitoring_dataset.has_targets():
                Y = None
            self.monitor.add_dataset(dataset=self.monitoring_dataset,
                                mode="sequential",
                                batch_size=self.batch_size,
                                num_batches=self.monitoring_batches)
            channels = model.get_monitoring_channels(X,Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))

            #TODO: currently only supports unsupervised costs, support supervised too
            obj = self.cost(model,X)
            self.monitor.add_channel('batch_gd_objective',ipt=X,val=obj)

            for name in channels:
                J = channels[name]
                if isinstance(J, tuple):
                    assert len(J) == 2
                    J, prereqs = J
                else:
                    prereqs = None

                if Y is not None:
                    ipt = (X,Y)
                else:
                    ipt = X

                self.monitor.add_channel(name=name,
                                         ipt=ipt,
                                         val=J,
                                         prereqs=prereqs)


        obj = self.cost(model,X)

        self.optimizer = BatchGradientDescent(
                            objective = obj,
                            params = model.get_params(),
                            param_constrainers = [ model.censor_updates ],
                            lr_scalers = model.get_lr_scalers(),
                            inputs = [ X ],
                            verbose = True,
                            max_iter = self.updates_per_batch)


        self.first = True
        self.bSetup = True

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        batch_size = self.batch_size

        if self.topo:
            get_data = dataset.get_batch_topo
        else:
            get_data = dataset.get_batch_design

        for i in xrange(self.batches_per_iter):
            X = get_data(self.batch_size)
            self.optimizer.minimize(X)
            model.monitor.report_batch( batch_size )
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion(self.model)
コード例 #21
0
ファイル: realest.py プロジェクト: cc13ny/galatea
_, model_path = sys.argv
from pylearn2.utils import serial
model = serial.load(model_path)
d = model.discriminator
import gc
del model
gc.collect()
from pylearn2.utils import sharedX
X = sharedX(d.get_input_space().get_origin_batch(1))
obj =  -d.fprop(X).sum()
from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent as BGD
import theano.tensor as T
def norm_constraint(updates):
    assert X in updates
    updates[X] = updates[X] / (1e-7 + T.sqrt(T.sqr(X).sum()))
opt = BGD(objective=obj, params=[X], param_constrainers=[norm_constraint], conjugate=True, reset_conjugate=False,
        reset_alpha=False, line_search_mode='exhaustive', verbose=3, max_iter=20)
results = []
import numpy as np
rng = np.random.RandomState([1, 2, 3])
for i in xrange(10):
    X.set_value(rng.randn(*X.get_value().shape).astype(X.dtype) / 10.)
    opt.minimize()
    Xv = X.dimshuffle(3, 1, 2, 0).eval()
    results.append(Xv)
X = np.concatenate(results, axis=0)
from pylearn2.gui.patch_viewer import make_viewer
v = make_viewer(X)
v.show()

コード例 #22
0
                1. + T.exp(model.free_energy(X) - model.free_energy(Y_i)))
            acc = (pos_prob > .5).mean()
            accs.append(acc)
        acc = sum(accs) / float(len(accs))

        print '\tinit accuracy ', function([], acc)()

        #Minimize the objective function with batch gradient descent
        minimizer = BatchGradientDescent(
            objective=J,
            params=model.get_params(),
            param_constrainers=[model.censor_updates])

        print '\tinit obj:', minimizer.obj()
        #minimizer.verbose = True
        minimizer.minimize()
        print '\tfinal obj:', minimizer.obj()

        recovered_beta = model.beta.get_value()
        recovered_mu = model.mu.get_value()

        print '\trecovered beta:', recovered_beta
        print '\trecovered mu:', recovered_mu

        kl = kl_divergence(true, model)
        kl = function([], kl)()
        assert kl >= 0.0

        print '\tkl was ', kl
        print '\tfinal accuracy ', function([], acc)()
        kls[trial, idx1] = kl
コード例 #23
0
ファイル: __init__.py プロジェクト: cc13ny/galatea
class WarmStart(TrainExtension):

    def __init__(self, num_basis_vectors, num_points, scale, max_jump_norm = 1.,
            method = 'gradient', fitting_cost = 'mse', include_root = False,
            num_applications = -1, psd = False, use_solver = False, reps=1):
        self.__dict__.update(locals())
        del self.self
        self.batch_size = 1000
        self.rng = np.random.RandomState([2014, 5, 8, 2])

    def setup(self, model, dataset, algorithm):
        """
        Train calls this immediately upon instantiation,
        before any monitoring is done.

        This subclass uses it to warm-start the parameters.

        Parameters
        ----------
        model : pylearn2.models.Model
            The model object being trained.

        dataset : pylearn2.datasets.Dataset
            The dataset object being trained.

        algorithm : pylearn2.training_algorithms.TrainingAlgorithm
            The object representing the training algorithm being
            used to train the model.
            *This must be a TrainingAlgorithm that has a `cost`
            attribute that is a pylearn2 `Cost`, such as `SGD`
            or `BGD`.*
        """

        if self.num_applications == 0:
            return
        self.num_applications -= 1

        for i in xrange(self.reps):
            self.setup_impl(model, dataset, algorithm)

    def setup_impl(self, model, dataset, algorithm):
        cost = algorithm.cost

        root = model.get_param_vector()

        dim = root.size

        rng = self.rng


        points = rng.randn(self.num_points, self.num_basis_vectors)
        points = points.astype(root.dtype)
        points *= self.scale

        if self.include_root:
            points[0, :] = 0.

        if not hasattr(self, 'cost_fn'):
            # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now
            # =======================================
            data_specs = cost.get_data_specs(model)
            mapping = DataSpecsMapping(data_specs)
            space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
            source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

            # Build a flat tuple of Theano Variables, one for each space.
            # We want that so that if the same space/source is specified
            # more than once in data_specs, only one Theano Variable
            # is generated for it, and the corresponding value is passed
            # only once to the compiled Theano function.
            theano_args = []
            for space, source in safe_zip(space_tuple, source_tuple):
                name = '%s[%s]' % (self.__class__.__name__, source)
                arg = space.make_theano_batch(name=name,
                                              batch_size=self.batch_size)
                theano_args.append(arg)
            theano_args = tuple(theano_args)

            # Methods of `cost` need args to be passed in a format compatible
            # with data_specs
            nested_args = mapping.nest(theano_args)
            fixed_var_descr = cost.get_fixed_var_descr(model, nested_args)
            self.on_load_batch = fixed_var_descr.on_load_batch

            cost_value = cost.expr(model, nested_args,
                                        ** fixed_var_descr.fixed_vars)
            # End cargo culting
            # ======================

            print "Compiling cost function..."
            cost_fn = function(theano_args, cost_value)
            self.cost_fn = cost_fn
        else:
            cost_fn = self.cost_fn

        cost_values = np.zeros(self.num_points)


        data = list(dataset.get_batch_design(self.batch_size,
            include_labels=True))
        from pylearn2.utils.one_hot import one_hot
        data[1] = one_hot(data[1])


        if self.method == 'gaussian':
            basis = rng.normal(dim, self.num_basis_vectors).astype(root.dtype)
        elif self.method == 'element':
            basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype)
            for i in xrange(self.num_basis_vectors):
                basis[rng.randint(dim), i] = 1.
        elif self.method == 'gradient':
            if not hasattr(self, 'grad_fn'):
                self.grad_fn = function(theano_args, grad(cost_value, model.get_params()))
            grad_fn = self.grad_fn

            basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype)
            for i in xrange(self.num_basis_vectors):
                ipt = list(dataset.get_batch_design(1, include_labels=True))
                label = ipt[1]
                assert label.size == 1
                label = label[0]
                one_hot = np.zeros((1, 10,),dtype='float32')
                one_hot[0, label] = 1
                ipt[1] = one_hot
                g = grad_fn(*ipt)
                basis[:,i] = np.concatenate([e.reshape(e.size) for e in g], axis=0)
        else:
            assert False

        basis /= np.sqrt(np.square(basis).sum(axis=0))

        # Orthogonalize basis
        for i in xrange(self.num_basis_vectors):
            v = basis[:,i ].copy()
            for j in xrange(i - 1):
                u = basis[:, j].copy()
                v -= np.dot(u, v) * u
            norm = np.sqrt(np.square(v).sum())
            assert norm > 1e-4
            v /= norm
            basis[:,i] = v


        for i in xrange(self.num_points):
            print "Evaluating cost at point ", i

            point = points[i, :]
            full_point = root + np.dot(basis, point)
            model.set_param_vector(full_point)

            cost_values[i] = cost_fn(*data)
            print cost_values[i]


        from pylearn2.utils import sharedX
        import theano.tensor as T

        print "!!!!!!!! FITTING THE QUADRATIC FUNCTION !!!!!!!!!!!!!!!!!!!"

        if not hasattr(self, 'fit_quad'):
            points = sharedX(points)
            #from theano import config
            #config.compute_test_value = 'raise'
            cost_values = sharedX(cost_values)
            A = sharedX(np.zeros((self.num_basis_vectors, self.num_basis_vectors)))
            if self.psd:
                mat = T.dot(A.T, A)
            else:
                mat = A
            b = sharedX(np.zeros(self.num_basis_vectors))
            c = sharedX(0.)
            half_quad = T.dot(points, mat)
            quad = (points * half_quad).sum(axis=1)
            lin = T.dot(points, b)
            pred = quad + lin + c

            from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent

            mse = T.square(pred - cost_values).mean()
            mae = abs(pred - cost_values).mean()

            obj = locals()[self.fitting_cost]

            fit_quad = BatchGradientDescent(obj, params = [A, b, c],
                    max_iter = self.num_basis_vectors ** 2,
                    verbose = 3, tol = None,
                    init_alpha = None, min_init_alpha = 1e-7,
                    reset_alpha = False, conjugate = True,
                    reset_conjugate = False,
                    line_search_mode = 'exhaustive')
            self.fit_quad = fit_quad
            self.A = A
            self.b = b
            self.c = c
            self.points = points
            self.cost_values = cost_values
        else:
            self.A.set_value(.001 * np.identity(self.A.get_value().shape[0], dtype=self.A.dtype))
            self.b.set_value(self.b.get_value() * 0.)
            self.c.set_value(self.c.get_value() * 0.)
            self.points.set_value(points)
            self.cost_values.set_value(cost_values.astype(self.cost_values.dtype))

        self.fit_quad.minimize()

        print "!!!!!!!!!!!!! FINDING ITS MINIMUM !!!!!!!!!!!!!!!!!!!!!!!!!!!"

        if self.use_solver:
            if self.psd:
                Av = self.A.get_value()
                mat_v = np.dot(Av.T, Av)
            else:
                mat_v = self.A.get_value()
            bv = self.b.get_value()

            # minimize for x^T A x + b^T x + c
            # -> solve 2 A x + b = 0
            # Ax = - b / 2

            print "********** mat_v", mat_v.min(), mat_v.max()
            x, ignored_residuals, ignored_rank, ignored_singular_values = np.linalg.lstsq(mat_v, - 0.5 * bv)
            print "********** soln: ", x.min(), x.mean(), x.max()
            print "********** SVs: ", ignored_singular_values.min(), ignored_singular_values.max()
            assert x.ndim == 1, x.shape
            prod = np.dot(basis, x)
            norm = np.sqrt(np.square(prod).sum())
            print "*************** Moving params by ",norm
            vector = root + prod
            model.set_param_vector(vector)

        else: # use minimizer
            if not hasattr(self, 'fit_params'):
                self.vector = sharedX(points.get_value().mean(axis=0))
                vector = self.vector
                obj = T.dot(T.dot(mat, vector), vector) + T.dot(b, vector)

                def constrain(d):
                    assert vector in d
                    n = d[vector]
                    norm = T.sqrt(T.square(n).sum())
                    desired_norm = T.clip(norm, 0., self.max_jump_norm)
                    d[vector] = n * desired_norm / norm

                self.fit_params = BatchGradientDescent(obj, params=[vector],
                    max_iter = self.num_basis_vectors,
                    verbose = 3, tol=None,
                    param_constrainers = [constrain],
                    init_alpha = None, min_init_alpha = 1e-3,
                    reset_alpha=False, conjugate=True, reset_conjugate=False,
                    line_search_mode='exhaustive')
            else:
                self.vector.set_value(points.mean(axis=0).astype(self.vector.dtype))

            self.fit_params.minimize()

            model.set_param_vector(root + np.dot(basis , self.vector.get_value()))
コード例 #24
0
ファイル: bgd.py プロジェクト: poolio/pylearn
class BGD(TrainingAlgorithm):
    """Batch Gradient Descent training algorithm class"""
    def __init__(self, cost, batch_size=None, batches_per_iter=None,
                 updates_per_batch = 10,
                 monitoring_batches=None, monitoring_dataset=None,
                 termination_criterion = None, set_batch_size = False,
                 reset_alpha = True, conjugate = False,
                 min_init_alpha = .001,
                 reset_conjugate = True, line_search_mode = None,
                 verbose_optimization=False, scale_step=1., theano_function_mode=None,
                 init_alpha=None, seed=None):
        """
        cost: a pylearn2 Cost
        batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm
                    still iterates over minibatches of data. The difference
                    is that this class uses partial line searches to choose
                    the step size along each gradient direction, and can do
                    repeated updates on the same batch. The assumption is
                    that you use big enough minibatches with this algorithm that
                    a large step size will generalize reasonably well to other
                    minibatches.
                    To implement true Batch Gradient Descent, set the batch_size
                    to the total number of examples available.
                    If batch_size is None, it will revert to the model's force_batch_size
                    attribute.
        set_batch_size: If True, BGD will attempt to override the model's force_batch_size
                attribute by calling set_batch_size on it.
        updates_per_batch: Passed through to the optimization.BatchGradientDescent's
                   max_iters parameter
        reset_alpha, conjugate, reset_conjugate: passed through to the
            optimization.BatchGradientDescent parameters of the same names
        monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets
        """

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches == None


        self._set_monitoring_dataset(monitoring_dataset)

        self.bSetup = False
        self.termination_criterion = termination_criterion
        if seed is None:
            seed = [2012, 10, 16]
        self.rng = np.random.RandomState(seed)

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size):
                    raise ValueError("batch_size is %d but model.force_batch_size is %d" %
                            (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)
        X = self.model.get_input_space().make_theano_batch()
        X.name = 'BGD_X'
        self.topo = X.ndim != 2
        Y = T.matrix()
        Y.name = 'BGD_Y'

        fixed_var_descr = self.cost.get_fixed_var_descr(model, X, Y)
        self.on_load_batch = fixed_var_descr.on_load_batch

        if not self.cost.supervised:
            Y = None

        if self.cost.supervised:
            obj = self.cost(model, X, Y, ** fixed_var_descr.fixed_vars)
            grads, grad_updates = self.cost.get_gradients(model, X, Y, ** fixed_var_descr.fixed_vars)
            ipt = (X,Y)
        else:
            obj = self.cost(model, X, ** fixed_var_descr.fixed_vars)
            grads, grad_updates = self.cost.get_gradients(model, X, ** fixed_var_descr.fixed_vars)
            ipt = X
            Y = None

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)


        if obj is None:
            raise ValueError("BGD is incompatible with "+str(self.cost)+" because "
                    " it is intractable, but BGD uses the cost function value to do "
                    " line searches.")

        if self.monitoring_dataset is not None:
            if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]):
                Y = None

            channels = model.get_monitoring_channels(X,Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, X, Y, ** fixed_var_descr.fixed_vars))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                    mode="sequential",
                                    batch_size=self.batch_size,
                                    num_batches=self.monitoring_batches)

                # The monitor compiles all channels for the same dataset into one function, and
                # runs all prereqs before calling the function. So we only need to register the
                # on_load_batch prereq once per monitoring dataset.
                self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=obj,
                        dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X,Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name= prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset = monitoring_dataset,
                                             prereqs=prereqs)

        if self.cost.supervised:
            ipts = [X, Y]
        else:
            ipts = [X]

        params = model.get_params()

        self.optimizer = BatchGradientDescent(
                            objective = obj,
                            gradients = grads,
                            gradient_updates = grad_updates,
                            params = params,
                            param_constrainers = [ model.censor_updates ],
                            lr_scalers = model.get_lr_scalers(),
                            inputs = ipts,
                            verbose = self.verbose_optimization,
                            max_iter = self.updates_per_batch,
                            reset_alpha = self.reset_alpha,
                            conjugate = self.conjugate,
                            reset_conjugate = self.reset_conjugate,
                            min_init_alpha = self.min_init_alpha,
                            line_search_mode = self.line_search_mode,
                            theano_function_mode=self.theano_function_mode,
                            init_alpha=self.init_alpha)

        if self.monitoring_dataset is not None:
            self.monitor.add_channel(name='ave_step_size',
                    ipt=ipt, val = self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(name='ave_grad_size',
                    ipt=ipt, val = self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(name='ave_grad_mult',
                    ipt=ipt, val = self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0])


        self.first = True
        self.bSetup = True

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        batch_size = self.batch_size

        if self.topo:
            get_data = dataset.get_batch_topo
        else:
            get_data = dataset.get_batch_design

        rng = self.rng
        train_iteration_mode = 'shuffled_sequential'
        if not is_stochastic(train_iteration_mode):
            rng = None
        iterator = dataset.iterator(mode=train_iteration_mode,
                batch_size=self.batch_size,
                targets=self.cost.supervised,
                num_batches=self.batches_per_iter,
                topo=self.topo,
                rng = rng)
        for data in iterator:
            if self.cost.supervised:
                args = data
                X, Y = data
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    stry = str(Y).replace('\n',' ')
                    mode.record.handle_line('data Y '+stry+'\n')
                for on_load_batch in self.on_load_batch:
                    on_load_batch(X, Y)
            else:
                args = [ data ]
                X = data
                for on_load_batch in self.on_load_batch:
                    on_load_batch(X, None)
            self.before_step(model)
            self.optimizer.minimize(*args)
            self.after_step(model)
            model.monitor.report_batch( X.shape[0] )

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            return self.termination_criterion(self.model)

    def before_step(self, model):
        if self.scale_step != 1.:
            self.params = list(model.get_params())
            self.value = [ param.get_value() for param in self.params ]

    def after_step(self, model):
        if self.scale_step != 1:
            for param, value in safe_zip(self.params, self.value):
                value = (1.-self.scale_step) * value + self.scale_step * param.get_value()
                param.set_value(value)
コード例 #25
0
ファイル: bgd.py プロジェクト: sdmassey27/pylearn2
class BGD(TrainingAlgorithm):
    """Batch Gradient Descent training algorithm class"""
    def __init__(self, cost=None, batch_size=None, batches_per_iter=None,
                 updates_per_batch = 10,
                 monitoring_batches=None, monitoring_dataset=None,
                 termination_criterion = None, set_batch_size = False,
                 reset_alpha = True, conjugate = False,
                 min_init_alpha = .001,
                 reset_conjugate = True, line_search_mode = None,
                 verbose_optimization=False, scale_step=1., theano_function_mode=None,
                 init_alpha=None, seed=None):
        """
        cost: a pylearn2 Cost, or None, in which case model.get_default_cost()
                will be used
        batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm
                    still iterates over minibatches of data. The difference
                    is that this class uses partial line searches to choose
                    the step size along each gradient direction, and can do
                    repeated updates on the same batch. The assumption is
                    that you use big enough minibatches with this algorithm that
                    a large step size will generalize reasonably well to other
                    minibatches.
                    To implement true Batch Gradient Descent, set the batch_size
                    to the total number of examples available.
                    If batch_size is None, it will revert to the model's force_batch_size
                    attribute.
        set_batch_size: If True, BGD will attempt to override the model's force_batch_size
                attribute by calling set_batch_size on it.
        updates_per_batch: Passed through to the optimization.BatchGradientDescent's
                   max_iters parameter
        reset_alpha, conjugate, reset_conjugate: passed through to the
            optimization.BatchGradientDescent parameters of the same names
        monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets
        """

        self.__dict__.update(locals())
        del self.self

        if monitoring_dataset is None:
            assert monitoring_batches == None


        self._set_monitoring_dataset(monitoring_dataset)

        self.bSetup = False
        self.termination_criterion = termination_criterion
        if seed is None:
            seed = [2012, 10, 16]
        self.rng = np.random.RandomState(seed)

    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.cost is None:
            self.cost = model.get_default_cost()

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size):
                    raise ValueError("batch_size is %d but model.force_batch_size is %d" %
                            (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)
        X = self.model.get_input_space().make_theano_batch()
        X.name = 'BGD_X'
        self.topo = X.ndim != 2
        if self.topo:
            assert self.model.get_input_space().axes == ('b', 0, 1, 'c')
        Y = T.matrix()
        Y.name = 'BGD_Y'
        if config.compute_test_value != 'off':
            X.tag.test_value = self.model.get_input_space().get_origin_batch(self.batch_size).astype(X.dtype)
            Y_batch = self.model.get_output_space().get_origin_batch(self.batch_size).astype(Y.dtype)
            assert Y_batch.ndim == 2
            for i in xrange(Y_batch.shape[0]):
                Y_batch[i, i % Y_batch.shape[1]] = 1
            Y.tag.test_value = Y_batch

        fixed_var_descr = self.cost.get_fixed_var_descr(model, X, Y)
        self.on_load_batch = fixed_var_descr.on_load_batch

        if not self.cost.supervised:
            Y = None

        if self.cost.supervised:
            obj = self.cost(model, X, Y, ** fixed_var_descr.fixed_vars)
            grads, grad_updates = self.cost.get_gradients(model, X, Y, ** fixed_var_descr.fixed_vars)
            ipt = (X,Y)
        else:
            obj = self.cost(model, X, ** fixed_var_descr.fixed_vars)
            grads, grad_updates = self.cost.get_gradients(model, X, ** fixed_var_descr.fixed_vars)
            ipt = X
            Y = None

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)


        if obj is None:
            raise ValueError("BGD is incompatible with "+str(self.cost)+" because "
                    " it is intractable, but BGD uses the cost function value to do "
                    " line searches.")

        # TODO: replace the following if block with a call to monitor.setup (it does the same thing;
        # this will reduce code duplication)
        # may need to still manually add some BGD-specific channels like ave_step_size here
        if self.monitoring_dataset is not None:
            if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]):
                Y = None

            channels = model.get_monitoring_channels(X,Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, X, Y, ** fixed_var_descr.fixed_vars))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                    mode="sequential",
                                    batch_size=self.batch_size,
                                    num_batches=self.monitoring_batches)

                # The monitor compiles all channels for the same dataset into one function, and
                # runs all prereqs before calling the function. So we only need to register the
                # on_load_batch prereq once per monitoring dataset.
                self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=obj,
                        dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X,Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name= prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset = monitoring_dataset,
                                             prereqs=prereqs)

        if self.cost.supervised:
            ipts = [X, Y]
        else:
            ipts = [X]

        params = model.get_params()

        self.optimizer = BatchGradientDescent(
                            objective = obj,
                            gradients = grads,
                            gradient_updates = grad_updates,
                            params = params,
                            param_constrainers = [ model.censor_updates ],
                            lr_scalers = model.get_lr_scalers(),
                            inputs = ipts,
                            verbose = self.verbose_optimization,
                            max_iter = self.updates_per_batch,
                            reset_alpha = self.reset_alpha,
                            conjugate = self.conjugate,
                            reset_conjugate = self.reset_conjugate,
                            min_init_alpha = self.min_init_alpha,
                            line_search_mode = self.line_search_mode,
                            theano_function_mode=self.theano_function_mode,
                            init_alpha=self.init_alpha)

        if self.monitoring_dataset is not None:
            self.monitor.add_channel(name='ave_step_size',
                    ipt=ipt, val = self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(name='ave_grad_size',
                    ipt=ipt, val = self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(name='ave_grad_mult',
                    ipt=ipt, val = self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0])


        self.first = True
        self.bSetup = True

    def train(self, dataset):
        assert self.bSetup
        model = self.model
        batch_size = self.batch_size

        if self.topo:
            get_data = dataset.get_batch_topo
        else:
            get_data = dataset.get_batch_design

        rng = self.rng
        train_iteration_mode = 'shuffled_sequential'
        if not is_stochastic(train_iteration_mode):
            rng = None
        iterator = dataset.iterator(mode=train_iteration_mode,
                batch_size=self.batch_size,
                targets=self.cost.supervised,
                num_batches=self.batches_per_iter,
                topo=self.topo,
                rng = rng)
        for data in iterator:
            if self.cost.supervised:
                args = data
                X, Y = data
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    stry = str(Y).replace('\n',' ')
                    mode.record.handle_line('data Y '+stry+'\n')
                for on_load_batch in self.on_load_batch:
                    on_load_batch(X, Y)
            else:
                args = [ data ]
                X = data
                for on_load_batch in self.on_load_batch:
                    on_load_batch(X, None)
            self.before_step(model)
            self.optimizer.minimize(*args)
            self.after_step(model)
            model.monitor.report_batch( X.shape[0] )

    def continue_learning(self, model):
        if self.termination_criterion is None:
            return True
        else:
            rval = self.termination_criterion.continue_learning(self.model)
            assert rval in [True, False, 0, 1]
            return rval

    def before_step(self, model):
        if self.scale_step != 1.:
            self.params = list(model.get_params())
            self.value = [ param.get_value() for param in self.params ]

    def after_step(self, model):
        if self.scale_step != 1:
            for param, value in safe_zip(self.params, self.value):
                value = (1.-self.scale_step) * value + self.scale_step * param.get_value()
                param.set_value(value)