Ejemplo n.º 1
0
 def __init__(self, inputs, outputs = None, updates = None):
     """
         Standin for a theano function with the given inputs, outputs, updates.
         Here in the __init__ method you give the same expression as usual.
         However, instead of passing __call__ the input variables directly, you pass it batches,
         where each batch is a list containing the inputs for that batch.
         It returns the average value of the function, averaged across batches,
         taking batch size into account. The average of all updates is also applied.
         One extra change: if any of the inputs is a shared variable, then this can
             assign to that variable, while theano.function would refuse to.
             Those shared variables will be left with the value of the last batch when __call__ returns.
     """
     batch_size = T.cast(inputs[0].shape[0], 'float32')
     total_examples = T.scalar()
     transformed_updates = OrderedDict()
     self.has_updates = updates is not None
     if self.has_updates:
         self._clear = function([], updates = [ (var, 0. * var) for var in updates])
         for var in updates:
             update = updates[var]
             transformed_updates[var] = var + (batch_size / total_examples) * update
     self._shared_mask = [ hasattr(elem, 'get_value') for elem in inputs]
     true_inputs = self._true_inputs(inputs)
     self._shared = self._shared_inputs(inputs)
     if outputs is not None:
         if not isinstance(outputs, list):
             outputs = [ outputs ]
         outputs = [ output * (batch_size / total_examples) for output in outputs]
     self._func = function(true_inputs + [total_examples], outputs=outputs, updates=transformed_updates)
Ejemplo n.º 2
0
 def __init__(self, inputs, outputs=None, updates=None):
     batch_size = T.cast(inputs[0].shape[0], 'float32')
     total_examples = T.scalar()
     transformed_updates = OrderedDict()
     self.has_updates = updates is not None
     if self.has_updates:
         self._clear = function([],
                                updates=[(var, 0. * var)
                                         for var in updates])
         for var in updates:
             update = updates[var]
             transformed_updates[var] = var + \
                 (batch_size / total_examples) * update
     self._shared_mask = [hasattr(elem, 'get_value') for elem in inputs]
     true_inputs = self._true_inputs(inputs)
     self._shared = self._shared_inputs(inputs)
     if outputs is not None:
         if not isinstance(outputs, list):
             outputs = [outputs]
         outputs = [
             output * (batch_size / total_examples) for output in outputs
         ]
     self._func = function(true_inputs + [total_examples],
                           outputs=outputs,
                           updates=transformed_updates)
Ejemplo n.º 3
0
 def get_fixed_var_descr(self, model, X, Y=None):
     rval = FixedVarDescr()
     rval.fixed_vars = {'sup_aux_var': sup_counter}
     rval.on_load_batch = [
         function([X, Y], updates=[(sup_counter, sup_counter + 1)])
     ]
     return rval
Ejemplo n.º 4
0
def test_vector_to_conv_c01b_invertible():
    """
    Tests that the format_as methods between Conv2DSpace
    and VectorSpace are invertible for the ('c', 0, 1, 'b')
    axis format.
    """

    rng = np.random.RandomState([2013, 5, 1])

    batch_size = 3
    rows = 4
    cols = 5
    channels = 2

    conv = Conv2DSpace([rows, cols], channels=channels, axes=('c', 0, 1, 'b'))
    vec = VectorSpace(conv.get_total_dimension())

    X = conv.make_batch_theano()
    Y = conv.format_as(X, vec)
    Z = vec.format_as(Y, conv)

    A = vec.make_batch_theano()
    B = vec.format_as(A, conv)
    C = conv.format_as(B, vec)

    f = function([X, A], [Z, C])

    X = rng.randn(*(conv.get_origin_batch(batch_size).shape)).astype(X.dtype)
    A = rng.randn(*(vec.get_origin_batch(batch_size).shape)).astype(A.dtype)

    Z, C = f(X, A)

    np.testing.assert_allclose(Z, X)
    np.testing.assert_allclose(C, A)
Ejemplo n.º 5
0
def test_vector_to_conv_c01b_invertible():

    """
    Tests that the format_as methods between Conv2DSpace
    and VectorSpace are invertible for the ('c', 0, 1, 'b')
    axis format.
    """

    rng = np.random.RandomState([2013, 5, 1])

    batch_size = 3
    rows = 4
    cols = 5
    channels = 2

    conv = Conv2DSpace([rows, cols], channels = channels, axes = ('c', 0, 1, 'b'))
    vec = VectorSpace(conv.get_total_dimension())

    X = conv.make_batch_theano()
    Y = conv.format_as(X, vec)
    Z = vec.format_as(Y, conv)

    A = vec.make_batch_theano()
    B = vec.format_as(A, conv)
    C = conv.format_as(B, vec)

    f = function([X, A], [Z, C])

    X = rng.randn(*(conv.get_origin_batch(batch_size).shape)).astype(X.dtype)
    A = rng.randn(*(vec.get_origin_batch(batch_size).shape)).astype(A.dtype)

    Z, C = f(X,A)

    np.testing.assert_allclose(Z, X)
    np.testing.assert_allclose(C, A)
Ejemplo n.º 6
0
    def get_fixed_var_descr(self, model, X, Y):
        """
        .. todo::

            WRITEME
        """

        assert Y is not None

        batch_size = model.batch_size

        drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size))
        drop_mask_X.name = 'drop_mask'

        X_space = model.get_input_space()

        updates = OrderedDict()
        rval = FixedVarDescr()
        inputs=[X, Y]

        if not self.supervised:
            update_X = self.mask_gen(X, X_space = X_space)
        else:
            drop_mask_Y = sharedX(np.ones(batch_size,))
            drop_mask_Y.name = 'drop_mask_Y'
            update_X, update_Y = self.mask_gen(X, Y, X_space)
            updates[drop_mask_Y] = update_Y
            rval.fixed_vars['drop_mask_Y'] =  drop_mask_Y
        if self.mask_gen.sync_channels:
            n = update_X.ndim
            assert n == drop_mask_X.ndim - 1
            update_X.name = 'raw_update_X'
            zeros_like_X = T.zeros_like(X)
            zeros_like_X.name = 'zeros_like_X'
            update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x')
            update_X.name = 'update_X'
        updates[drop_mask_X] = update_X

        rval.fixed_vars['drop_mask'] = drop_mask_X

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            include_prob_V = model.inference_procedure.include_prob_V
            include_prob_Y = model.inference_procedure.include_prob_Y

            theano_rng = MRG_RandomStreams(2012+11+20)
            for elem in flatten([model.inference_procedure.V_dropout]):
                updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V
            if "Softmax" in str(type(model.hidden_layers[-1])):
                hid = model.inference_procedure.H_dropout[:-1]
                y = model.inference_procedure.H_dropout[-1]
                updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y
            else:
                hid = model.inference_procedure.H_dropout
            for elem in flatten(hid):
                updates[elem] =  theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob

        rval.on_load_batch = [utils.function(inputs, updates=updates)]

        return rval
Ejemplo n.º 7
0
    def get_fixed_var_descr(self, model, X, Y):
        """
        .. todo::

            WRITEME
        """

        assert Y is not None

        batch_size = model.batch_size

        drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size))
        drop_mask_X.name = 'drop_mask'

        X_space = model.get_input_space()

        updates = OrderedDict()
        rval = FixedVarDescr()
        inputs=[X, Y]

        if not self.supervised:
            update_X = self.mask_gen(X, X_space = X_space)
        else:
            drop_mask_Y = sharedX(np.ones(batch_size,))
            drop_mask_Y.name = 'drop_mask_Y'
            update_X, update_Y = self.mask_gen(X, Y, X_space)
            updates[drop_mask_Y] = update_Y
            rval.fixed_vars['drop_mask_Y'] =  drop_mask_Y
        if self.mask_gen.sync_channels:
            n = update_X.ndim
            assert n == drop_mask_X.ndim - 1
            update_X.name = 'raw_update_X'
            zeros_like_X = T.zeros_like(X)
            zeros_like_X.name = 'zeros_like_X'
            update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x')
            update_X.name = 'update_X'
        updates[drop_mask_X] = update_X

        rval.fixed_vars['drop_mask'] = drop_mask_X

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            include_prob_V = model.inference_procedure.include_prob_V
            include_prob_Y = model.inference_procedure.include_prob_Y

            theano_rng = MRG_RandomStreams(2012+11+20)
            for elem in flatten([model.inference_procedure.V_dropout]):
                updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V
            if "Softmax" in str(type(model.hidden_layers[-1])):
                hid = model.inference_procedure.H_dropout[:-1]
                y = model.inference_procedure.H_dropout[-1]
                updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y
            else:
                hid = model.inference_procedure.H_dropout
            for elem in flatten(hid):
                updates[elem] =  theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob

        rval.on_load_batch = [utils.function(inputs, updates=updates)]

        return rval
Ejemplo n.º 8
0
    def __init__(self, dataset, batch_size, num_batches, topo, targets, rng):


        if rng is None:
            rng = np.random.RandomState([2013, 4, 22])
        if isinstance(rng, list):
            rng = np.random.RandomState(rng)

        self.__dict__.update(locals())
        del self.self

        theano_rng = MRG_RandomStreams(rng.randint(2 ** 16))

        if batch_size is None:
            raise ValueError("must specify batch size, there is infinite data.")

        samples = dataset.s3c.random_design_matrix(batch_size, theano_rng = theano_rng,
                            return_all = targets)
        assert samples is not None
        if targets:
            assert len(samples) == 3
            assert not any(sample is None for sample in samples)
        else:
            assert isinstance(samples, Variable)

        warnings.warn("This is recompiled every time we make a new iterator, just compile it once per iteration mode. Keep in mind the rng is part of the mode though-- the monitor wants to see the same stuff every time.")
        self.f = function([], samples)

        if num_batches is None:
            raise ValueError("must specify a number of batches, there is infinite 'data'")

        self.num_examples = num_batches * batch_size
Ejemplo n.º 9
0
        def get_fixed_var_descr(self, model, X, Y, **kwargs):
            rval = FixedVarDescr()
            rval.fixed_vars = {'unsup_aux_var': unsup_counter}
            Y=T.matrix()
            theano_func = function([X, Y], updates=[(unsup_counter, unsup_counter + 1)])
            rval.on_load_batch = [theano_func]

            return rval
Ejemplo n.º 10
0
    def __init__(self, inputs, outputs=None, updates=None):
        """
        Standin for a theano function with the given inputs, outputs, updates.
        Here in the __init__ method you give the same expression as usual.
        However, instead of passing __call__ the input variables directly, you
        pass it batches, where each batch is a list containing the inputs for
        that batch. It returns the average value of the function, averaged
        across batches, taking batch size into account. The average of all
        updates is also applied.

        One extra change: if any of the inputs is a shared variable, then this
        can assign to that variable, while theano.function would refuse to.
        Those shared variables will be left with the value of the last batch
        when __call__ returns.

        Parameters
        ----------
        inputs : WRITEME
        outputs : WRITEME
        updates : WRITEME
        """
        batch_size = T.cast(inputs[0].shape[0], 'float32')
        total_examples = T.scalar()
        transformed_updates = OrderedDict()
        self.has_updates = updates is not None
        if self.has_updates:
            self._clear = function([],
                                   updates=[(var, 0. * var)
                                            for var in updates])
            for var in updates:
                update = updates[var]
                transformed_updates[var] = var + (batch_size /
                                                  total_examples) * update
        self._shared_mask = [hasattr(elem, 'get_value') for elem in inputs]
        true_inputs = self._true_inputs(inputs)
        self._shared = self._shared_inputs(inputs)
        if outputs is not None:
            if not isinstance(outputs, list):
                outputs = [outputs]
            outputs = [
                output * (batch_size / total_examples) for output in outputs
            ]
        self._func = function(true_inputs + [total_examples],
                              outputs=outputs,
                              updates=transformed_updates)
Ejemplo n.º 11
0
 def __init__(self, inputs, outputs = None, updates = None):
     batch_size = T.cast(inputs[0].shape[0], 'float32')
     total_examples = T.scalar()
     transformed_updates = OrderedDict()
     self.has_updates = updates is not None
     if self.has_updates:
         self._clear = function([], updates = [ (var, 0. * var) for var in updates])
         for var in updates:
             update = updates[var]
             transformed_updates[var] = var + (batch_size / total_examples) * update
     self._shared_mask = [ hasattr(elem, 'get_value') for elem in inputs]
     true_inputs = self._true_inputs(inputs)
     self._shared = self._shared_inputs(inputs)
     if outputs is not None:
         if not isinstance(outputs, list):
             outputs = [ outputs ]
         outputs = [ output * (batch_size / total_examples) for output in outputs]
     self._func = function(true_inputs + [total_examples], outputs=outputs, updates=transformed_updates)
Ejemplo n.º 12
0
 def enforce_constraints(self):
     """
     Enforces all constraints encoded by self.modify_updates.
     """
     params = self.get_params()
     updates = OrderedDict(izip_no_length_check(params, params))
     self.modify_updates(updates)
     f = function([], updates=updates)
     f()
Ejemplo n.º 13
0
 def enforce_constraints(self):
     """
     Enforces all constraints encoded by self.censor_updates.
     """
     params = self.get_params()
     updates = OrderedDict(izip_no_length_check(params, params))
     self.censor_updates(updates)
     f = function([], updates=updates)
     f()
Ejemplo n.º 14
0
        def get_fixed_var_descr(self, model, X, Y, **kwargs):
            rval = FixedVarDescr()
            rval.fixed_vars = {'unsup_aux_var': unsup_counter}
            Y = T.matrix()
            theano_func = function([X, Y],
                                   updates=[(unsup_counter, unsup_counter + 1)
                                            ])
            rval.on_load_batch = [theano_func]

            return rval
Ejemplo n.º 15
0
        def get_fixed_var_descr(self, model, data):
            data_specs = self.get_data_specs(model)
            data_specs[0].validate(data)
            rval = FixedVarDescr()
            rval.fixed_vars = {'sup_aux_var': sup_counter}

            theano_func = function([], updates=[(sup_counter,
                sup_counter + 1)])
            def on_load(data):
                theano_func()
            rval.on_load_batch = [on_load]
            return rval
Ejemplo n.º 16
0
def get_obj_func(model):
    X = model.get_input_space().make_batch_theano()
    Y = model.get_output_space().make_batch_theano()
    y = T.argmax(Y, axis=1)
    drop_mask = mask_gen(X, X_space=model.get_input_space())
    if isinstance(model, MLP_Wrapper):
        Q = model.mf_missing(X, drop_mask)
    else:
        Q = model.inference_procedure.do_inpainting(X, Y = T.zeros_like(Y), drop_mask = drop_mask, drop_mask_Y = T.ones_like(T.cast(y, 'float32')))
    Y_hat = Q[-1]
    y_hat = T.argmax(Y_hat, axis=1)
    obj = T.neq(y, y_hat).mean()
    return function([X,Y], obj)
Ejemplo n.º 17
0
        def get_fixed_var_descr(self, model, data):
            data_specs = self.get_data_specs(model)
            data_specs[0].validate(data)
            rval = FixedVarDescr()
            rval.fixed_vars = {'sup_aux_var': sup_counter}

            theano_func = function([],
                                   updates=[(sup_counter, sup_counter + 1)])

            def on_load(data):
                theano_func()

            rval.on_load_batch = [on_load]
            return rval
Ejemplo n.º 18
0
    def get_weights_topo(self):

        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()

        W ,= self.transformer.get_params()

        W = W.T

        W = W.reshape((self.dim, self.input_space.shape[0],
                       self.input_space.shape[1], self.input_space.nchannels))

        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))

        return function([], W)()
Ejemplo n.º 19
0
        def get_fixed_var_descr(self, model, data, **kwargs):
            data_specs = self.get_data_specs(model)
            data_specs[0].validate(data)
            rval = FixedVarDescr()
            rval.fixed_vars = {'unsup_aux_var': unsup_counter}

            # The input to function should be a flat, non-redundent tuple
            mapping = DataSpecsMapping(data_specs)
            data_tuple = mapping.flatten(data, return_tuple=True)
            theano_func = function([],
                    updates=[(unsup_counter, unsup_counter + 1)])
            def on_load(batch, mapping=mapping, theano_func=theano_func):
                return theano_func()
            rval.on_load_batch = [on_load]

            return rval
    def get_weights_topo(self):

        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()

        W, = self.transformer.get_params()

        W = W.T

        W = W.reshape(
            (self.dim, self.input_space.shape[0], self.input_space.shape[1],
             self.input_space.num_channels))

        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))

        return function([], W)()
Ejemplo n.º 21
0
def get_obj_func(model):
    X = model.get_input_space().make_batch_theano()
    Y = model.get_output_space().make_batch_theano()
    y = T.argmax(Y, axis=1)
    drop_mask = mask_gen(X, X_space=model.get_input_space())
    if isinstance(model, MLP_Wrapper):
        Q = model.mf_missing(X, drop_mask)
    else:
        Q = model.inference_procedure.do_inpainting(X,
                                                    Y=T.zeros_like(Y),
                                                    drop_mask=drop_mask,
                                                    drop_mask_Y=T.ones_like(
                                                        T.cast(y, 'float32')))
    Y_hat = Q[-1]
    y_hat = T.argmax(Y_hat, axis=1)
    obj = T.neq(y, y_hat).mean()
    return function([X, Y], obj)
Ejemplo n.º 22
0
        def get_fixed_var_descr(self, model, data, **kwargs):
            data_specs = self.get_data_specs(model)
            data_specs[0].validate(data)
            rval = FixedVarDescr()
            rval.fixed_vars = {'unsup_aux_var': unsup_counter}

            # The input to function should be a flat, non-redundent tuple
            mapping = DataSpecsMapping(data_specs)
            data_tuple = mapping.flatten(data, return_tuple=True)
            theano_func = function([],
                                   updates=[(unsup_counter, unsup_counter + 1)
                                            ])

            def on_load(batch, mapping=mapping, theano_func=theano_func):
                return theano_func()

            rval.on_load_batch = [on_load]

            return rval
Ejemplo n.º 23
0
        def get_fixed_var_descr(self, model, data):
            data_specs = self.get_data_specs(model)
            data_specs[0].validate(data)
            rval = FixedVarDescr()
            rval.fixed_vars = {'sup_aux_var': sup_counter}
            rval.data_specs = data_specs

            # data has to be flattened into a tuple before being passed
            # to `function`.
            mapping = DataSpecsMapping(data_specs)
            flat_data = mapping.flatten(data, return_tuple=True)
            theano_func = function(flat_data,
                                   updates=[(sup_counter, sup_counter + 1)])
            # the on_load_batch function will take numerical data formatted
            # as rval.data_specs, so we have to flatten it inside the
            # returned function too.
            # Using default argument binds the variables used in the lambda
            # function to the value they have when the lambda is defined.
            on_load = (lambda batch, mapping=mapping, theano_func=theano_func:
                       theano_func(*mapping.flatten(batch, return_tuple=True)))
            rval.on_load_batch = [on_load]
            return rval
Ejemplo n.º 24
0
        def get_fixed_var_descr(self, model, data, **kwargs):
            data_specs = self.get_data_specs(model)
            data_specs[0].validate(data)
            rval = FixedVarDescr()
            rval.fixed_vars = {'unsup_aux_var': unsup_counter}
            rval.data_specs = data_specs

            # The input to function should be a flat, non-redundent tuple
            mapping = DataSpecsMapping(data_specs)
            data_tuple = mapping.flatten(data, return_tuple=True)
            theano_func = function(data_tuple,
                    updates=[(unsup_counter, unsup_counter + 1)])
            # the on_load_batch function will take numerical data formatted
            # as rval.data_specs, so we have to flatten it inside the
            # returned function too.
            # Using default argument binds the variables used in the lambda
            # function to the value they have when the lambda is defined.
            on_load = (lambda batch, mapping=mapping, theano_func=theano_func:
                    theano_func(*mapping.flatten(batch, return_tuple=True)))
            rval.on_load_batch = [on_load]

            return rval
Ejemplo n.º 25
0
        def get_fixed_var_descr(self, model, data):
            data_specs = self.get_data_specs(model)
            data_specs[0].validate(data)
            rval = FixedVarDescr()
            rval.fixed_vars = {'sup_aux_var': sup_counter}
            rval.data_specs = data_specs

            # data has to be flattened into a tuple before being passed
            # to `function`.
            mapping = DataSpecsMapping(data_specs)
            flat_data = mapping.flatten(data, return_tuple=True)
            theano_func = function(flat_data,
                                 updates=[(sup_counter, sup_counter + 1)])
            # the on_load_batch function will take numerical data formatted
            # as rval.data_specs, so we have to flatten it inside the
            # returned function too.
            # Using default argument binds the variables used in the lambda
            # function to the value they have when the lambda is defined.
            on_load = (lambda batch, mapping=mapping, theano_func=theano_func:
                    theano_func(*mapping.flatten(batch, return_tuple=True)))
            rval.on_load_batch = [on_load]
            return rval
Ejemplo n.º 26
0
    def setup(self, model, dataset, algorithm):
        self.origin = model.get_param_vector()

        cost = algorithm.cost
        # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now
        # =======================================
        data_specs = cost.get_data_specs(model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                                          batch_size=self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = cost.expr(model, nested_args,
                                    ** fixed_var_descr.fixed_vars)
        # End cargo culting
        # ======================

        print "Compiling cost function..."
        cost_fn = function(theano_args, cost_value)
        self.cost_fn = cost_fn
Ejemplo n.º 27
0
        def get_fixed_var_descr(self, model, data, **kwargs):
            data_specs = self.get_data_specs(model)
            data_specs[0].validate(data)
            rval = FixedVarDescr()
            rval.fixed_vars = {'unsup_aux_var': unsup_counter}
            rval.data_specs = data_specs

            # The input to function should be a flat, non-redundent tuple
            mapping = DataSpecsMapping(data_specs)
            data_tuple = mapping.flatten(data, return_tuple=True)
            theano_func = function(data_tuple,
                                   updates=[(unsup_counter, unsup_counter + 1)
                                            ])
            # the on_load_batch function will take numerical data formatted
            # as rval.data_specs, so we have to flatten it inside the
            # returned function too.
            # Using default argument binds the variables used in the lambda
            # function to the value they have when the lambda is defined.
            on_load = (lambda batch, mapping=mapping, theano_func=theano_func:
                       theano_func(*mapping.flatten(batch, return_tuple=True)))
            rval.on_load_batch = [on_load]

            return rval
Ejemplo n.º 28
0
    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is needed so that if new channels are added, Theano's
        optimizations make sure (to the extent that they can) that the new
        channels and old channels don't have any redundant calculations.

        It is also needed to regenerate Theano functions after pickling and
        unpickling, since Theano functions should not be pickled.
        """
        self._dirty = False

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(inputs=[], updates=updates, mode=self.theano_function_mode,
                    name = 'Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        #Get the appropriate kind of theano variable to represent the data the model
        #acts on
        X = self.model.get_input_space().make_theano_batch(name = "monitoring_X")
        if config.compute_test_value != 'off':
            m = self.model.get_test_batch_size()
            test_value = self.model.get_input_space().get_origin_batch(m)
            X.tag.test_value = np.cast[X.type.dtype](test_value)
        if self.require_label:
            Y = self.model.get_output_space().make_theano_batch(name = "monitoring_Y")

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including channel '+key+'\n')
            log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \
              for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                    self._num_batches, self._batch_size)]
        num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for channel in self.channels.values():
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            n = num_examples[index]
            u = updates[index]
            if isinstance(channel.graph_input, (list, tuple)):
                g[channel.graph_input[0]] = X
                g[channel.graph_input[1]] = Y
            else:
                g[channel.graph_input] = X
            if n == 0:
                raise ValueError("Iterating over 0 examples results in divide by 0")
            if self.topo:
                batch_index = d.get_topo_batch_axis()
            else:
                batch_index = 0
            val = channel.val * T.cast(X.shape[batch_index], config.floatX) / n
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' \
                                + key.name + ' has dtype ' + key.dtype + \
                                ' but is driven by an expression with type ' + \
                                up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key '+var_descriptor(elem)+'\n')
                        mode.record.handle_line('g val '+var_descriptor(g[elem])+'\n')
                    for elem in u:
                        mode.record.handle_line('u key '+var_descriptor(elem)+'\n')
                        mode.record.handle_line('u val '+var_descriptor(u[elem])+'\n')
                function_name = 'Monitor.accum[%d]' % idx
                if self.require_label:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line('compiling supervised accum\n')
                    # Some channels may not depend on the data, ie, they might just monitor the model
                    # parameters, or some shared variable updated by the training algorithm, so we
                    # need to ignore the unused input error
                    self.accum.append(function([X, Y], givens=g, updates=u, mode=self.theano_function_mode,
                            name=function_name))
                else:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line('compiling unsupervised accum\n')
                    self.accum.append(function([X], givens=g, updates=u, mode=self.theano_function_mode,
                            name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output '+var_descriptor(elem)+'\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del([name for name in final_names
                                    if name not in init_names])
Ejemplo n.º 29
0
    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is called any time we need to evaluate the channels and the
        channel definitions have changed since last we called it, or if the
        theano functions are unavailable for any other reason (first time they
        are needed after construction or deserialization, etc.)

        All channels are compiled as part of the same theano function so that
        the theano optimizations can eliminate subexpressions that are shared
        between multiple channels.
        """
        self._dirty = False

        # Recompute the data specs, since the channels may have changed.
        self._build_data_specs()

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data
        # the model acts on
        batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]]
        theano_args = self._flat_data_specs[0].make_theano_batch(batch_names)

        # Get a symbolic expression of the batch size
        # We do it here, rather than for each channel, because channels with an
        # empty data_specs do not use data, and are unable to extract the batch
        # size. The case where the whole data specs is empty is not supported.
        batch_size = self._flat_data_specs[0].batch_size(theano_args)

        # Also get a nested representation, for joint iteration
        # with each of channel.graph_input
        nested_theano_args = self._data_specs_mapping.nest(theano_args)
        if not isinstance(nested_theano_args, tuple):
            nested_theano_args = (nested_theano_args, )
        assert len(nested_theano_args) == (len(self.channels) + 1)

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including ' +
                                        'channel ' + key + '\n')
            log.info('\t%s' % key)
        it = [
            d.iterator(mode=i,
                       num_batches=n,
                       batch_size=b,
                       data_specs=self._flat_data_specs,
                       return_tuple=True)
            for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                        self._num_batches, self._batch_size)
        ]
        self.num_examples = [
            np.cast[config.floatX](float(i.num_examples)) for i in it
        ]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for i, channel in enumerate(self.channels.values()):
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]

            # Flatten channel.graph_input and the appropriate part of
            # nested_theano_args, to iterate jointly over them.
            c_mapping = DataSpecsMapping(channel.data_specs)
            channel_inputs = c_mapping.flatten(channel.graph_input,
                                               return_tuple=True)
            inputs = c_mapping.flatten(nested_theano_args[i + 1],
                                       return_tuple=True)

            for (channel_X, X) in safe_izip(channel_inputs, inputs):
                assert channel_X not in g or g[channel_X] is X
                assert channel_X.type == X.type, (channel_X.type, X.type)
                g[channel_X] = X

            if batch_size == 0:
                # No channel does need any data, so there is not need to
                # average results, and we will call the accum functions only
                # once.
                # TODO: better handling of channels not needing data when
                # some other channels need data.
                assert len(self._flat_data_specs[1]) == 0
                val = channel.val
            else:
                if n == 0:
                    raise ValueError("Iterating over 0 examples results in " +
                                     "divide by 0")
                val = (channel.val * T.cast(batch_size, config.floatX) /
                       cur_num_examples)
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' +
                                        key.name + ' has dtype ' + key.dtype +
                                        ' but is driven by an expression ' +
                                        'with type ' + up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if mode is not None and hasattr(mode, 'record'):
                    mode.record.handle_line('compiling supervised accum\n')
                # Some channels may not depend on the data, ie, they might just
                # monitor the model parameters, or some shared variable updated
                # by the training algorithm, so we need to ignore the unused
                # input error
                self.accum.append(
                    function(theano_args,
                             givens=g,
                             updates=u,
                             mode=self.theano_function_mode,
                             name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del(
            [name for name in final_names if name not in init_names])
Ejemplo n.º 30
0
    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is needed so that if new channels are added, Theano's
        optimizations make sure (to the extent that they can) that the new
        channels and old channels don't have any redundant calculations.

        It is also needed to regenerate Theano functions after pickling and
        unpickling, since Theano functions should not be pickled.
        """
        self._dirty = False

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data the model
        # acts on
        X = self.model.get_input_space().make_theano_batch(name="monitoring_X")
        if config.compute_test_value != 'off':
            m = self.model.get_test_batch_size()
            test_value = self.model.get_input_space().get_origin_batch(m)
            X.tag.test_value = np.cast[X.type.dtype](test_value)
        if self.require_label:
            Y = self.model.get_output_space().make_theano_batch(
                name="monitoring_Y")

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line(
                    'compiling monitor including channel ' + key + '\n')
            log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \
              for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                    self._num_batches, self._batch_size)]
        self.num_examples = [
            np.cast[config.floatX](float(i.num_examples)) for i in it
        ]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for channel in self.channels.values():
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]
            if isinstance(channel.graph_input, (list, tuple)):
                channel_X, channel_Y = channel.graph_input
                assert channel_X not in g or g[channel_X] is X
                assert channel_Y not in g or g[channel_Y] is Y
                g[channel_X] = X
                g[channel_Y] = Y
            else:
                channel_X = channel.graph_input
                assert channel_X not in g or g[channel_X] is X
                g[channel_X] = X
            if n == 0:
                raise ValueError(
                    "Iterating over 0 examples results in divide by 0")
            if self.topo:
                batch_index = d.get_topo_batch_axis()
            else:
                batch_index = 0
            val = channel.val * T.cast(X.shape[batch_index],
                                       config.floatX) / cur_num_examples
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' \
                                + key.name + ' has dtype ' + key.dtype + \
                                ' but is driven by an expression with type ' + \
                                up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if self.require_label:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line('compiling supervised accum\n')
                    # Some channels may not depend on the data, ie, they might just monitor the model
                    # parameters, or some shared variable updated by the training algorithm, so we
                    # need to ignore the unused input error
                    self.accum.append(
                        function([X, Y],
                                 givens=g,
                                 updates=u,
                                 mode=self.theano_function_mode,
                                 name=function_name))
                else:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line(
                            'compiling unsupervised accum\n')
                    self.accum.append(
                        function([X],
                                 givens=g,
                                 updates=u,
                                 mode=self.theano_function_mode,
                                 name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del(
            [name for name in final_names if name not in init_names])
Ejemplo n.º 31
0
    def __init__(self,
                 objective,
                 params,
                 inputs=None,
                 param_constrainers=None,
                 max_iter=-1,
                 lr_scalers=None,
                 verbose=0,
                 tol=None,
                 init_alpha=None,
                 min_init_alpha=1e-3,
                 reset_alpha=True,
                 conjugate=False,
                 reset_conjugate=True,
                 gradients=None,
                 gradient_updates=None,
                 line_search_mode=None,
                 accumulate=False,
                 theano_function_mode=None):
        """
        objective: a theano expression to be minimized
                       should be a function of params and,
                       if provided, inputs
            params: A list of theano shared variables.
                    These are the optimization variables
            inputs: (Optional) A list of theano variables
                    to serve as inputs to the graph.
            param_constrainers: (Optional) A list of callables
                    to be called on all updates dictionaries to
                    be applied to params. This is how you implement
                    constrained optimization.
            reset_alpha: If True, reverts to using init_alpha after
                        each call. If False, the final set of alphas
                        is used at the start of the next call to minimize.
            conjugate: If True, tries to pick conjugate gradient directions.
                       For the directions to be truly conjugate, you must use
                       line_search_mode = 'exhaustive' and the objective function
                       must be quadratic.
                       Using line_search_mode = 'exhaustive' on a non-quadratic objective
                       function implements nonlinear conjugate gradient descent.
            reset_conjugate:
                    has no effect unless conjugate == True
                    if reset_conjugate == True,
                        reverts to direction of steepest descent for the first
                        step in each call to minimize.
                    otherwise, tries to make the new search direction
                    conjugate to the last one (even though the objective function
                    might be totally different on each call to minimize)
            gradients: if None, compute the gradients of obj using T.grad
                    otherwise, a dictionary mapping from params to expressions
                    for their gradients (this allows you to use approximate
                    gradients computed with something other than T.grad)
            gradient_updates: a dictionary of shared variable updates to run
                each time the gradient is computed

            Calling the ``minimize'' method with values for
            for ``inputs'' will update ``params'' to minimize
            ``objective''.
        """

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            print 'batch gradient class compiling gradient function'
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            print 'done. Took ', t2 - t1

        if self.verbose:
            print 'batch gradient class compiling objective function'
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs,
                                obj,
                                mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            print 'done'

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha],
                                    updates=goto_updates,
                                    mode=self.theano_function_mode,
                                    name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(
            sum([
                T.sqr(elem).sum()
                for elem in self.param_to_grad_shared.values()
            ]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (
            1. - self.new_weight) * self.ave_grad_size

        self._normalize_grad = function(
            [],
            norm,
            updates=normalize_grad_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(),
                                                 'old_' + elem.name)

            self._store_old_grad = function(
                [norm],
                updates=OrderedDict([(grad_to_old_grad[g], g * norm)
                                     for g in grad_to_old_grad]),
                mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g] for g in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                            for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)
            """

            beta_pr is the Polak-Ribiere formula for beta.
            According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            (ie, it is meant to revert to steepest descent when you have traveled far enough that
            the objective function is behaving non-quadratically enough that the conjugate gradient
            formulas aren't working anymore)

            http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            """

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g])
                                      for g in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function(
                [],
                updates=make_conjugate_updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
Ejemplo n.º 32
0
    def __init__(self, objective, params, inputs = None,
            param_constrainers = None, max_iter = -1,
            lr_scalers = None, verbose = 0, tol = None,
            init_alpha = None, min_init_alpha = 1e-3,
            reset_alpha = True, conjugate = False,
            reset_conjugate = True, gradients = None,
            gradient_updates = None, line_search_mode = None,
            accumulate = False, theano_function_mode=None):
        """
        objective: a theano expression to be minimized
                       should be a function of params and,
                       if provided, inputs
            params: A list of theano shared variables.
                    These are the optimization variables
            inputs: (Optional) A list of theano variables
                    to serve as inputs to the graph.
            param_constrainers: (Optional) A list of callables
                    to be called on all updates dictionaries to
                    be applied to params. This is how you implement
                    constrained optimization.
            reset_alpha: If True, reverts to using init_alpha after
                        each call. If False, the final set of alphas
                        is used at the start of the next call to minimize.
            conjugate: If True, tries to pick conjugate gradient directions.
                       For the directions to be truly conjugate, you must use
                       line_search_mode = 'exhaustive' and the objective function
                       must be quadratic.
                       Using line_search_mode = 'exhaustive' on a non-quadratic objective
                       function implements nonlinear conjugate gradient descent.
            reset_conjugate:
                    has no effect unless conjugate == True
                    if reset_conjugate == True,
                        reverts to direction of steepest descent for the first
                        step in each call to minimize.
                    otherwise, tries to make the new search direction
                    conjugate to the last one (even though the objective function
                    might be totally different on each call to minimize)
            gradients: if None, compute the gradients of obj using T.grad
                    otherwise, a dictionary mapping from params to expressions
                    for their gradients (this allows you to use approximate
                    gradients computed with something other than T.grad)
            gradient_updates: a dictionary of shared variable updates to run
                each time the gradient is computed

            Calling the ``minimize'' method with values for
            for ``inputs'' will update ``params'' to minimize
            ``objective''.
        """

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha  = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [ param for param in params ]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX( param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            print 'batch gradient class compiling gradient function'
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates = updates)
        else:
            self._compute_grad = function(inputs, updates = updates,
                    mode=self.theano_function_mode,
                    name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            print 'done. Took ',t2-t1

        if self.verbose:
            print 'batch gradient class compiling objective function'
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs, obj, mode=self.theano_function_mode,
                    name='BatchGradientDescent.obj')

        if self.verbose:
            print 'done'

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name = 'alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha], updates=goto_updates,
                mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode,
                name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g], g * norm)
                for g in grad_to_old_grad]), mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [ grad_to_old_grad[g] for g in grad_ordered]

            def dot_product(x, y):
                return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            """

            beta_pr is the Polak-Ribiere formula for beta.
            According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            (ie, it is meant to revert to steepest descent when you have traveled far enough that
            the objective function is behaving non-quadratically enough that the conjugate gradient
            formulas aren't working anymore)

            http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            """

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function([], updates=make_conjugate_updates,
                    mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')


        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
Ejemplo n.º 33
0
    def setup_impl(self, model, dataset, algorithm):
        cost = algorithm.cost

        root = model.get_param_vector()

        dim = root.size

        rng = self.rng


        points = rng.randn(self.num_points, self.num_basis_vectors)
        points = points.astype(root.dtype)
        points *= self.scale

        if self.include_root:
            points[0, :] = 0.

        if not hasattr(self, 'cost_fn'):
            # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now
            # =======================================
            data_specs = cost.get_data_specs(model)
            mapping = DataSpecsMapping(data_specs)
            space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
            source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

            # Build a flat tuple of Theano Variables, one for each space.
            # We want that so that if the same space/source is specified
            # more than once in data_specs, only one Theano Variable
            # is generated for it, and the corresponding value is passed
            # only once to the compiled Theano function.
            theano_args = []
            for space, source in safe_zip(space_tuple, source_tuple):
                name = '%s[%s]' % (self.__class__.__name__, source)
                arg = space.make_theano_batch(name=name,
                                              batch_size=self.batch_size)
                theano_args.append(arg)
            theano_args = tuple(theano_args)

            # Methods of `cost` need args to be passed in a format compatible
            # with data_specs
            nested_args = mapping.nest(theano_args)
            fixed_var_descr = cost.get_fixed_var_descr(model, nested_args)
            self.on_load_batch = fixed_var_descr.on_load_batch

            cost_value = cost.expr(model, nested_args,
                                        ** fixed_var_descr.fixed_vars)
            # End cargo culting
            # ======================

            print "Compiling cost function..."
            cost_fn = function(theano_args, cost_value)
            self.cost_fn = cost_fn
        else:
            cost_fn = self.cost_fn

        cost_values = np.zeros(self.num_points)


        data = list(dataset.get_batch_design(self.batch_size,
            include_labels=True))
        from pylearn2.utils.one_hot import one_hot
        data[1] = one_hot(data[1])


        if self.method == 'gaussian':
            basis = rng.normal(dim, self.num_basis_vectors).astype(root.dtype)
        elif self.method == 'element':
            basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype)
            for i in xrange(self.num_basis_vectors):
                basis[rng.randint(dim), i] = 1.
        elif self.method == 'gradient':
            if not hasattr(self, 'grad_fn'):
                self.grad_fn = function(theano_args, grad(cost_value, model.get_params()))
            grad_fn = self.grad_fn

            basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype)
            for i in xrange(self.num_basis_vectors):
                ipt = list(dataset.get_batch_design(1, include_labels=True))
                label = ipt[1]
                assert label.size == 1
                label = label[0]
                one_hot = np.zeros((1, 10,),dtype='float32')
                one_hot[0, label] = 1
                ipt[1] = one_hot
                g = grad_fn(*ipt)
                basis[:,i] = np.concatenate([e.reshape(e.size) for e in g], axis=0)
        else:
            assert False

        basis /= np.sqrt(np.square(basis).sum(axis=0))

        # Orthogonalize basis
        for i in xrange(self.num_basis_vectors):
            v = basis[:,i ].copy()
            for j in xrange(i - 1):
                u = basis[:, j].copy()
                v -= np.dot(u, v) * u
            norm = np.sqrt(np.square(v).sum())
            assert norm > 1e-4
            v /= norm
            basis[:,i] = v


        for i in xrange(self.num_points):
            print "Evaluating cost at point ", i

            point = points[i, :]
            full_point = root + np.dot(basis, point)
            model.set_param_vector(full_point)

            cost_values[i] = cost_fn(*data)
            print cost_values[i]


        from pylearn2.utils import sharedX
        import theano.tensor as T

        print "!!!!!!!! FITTING THE QUADRATIC FUNCTION !!!!!!!!!!!!!!!!!!!"

        if not hasattr(self, 'fit_quad'):
            points = sharedX(points)
            #from theano import config
            #config.compute_test_value = 'raise'
            cost_values = sharedX(cost_values)
            A = sharedX(np.zeros((self.num_basis_vectors, self.num_basis_vectors)))
            if self.psd:
                mat = T.dot(A.T, A)
            else:
                mat = A
            b = sharedX(np.zeros(self.num_basis_vectors))
            c = sharedX(0.)
            half_quad = T.dot(points, mat)
            quad = (points * half_quad).sum(axis=1)
            lin = T.dot(points, b)
            pred = quad + lin + c

            from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent

            mse = T.square(pred - cost_values).mean()
            mae = abs(pred - cost_values).mean()

            obj = locals()[self.fitting_cost]

            fit_quad = BatchGradientDescent(obj, params = [A, b, c],
                    max_iter = self.num_basis_vectors ** 2,
                    verbose = 3, tol = None,
                    init_alpha = None, min_init_alpha = 1e-7,
                    reset_alpha = False, conjugate = True,
                    reset_conjugate = False,
                    line_search_mode = 'exhaustive')
            self.fit_quad = fit_quad
            self.A = A
            self.b = b
            self.c = c
            self.points = points
            self.cost_values = cost_values
        else:
            self.A.set_value(.001 * np.identity(self.A.get_value().shape[0], dtype=self.A.dtype))
            self.b.set_value(self.b.get_value() * 0.)
            self.c.set_value(self.c.get_value() * 0.)
            self.points.set_value(points)
            self.cost_values.set_value(cost_values.astype(self.cost_values.dtype))

        self.fit_quad.minimize()

        print "!!!!!!!!!!!!! FINDING ITS MINIMUM !!!!!!!!!!!!!!!!!!!!!!!!!!!"

        if self.use_solver:
            if self.psd:
                Av = self.A.get_value()
                mat_v = np.dot(Av.T, Av)
            else:
                mat_v = self.A.get_value()
            bv = self.b.get_value()

            # minimize for x^T A x + b^T x + c
            # -> solve 2 A x + b = 0
            # Ax = - b / 2

            print "********** mat_v", mat_v.min(), mat_v.max()
            x, ignored_residuals, ignored_rank, ignored_singular_values = np.linalg.lstsq(mat_v, - 0.5 * bv)
            print "********** soln: ", x.min(), x.mean(), x.max()
            print "********** SVs: ", ignored_singular_values.min(), ignored_singular_values.max()
            assert x.ndim == 1, x.shape
            prod = np.dot(basis, x)
            norm = np.sqrt(np.square(prod).sum())
            print "*************** Moving params by ",norm
            vector = root + prod
            model.set_param_vector(vector)

        else: # use minimizer
            if not hasattr(self, 'fit_params'):
                self.vector = sharedX(points.get_value().mean(axis=0))
                vector = self.vector
                obj = T.dot(T.dot(mat, vector), vector) + T.dot(b, vector)

                def constrain(d):
                    assert vector in d
                    n = d[vector]
                    norm = T.sqrt(T.square(n).sum())
                    desired_norm = T.clip(norm, 0., self.max_jump_norm)
                    d[vector] = n * desired_norm / norm

                self.fit_params = BatchGradientDescent(obj, params=[vector],
                    max_iter = self.num_basis_vectors,
                    verbose = 3, tol=None,
                    param_constrainers = [constrain],
                    init_alpha = None, min_init_alpha = 1e-3,
                    reset_alpha=False, conjugate=True, reset_conjugate=False,
                    line_search_mode='exhaustive')
            else:
                self.vector.set_value(points.mean(axis=0).astype(self.vector.dtype))

            self.fit_params.minimize()

            model.set_param_vector(root + np.dot(basis , self.vector.get_value()))
Ejemplo n.º 34
0
 def get_fixed_var_descr(self, model, X, Y=None):
     rval = FixedVarDescr()
     rval.fixed_vars = {'sup_aux_var': sup_counter}
     rval.on_load_batch = [ function([X, Y], updates=[(sup_counter, sup_counter+1)])]
     return rval
Ejemplo n.º 35
0
from galatea.maxout import GCN_C01B2

layer = GCN_C01B2(layer_name='unused')

from pylearn2.space import Conv2DSpace

space = Conv2DSpace(shape=[32, 32], num_channels=3, axes = ('c', 0, 1, 'b'))

layer.set_input_space(space)

from pylearn2.utils import function

X = space.make_batch_theano()

gcn = function([X], layer.fprop(X))

from pylearn2.space import VectorSpace
vector_space = VectorSpace(32*32*3)

flatten = function([X], space.format_as(X, vector_space))

mean = np.zeros((32*32*3,), dtype='float32')
cov = np.zeros((32*32*3, 32*32*3), dtype='float32')

dataset.X = dataset.X.astype('float32')

r_ofs = 8
c_ofs = 8

Ejemplo n.º 36
0
def get_obj_func(model):
    X = model.get_input_space().make_batch_theano()
    Y = model.get_output_space().make_batch_theano()
    obj = cost(model, X, Y)
    return function([X,Y], obj)
Ejemplo n.º 37
0
    def get_fixed_var_descr(self, model, data):
        """
        .. todo::

            WRITEME
        """

        X, Y = data

        assert Y is not None

        batch_size = model.batch_size

        drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size))
        drop_mask_X.name = "drop_mask"

        X_space = model.get_input_space()

        updates = OrderedDict()
        rval = FixedVarDescr()
        inputs = [X, Y]

        if not self.supervised:
            update_X = self.mask_gen(X, X_space=X_space)
        else:
            drop_mask_Y = sharedX(np.ones(batch_size))
            drop_mask_Y.name = "drop_mask_Y"
            update_X, update_Y = self.mask_gen(X, Y, X_space)
            updates[drop_mask_Y] = update_Y
            rval.fixed_vars["drop_mask_Y"] = drop_mask_Y
        if self.mask_gen.sync_channels:
            n = update_X.ndim
            assert n == drop_mask_X.ndim - 1
            update_X.name = "raw_update_X"
            zeros_like_X = T.zeros_like(X)
            zeros_like_X.name = "zeros_like_X"
            update_X = zeros_like_X + update_X.dimshuffle(0, 1, 2, "x")
            update_X.name = "update_X"
        updates[drop_mask_X] = update_X

        rval.fixed_vars["drop_mask"] = drop_mask_X

        if hasattr(model.inference_procedure, "V_dropout"):
            include_prob = model.inference_procedure.include_prob
            include_prob_V = model.inference_procedure.include_prob_V
            include_prob_Y = model.inference_procedure.include_prob_Y

            theano_rng = make_theano_rng(None, 2012 + 10 + 20, which_method="binomial")
            for elem in flatten([model.inference_procedure.V_dropout]):
                updates[elem] = (
                    theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V
                )
            if "Softmax" in str(type(model.hidden_layers[-1])):
                hid = model.inference_procedure.H_dropout[:-1]
                y = model.inference_procedure.H_dropout[-1]
                updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y
            else:
                hid = model.inference_procedure.H_dropout
            for elem in flatten(hid):
                updates[elem] = (
                    theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob
                )

        rval.on_load_batch = [utils.function(inputs, updates=updates)]

        return rval
Ejemplo n.º 38
0
    def __init__(self, objective, params, inputs=None,
                 param_constrainers=None, max_iter=-1,
                 lr_scalers=None, verbose=0, tol=None,
                 init_alpha=None, min_init_alpha=1e-3,
                 reset_alpha=True, conjugate=False,
                 reset_conjugate=True, gradients=None,
                 gradient_updates=None, line_search_mode=None,
                 accumulate=False, theano_function_mode=None):

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            logger.info('batch gradient class compiling gradient function')
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            logger.info('done. Took {0}'.format(t2-t1))

        if self.verbose:
            logger.info('batch gradient class compiling objective function')
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs, obj, mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            logger.info('done')

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function(
            [alpha],
            updates=goto_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(sum([T.sqr(elem).sum() for elem in
                           self.param_to_grad_shared.values()]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = \
            self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = \
            function([],
                     norm,
                     updates=normalize_grad_updates,
                     mode=self.theano_function_mode,
                     name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = \
                    sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = \
                function([norm],
                         updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm)
                                             for g_ in grad_to_old_grad]),
                         mode=self.theano_function_mode,
                         name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                           for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            # beta_pr is the Polak-Ribiere formula for beta.
            # According to wikipedia, the beta to use for NCG is "a matter of
            # heuristics or taste" but max(0, beta_pr) is "a popular choice...
            # which provides direction reset automatically." (ie, it is meant
            # to revert to steepest descent when you have traveled far enough
            # that the objective function is behaving non-quadratically enough
            # that the conjugate gradient formulas aren't working anymore)

            # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            assert grad not in grad_to_old_grad

            make_conjugate_updates = \
                [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate var '
                        + var_descriptor(v) + '\n')
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate update '
                        + var_descriptor(u) + '\n')

            self._make_conjugate = \
                function([], updates=make_conjugate_updates,
                         mode=self.theano_function_mode,
                         name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate output '
                        + var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
Ejemplo n.º 39
0
    def run(self):
        mm = self.monitor        

        updates = OrderedDict()
        for channel in mm.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)        
        mm.begin_record_entry = function(inputs=[], updates=updates, mode=mm.theano_function_mode,
                    name = 'Monitor.begin_record_entry')


        updates = OrderedDict()
        givens = OrderedDict()
        theano_args = mm._flat_data_specs[0].make_theano_batch(
                ['monitoring_%s' % s for s in mm._flat_data_specs[1]])

        # Get a symbolic expression of the batch size
        # We do it here, rather than for each channel, because channels with an
        # empty data_specs do not use data, and are unable to extract the batch
        # size. The case where the whole data specs is empty is not supported.
        batch_size = mm._flat_data_specs[0].batch_size(theano_args)
        nested_theano_args = mm._data_specs_mapping.nest(theano_args)
        if not isinstance(nested_theano_args, tuple):
            nested_theano_args = (nested_theano_args,)        
        
        assert len(nested_theano_args) == (len(mm.channels) + 1)

        for key in sorted(mm.channels.keys()):
            mode = mm.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including channel '+key+'\n')
            #log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b,
                         data_specs=mm._flat_data_specs,
                         return_tuple=True) \
              for d, i, n, b in safe_izip(mm._datasets, mm._iteration_mode,
                                    mm._num_batches, mm._batch_size)]
        mm.num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it]


        givens = [OrderedDict() for d in mm._datasets]
        updates = [OrderedDict() for d in mm._datasets]

        #for i, channel in enumerate(mm.channels.values()):
        for i, dw_name in enumerate(mm.channels.keys()):            
            if dw_name in self.p_channel:
                channel = mm.channels[dw_name]
                
                index = mm._datasets.index(channel.dataset)
                d = mm._datasets[index]
                g = givens[index]
                cur_num_examples = mm.num_examples[index]
                u = updates[index]

                # Flatten channel.graph_input and the appropriate part of
                # nested_theano_args, to iterate jointly over them.
                c_mapping = DataSpecsMapping(channel.data_specs)
                channel_inputs = c_mapping.flatten(channel.graph_input,
                                                   return_tuple=True)                
                inputs = c_mapping.flatten(nested_theano_args[i + 1],
                                           return_tuple=True)

                for (channel_X, X) in safe_izip(channel_inputs, inputs):
                    assert channel_X not in g or g[channel_X] is X
                    #print channel_X.type , X.type
                    assert channel_X.type == X.type
                    g[channel_X] = X

                if batch_size == 0:
                    # No channel does need any data, so there is not need to
                    # average results, and we will call the accum functions only
                    # once.
                    # TODO: better handling of channels not needing data when
                    # some other channels need data.
                    assert len(mm._flat_data_specs[1]) == 0
                    val = channel.val
                else:
                    if n == 0:
                        raise ValueError("Iterating over 0 examples results in divide by 0")
                    val = (channel.val * T.cast(batch_size, config.floatX)
                            / cur_num_examples)
                u[channel.val_shared] = channel.val_shared + val
            
        mm.accum = []
        for idx, packed in enumerate(safe_izip(givens, updates)):
            g, u = packed
            mode = mm.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for elem in g:
                    mode.record.handle_line('g key '+var_descriptor(elem)+'\n')
                    mode.record.handle_line('g val '+var_descriptor(g[elem])+'\n')
                for elem in u:
                    mode.record.handle_line('u key '+var_descriptor(elem)+'\n')
                    mode.record.handle_line('u val '+var_descriptor(u[elem])+'\n')
            function_name = 'Monitor.accum[%d]' % idx
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling supervised accum\n')
            # Some channels may not depend on the data, ie, they might just monitor the model
            # parameters, or some shared variable updated by the training algorithm, so we
            # need to ignore the unused input error
            mm.accum.append(function(theano_args,
                                       givens=g,
                                       updates=u,
                                       mode=mm.theano_function_mode,
                                       name=function_name))
        for a in mm.accum:
            if mode is not None and hasattr(mode, 'record'):
                for elem in a.maker.fgraph.outputs:
                    mode.record.handle_line('accum output '+var_descriptor(elem)+'\n')
            #log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
            datasets = mm._datasets
            
        # Set all channels' val_shared to 0
        mm.begin_record_entry()
        
        for d, i, b, n, a, sd, ne in safe_izip(datasets,
                                               mm._iteration_mode,
                                               mm._batch_size,
                                               mm._num_batches,
                                               mm.accum,
                                               mm._rng_seed,
                                               mm.num_examples):
            myiterator = d.iterator(mode=i,
                                    batch_size=b,
                                    num_batches=n,
                                    data_specs=mm._flat_data_specs,
                                    return_tuple=True,
                                    rng=sd)

            # If mm._flat_data_specs is empty, no channel needs data,
            # so we do not need to call the iterator in order to average
            # the monitored values across different batches, we only
            # have to call them once.
            if len(mm._flat_data_specs[1]) == 0:
                X = ()
                mm.run_prereqs(X, d)
                a(*X)

            else:
                actual_ne = 0
                for X in myiterator:
                    # X is a flat (not nested) tuple
                    mm.run_prereqs(X, d)
                    a(*X)
                    actual_ne += mm._flat_data_specs[0].np_batch_size(X)
                # end for X
                if actual_ne != ne:
                    raise RuntimeError("At compile time, your iterator said "
                            "it had " + str(ne) + " examples total, but at "
                            "runtime it gave us " + str(actual_ne) + ".")
        # end for d
        t = time.time() - mm.t0
        if self.p_save != None:
            b= open(self.p_save,'a')
            b.write("\tEpochs seen: %d\n" % mm._epochs_seen)
	    b.write("\tTime Elapse: %s\n" % str(datetime.timedelta(seconds=t)))
        print("Monitoring step:")
        print("\tEpochs seen: %d" % mm._epochs_seen)
        print("\tBatches seen: %d" % mm._num_batches_seen)
        print("\tTime Elapse: %s" % str(datetime.timedelta(seconds=t)))
        #print("\tExamples seen: %d" % mm._examples_seen)
        #print mm.channels
        for channel_name in self.p_channel:                
            if channel_name in mm.channels:
                channel = mm.channels[channel_name]
                channel.time_record.append(t)
                channel.batch_record.append(mm._num_batches_seen)
                channel.example_record.append(mm._examples_seen)
                channel.epoch_record.append(mm._epochs_seen)
                val = channel.val_shared.get_value()
                # naive hack: ...
                #channel.val_shared.set_value(0)
                channel.val_record.append(val)
                if abs(val) < 1e4:
                    val_str = str(val)
                else:
                    val_str = '%.3e' % val
                print "\t%s: %s" % (channel_name, val_str)
                if self.p_save!=None:
                    b.write("\t%s: %s\n" % (channel_name, val_str)) 
        # clean up        
        if self.p_save!=None:
            b.close() 
        mm._epochs_seen += 1
Ejemplo n.º 40
0
    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is called any time we need to evaluate the channels and
        the channel definitions have changed since last we called it,
        or if the theano functions are unavailable for any other reason
        (first time they are needed after construction or
        deserialization, etc.)

        All channels are compiled as part of the same theano function
        so that the theano optimizations can eliminate subexpressions
        that are shared between multiple channels.
        """
        self._dirty = False

        # Recompute the data specs, since the channels may have changed.
        self._build_data_specs()

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry'
            )
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data
        # the model acts on
        batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]]
        theano_args = self._flat_data_specs[0].make_theano_batch(batch_names)

        # Get a symbolic expression of the batch size
        # We do it here, rather than for each channel, because channels with an
        # empty data_specs do not use data, and are unable to extract the batch
        # size. The case where the whole data specs is empty is not supported.
        batch_size = self._flat_data_specs[0].batch_size(theano_args)

        # Also get a nested representation, for joint iteration
        # with each of channel.graph_input
        nested_theano_args = self._data_specs_mapping.nest(theano_args)
        if not isinstance(nested_theano_args, tuple):
            nested_theano_args = (nested_theano_args,)
        assert len(nested_theano_args) == (len(self.channels) + 1)

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including ' +
                                        'channel ' + key + '\n')
            log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b,
                         data_specs=self._flat_data_specs,
                         return_tuple=True)
              for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                          self._num_batches, self._batch_size)]
        self.num_examples = [np.cast[config.floatX](float(i.num_examples))
                             for i in it]
        self.num_examples = [float(i.num_examples) for i in it]

        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for i, channel in enumerate(self.channels.values()):
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            inv_cur_num_examples = as_floatX(1./self.num_examples[index])
            u = updates[index]

            # Flatten channel.graph_input and the appropriate part of
            # nested_theano_args, to iterate jointly over them.
            c_mapping = DataSpecsMapping(channel.data_specs)
            channel_inputs = c_mapping.flatten(channel.graph_input,
                                               return_tuple=True)
            inputs = c_mapping.flatten(nested_theano_args[i + 1],
                                       return_tuple=True)

            for (channel_X, X) in safe_izip(channel_inputs, inputs):
                assert channel_X not in g or g[channel_X] is X
                assert channel_X.type == X.type, (channel_X.type, X.type)
                g[channel_X] = X

            if batch_size == 0:
                # No channel does need any data, so there is not need to
                # average results, and we will call the accum functions only
                # once.
                # TODO: better handling of channels not needing data when
                # some other channels need data.
                assert len(self._flat_data_specs[1]) == 0
                val = channel.val
            else:
                if n == 0:
                    raise ValueError("Iterating over 0 examples results in " +
                                     "divide by 0")
                val = (channel.val * T.cast(batch_size, config.floatX)
                       * inv_cur_num_examples)
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' +
                                        key.name + ' has dtype ' + key.dtype +
                                        ' but is driven by an expression ' +
                                        'with type ' + up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if mode is not None and hasattr(mode, 'record'):
                    mode.record.handle_line('compiling supervised accum\n')
                # Some channels may not depend on the data, ie, they might just
                # monitor the model parameters, or some shared variable updated
                # by the training algorithm, so we need to ignore the unused
                # input error
                self.accum.append(function(theano_args,
                                           givens=g,
                                           updates=u,
                                           mode=self.theano_function_mode,
                                           name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del([name for name in final_names
                                    if name not in init_names])
Ejemplo n.º 41
0
    def __init__(self,
                 objective,
                 params,
                 inputs=None,
                 param_constrainers=None,
                 max_iter=-1,
                 lr_scalers=None,
                 verbose=0,
                 tol=None,
                 init_alpha=None,
                 min_init_alpha=1e-3,
                 reset_alpha=True,
                 conjugate=False,
                 reset_conjugate=True,
                 gradients=None,
                 gradient_updates=None,
                 line_search_mode=None,
                 accumulate=False,
                 theano_function_mode=None):

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            logger.info('batch gradient class compiling gradient function')
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            logger.info('done. Took {0}'.format(t2 - t1))

        if self.verbose:
            logger.info('batch gradient class compiling objective function')
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs,
                                obj,
                                mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            logger.info('done')

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha],
                                    updates=goto_updates,
                                    mode=self.theano_function_mode,
                                    name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(
            sum([
                T.sqr(elem).sum()
                for elem in self.param_to_grad_shared.values()
            ]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = \
            self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = \
            function([],
                     norm,
                     updates=normalize_grad_updates,
                     mode=self.theano_function_mode,
                     name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = \
                    sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = \
                function([norm],
                         updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm)
                                             for g_ in grad_to_old_grad]),
                         mode=self.theano_function_mode,
                         name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                            for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            # beta_pr is the Polak-Ribiere formula for beta.
            # According to wikipedia, the beta to use for NCG is "a matter of
            # heuristics or taste" but max(0, beta_pr) is "a popular choice...
            # which provides direction reset automatically." (ie, it is meant
            # to revert to steepest descent when you have traveled far enough
            # that the objective function is behaving non-quadratically enough
            # that the conjugate gradient formulas aren't working anymore)

            # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            assert grad not in grad_to_old_grad

            make_conjugate_updates = \
                [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate var ' +
                        var_descriptor(v) + '\n')
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate update ' +
                        var_descriptor(u) + '\n')

            self._make_conjugate = \
                function([], updates=make_conjugate_updates,
                         mode=self.theano_function_mode,
                         name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line(
                        'BatchGradientDescent._make_conjugate output ' +
                        var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
Ejemplo n.º 42
0
def get_obj_func(model):
    X = model.get_input_space().make_batch_theano()
    Y = model.get_output_space().make_batch_theano()
    obj = cost(model, X, Y)
    return function([X, Y], obj)