Beispiel #1
0
    def get_fixed_var_descr(self, model, X, Y):
        """
        .. todo::

            WRITEME
        """

        assert Y is not None

        batch_size = model.batch_size

        drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size))
        drop_mask_X.name = 'drop_mask'

        X_space = model.get_input_space()

        updates = OrderedDict()
        rval = FixedVarDescr()
        inputs=[X, Y]

        if not self.supervised:
            update_X = self.mask_gen(X, X_space = X_space)
        else:
            drop_mask_Y = sharedX(np.ones(batch_size,))
            drop_mask_Y.name = 'drop_mask_Y'
            update_X, update_Y = self.mask_gen(X, Y, X_space)
            updates[drop_mask_Y] = update_Y
            rval.fixed_vars['drop_mask_Y'] =  drop_mask_Y
        if self.mask_gen.sync_channels:
            n = update_X.ndim
            assert n == drop_mask_X.ndim - 1
            update_X.name = 'raw_update_X'
            zeros_like_X = T.zeros_like(X)
            zeros_like_X.name = 'zeros_like_X'
            update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x')
            update_X.name = 'update_X'
        updates[drop_mask_X] = update_X

        rval.fixed_vars['drop_mask'] = drop_mask_X

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            include_prob_V = model.inference_procedure.include_prob_V
            include_prob_Y = model.inference_procedure.include_prob_Y

            theano_rng = MRG_RandomStreams(2012+11+20)
            for elem in flatten([model.inference_procedure.V_dropout]):
                updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V
            if "Softmax" in str(type(model.hidden_layers[-1])):
                hid = model.inference_procedure.H_dropout[:-1]
                y = model.inference_procedure.H_dropout[-1]
                updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y
            else:
                hid = model.inference_procedure.H_dropout
            for elem in flatten(hid):
                updates[elem] =  theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob

        rval.on_load_batch = [utils.function(inputs, updates=updates)]

        return rval
Beispiel #2
0
    def get_fixed_var_descr(self, model, X, Y):
        """
        .. todo::

            WRITEME
        """

        assert Y is not None

        batch_size = model.batch_size

        drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size))
        drop_mask_X.name = 'drop_mask'

        X_space = model.get_input_space()

        updates = OrderedDict()
        rval = FixedVarDescr()
        inputs=[X, Y]

        if not self.supervised:
            update_X = self.mask_gen(X, X_space = X_space)
        else:
            drop_mask_Y = sharedX(np.ones(batch_size,))
            drop_mask_Y.name = 'drop_mask_Y'
            update_X, update_Y = self.mask_gen(X, Y, X_space)
            updates[drop_mask_Y] = update_Y
            rval.fixed_vars['drop_mask_Y'] =  drop_mask_Y
        if self.mask_gen.sync_channels:
            n = update_X.ndim
            assert n == drop_mask_X.ndim - 1
            update_X.name = 'raw_update_X'
            zeros_like_X = T.zeros_like(X)
            zeros_like_X.name = 'zeros_like_X'
            update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x')
            update_X.name = 'update_X'
        updates[drop_mask_X] = update_X

        rval.fixed_vars['drop_mask'] = drop_mask_X

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            include_prob_V = model.inference_procedure.include_prob_V
            include_prob_Y = model.inference_procedure.include_prob_Y

            theano_rng = MRG_RandomStreams(2012+11+20)
            for elem in flatten([model.inference_procedure.V_dropout]):
                updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V
            if "Softmax" in str(type(model.hidden_layers[-1])):
                hid = model.inference_procedure.H_dropout[:-1]
                y = model.inference_procedure.H_dropout[-1]
                updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y
            else:
                hid = model.inference_procedure.H_dropout
            for elem in flatten(hid):
                updates[elem] =  theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob

        rval.on_load_batch = [utils.function(inputs, updates=updates)]

        return rval
Beispiel #3
0
    def get_gradients(self, model, data, **kwargs):
        self.get_data_specs(model)[0].validate(data)
        obj, scratch = self.base_cost(model, data, return_locals=True, **kwargs)
        if self.supervised:
            assert isinstance(data, (list, tuple))
            assert len(data) == 2
            (X, Y) = data
        else:
            X, = data
        interm_grads = OrderedDict()


        H_hat = scratch['H_hat']
        terms = scratch['terms']
        hidden_layers = scratch['hidden_layers']

        grads = OrderedDict()

        assert len(H_hat) == len(terms)
        assert len(terms) == len(hidden_layers)
        num_layers = len(hidden_layers)
        for i in xrange(num_layers):
            state = H_hat[i]
            layer = model.hidden_layers[i]
            term = terms[i]

            if term == 0.:
                continue
            else:
                print 'term is ',term

            if i == 0:
                state_below = X
                layer_below = model.visible_layer
            else:
                layer_below = model.hidden_layers[i-1]
                state_below = H_hat[i-1]
            state_below = layer_below.upward_state(state_below)

            components = flatten(state)

            real_grads = T.grad(term, components)

            fake_state = layer.linear_feed_forward_approximation(state_below)

            fake_components = flatten(fake_state)
            real_grads = OrderedDict(safe_zip(fake_components, real_grads))

            params = list(layer.get_params())
            fake_grads = T.grad(cost=None, consider_constant=flatten(state_below),
                    wrt=params, known_grads = real_grads)

            for param, grad in safe_zip(params, fake_grads):
                if param in grads:
                    grads[param] = grads[param] + grad
                else:
                    grads[param] = grad

        return grads, OrderedDict()
Beispiel #4
0
    def get_gradients(self, model, X, Y=None, **kwargs):
        obj, scratch = self.base_cost(model,
                                      X,
                                      Y,
                                      return_locals=True,
                                      **kwargs)

        interm_grads = OrderedDict()

        H_hat = scratch['H_hat']
        terms = scratch['terms']
        hidden_layers = scratch['hidden_layers']

        grads = OrderedDict()

        assert len(H_hat) == len(terms)
        assert len(terms) == len(hidden_layers)
        num_layers = len(hidden_layers)
        for i in xrange(num_layers):
            state = H_hat[i]
            layer = model.hidden_layers[i]
            term = terms[i]

            if term == 0.:
                continue
            else:
                print 'term is ', term

            if i == 0:
                state_below = X
                layer_below = model.visible_layer
            else:
                layer_below = model.hidden_layers[i - 1]
                state_below = H_hat[i - 1]
            state_below = layer_below.upward_state(state_below)

            components = flatten(state)

            real_grads = T.grad(term, components)

            fake_state = layer.linear_feed_forward_approximation(state_below)

            fake_components = flatten(fake_state)
            real_grads = OrderedDict(safe_zip(fake_components, real_grads))

            params = list(layer.get_params())
            fake_grads = T.grad(cost=None,
                                consider_constant=flatten(state_below),
                                wrt=params,
                                known_grads=real_grads)

            for param, grad in safe_zip(params, fake_grads):
                if param in grads:
                    grads[param] = grads[param] + grad
                else:
                    grads[param] = grad

        return grads, OrderedDict()
Beispiel #5
0
 def nan_check(i, node, fn):
     inputs = fn.inputs
     # TODO: figure out why individual inputs are themselves lists sometimes
     for x in flatten(inputs):
         do_check_on(x, node, fn, True)
     fn()
     outputs = fn.outputs
     for j, x in enumerate(flatten(outputs)):
         do_check_on(x, node, fn, False)
Beispiel #6
0
    def get_monitoring_channels(self, data):
        """
        .. todo::

            WRITEME
        """
        space, source = self.get_monitoring_data_specs()
        space.validate(data)
        X = data
        history = self.mf(X, return_history=True)
        q = history[-1]

        rval = OrderedDict()

        ch = self.visible_layer.get_monitoring_channels()
        for key in ch:
            rval['vis_' + key] = ch[key]

        for state, layer in safe_zip(q, self.hidden_layers):
            ch = layer.get_monitoring_channels()
            for key in ch:
                rval[layer.layer_name + '_' + key] = ch[key]
            ch = layer.get_monitoring_channels_from_state(state)
            for key in ch:
                rval['mf_' + layer.layer_name + '_' + key] = ch[key]

        if len(history) > 1:
            prev_q = history[-2]

            flat_q = flatten(q)
            flat_prev_q = flatten(prev_q)

            mx = None
            for new, old in safe_zip(flat_q, flat_prev_q):
                cur_mx = abs(new - old).max()
                if new is old:
                    logger.error('{0} is {1}'.format(new, old))
                    assert False
                if mx is None:
                    mx = cur_mx
                else:
                    mx = T.maximum(mx, cur_mx)

            rval['max_var_param_diff'] = mx

            for layer, new, old in safe_zip(self.hidden_layers,
                                            q, prev_q):
                sum_diff = 0.
                for sub_new, sub_old in safe_zip(flatten(new), flatten(old)):
                    sum_diff += abs(sub_new - sub_old).sum()
                denom = self.batch_size * \
                    layer.get_total_state_space().get_total_dimension()
                denom = np.cast[config.floatX](denom)
                rval['mean_'+layer.layer_name+'_var_param_diff'] = \
                    sum_diff / denom

        return rval
Beispiel #7
0
    def get_monitoring_channels(self, data):
        """
        .. todo::

            WRITEME
        """
        space, source = self.get_monitoring_data_specs()
        space.validate(data)
        X = data
        history = self.mf(X, return_history=True)
        q = history[-1]

        rval = OrderedDict()

        ch = self.visible_layer.get_monitoring_channels()
        for key in ch:
            rval['vis_' + key] = ch[key]

        for state, layer in safe_zip(q, self.hidden_layers):
            ch = layer.get_monitoring_channels()
            for key in ch:
                rval[layer.layer_name + '_' + key] = ch[key]
            ch = layer.get_monitoring_channels_from_state(state)
            for key in ch:
                rval['mf_' + layer.layer_name + '_' + key] = ch[key]

        if len(history) > 1:
            prev_q = history[-2]

            flat_q = flatten(q)
            flat_prev_q = flatten(prev_q)

            mx = None
            for new, old in safe_zip(flat_q, flat_prev_q):
                cur_mx = abs(new - old).max()
                if new is old:
                    logger.error('{0} is {1}'.format(new, old))
                    assert False
                if mx is None:
                    mx = cur_mx
                else:
                    mx = T.maximum(mx, cur_mx)

            rval['max_var_param_diff'] = mx

            for layer, new, old in safe_zip(self.hidden_layers,
                                            q, prev_q):
                sum_diff = 0.
                for sub_new, sub_old in safe_zip(flatten(new), flatten(old)):
                    sum_diff += abs(sub_new - sub_old).sum()
                denom = self.batch_size * \
                    layer.get_total_state_space().get_total_dimension()
                denom = np.cast[config.floatX](denom)
                rval['mean_'+layer.layer_name+'_var_param_diff'] = \
                    sum_diff / denom

        return rval
Beispiel #8
0
 def nan_check(i, node, fn):
     inputs = fn.inputs
     # TODO: figure out why individual inputs are themselves lists sometimes
     for x in flatten(inputs):
         do_check_on(x, node, fn, True)
     fn()
     outputs = fn.outputs
     for j, x in enumerate(flatten(outputs)):
         do_check_on(x, node, fn, False)
Beispiel #9
0
    def _get_standard_neg(self, model, layer_to_chains):
        params = list(model.get_params())

        warnings.warn("""TODO: reduce variance of negative phase by
                         integrating out the even-numbered layers. The
                         Rao-Blackwellize method can do this for you when
                         expected gradient = gradient of expectation, but
                         doing this in general is trickier.""")
        #layer_to_chains = model.rao_blackwellize(layer_to_chains)
        expected_energy_p = model.energy(
            layer_to_chains[model.visible_layer],
            [layer_to_chains[layer] for layer in model.hidden_layers]
        ).mean()

        samples = flatten(layer_to_chains.values())
        for i, sample in enumerate(samples):
            if sample.name is None:
                sample.name = 'sample_'+str(i)

        neg_phase_grads = OrderedDict(
            safe_zip(params, T.grad(-expected_energy_p, params,
                                    consider_constant=samples,
                                    disconnected_inputs='ignore'))
        )
        return neg_phase_grads
Beispiel #10
0
    def _get_standard_neg(self, model, layer_to_chains):
        """
        .. todo::

            WRITEME
        """
        params = list(model.get_params())

        warnings.warn("""TODO: reduce variance of negative phase by
                         integrating out the even-numbered layers. The
                         Rao-Blackwellize method can do this for you when
                         expected gradient = gradient of expectation, but
                         doing this in general is trickier.""")
        #layer_to_chains = model.rao_blackwellize(layer_to_chains)
        expected_energy_p = model.energy(
            layer_to_chains[model.visible_layer],
            [layer_to_chains[layer] for layer in model.hidden_layers]).mean()

        samples = flatten(layer_to_chains.values())
        for i, sample in enumerate(samples):
            if sample.name is None:
                sample.name = 'sample_' + str(i)

        neg_phase_grads = OrderedDict(
            safe_zip(
                params,
                T.grad(-expected_energy_p,
                       params,
                       consider_constant=samples,
                       disconnected_inputs='ignore')))
        return neg_phase_grads
Beispiel #11
0
        def nan_check(i, node, fn):
            """
            Runs `fn` while checking its inputs and outputs for NaNs / Infs

            Parameters
            ----------
            i : currently ignored (TODO: determine why it is here or remove)
            node : theano.gof.Apply
                The Apply node currently being executed
            fn : callable
                The thunk to execute for this Apply node
            """
            inputs = fn.inputs
            # TODO: figure out why individual inputs are themselves lists sometimes
            for x in flatten(inputs):
                do_check_on(x, node, fn, True)
            fn()
            outputs = fn.outputs
            for j, x in enumerate(flatten(outputs)):
                do_check_on(x, node, fn, False)
Beispiel #12
0
        def nan_check(i, node, fn):
            """
            Runs `fn` while checking its inputs and outputs for NaNs / Infs

            Parameters
            ----------
            i : currently ignored (TODO: determine why it is here or remove)
            node : theano.gof.Apply
                The Apply node currently being executed
            fn : callable
                The thunk to execute for this Apply node
            """
            inputs = fn.inputs
            # TODO: figure out why individual inputs are themselves lists sometimes
            for x in flatten(inputs):
                do_check_on(x, node, fn, True)
            fn()
            outputs = fn.outputs
            for j, x in enumerate(flatten(outputs)):
                do_check_on(x, node, fn, False)
Beispiel #13
0
    def _get_sampling_pos(self, model, X, Y):
        """
        .. todo::

            WRITEME
        """
        layer_to_clamp = OrderedDict([(model.visible_layer, True)])
        layer_to_pos_samples = OrderedDict([(model.visible_layer, X)])
        if self.supervised:
            # note: if the Y layer changes to something without linear energy,
            #       we'll need to make the expected energy clamp Y in the
            #       positive phase
            assert isinstance(model.hidden_layers[-1], dbm.Softmax)
            layer_to_clamp[model.hidden_layers[-1]] = True
            layer_to_pos_samples[model.hidden_layers[-1]] = Y
            hid = model.hidden_layers[:-1]
        else:
            assert Y is None
            hid = model.hidden_layers

        for layer in hid:
            mf_state = layer.init_mf_state()

            def recurse_zeros(x):
                if isinstance(x, tuple):
                    return tuple([recurse_zeros(e) for e in x])
                return x.zeros_like()

            layer_to_pos_samples[layer] = recurse_zeros(mf_state)

        layer_to_pos_samples = model.mcmc_steps(
            layer_to_state=layer_to_pos_samples,
            layer_to_clamp=layer_to_clamp,
            num_steps=self.num_gibbs_steps,
            theano_rng=self.theano_rng)

        q = [layer_to_pos_samples[layer] for layer in model.hidden_layers]

        pos_samples = flatten(q)

        # The gradients of the expected energy under q are easy, we can just
        # do that in theano
        expected_energy_q = model.energy(X, q).mean()
        params = list(model.get_params())
        gradients = OrderedDict(
            safe_zip(
                params,
                T.grad(expected_energy_q,
                       params,
                       consider_constant=pos_samples,
                       disconnected_inputs='ignore')))
        return gradients
Beispiel #14
0
    def _get_sampling_pos(self, model, X, Y):
        """
        .. todo::

            WRITEME
        """
        layer_to_clamp = OrderedDict([(model.visible_layer, True)])
        layer_to_pos_samples = OrderedDict([(model.visible_layer, X)])
        if self.supervised:
            # note: if the Y layer changes to something without linear energy,
            #       we'll need to make the expected energy clamp Y in the
            #       positive phase
            assert isinstance(model.hidden_layers[-1], Softmax)
            layer_to_clamp[model.hidden_layers[-1]] = True
            layer_to_pos_samples[model.hidden_layers[-1]] = Y
            hid = model.hidden_layers[:-1]
        else:
            assert Y is None
            hid = model.hidden_layers

        for layer in hid:
            mf_state = layer.init_mf_state()

            def recurse_zeros(x):
                if isinstance(x, tuple):
                    return tuple([recurse_zeros(e) for e in x])
                return x.zeros_like()

            layer_to_pos_samples[layer] = recurse_zeros(mf_state)

        layer_to_pos_samples = model.mcmc_steps(
            layer_to_state=layer_to_pos_samples,
            layer_to_clamp=layer_to_clamp,
            num_steps=self.num_gibbs_steps,
            theano_rng=self.theano_rng,
        )

        q = [layer_to_pos_samples[layer] for layer in model.hidden_layers]

        pos_samples = flatten(q)

        # The gradients of the expected energy under q are easy, we can just
        # do that in theano
        expected_energy_q = model.energy(X, q).mean()
        params = list(model.get_params())
        gradients = OrderedDict(
            safe_zip(
                params, T.grad(expected_energy_q, params, consider_constant=pos_samples, disconnected_inputs="ignore")
            )
        )
        return gradients
Beispiel #15
0
    def _get_variational_pos(self, model, X, Y):
        """
        .. todo::

            WRITEME
        """
        if self.supervised:
            assert Y is not None
            # note: if the Y layer changes to something without linear energy,
            # we'll need to make the expected energy clamp Y in the positive
            # phase
            assert isinstance(model.hidden_layers[-1], dbm.Softmax)

        q = model.mf(X, Y)
        """
            Use the non-negativity of the KL divergence to construct a lower
            bound on the log likelihood. We can drop all terms that are
            constant with repsect to the model parameters:

            log P(v) = L(v, q) + KL(q || P(h|v))
            L(v, q) = log P(v) - KL(q || P(h|v))
            L(v, q) = log P(v) - sum_h q(h) log q(h) + q(h) log P(h | v)
            L(v, q) = log P(v) + sum_h q(h) log P(h | v) + const
            L(v, q) = log P(v) + sum_h q(h) log P(h, v)
                               - sum_h q(h) log P(v) + const
            L(v, q) = sum_h q(h) log P(h, v) + const
            L(v, q) = sum_h q(h) -E(h, v) - log Z + const

            so the cost we want to minimize is
            expected_energy + log Z + const


            Note: for the RBM, this bound is exact, since the KL divergence
                  goes to 0.
        """

        variational_params = flatten(q)

        # The gradients of the expected energy under q are easy, we can just
        # do that in theano
        expected_energy_q = model.expected_energy(X, q).mean()
        params = list(model.get_params())
        gradients = OrderedDict(
            safe_zip(
                params,
                T.grad(expected_energy_q,
                       params,
                       consider_constant=variational_params,
                       disconnected_inputs='ignore')))
        return gradients
Beispiel #16
0
    def _get_variational_pos(self, model, X, Y):
        """
        .. todo::

            WRITEME
        """
        if self.supervised:
            assert Y is not None
            # note: if the Y layer changes to something without linear energy,
            # we'll need to make the expected energy clamp Y in the positive
            # phase
            assert isinstance(model.hidden_layers[-1], Softmax)

        q = model.mf(X, Y)

        """
            Use the non-negativity of the KL divergence to construct a lower
            bound on the log likelihood. We can drop all terms that are
            constant with repsect to the model parameters:

            log P(v) = L(v, q) + KL(q || P(h|v))
            L(v, q) = log P(v) - KL(q || P(h|v))
            L(v, q) = log P(v) - sum_h q(h) log q(h) + q(h) log P(h | v)
            L(v, q) = log P(v) + sum_h q(h) log P(h | v) + const
            L(v, q) = log P(v) + sum_h q(h) log P(h, v)
                               - sum_h q(h) log P(v) + const
            L(v, q) = sum_h q(h) log P(h, v) + const
            L(v, q) = sum_h q(h) -E(h, v) - log Z + const

            so the cost we want to minimize is
            expected_energy + log Z + const


            Note: for the RBM, this bound is exact, since the KL divergence
                  goes to 0.
        """

        variational_params = flatten(q)

        # The gradients of the expected energy under q are easy, we can just
        # do that in theano
        expected_energy_q = model.expected_energy(X, q).mean()
        params = list(model.get_params())
        gradients = OrderedDict(
            safe_zip(params, T.grad(expected_energy_q,
                                    params,
                                    consider_constant=variational_params,
                                    disconnected_inputs='ignore'))
        )
        return gradients
Beispiel #17
0
    def _get_toronto_neg(self, model, layer_to_chains):
        """
        .. todo::

            WRITEME
        """
        # Ruslan Salakhutdinov's undocumented negative phase from
        # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m
        # IG copied it here without fully understanding it, so it
        # only applies to exactly the same model structure as
        # in that code.

        assert isinstance(model.visible_layer, dbm.BinaryVector)
        assert isinstance(model.hidden_layers[0], dbm.BinaryVectorMaxPool)
        assert model.hidden_layers[0].pool_size == 1
        assert isinstance(model.hidden_layers[1], dbm.BinaryVectorMaxPool)
        assert model.hidden_layers[1].pool_size == 1
        assert isinstance(model.hidden_layers[2], dbm.Softmax)
        assert len(model.hidden_layers) == 3

        params = list(model.get_params())

        V_samples = layer_to_chains[model.visible_layer]
        H1_samples, H2_samples, Y_samples = [
            layer_to_chains[layer] for layer in model.hidden_layers
        ]

        H1_mf = model.hidden_layers[0].mf_update(
            state_below=model.visible_layer.upward_state(V_samples),
            state_above=model.hidden_layers[1].downward_state(H2_samples),
            layer_above=model.hidden_layers[1])
        Y_mf = model.hidden_layers[2].mf_update(
            state_below=model.hidden_layers[1].upward_state(H2_samples))
        H2_mf = model.hidden_layers[1].mf_update(
            state_below=model.hidden_layers[0].upward_state(H1_mf),
            state_above=model.hidden_layers[2].downward_state(Y_mf),
            layer_above=model.hidden_layers[2])

        expected_energy_p = model.energy(V_samples,
                                         [H1_mf, H2_mf, Y_samples]).mean()

        constants = flatten([V_samples, H1_mf, H2_mf, Y_samples])

        neg_phase_grads = OrderedDict(
            safe_zip(
                params,
                T.grad(-expected_energy_p, params,
                       consider_constant=constants)))
        return neg_phase_grads
Beispiel #18
0
    def _get_toronto_neg(self, model, layer_to_chains):
        """
        .. todo::

            WRITEME
        """
        # Ruslan Salakhutdinov's undocumented negative phase from
        # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m
        # IG copied it here without fully understanding it, so it
        # only applies to exactly the same model structure as
        # in that code.

        assert isinstance(model.visible_layer, BinaryVector)
        assert isinstance(model.hidden_layers[0], BinaryVectorMaxPool)
        assert model.hidden_layers[0].pool_size == 1
        assert isinstance(model.hidden_layers[1], BinaryVectorMaxPool)
        assert model.hidden_layers[1].pool_size == 1
        assert isinstance(model.hidden_layers[2], Softmax)
        assert len(model.hidden_layers) == 3

        params = list(model.get_params())

        V_samples = layer_to_chains[model.visible_layer]
        H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for
                                             layer in model.hidden_layers]

        H1_mf = model.hidden_layers[0].mf_update(
            state_below=model.visible_layer.upward_state(V_samples),
            state_above=model.hidden_layers[1].downward_state(H2_samples),
            layer_above=model.hidden_layers[1])
        Y_mf = model.hidden_layers[2].mf_update(
            state_below=model.hidden_layers[1].upward_state(H2_samples))
        H2_mf = model.hidden_layers[1].mf_update(
            state_below=model.hidden_layers[0].upward_state(H1_mf),
            state_above=model.hidden_layers[2].downward_state(Y_mf),
            layer_above=model.hidden_layers[2])

        expected_energy_p = model.energy(
            V_samples, [H1_mf, H2_mf, Y_samples]
        ).mean()

        constants = flatten([V_samples, H1_mf, H2_mf, Y_samples])

        neg_phase_grads = OrderedDict(
            safe_zip(params, T.grad(-expected_energy_p, params,
                                    consider_constant=constants)))
        return neg_phase_grads
    keep = keep_func(filter_me.X, filter_me.y)

    keep = keep.astype('bool')

    filter_me.X = filter_me.X[keep, :]
    filter_me.y = filter_me.y[keep, :]



dropout = hasattr(model.inference_procedure, 'V_dropout')
if dropout:
    include_prob = model.inference_procedure.include_prob
    theano_rng = MRG_RandomStreams(2012+11+20)
    updates = {}
    for elem in flatten([model.inference_procedure.V_dropout, model.inference_procedure.H_dropout]):
        updates[elem] =  theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob
    do_dropout = function([], updates=updates)


while True:
    if dropout:
        do_dropout()

    if cost.supervised:
        X, Y = dataset.get_batch_design(m, include_labels = True)
    else:
        X = dataset.get_batch_design(m)
    if topo:
        X = dataset.get_topological_view(X)
Beispiel #20
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.set_batch_size:
            model.set_batch_size(self.batch_size)

        if self.batch_size is None:
            self.batch_size = model.force_batch_size

        model.cost = self.cost
        model.mask_gen = self.mask_gen

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)
        prereq = self.get_setup_batch_object()
        #We want to use big batches. We need to make several theano calls on each
        #batch. To avoid paying the GPU latency every time, we use a shared variable
        #but the shared variable needs to stay allocated during the time that the
        #monitor is working, and we don't want the monitor to increase the memory
        #overhead. So we make the monitor work off of the same shared variable
        space = model.get_input_space()
        X = sharedX(space.get_origin_batch(model.batch_size), 'BGD_X')
        self.space = space
        rng = np.random.RandomState([2012, 7, 20])
        test_mask = space.get_origin_batch(model.batch_size)
        test_mask = rng.randint(0, 2, test_mask.shape)
        if hasattr(self.mask_gen,
                   'sync_channels') and self.mask_gen.sync_channels:
            if test_mask.ndim != 4:
                raise NotImplementedError()
            test_mask = test_mask[:, :, :, 0]
            assert test_mask.ndim == 3
        drop_mask = sharedX(np.cast[X.dtype](test_mask), name='drop_mask')
        self.drop_mask = drop_mask
        assert drop_mask.ndim == test_mask.ndim

        Y = None
        drop_mask_Y = None
        if self.cost.supervised:
            Y = sharedX(
                model.get_output_space().get_origin_batch(model.batch_size),
                'BGD_Y')
            self.Y = Y
            test_mask_Y = rng.randint(0, 2, (model.batch_size, ))
            drop_mask_Y = sharedX(np.cast[Y.dtype](test_mask_Y),
                                  name='drop_mask_Y')
            self.drop_mask_Y = drop_mask_Y
            dmx, dmy = self.mask_gen(X, Y)
            updates = OrderedDict([ (drop_mask, dmx),\
                    (drop_mask_Y, dmy)] )
        else:
            updates = OrderedDict([(drop_mask, self.mask_gen(X))])

        obj = self.cost(model,
                        X,
                        Y,
                        drop_mask=drop_mask,
                        drop_mask_Y=drop_mask_Y)
        gradients, gradient_updates = self.cost.get_gradients(
            model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y)

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            theano_rng = MRG_RandomStreams(2012 + 11 + 20)
            for elem in flatten([
                    model.inference_procedure.V_dropout,
                    model.inference_procedure.H_dropout
            ]):
                updates[elem] = theano_rng.binomial(
                    p=include_prob, size=elem.shape, dtype=elem.dtype,
                    n=1) / include_prob
        self.update_mask = function([], updates=updates)

        if self.monitoring_dataset is not None:
            if not any([
                    dataset.has_targets()
                    for dataset in self.monitoring_dataset.values()
            ]):
                Y = None
            assert X.name is not None
            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            assert X.name is not None
            wtf = self.cost.get_monitoring_channels(model,
                                                    X=X,
                                                    Y=Y,
                                                    drop_mask=drop_mask,
                                                    drop_mask_Y=drop_mask_Y)
            for key in wtf:
                channels[key] = wtf[key]

            for dataset_name in self.monitoring_dataset:

                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'

                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                         mode="sequential",
                                         batch_size=self.batch_size,
                                         num_batches=self.monitoring_batches)
                #we only need to put the prereq in once to make sure it gets run
                #adding it more times shouldn't hurt, but be careful
                #each time you say "self.setup_batch" you get a new object with a
                #different id, and if you install n of those the prereq will run n
                #times. It won't cause any wrong results, just a big slowdown
                warnings.warn(
                    "This is weird-- ipt=(X,Y)=tell the monitor to replace X, Y with the givens dict, "
                    " but you don't actually want them to be replaced.")
                ipt = X
                if Y is not None:
                    ipt = [X, Y]
                self.monitor.add_channel(prefix + 'objective',
                                         ipt=ipt,
                                         val=obj,
                                         dataset=monitoring_dataset,
                                         prereqs=[prereq])

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = []

                    prereqs = list(prereqs)
                    prereqs.append(prereq)

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name=prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset=monitoring_dataset,
                                             prereqs=prereqs)

        self.accumulate = self.combine_batches > 1
        if self.accumulate:
            self.inputs = [
                elem for elem in [X, Y, drop_mask, drop_mask_Y]
                if elem is not None
            ]
        else:
            self.inputs = None

        self.optimizer = BatchGradientDescent(
            objective=obj,
            inputs=self.inputs,
            verbose=1,
            gradients=gradients,
            gradient_updates=gradient_updates,
            params=model.get_params(),
            lr_scalers=model.get_lr_scalers(),
            param_constrainers=[model.censor_updates],
            max_iter=self.max_iter,
            tol=3e-7,
            init_alpha=self.init_alpha,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
            accumulate=self.accumulate,
            theano_function_mode=self.theano_function_mode)
        self.X = X

        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                name='ave_step_size',
                ipt=ipt,
                val=self.optimizer.ave_step_size,
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_size',
                ipt=ipt,
                val=self.optimizer.ave_grad_size,
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_mult',
                ipt=ipt,
                val=self.optimizer.ave_grad_mult,
                dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True
Beispiel #21
0
    keep_func = function([gX, gY], keep)

    keep = keep_func(filter_me.X, filter_me.y)

    keep = keep.astype('bool')

    filter_me.X = filter_me.X[keep, :]
    filter_me.y = filter_me.y[keep, :]

dropout = hasattr(model.inference_procedure, 'V_dropout')
if dropout:
    include_prob = model.inference_procedure.include_prob
    theano_rng = MRG_RandomStreams(2012 + 11 + 20)
    updates = {}
    for elem in flatten([
            model.inference_procedure.V_dropout,
            model.inference_procedure.H_dropout
    ]):
        updates[elem] = theano_rng.binomial(
            p=include_prob, size=elem.shape, dtype=elem.dtype,
            n=1) / include_prob
    do_dropout = function([], updates=updates)

while True:
    if dropout:
        do_dropout()

    if cost.supervised:
        X, Y = dataset.get_batch_design(m, include_labels=True)
    else:
        X = dataset.get_batch_design(m)
    if topo:
Beispiel #22
0
    def get_gradients(self, model, X, Y=None):
        """
        PCD approximation to the gradient.
        Keep in mind this is a cost, so we use
        the negative log likelihood.
        """

        layer_to_clamp = OrderedDict([(model.visible_layer, True )])
        layer_to_pos_samples = OrderedDict([(model.visible_layer, X)])
        if self.supervised:
            assert Y is not None
            # note: if the Y layer changes to something without linear energy,
            # we'll need to make the expected energy clamp Y in the positive phase
            assert isinstance(model.hidden_layers[-1], dbm.Softmax)
            layer_to_clamp[model.hidden_layers[-1]] = True
            layer_to_pos_samples[model.hidden_layers[-1]] = Y
            hid = model.hidden_layers[:-1]
        else:
            assert Y is None
            hid = model.hidden_layers

        for layer in hid:
            mf_state = layer.init_mf_state()
            def recurse_zeros(x):
                if isinstance(x, tuple):
                    return tuple([recurse_zeros(e) for e in x])
                return x.zeros_like()
            layer_to_pos_samples[layer] = recurse_zeros(mf_state)

        layer_to_pos_samples = model.mcmc_steps(
                layer_to_state=layer_to_pos_samples,
                layer_to_clamp=layer_to_clamp,
                num_steps=self.num_gibbs_steps,
                theano_rng = self.theano_rng)

        q = [layer_to_pos_samples[layer] for layer in model.hidden_layers]

        pos_samples = flatten(q)

        # The gradients of the expected energy under q are easy, we can just do that in theano
        expected_energy_q = model.energy(X, q).mean()
        params = list(model.get_params())
        gradients = OrderedDict(safe_zip(params, T.grad(expected_energy_q, params,
            consider_constant = pos_samples,
            disconnected_inputs = 'ignore')))

        """
        d/d theta log Z = (d/d theta Z) / Z
                        = (d/d theta sum_h sum_v exp(-E(v,h)) ) / Z
                        = (sum_h sum_v - exp(-E(v,h)) d/d theta E(v,h) ) / Z
                        = - sum_h sum_v P(v,h)  d/d theta E(v,h)
        """

        layer_to_chains = model.make_layer_to_state(self.num_chains)

        def recurse_check(l):
            if isinstance(l, (list, tuple)):
                for elem in l:
                    recurse_check(elem)
            else:
                assert l.get_value().shape[0] == self.num_chains

        recurse_check(layer_to_chains.values())

        model.layer_to_chains = layer_to_chains

        # Note that we replace layer_to_chains with a dict mapping to the new
        # state of the chains
        updates, layer_to_chains = model.get_sampling_updates(layer_to_chains,
                self.theano_rng, num_steps=self.num_gibbs_steps,
                return_layer_to_updated = True)


        if self.toronto_neg:
            # Ruslan Salakhutdinov's undocumented negative phase from
            # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m
            # IG copied it here without fully understanding it, so it
            # only applies to exactly the same model structure as
            # in that code.

            assert isinstance(model.visible_layer, dbm.BinaryVector)
            assert isinstance(model.hidden_layers[0], dbm.BinaryVectorMaxPool)
            assert model.hidden_layers[0].pool_size == 1
            assert isinstance(model.hidden_layers[1], dbm.BinaryVectorMaxPool)
            assert model.hidden_layers[1].pool_size == 1
            assert isinstance(model.hidden_layers[2], dbm.Softmax)
            assert len(model.hidden_layers) == 3

            V_samples = layer_to_chains[model.visible_layer]
            H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for layer in model.hidden_layers]

            H1_mf = model.hidden_layers[0].mf_update(state_below=model.visible_layer.upward_state(V_samples),
                                                    state_above=model.hidden_layers[1].downward_state(H2_samples),
                                                    layer_above=model.hidden_layers[1])
            Y_mf = model.hidden_layers[2].mf_update(state_below=model.hidden_layers[1].upward_state(H2_samples))
            H2_mf = model.hidden_layers[1].mf_update(state_below=model.hidden_layers[0].upward_state(H1_mf),
                                                    state_above=model.hidden_layers[2].downward_state(Y_mf),
                                                    layer_above=model.hidden_layers[2])

            expected_energy_p = model.energy(V_samples, [H1_mf, H2_mf, Y_samples]).mean()

            constants = flatten([V_samples, H1_mf, H2_mf, Y_samples])

            neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant = constants)))
        else:
            warnings.warn("""TODO: reduce variance of negative phase by integrating out
                    the even-numbered layers. The Rao-Blackwellize method can do this
                    for you when expected gradient = gradient of expectation, but doing
                    this in general is trickier.""")
            #layer_to_chains = model.rao_blackwellize(layer_to_chains)
            expected_energy_p = model.energy(layer_to_chains[model.visible_layer],
                    [layer_to_chains[layer] for layer in model.hidden_layers]).mean()

            samples = flatten(layer_to_chains.values())
            for i, sample in enumerate(samples):
                if sample.name is None:
                    sample.name = 'sample_'+str(i)

            neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant
                = samples, disconnected_inputs='ignore')))


        for param in list(gradients.keys()):
            #print param.name,': '
            #print theano.printing.min_informative_str(neg_phase_grads[param])
            gradients[param] =  neg_phase_grads[param]  + gradients[param]

        return gradients, updates
Beispiel #23
0
    def get_gradients(self, model, X, Y=None):
        """
        PCD approximation to the gradient of the bound.
        Keep in mind this is a cost, so we are upper bounding
        the negative log likelihood.
        """

        if self.supervised:
            assert Y is not None
            # note: if the Y layer changes to something without linear energy,
            # we'll need to make the expected energy clamp Y in the positive phase
            assert isinstance(model.hidden_layers[-1], dbm.Softmax)



        q = model.mf(X, Y)


        """
            Use the non-negativity of the KL divergence to construct a lower bound
            on the log likelihood. We can drop all terms that are constant with
            repsect to the model parameters:

            log P(v) = L(v, q) + KL(q || P(h|v))
            L(v, q) = log P(v) - KL(q || P(h|v))
            L(v, q) = log P(v) - sum_h q(h) log q(h) + q(h) log P(h | v)
            L(v, q) = log P(v) + sum_h q(h) log P(h | v) + const
            L(v, q) = log P(v) + sum_h q(h) log P(h, v) - sum_h q(h) log P(v) + const
            L(v, q) = sum_h q(h) log P(h, v) + const
            L(v, q) = sum_h q(h) -E(h, v) - log Z + const

            so the cost we want to minimize is
            expected_energy + log Z + const


            Note: for the RBM, this bound is exact, since the KL divergence goes to 0.
        """

        variational_params = flatten(q)

        # The gradients of the expected energy under q are easy, we can just do that in theano
        expected_energy_q = model.expected_energy(X, q).mean()
        params = list(model.get_params())
        gradients = OrderedDict(safe_zip(params, T.grad(expected_energy_q, params,
            consider_constant = variational_params,
            disconnected_inputs = 'ignore')))

        """
        d/d theta log Z = (d/d theta Z) / Z
                        = (d/d theta sum_h sum_v exp(-E(v,h)) ) / Z
                        = (sum_h sum_v - exp(-E(v,h)) d/d theta E(v,h) ) / Z
                        = - sum_h sum_v P(v,h)  d/d theta E(v,h)
        """

        layer_to_chains = model.make_layer_to_state(self.num_chains)

        def recurse_check(l):
            if isinstance(l, (list, tuple)):
                for elem in l:
                    recurse_check(elem)
            else:
                assert l.get_value().shape[0] == self.num_chains

        recurse_check(layer_to_chains.values())

        model.layer_to_chains = layer_to_chains

        # Note that we replace layer_to_chains with a dict mapping to the new
        # state of the chains
        updates, layer_to_chains = model.get_sampling_updates(layer_to_chains,
                self.theano_rng, num_steps=self.num_gibbs_steps,
                return_layer_to_updated = True)


        if self.toronto_neg:
            # Ruslan Salakhutdinov's undocumented negative phase from
            # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m
            # IG copied it here without fully understanding it, so it
            # only applies to exactly the same model structure as
            # in that code.

            assert isinstance(model.visible_layer, dbm.BinaryVector)
            assert isinstance(model.hidden_layers[0], dbm.BinaryVectorMaxPool)
            assert model.hidden_layers[0].pool_size == 1
            assert isinstance(model.hidden_layers[1], dbm.BinaryVectorMaxPool)
            assert model.hidden_layers[1].pool_size == 1
            assert isinstance(model.hidden_layers[2], dbm.Softmax)
            assert len(model.hidden_layers) == 3

            V_samples = layer_to_chains[model.visible_layer]
            H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for layer in model.hidden_layers]

            H1_mf = model.hidden_layers[0].mf_update(state_below=model.visible_layer.upward_state(V_samples),
                                                    state_above=model.hidden_layers[1].downward_state(H2_samples),
                                                    layer_above=model.hidden_layers[1])
            Y_mf = model.hidden_layers[2].mf_update(state_below=model.hidden_layers[1].upward_state(H2_samples))
            H2_mf = model.hidden_layers[1].mf_update(state_below=model.hidden_layers[0].upward_state(H1_mf),
                                                    state_above=model.hidden_layers[2].downward_state(Y_mf),
                                                    layer_above=model.hidden_layers[2])

            expected_energy_p = model.energy(V_samples, [H1_mf, H2_mf, Y_samples]).mean()

            constants = flatten([V_samples, H1_mf, H2_mf, Y_samples])

            neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant = constants)))
        else:
            warnings.warn("""TODO: reduce variance of negative phase by integrating out
                    the even-numbered layers. The Rao-Blackwellize method can do this
                    for you when expected gradient = gradient of expectation, but doing
                    this in general is trickier.""")
            #layer_to_chains = model.rao_blackwellize(layer_to_chains)
            expected_energy_p = model.energy(layer_to_chains[model.visible_layer],
                    [layer_to_chains[layer] for layer in model.hidden_layers]).mean()

            samples = flatten(layer_to_chains.values())
            for i, sample in enumerate(samples):
                if sample.name is None:
                    sample.name = 'sample_'+str(i)

            neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant
                = samples, disconnected_inputs='ignore')))


        for param in list(gradients.keys()):
            gradients[param] = neg_phase_grads[param] + gradients[param]

        return gradients, updates
Beispiel #24
0
    def expr(self, model, data, drop_mask = None, drop_mask_Y = None,
            return_locals = False, include_toronto = True, ** kwargs):
        """
        .. todo::

            WRITEME
        """

        if self.supervised:
            X, Y = data
        else:
            X = data
            Y = None

        if not self.supervised:
            assert drop_mask_Y is None
            # ignore Y if some other cost is supervised and has made it get
            # passed in (can this still happen after the (space, source)
            # interface change?)
            Y = None
        if self.supervised:
            assert Y is not None
            if drop_mask is not None:
                assert drop_mask_Y is not None

        if not hasattr(model,'cost'):
            model.cost = self
        if not hasattr(model,'mask_gen'):
            model.mask_gen = self.mask_gen

        dbm = model

        X_space = model.get_input_space()

        if drop_mask is None:
            if self.supervised:
                drop_mask, drop_mask_Y = self.mask_gen(X, Y, X_space=X_space)
            else:
                drop_mask = self.mask_gen(X, X_space=X_space)

        if drop_mask_Y is not None:
            assert drop_mask_Y.ndim == 1

        if drop_mask.ndim < X.ndim:
            if self.mask_gen is not None:
                assert self.mask_gen.sync_channels
            if X.ndim != 4:
                raise NotImplementedError()
            drop_mask = drop_mask.dimshuffle(0,1,2,'x')

        if not hasattr(self,'noise'):
            self.noise = False

        history = dbm.do_inpainting(X, Y = Y, drop_mask = drop_mask,
                drop_mask_Y = drop_mask_Y, return_history = True,
                noise = self.noise,
                niter = self.niter, block_grad = self.block_grad)
        final_state = history[-1]

        new_drop_mask = None
        new_drop_mask_Y = None
        new_history = [ None for state in history ]

        if not hasattr(self, 'both_directions'):
            self.both_directions = False
        if self.both_directions:
            new_drop_mask = 1. - drop_mask
            if self.supervised:
                new_drop_mask_Y = 1. - drop_mask_Y
            new_history = dbm.do_inpainting(X, Y = Y,
                    drop_mask=new_drop_mask,
                    drop_mask_Y=new_drop_mask_Y, return_history=True,
                    noise = self.noise,
                    niter = self.niter, block_grad = self.block_grad)

        new_final_state = new_history[-1]

        total_cost, sublocals = self.cost_from_states(final_state,
                new_final_state, dbm, X, Y, drop_mask, drop_mask_Y,
                new_drop_mask, new_drop_mask_Y,
                return_locals=True)
        l1_act_cost = sublocals['l1_act_cost']
        inpaint_cost = sublocals['inpaint_cost']
        reweighted_act_cost = sublocals['reweighted_act_cost']

        if not hasattr(self, 'robustness'):
            self.robustness = None
        if self.robustness is not None:
            inpainting_H_hat = history[-1]['H_hat']
            mf_H_hat = dbm.mf(X, Y=Y)
            if self.supervised:
                inpainting_H_hat = inpainting_H_hat[:-1]
                mf_H_hat = mf_H_hat[:-1]
                for ihh, mhh in safe_izip(flatten(inpainting_H_hat),
                        flatten(mf_H_hat)):
                    total_cost += self.robustness * T.sqr(mhh-ihh).sum()

        if not hasattr(self, 'toronto_act_targets'):
            self.toronto_act_targets = None
        toronto_act_cost = None
        if self.toronto_act_targets is not None and include_toronto:
            toronto_act_cost = 0.
            H_hat = history[-1]['H_hat']
            for s, c, t in zip(H_hat, self.toronto_act_coeffs,
                    self.toronto_act_targets):
                if c == 0.:
                    continue
                s, _ = s
                m = s.mean(axis=0)
                toronto_act_cost += c * T.sqr(m-t).mean()
            total_cost += toronto_act_cost

        if return_locals:
            return locals()

        total_cost.name = 'total_inpaint_cost'

        return total_cost
Beispiel #25
0
    def get_gradients(self, model, data, **kwargs):
        """
        .. todo::

            WRITEME
        """
        self.get_data_specs(model)[0].validate(data)
        obj, scratch = self.base_cost.expr(model,
                                           data,
                                           return_locals=True,
                                           **kwargs)
        if self.supervised:
            assert isinstance(data, (list, tuple))
            assert len(data) == 2
            (X, Y) = data
        else:
            X = data

        H_hat = scratch['H_hat']
        terms = scratch['terms']
        hidden_layers = scratch['hidden_layers']

        grads = OrderedDict()

        assert len(H_hat) == len(terms)
        assert len(terms) == len(hidden_layers)
        num_layers = len(hidden_layers)
        for i in xrange(num_layers):
            state = H_hat[i]
            layer = model.hidden_layers[i]
            term = terms[i]

            if term == 0.:
                continue
            else:
                print 'term is ', term

            if i == 0:
                state_below = X
                layer_below = model.visible_layer
            else:
                layer_below = model.hidden_layers[i - 1]
                state_below = H_hat[i - 1]
            state_below = layer_below.upward_state(state_below)

            components = flatten(state)

            real_grads = T.grad(term, components)

            fake_state = layer.linear_feed_forward_approximation(state_below)

            fake_components = flatten(fake_state)
            real_grads = OrderedDict(safe_zip(fake_components, real_grads))

            params = list(layer.get_params())
            fake_grads = pylearn2.utils.grad(
                cost=None,
                consider_constant=flatten(state_below),
                wrt=params,
                known_grads=real_grads)

            for param, grad in safe_zip(params, fake_grads):
                if param in grads:
                    grads[param] = grads[param] + grad
                else:
                    grads[param] = grad

        return grads, OrderedDict()
Beispiel #26
0
    def __call__(self,
                 model,
                 X,
                 Y=None,
                 drop_mask=None,
                 drop_mask_Y=None,
                 return_locals=False,
                 include_toronto=True,
                 **kwargs):
        """
        .. todo::

            WRITEME
        """

        if not self.supervised:
            assert drop_mask_Y is None
            Y = None  # ignore Y if some other cost is supervised and has made it get passed in
        if self.supervised:
            assert Y is not None
            if drop_mask is not None:
                assert drop_mask_Y is not None

        if not hasattr(model, 'cost'):
            model.cost = self
        if not hasattr(model, 'mask_gen'):
            model.mask_gen = self.mask_gen

        dbm = model

        X_space = model.get_input_space()

        if drop_mask is None:
            if self.supervised:
                drop_mask, drop_mask_Y = self.mask_gen(X, Y, X_space=X_space)
            else:
                drop_mask = self.mask_gen(X, X_space=X_space)

        if drop_mask_Y is not None:
            assert drop_mask_Y.ndim == 1

        if drop_mask.ndim < X.ndim:
            if self.mask_gen is not None:
                assert self.mask_gen.sync_channels
            if X.ndim != 4:
                raise NotImplementedError()
            drop_mask = drop_mask.dimshuffle(0, 1, 2, 'x')

        if not hasattr(self, 'noise'):
            self.noise = False

        history = dbm.do_inpainting(X,
                                    Y=Y,
                                    drop_mask=drop_mask,
                                    drop_mask_Y=drop_mask_Y,
                                    return_history=True,
                                    noise=self.noise,
                                    niter=self.niter,
                                    block_grad=self.block_grad)
        final_state = history[-1]

        new_drop_mask = None
        new_drop_mask_Y = None
        new_history = [None for state in history]

        if not hasattr(self, 'both_directions'):
            self.both_directions = False
        if self.both_directions:
            new_drop_mask = 1. - drop_mask
            if self.supervised:
                new_drop_mask_Y = 1. - drop_mask_Y
            new_history = dbm.do_inpainting(X,
                                            Y=Y,
                                            drop_mask=new_drop_mask,
                                            drop_mask_Y=new_drop_mask_Y,
                                            return_history=True,
                                            noise=self.noise,
                                            niter=self.niter,
                                            block_grad=self.block_grad)

        new_final_state = new_history[-1]

        total_cost, sublocals = self.cost_from_states(final_state,
                                                      new_final_state,
                                                      dbm,
                                                      X,
                                                      Y,
                                                      drop_mask,
                                                      drop_mask_Y,
                                                      new_drop_mask,
                                                      new_drop_mask_Y,
                                                      return_locals=True)
        l1_act_cost = sublocals['l1_act_cost']
        inpaint_cost = sublocals['inpaint_cost']
        reweighted_act_cost = sublocals['reweighted_act_cost']

        if not hasattr(self, 'robustness'):
            self.robustness = None
        if self.robustness is not None:
            inpainting_H_hat = history[-1]['H_hat']
            mf_H_hat = dbm.mf(X, Y=Y)
            if self.supervised:
                inpainting_H_hat = inpainting_H_hat[:-1]
                mf_H_hat = mf_H_hat[:-1]
                for ihh, mhh in safe_izip(flatten(inpainting_H_hat),
                                          flatten(mf_H_hat)):
                    total_cost += self.robustness * T.sqr(mhh - ihh).sum()

        if not hasattr(self, 'toronto_act_targets'):
            self.toronto_act_targets = None
        toronto_act_cost = None
        if self.toronto_act_targets is not None and include_toronto:
            toronto_act_cost = 0.
            H_hat = history[-1]['H_hat']
            for s, c, t in zip(H_hat, self.toronto_act_coeffs,
                               self.toronto_act_targets):
                if c == 0.:
                    continue
                s, _ = s
                m = s.mean(axis=0)
                toronto_act_cost += c * T.sqr(m - t).mean()
            total_cost += toronto_act_cost

        if return_locals:
            return locals()

        total_cost.name = 'total_inpaint_cost'

        return total_cost
Beispiel #27
0
    def get_fixed_var_descr(self, model, data):
        """
        .. todo::

            WRITEME
        """

        X, Y = data

        assert Y is not None

        batch_size = model.batch_size

        drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size))
        drop_mask_X.name = "drop_mask"

        X_space = model.get_input_space()

        updates = OrderedDict()
        rval = FixedVarDescr()
        inputs = [X, Y]

        if not self.supervised:
            update_X = self.mask_gen(X, X_space=X_space)
        else:
            drop_mask_Y = sharedX(np.ones(batch_size))
            drop_mask_Y.name = "drop_mask_Y"
            update_X, update_Y = self.mask_gen(X, Y, X_space)
            updates[drop_mask_Y] = update_Y
            rval.fixed_vars["drop_mask_Y"] = drop_mask_Y
        if self.mask_gen.sync_channels:
            n = update_X.ndim
            assert n == drop_mask_X.ndim - 1
            update_X.name = "raw_update_X"
            zeros_like_X = T.zeros_like(X)
            zeros_like_X.name = "zeros_like_X"
            update_X = zeros_like_X + update_X.dimshuffle(0, 1, 2, "x")
            update_X.name = "update_X"
        updates[drop_mask_X] = update_X

        rval.fixed_vars["drop_mask"] = drop_mask_X

        if hasattr(model.inference_procedure, "V_dropout"):
            include_prob = model.inference_procedure.include_prob
            include_prob_V = model.inference_procedure.include_prob_V
            include_prob_Y = model.inference_procedure.include_prob_Y

            theano_rng = make_theano_rng(None, 2012 + 10 + 20, which_method="binomial")
            for elem in flatten([model.inference_procedure.V_dropout]):
                updates[elem] = (
                    theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V
                )
            if "Softmax" in str(type(model.hidden_layers[-1])):
                hid = model.inference_procedure.H_dropout[:-1]
                y = model.inference_procedure.H_dropout[-1]
                updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y
            else:
                hid = model.inference_procedure.H_dropout
            for elem in flatten(hid):
                updates[elem] = (
                    theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob
                )

        rval.on_load_batch = [utils.function(inputs, updates=updates)]

        return rval