Example #1
0
    def __init__(self, n_inputs, n_hiddens, act_fun, n_outputs, n_components):
        """
        Constructs an svi mdn with a given architecture. Note that the mdn has full precision matrices.
        :param n_inputs: dimensionality of the input
        :param n_hiddens: list with number of hidden units in the net
        :param act_fun: activation function type to use in the net
        :param n_outputs: dimensionality of the output
        :param n_components: number of mixture components
        :return: None
        """

        # check if inputs are of the right type
        assert isposint(
            n_inputs), 'Number of inputs must be a positive integer.'
        assert isposint(
            n_outputs), 'Number of outputs must be a positive integer.'
        assert isposint(
            n_components), 'Number of components must be a positive integer.'
        assert isinstance(
            n_hiddens, list
        ), 'Number of hidden units must be a list of positive integers.'
        for h in n_hiddens:
            assert isposint(
                h
            ), 'Number of hidden units must be a list of positive integers.'
        assert act_fun in ['logistic', 'tanh', 'linear', 'relu',
                           'softplus'], 'Unsupported activation function.'

        # construct the net
        self.net = nn.NeuralNetSvi(n_inputs)
        for h in n_hiddens:
            self.net.addLayer(h, act_fun)
        self.input = self.net.hs[0]
        self.srng = self.net.srng

        # the naming scheme of the theano variables from now on might look a bit cryptic but it actually makes sense
        # each variable name has 3 or 4 letters, with the following meanings:
        # 1st letter: m=mean, s=variance, u=noise, z=random. note that s can also be the log std if convenient
        # 2nd letter: W=weights, b=biases or a=activations
        # 3rd letter: a=mixing coefficients, m=means, U=precisions
        # 4th letter: s if it's a list of variables, nothing otherwise
        # in general, capital means matrix, lowercase means vector(s)

        # mixing coefficients
        self.mWa = theano.shared(
            (rng.randn(self.net.n_outputs, n_components) /
             np.sqrt(self.net.n_outputs + 1)).astype(dtype),
            name='mWa')
        self.mba = theano.shared(rng.randn(n_components).astype(dtype),
                                 name='mba')
        self.sWa = theano.shared(
            -5.0 * np.ones([self.net.n_outputs, n_components], dtype=dtype),
            name='sWa')
        self.sba = theano.shared(-5.0 * np.ones(n_components, dtype=dtype),
                                 name='sba')
        uaa = self.srng.normal((self.net.hs[-1].shape[0], n_components),
                               dtype=dtype)
        maa = tt.dot(self.net.hs[-1], self.mWa) + self.mba
        saa = tt.dot(self.net.hs[-1]**2, tt.exp(2 * self.sWa)) + tt.exp(
            2 * self.sba)
        zaa = tt.sqrt(saa) * uaa + maa
        self.a = tt.nnet.softmax(zaa)

        # mixture means
        # the mean of each component is calculated separately. consider vectorizing this
        self.mWms = [
            theano.shared((rng.randn(self.net.n_outputs, n_outputs) /
                           np.sqrt(self.net.n_outputs + 1)).astype(dtype),
                          name='mWm' + str(i)) for i in xrange(n_components)
        ]
        self.mbms = [
            theano.shared(rng.randn(n_outputs).astype(dtype),
                          name='mbm' + str(i)) for i in xrange(n_components)
        ]
        self.sWms = [
            theano.shared(
                -5.0 * np.ones([self.net.n_outputs, n_outputs], dtype=dtype),
                name='sWm' + str(i)) for i in xrange(n_components)
        ]
        self.sbms = [
            theano.shared(-5.0 * np.ones(n_outputs, dtype=dtype),
                          name='sbm' + str(i)) for i in xrange(n_components)
        ]
        uams = [
            self.srng.normal((self.net.hs[-1].shape[0], n_outputs),
                             dtype=dtype) for i in xrange(n_components)
        ]
        mams = [
            tt.dot(self.net.hs[-1], mWm) + mbm
            for mWm, mbm in izip(self.mWms, self.mbms)
        ]
        sams = [
            tt.dot(self.net.hs[-1]**2, tt.exp(2 * sWm)) + tt.exp(2 * sbm)
            for sWm, sbm in izip(self.sWms, self.sbms)
        ]
        zams = [
            tt.sqrt(sam) * uam + mam
            for sam, uam, mam in izip(sams, uams, mams)
        ]
        self.ms = zams

        # mixture precisions
        # note that U here is an upper triangular matrix such that U'*U is the precision
        self.mWUs = [
            theano.shared((rng.randn(self.net.n_outputs, n_outputs**2) /
                           np.sqrt(self.net.n_outputs + 1)).astype(dtype),
                          name='mWU' + str(i)) for i in xrange(n_components)
        ]
        self.mbUs = [
            theano.shared(rng.randn(n_outputs**2).astype(dtype),
                          name='mbU' + str(i)) for i in xrange(n_components)
        ]
        self.sWUs = [
            theano.shared(
                -5.0 *
                np.ones([self.net.n_outputs, n_outputs**2], dtype=dtype),
                name='sWU' + str(i)) for i in xrange(n_components)
        ]
        self.sbUs = [
            theano.shared(-5.0 * np.ones(n_outputs**2, dtype=dtype),
                          name='sbU' + str(i)) for i in xrange(n_components)
        ]
        uaUs = [
            self.srng.normal((self.net.hs[-1].shape[0], n_outputs**2),
                             dtype=dtype) for i in xrange(n_components)
        ]
        maUs = [
            tt.dot(self.net.hs[-1], mWU) + mbU
            for mWU, mbU in izip(self.mWUs, self.mbUs)
        ]
        saUs = [
            tt.dot(self.net.hs[-1]**2, tt.exp(2 * sWU)) + tt.exp(2 * sbU)
            for sWU, sbU in izip(self.sWUs, self.sbUs)
        ]
        zaUs = [
            tt.sqrt(saU) * uaU + maU
            for saU, uaU, maU in izip(saUs, uaUs, maUs)
        ]
        zaUs_reshaped = [
            tt.reshape(zaU, [-1, n_outputs, n_outputs]) for zaU in zaUs
        ]
        triu_mask = np.triu(np.ones([n_outputs, n_outputs], dtype=dtype), 1)
        diag_mask = np.eye(n_outputs, dtype=dtype)
        self.Us = [
            triu_mask * zaU + diag_mask * tt.exp(diag_mask * zaU)
            for zaU in zaUs_reshaped
        ]
        ldetUs = [
            tt.sum(tt.sum(diag_mask * zaU, axis=2), axis=1)
            for zaU in zaUs_reshaped
        ]

        # log probabilities
        self.y = tt.matrix('y')
        lprobs_comps = [
            -0.5 * tt.sum(tt.sum(
                (self.y - m).dimshuffle([0, 'x', 1]) * U, axis=2)**2,
                          axis=1) + ldetU
            for m, U, ldetU in izip(self.ms, self.Us, ldetUs)
        ]
        self.lprobs = tt.log(
            tt.sum(tt.exp(tt.stack(lprobs_comps, axis=1) + tt.log(self.a)),
                   axis=1)) - (0.5 * n_outputs * np.log(2 * np.pi))
        self.mlprob = -tt.mean(self.lprobs)

        # all parameters in one container
        self.uas = self.net.uas + [uaa] + uams + uaUs
        self.mas = self.net.mas + [maa] + mams + maUs
        self.zas = self.net.zas + [zaa] + zams + zaUs
        self.mps = self.net.mps + [
            self.mWa, self.mba
        ] + self.mWms + self.mbms + self.mWUs + self.mbUs
        self.sps = self.net.sps + [
            self.sWa, self.sba
        ] + self.sWms + self.sbms + self.sWUs + self.sbUs
        self.parms = self.mps + self.sps

        # theano evaluation functions, will be compiled when first needed
        self.eval_comps_f = None
        self.eval_lprobs_f = None
        self.eval_comps_f_rand = None
        self.eval_lprobs_f_rand = None

        # save these for later
        self.n_inputs = self.net.n_inputs
        self.n_outputs = n_outputs
        self.n_components = n_components
        self.act_fun = act_fun