def __init__(self, n_inputs, n_hiddens, act_fun, n_outputs, n_components): """ Constructs an svi mdn with a given architecture. Note that the mdn has full precision matrices. :param n_inputs: dimensionality of the input :param n_hiddens: list with number of hidden units in the net :param act_fun: activation function type to use in the net :param n_outputs: dimensionality of the output :param n_components: number of mixture components :return: None """ # check if inputs are of the right type assert isposint( n_inputs), 'Number of inputs must be a positive integer.' assert isposint( n_outputs), 'Number of outputs must be a positive integer.' assert isposint( n_components), 'Number of components must be a positive integer.' assert isinstance( n_hiddens, list ), 'Number of hidden units must be a list of positive integers.' for h in n_hiddens: assert isposint( h ), 'Number of hidden units must be a list of positive integers.' assert act_fun in ['logistic', 'tanh', 'linear', 'relu', 'softplus'], 'Unsupported activation function.' # construct the net self.net = nn.NeuralNetSvi(n_inputs) for h in n_hiddens: self.net.addLayer(h, act_fun) self.input = self.net.hs[0] self.srng = self.net.srng # the naming scheme of the theano variables from now on might look a bit cryptic but it actually makes sense # each variable name has 3 or 4 letters, with the following meanings: # 1st letter: m=mean, s=variance, u=noise, z=random. note that s can also be the log std if convenient # 2nd letter: W=weights, b=biases or a=activations # 3rd letter: a=mixing coefficients, m=means, U=precisions # 4th letter: s if it's a list of variables, nothing otherwise # in general, capital means matrix, lowercase means vector(s) # mixing coefficients self.mWa = theano.shared( (rng.randn(self.net.n_outputs, n_components) / np.sqrt(self.net.n_outputs + 1)).astype(dtype), name='mWa') self.mba = theano.shared(rng.randn(n_components).astype(dtype), name='mba') self.sWa = theano.shared( -5.0 * np.ones([self.net.n_outputs, n_components], dtype=dtype), name='sWa') self.sba = theano.shared(-5.0 * np.ones(n_components, dtype=dtype), name='sba') uaa = self.srng.normal((self.net.hs[-1].shape[0], n_components), dtype=dtype) maa = tt.dot(self.net.hs[-1], self.mWa) + self.mba saa = tt.dot(self.net.hs[-1]**2, tt.exp(2 * self.sWa)) + tt.exp( 2 * self.sba) zaa = tt.sqrt(saa) * uaa + maa self.a = tt.nnet.softmax(zaa) # mixture means # the mean of each component is calculated separately. consider vectorizing this self.mWms = [ theano.shared((rng.randn(self.net.n_outputs, n_outputs) / np.sqrt(self.net.n_outputs + 1)).astype(dtype), name='mWm' + str(i)) for i in xrange(n_components) ] self.mbms = [ theano.shared(rng.randn(n_outputs).astype(dtype), name='mbm' + str(i)) for i in xrange(n_components) ] self.sWms = [ theano.shared( -5.0 * np.ones([self.net.n_outputs, n_outputs], dtype=dtype), name='sWm' + str(i)) for i in xrange(n_components) ] self.sbms = [ theano.shared(-5.0 * np.ones(n_outputs, dtype=dtype), name='sbm' + str(i)) for i in xrange(n_components) ] uams = [ self.srng.normal((self.net.hs[-1].shape[0], n_outputs), dtype=dtype) for i in xrange(n_components) ] mams = [ tt.dot(self.net.hs[-1], mWm) + mbm for mWm, mbm in izip(self.mWms, self.mbms) ] sams = [ tt.dot(self.net.hs[-1]**2, tt.exp(2 * sWm)) + tt.exp(2 * sbm) for sWm, sbm in izip(self.sWms, self.sbms) ] zams = [ tt.sqrt(sam) * uam + mam for sam, uam, mam in izip(sams, uams, mams) ] self.ms = zams # mixture precisions # note that U here is an upper triangular matrix such that U'*U is the precision self.mWUs = [ theano.shared((rng.randn(self.net.n_outputs, n_outputs**2) / np.sqrt(self.net.n_outputs + 1)).astype(dtype), name='mWU' + str(i)) for i in xrange(n_components) ] self.mbUs = [ theano.shared(rng.randn(n_outputs**2).astype(dtype), name='mbU' + str(i)) for i in xrange(n_components) ] self.sWUs = [ theano.shared( -5.0 * np.ones([self.net.n_outputs, n_outputs**2], dtype=dtype), name='sWU' + str(i)) for i in xrange(n_components) ] self.sbUs = [ theano.shared(-5.0 * np.ones(n_outputs**2, dtype=dtype), name='sbU' + str(i)) for i in xrange(n_components) ] uaUs = [ self.srng.normal((self.net.hs[-1].shape[0], n_outputs**2), dtype=dtype) for i in xrange(n_components) ] maUs = [ tt.dot(self.net.hs[-1], mWU) + mbU for mWU, mbU in izip(self.mWUs, self.mbUs) ] saUs = [ tt.dot(self.net.hs[-1]**2, tt.exp(2 * sWU)) + tt.exp(2 * sbU) for sWU, sbU in izip(self.sWUs, self.sbUs) ] zaUs = [ tt.sqrt(saU) * uaU + maU for saU, uaU, maU in izip(saUs, uaUs, maUs) ] zaUs_reshaped = [ tt.reshape(zaU, [-1, n_outputs, n_outputs]) for zaU in zaUs ] triu_mask = np.triu(np.ones([n_outputs, n_outputs], dtype=dtype), 1) diag_mask = np.eye(n_outputs, dtype=dtype) self.Us = [ triu_mask * zaU + diag_mask * tt.exp(diag_mask * zaU) for zaU in zaUs_reshaped ] ldetUs = [ tt.sum(tt.sum(diag_mask * zaU, axis=2), axis=1) for zaU in zaUs_reshaped ] # log probabilities self.y = tt.matrix('y') lprobs_comps = [ -0.5 * tt.sum(tt.sum( (self.y - m).dimshuffle([0, 'x', 1]) * U, axis=2)**2, axis=1) + ldetU for m, U, ldetU in izip(self.ms, self.Us, ldetUs) ] self.lprobs = tt.log( tt.sum(tt.exp(tt.stack(lprobs_comps, axis=1) + tt.log(self.a)), axis=1)) - (0.5 * n_outputs * np.log(2 * np.pi)) self.mlprob = -tt.mean(self.lprobs) # all parameters in one container self.uas = self.net.uas + [uaa] + uams + uaUs self.mas = self.net.mas + [maa] + mams + maUs self.zas = self.net.zas + [zaa] + zams + zaUs self.mps = self.net.mps + [ self.mWa, self.mba ] + self.mWms + self.mbms + self.mWUs + self.mbUs self.sps = self.net.sps + [ self.sWa, self.sba ] + self.sWms + self.sbms + self.sWUs + self.sbUs self.parms = self.mps + self.sps # theano evaluation functions, will be compiled when first needed self.eval_comps_f = None self.eval_lprobs_f = None self.eval_comps_f_rand = None self.eval_lprobs_f_rand = None # save these for later self.n_inputs = self.net.n_inputs self.n_outputs = n_outputs self.n_components = n_components self.act_fun = act_fun