def mk_stochastic(T): ''' % MK_STOCHASTIC Ensure the argument is a stochastic matrix, i.e., the sum over the last dimension is 1. % [T,Z] = mk_stochastic(T) % % If T is a vector, it will sum to 1. % If T is a matrix, each row will sum to 1. % If T is a 3D array, then sum_k T(i,j,k) = 1 for all i,j. % Set zeros to 1 before dividing % This is valid since S(j) = 0 iff T(i,j) = 0 for all j ''' T = np.asfarray(T) if T.ndim==1 or (T.ndim==2 and (T.shape[0]==1 or T.shape[1]==1)): # isvector T,Z = normalise(T) elif T.ndim==2: # matrix T = np.asmatrix(T) Z = np.sum(T,1) S = Z + (Z==0) norm = np.tile(S, (1, T.shape[1])) T = np.divide(T, norm) else: # multi-dimensional array ns = T.shape T = np.asmatrix(np.reshape(T, (np.prod(ns[0:-1]), ns[-1]))) Z = np.sum(T,1) S = Z + (Z==0) norm = np.tile(S, (1, ns[-1])) T = np.divide(T, norm) T = np.reshape(np.asarray(T), ns) return T,Z
def testMatrix(self): v = np.array([[1, 2], [1, 2]]) vn, s = normalise(v) assert np.all( np.abs(vn - np.array([[0.16666667, 0.33333333], [0.16666667, 0.33333333]])) < 1e-3) assert s == 6
def mixgauss_init(M, data, cov_type, method='kmeans'): ''' % MIXGAUSS_INIT Initial parameter estimates for a mixture of Gaussians % function [mu, Sigma, weights] = mixgauss_init(M, data, cov_type. method) % % INPUTS: % data(:,t) is the t'th example % M = num. mixture components % cov_type = 'full', 'diag' or 'spherical' % method = 'rnd' (choose centers randomly from data) or 'kmeans' (needs netlab) % % OUTPUTS: % mu(:,k) % Sigma(:,:,k) % weights(k) ''' if isinstance(data, list): data = np.hstack(data) elif data.ndim == 3: O, T, N = data.shape data = np.reshape(np.transpose(data, (0, 2, 1)), (O, T * N)) d, T = data.shape if method == 'rnd': C = np.atleast_2d(np.cov(data)) Sigma = np.transpose(np.tile(np.diag(np.diag(C)) * 0.5, (M, 1, 1)), (2, 1, 0)) # Initialize each mean to a random data point indices = np.arange(T) np.random.shuffle(indices) mu = data[:, indices[0:M]] weights, _ = normalise(np.ones((M, 1))) elif method == 'kmeans': gmm = GMM(n_components=M, covariance_type=cov_type, thresh=1e-2, min_covar=1e-3, n_iter=5, n_init=1, params='wmc', init_params='wmc') gmm.fit(data.T) mu = gmm.means_.T weights = np.asmatrix(gmm.weights_).T covars = gmm.covars_ Sigma = np.zeros((d, d, M)) for m in range(M): if cov_type == 'diag': Sigma[:, :, m] = np.diag(covars[m, :]) elif cov_type == 'full': Sigma[:, :, m] = covars[:, :, m] elif cov_type == 'spherical': Sigma[:, :, m] = covars[m] * np.eye(d) return mu, Sigma, weights
def testMatrix(self): v = np.array([[1, 2], [1, 2]]) vn, s = normalise(v) assert np.all(np.abs(vn-np.array([[ 0.16666667, 0.33333333], [ 0.16666667, 0.33333333]])) < 1e-3) assert s==6
def testTensor(self): v = np.array([[[1, 2], [1, 2]], [[1, 2], [1, 2]]]) vn, s = normalise(v) assert np.all( np.abs(vn - np.array([[ [0.08333333, 0.16666667], [0.08333333, 0.16666667] ], [[0.08333333, 0.16666667], [0.08333333, 0.16666667]]])) < 1e-3) assert s == 12
def testTensor(self): v = np.array([[[1, 2], [1, 2]], [[1, 2], [1, 2]]]) vn, s = normalise(v) assert np.all(np.abs(vn-np.array([[[ 0.08333333, 0.16666667], [ 0.08333333, 0.16666667]], [[ 0.08333333, 0.16666667], [ 0.08333333, 0.16666667]]])) < 1e-3) assert s==12
def mixgauss_init(M, data, cov_type, method='kmeans'): ''' % MIXGAUSS_INIT Initial parameter estimates for a mixture of Gaussians % function [mu, Sigma, weights] = mixgauss_init(M, data, cov_type. method) % % INPUTS: % data(:,t) is the t'th example % M = num. mixture components % cov_type = 'full', 'diag' or 'spherical' % method = 'rnd' (choose centers randomly from data) or 'kmeans' (needs netlab) % % OUTPUTS: % mu(:,k) % Sigma(:,:,k) % weights(k) ''' if isinstance(data, list): data = np.hstack(data) elif data.ndim==3: O, T, N = data.shape data = np.reshape(np.transpose(data, (0, 2, 1)), (O, T*N)) d, T = data.shape if method=='rnd': C = np.atleast_2d(np.cov(data)) Sigma = np.transpose(np.tile(np.diag(np.diag(C))*0.5, (M, 1, 1)), (2, 1, 0)) # Initialize each mean to a random data point indices = np.arange(T) np.random.shuffle(indices) mu = data[:,indices[0:M]] weights, _ = normalise(np.ones((M,1))) elif method=='kmeans': gmm = GMM(n_components=M, covariance_type=cov_type, thresh=1e-2, min_covar=1e-3, n_iter=5, n_init=1, params='wmc', init_params='wmc') gmm.fit(data.T) mu = gmm.means_.T weights = np.asmatrix(gmm.weights_).T covars = gmm.covars_ Sigma = np.zeros((d,d,M)) for m in range(M): if cov_type=='diag': Sigma[:,:,m] = np.diag(covars[m,:]) elif cov_type=='full': Sigma[:,:,m] = covars[:,:,m] elif cov_type=='spherical': Sigma[:,:,m] = covars[m] * np.eye(d) return mu, Sigma, weights
def fit(self, obs): obs = self._convertObs(obs) O = obs[0].shape[0] M = self.n_mix Q = self.n_components if 's' in self.init_params: self.startprob_, _ = normalise(self.startprob_) if 't' in self.init_params: self.transmat_, _ = mk_stochastic(self.transmat_) if 'm' in self.init_params or 'c' in self.init_params: mu0, Sigma0, weights0 = mixgauss_init( Q * M, obs, cov_type=self._covariance_type) if 'm' in self.init_params: self.means_ = np.transpose(np.reshape(mu0, (O, M, Q)), (0, 2, 1)) if 'c' in self.init_params: self.covars_ = np.transpose(np.reshape(Sigma0, (O, O, M, Q)), (0, 1, 3, 2)) mixmat0, _ = mk_stochastic(np.random.rand(Q, M)) self.LL, prior1, transmat1, mu1, Sigma1, mixmat1 = mhmm_em( data=obs, prior=self.startprob_, transmat=self.transmat_, mu=self.means_, Sigma=self.covars_, mixmat=mixmat0, max_iter=self.n_iter, thresh=self.thresh, cov_type=self._covariance_type, adj_trans='t' in self.params, adj_mix='w' in self.params, adj_mu='m' in self.params, adj_Sigma='c' in self.params) self.startprob_ = prior1 self.transmat_ = transmat1 self.means_ = mu1 self.covars_ = Sigma1 self.weights_ = mixmat1
def fit(self, obs): obs = self._convertObs(obs) O = obs[0].shape[0] M = self.n_mix Q = self.n_components if "s" in self.init_params: self.startprob_, _ = normalise(self.startprob_) if "t" in self.init_params: self.transmat_, _ = mk_stochastic(self.transmat_) if "m" in self.init_params or "c" in self.init_params: mu0, Sigma0, weights0 = mixgauss_init(Q * M, obs, cov_type=self._covariance_type) if "m" in self.init_params: self.means_ = np.transpose(np.reshape(mu0, (O, M, Q)), (0, 2, 1)) if "c" in self.init_params: self.covars_ = np.transpose(np.reshape(Sigma0, (O, O, M, Q)), (0, 1, 3, 2)) mixmat0, _ = mk_stochastic(np.random.rand(Q, M)) self.LL, prior1, transmat1, mu1, Sigma1, mixmat1 = mhmm_em( data=obs, prior=self.startprob_, transmat=self.transmat_, mu=self.means_, Sigma=self.covars_, mixmat=mixmat0, max_iter=self.n_iter, thresh=self.thresh, cov_type=self._covariance_type, adj_trans="t" in self.params, adj_mix="w" in self.params, adj_mu="m" in self.params, adj_Sigma="c" in self.params, ) self.startprob_ = prior1 self.transmat_ = transmat1 self.means_ = mu1 self.covars_ = Sigma1 self.weights_ = mixmat1
def testVectorCol(self): v = np.array([[1, 2]]) vn, s = normalise(v) assert np.all(np.abs(vn - np.array([0.33333, 0.66666])) < 1e-3) assert s == 3
def mhmm_em(data, prior, transmat, mu, Sigma, mixmat=None, **kwargs): ''' % LEARN_MHMM Compute the ML parameters of an HMM with (mixtures of) Gaussians output using EM. % [ll_trace, prior, transmat, mu, sigma, mixmat] = learn_mhmm(data, ... % prior0, transmat0, mu0, sigma0, mixmat0, ...) % % Notation: Q(t) = hidden state, Y(t) = observation, M(t) = mixture variable % % INPUTS: % data{ex}(:,t) or data(:,t,ex) if all sequences have the same length % prior(i) = Pr(Q(1) = i), % transmat(i,j) = Pr(Q(t+1)=j | Q(t)=i) % mu(:,j,k) = E[Y(t) | Q(t)=j, M(t)=k ] % Sigma(:,:,j,k) = Cov[Y(t) | Q(t)=j, M(t)=k] % mixmat(j,k) = Pr(M(t)=k | Q(t)=j) : set to [] or ones(Q,1) if only one mixture component % % Optional parameters may be passed as 'param_name', param_value pairs. % Parameter names are shown below; default values in [] - if none, argument is mandatory. % % 'max_iter' - max number of EM iterations [10] % 'thresh' - convergence threshold [1e-4] % 'verbose' - if 1, print out loglik at every iteration [1] % 'cov_type' - 'full', 'diag' or 'spherical' ['full'] % % To clamp some of the parameters, so learning does not change them: % 'adj_prior' - if 0, do not change prior [1] % 'adj_trans' - if 0, do not change transmat [1] % 'adj_mix' - if 0, do not change mixmat [1] % 'adj_mu' - if 0, do not change mu [1] % 'adj_Sigma' - if 0, do not change Sigma [1] % % If the number of mixture components differs depending on Q, just set the trailing % entries of mixmat to 0, e.g., 2 components if Q=1, 3 components if Q=2, % then set mixmat(1,3)=0. In this case, B2(1,3,:)=1.0. ''' max_iter = kwargs.pop('max_iter', 10) thresh = kwargs.pop('thresh', 1e-4) verbose = kwargs.pop('verbose', True) cov_type = kwargs.pop('cov_type', 'full') adj_prior = kwargs.pop('adj_prior', True) adj_trans = kwargs.pop('adj_trans', True) adj_mix = kwargs.pop('adj_mix', True) adj_mu = kwargs.pop('adj_mu', True) adj_Sigma = kwargs.pop('adj_Sigma', True) previous_loglik = -np.Inf loglik = 0 converged = False num_iter = 1 LL = [] if not isinstance(data, list): data = [data[:,:,i] for i in range(data.shape[2])] numex = len(data) O = data[0].shape[0] Q = len(prior) if mixmat==None: mixmat = np.ones((Q,1)) M = mixmat.shape[1] if M == 1: adj_mix = False while (num_iter <= max_iter) and not converged: # E step loglik, exp_num_trans, exp_num_visits1, postmix, m, ip, op = ess_mhmm(prior, transmat, mixmat, mu, Sigma, data) # M step if adj_prior: prior, _ = normalise(exp_num_visits1) if adj_trans: transmat, _ = mk_stochastic(exp_num_trans) if adj_mix: mixmat, _ = mk_stochastic(postmix) if adj_mu or adj_Sigma: postmixx = np.reshape(np.transpose(postmix), (M*Q,)) mm = np.reshape(np.transpose(m,(0,2,1)), (O, M*Q)) opp = np.reshape(np.transpose(op, (0,1,3,2)), (O*O, M*Q)) ipp = np.reshape(np.transpose(ip), (M*Q,)) mu2, Sigma2 = mixgauss_Mstep(postmixx, mm, opp, ipp, cov_type=cov_type) if adj_mu: mu = np.transpose(np.reshape(mu2, (O, M, Q)), (0,2,1)) if adj_Sigma: Sigma = np.transpose(np.reshape(Sigma2, (O, O, M, Q)), (0, 1, 3, 2)) if verbose: print 'iteration %d, loglik = %f' % (num_iter, loglik) num_iter = num_iter + 1 converged, _ = em_converged(loglik, previous_loglik, thresh) previous_loglik = loglik; LL.append(loglik); return LL, prior, transmat, mu, Sigma, mixmat
def fwdback(init_state_distrib, transmat, obslik, **kwargs): ''' % FWDBACK Compute the posterior probs. in an HMM using the forwards backwards algo. % % [alpha, beta, gamma, loglik, xi, gamma2] = fwdback(init_state_distrib, transmat, obslik, ...) % % Notation: % Y(t) = observation, Q(t) = hidden state, M(t) = mixture variable (for MOG outputs) % A(t) = discrete input (action) (for POMDP models) % % INPUT: % init_state_distrib(i) = Pr(Q(1) = i) % transmat(i,j) = Pr(Q(t) = j | Q(t-1)=i) % or transmat{a}(i,j) = Pr(Q(t) = j | Q(t-1)=i, A(t-1)=a) if there are discrete inputs % obslik(i,t) = Pr(Y(t)| Q(t)=i) % (Compute obslik using eval_pdf_xxx on your data sequence first.) % % Optional parameters may be passed as 'param_name', param_value pairs. % Parameter names are shown below; default values in [] - if none, argument is mandatory. % % For HMMs with MOG outputs: if you want to compute gamma2, you must specify % 'obslik2' - obslik(i,j,t) = Pr(Y(t)| Q(t)=i,M(t)=j) [] % 'mixmat' - mixmat(i,j) = Pr(M(t) = j | Q(t)=i) [] % or mixmat{t}(m,q) if not stationary % % For HMMs with discrete inputs: % 'act' - act(t) = action performed at step t % % Optional arguments: % 'fwd_only' - if 1, only do a forwards pass and set beta=[], gamma2=[] [0] % 'scaled' - if 1, normalize alphas and betas to prevent underflow [1] % 'maximize' - if 1, use max-product instead of sum-product [0] % % OUTPUTS: % alpha(i,t) = p(Q(t)=i | y(1:t)) (or p(Q(t)=i, y(1:t)) if scaled=0) % beta(i,t) = p(y(t+1:T) | Q(t)=i)*p(y(t+1:T)|y(1:t)) (or p(y(t+1:T) | Q(t)=i) if scaled=0) % gamma(i,t) = p(Q(t)=i | y(1:T)) % loglik = log p(y(1:T)) % xi(i,j,t-1) = p(Q(t-1)=i, Q(t)=j | y(1:T)) - NO LONGER COMPUTED % xi_summed(i,j) = sum_{t=}^{T-1} xi(i,j,t) - changed made by Herbert Jaeger % gamma2(j,k,t) = p(Q(t)=j, M(t)=k | y(1:T)) (only for MOG outputs) % % If fwd_only = 1, these become % alpha(i,t) = p(Q(t)=i | y(1:t)) % beta = [] % gamma(i,t) = p(Q(t)=i | y(1:t)) % xi(i,j,t-1) = p(Q(t-1)=i, Q(t)=j | y(1:t)) % gamma2 = [] % % Note: we only compute xi if it is requested as a return argument, since it can be very large. % Similarly, we only compute gamma2 on request (and if using MOG outputs). % % Examples: % % [alpha, beta, gamma, loglik] = fwdback(pi, A, multinomial_prob(sequence, B)); % % [B, B2] = mixgauss_prob(data, mu, Sigma, mixmat); % [alpha, beta, gamma, loglik, xi, gamma2] = fwdback(pi, A, B, 'obslik2', B2, 'mixmat', mixmat); ''' obslik2 = kwargs.pop('obslik2', None) mixmat = kwargs.pop('mixmat', None) fwd_only = kwargs.pop('fwd_only', False) scaled = kwargs.pop('scaled', True) act = kwargs.pop('act', None) maximize = kwargs.pop('maximize', False) compute_xi = kwargs.pop('compute_xi', obslik2!=None) compute_gamma2 = kwargs.pop('compute_gamma2', obslik2!=None and mixmat!=None) init_state_distrib = np.asmatrix(init_state_distrib) obslik = np.asmatrix(obslik) Q, T = obslik.shape; if act==None: act = np.zeros((T,)) transmat = transmat[np.newaxis,:,:] scale = np.ones((T,)) # scale(t) = Pr(O(t) | O(1:t-1)) = 1/c(t) as defined by Rabiner (1989). # Hence prod_t scale(t) = Pr(O(1)) Pr(O(2)|O(1)) Pr(O(3) | O(1:2)) ... = Pr(O(1), ... ,O(T)) # or log P = sum_t log scale(t). # Rabiner suggests multiplying beta(t) by scale(t), but we can instead # normalise beta(t) - the constants will cancel when we compute gamma. loglik = 0 alpha = np.asmatrix(np.zeros((Q,T))) gamma = np.asmatrix(np.zeros((Q,T))) if compute_xi: xi_summed = np.zeros((Q,Q)); else: xi_summed = None ######## Forwards ######## t = 0 alpha[:,t] = np.multiply(init_state_distrib, obslik[:,t].T).T if scaled: #[alpha(:,t), scale(t)] = normaliseC(alpha(:,t)); alpha[:,t], scale[t] = normalise(alpha[:,t]) #assert(approxeq(sum(alpha(:,t)),1)) for t in range (1, T): #trans = transmat(:,:,act(t-1))'; trans = transmat[act[t-1]] if maximize: m = max_mult(trans.T, alpha[:,t-1]) #A = repmat(alpha(:,t-1), [1 Q]); #m = max(trans .* A, [], 1); else: m = np.dot(trans.T,alpha[:,t-1]) alpha[:,t] = np.multiply(m, obslik[:,t]) if scaled: #[alpha(:,t), scale(t)] = normaliseC(alpha(:,t)); alpha[:,t], scale[t] = normalise(alpha[:,t]) if compute_xi and fwd_only: # useful for online EM #xi(:,:,t-1) = normaliseC((alpha(:,t-1) * obslik(:,t)') .* trans); xi_summed = xi_summed + normalise(np.multiply(np.dot(alpha[:,t-1], obslik[:,t].T), trans))[0]; #assert(approxeq(sum(alpha(:,t)),1)) if scaled: if np.any(scale==0): loglik = -np.Inf; else: loglik = np.sum(np.log(scale), 0) else: loglik = np.log(np.sum(alpha[:,T], 0)) if fwd_only: gamma = alpha; beta = None; gamma2 = None; return alpha, beta, gamma, loglik, xi_summed, gamma2 ######## Backwards ######## beta = np.asmatrix(np.zeros((Q,T))) if compute_gamma2: if isinstance(mixmat, list): M = mixmat[0].shape[1] else: M = mixmat.shape[1] gamma2 = np.zeros((Q,M,T)) else: gamma2 = None beta[:,T-1] = np.ones((Q,1)) #%gamma(:,T) = normaliseC(alpha(:,T) .* beta(:,T)); gamma[:,T-1], _ = normalise(np.multiply(alpha[:,T-1], beta[:,T-1])) t=T-1 if compute_gamma2: denom = obslik[:,t] + (obslik[:,t]==0) # replace 0s with 1s before dividing if isinstance(mixmat, list): #in case mixmax is an anyarray gamma2[:,:,t] = np.divide(np.multiply(np.multiply(obslik2[:,:,t], mixmat[t]), np.tile(gamma[:,t], (1, M))), np.tile(denom, (1, M))); else: gamma2[:,:,t] = np.divide(np.multiply(np.multiply(obslik2[:,:,t], mixmat), np.tile(gamma[:,t], (1, M))), np.tile(denom, (1, M))) #TODO: tiling and asmatrix might be slow. mybe remove #gamma2(:,:,t) = normaliseC(obslik2(:,:,t) .* mixmat .* repmat(gamma(:,t), [1 M])); % wrong! for t in range(T-2, -1, -1): b = np.multiply(beta[:,t+1], obslik[:,t+1]) #trans = transmat(:,:,act(t)); trans = transmat[act[t]] if maximize: B = np.tile(b.T, (Q, 1)) beta[:,t] = np.max(np.multiply(trans, B), 1) else: beta[:,t] = np.dot(trans, b) if scaled: #beta(:,t) = normaliseC(beta(:,t)); beta[:,t], _ = normalise(beta[:,t]) #gamma(:,t) = normaliseC(alpha(:,t) .* beta(:,t)); gamma[:,t], _ = normalise(np.multiply(alpha[:,t], beta[:,t])) if compute_xi: #xi(:,:,t) = normaliseC((trans .* (alpha(:,t) * b'))); xi_summed = xi_summed + normalise(np.multiply(trans, np.dot(alpha[:,t],b.T)))[0] if compute_gamma2: denom = obslik[:,t] + (obslik[:,t]==0) # replace 0s with 1s before dividing if isinstance(mixmat, list): #in case mixmax is an anyarray gamma2[:,:,t] = np.divide(np.multiply(np.multiply(obslik2[:,:,t], mixmat[t]), np.tile(gamma[:,t], (1, M))), np.tile(denom, (1, M))) else: gamma2[:,:,t] = np.divide(np.multiply(np.multiply(obslik2[:,:,t], mixmat), np.tile(gamma[:,t], (1, M))), np.tile(denom, (1, M))) #gamma2(:,:,t) = normaliseC(obslik2(:,:,t) .* mixmat .* repmat(gamma(:,t), [1 M])); # We now explain the equation for gamma2 # Let zt=y(1:t-1,t+1:T) be all observations except y(t) # gamma2(Q,M,t) = P(Qt,Mt|yt,zt) = P(yt|Qt,Mt,zt) P(Qt,Mt|zt) / P(yt|zt) # = P(yt|Qt,Mt) P(Mt|Qt) P(Qt|zt) / P(yt|zt) # Now gamma(Q,t) = P(Qt|yt,zt) = P(yt|Qt) P(Qt|zt) / P(yt|zt) # hence # P(Qt,Mt|yt,zt) = P(yt|Qt,Mt) P(Mt|Qt) [P(Qt|yt,zt) P(yt|zt) / P(yt|Qt)] / P(yt|zt) # = P(yt|Qt,Mt) P(Mt|Qt) P(Qt|yt,zt) / P(yt|Qt) return alpha, beta, gamma, loglik, xi_summed, gamma2
def testVectorCol(self): v = np.array([[1, 2]]) vn, s = normalise(v) assert np.all(np.abs(vn-np.array([0.33333, 0.66666])) < 1e-3) assert s==3