def setPostFactors(self, obsModel=None, SS=None, LP=None, Data=None, nu=0, B=0, m=0, kappa=0, **kwargs): ''' Set attribute Post to provided values. ''' self.ClearCache() if obsModel is not None: if hasattr(obsModel, 'Post'): self.Post = obsModel.Post.copy() self.K = self.Post.K else: self.setPostFromEstParams(obsModel.EstParams) return if LP is not None and Data is not None: SS = self.calcSummaryStats(Data, None, LP) if SS is not None: self.updatePost(SS) else: m = as2D(m) if m.shape[1] != self.D: m = m.T.copy() K, _ = m.shape self.Post = ParamBag(K=K, D=self.D) self.Post.setField('nu', as1D(nu), dims=('K')) self.Post.setField('B', B, dims=('K', 'D', 'D')) self.Post.setField('m', m, dims=('K', 'D')) self.Post.setField('kappa', as1D(kappa), dims=('K')) self.K = self.Post.K
def buildCostMatrix(zHat, zTrue): ''' Construct cost matrix for alignment of estimated and true sequences Args -------- zHat : 1D array each entry is an integer label in {0, 1, ... Kest-1} zTrue : 1D array each entry is an integer label in {0, 1, ... Ktrue-1} with optional negative state labels Returns -------- CostMatrix : 2D array, size Ktrue x Kest CostMatrix[j,k] = count of events across all timesteps, where j is assigned, but k is not. ''' zHat = as1D(zHat) zTrue = as1D(zTrue) Ktrue = int(np.max(zTrue)) + 1 Kest = int(np.max(zHat)) + 1 K = np.maximum(Ktrue, Kest) CostMatrix = np.zeros((K, K)) for ktrue in range(K): for kest in range(K): CostMatrix[ktrue, kest] = np.sum( np.logical_and(zTrue == ktrue, zHat != kest)) return CostMatrix
def packParamBagForPost(nu=None, beta=None, m=None, kappa=None, D=None, Post=None, **kwargs): ''' ''' m = as2D(m) beta = as2D(beta) if D is None: D = m.shape[1] if m.shape[1] != D: m = m.T.copy() if beta.shape[1] != D: beta = beta.T.copy() K, _ = m.shape if Post is None: Post = ParamBag(K=K, D=D) else: assert isinstance(Post, ParamBag) assert Post.K == K assert Post.D == D Post.setField('nu', as1D(nu), dims=('K')) Post.setField('beta', beta, dims=('K', 'D')) Post.setField('m', m, dims=('K', 'D')) Post.setField('kappa', as1D(kappa), dims=('K')) return Post
def packParamBagForPost(pnu_K=None, ptau_K=None, w_KE=None, P_KEE=None, Post=None, **kwargs): ''' Parse provided array args and pack into parameter bag Returns ------- Post : ParamBag, with K clusters ''' pnu_K = as1D(pnu_K) ptau_K = as1D(ptau_K) w_KE = as2D(w_KE) P_KEE = as3D(P_KEE) K = pnu_K.size E = w_KE.shape[1] if Post is None: Post = ParamBag(K=K, D=E - 1, E=E) elif not hasattr(Post, 'E'): Post.E = E assert Post.K == K assert Post.D == E - 1 assert Post.E == E Post.setField('pnu_K', pnu_K, dims=('K')) Post.setField('ptau_K', ptau_K, dims=('K')) Post.setField('w_KE', w_KE, dims=('K', 'E')) Post.setField('P_KEE', P_KEE, dims=('K', 'E', 'E')) return Post
def __init__(self, word_id=None, word_count=None, doc_range=None, vocab_size=0, vocabList=None, vocabfile=None, summary=None, nDocTotal=None, TrueParams=None, **kwargs): ''' Constructor for BagOfWordsData object. Represents bag-of-words dataset via several 1D vectors, where each entry gives the id/count of a single token. Parameters ------- word_id : 1D array, size U word_id[i] gives the int type for distinct word i. Range: [0, 1, 2, 3, ... vocab_size-1] word_count : 1D array, size U word_count[i] gives the number of times distinct word i appears in its corresponding document. Range: [1, 2, 3, ...] doc_range : 1D array, size nDoc+1 Defines section of each corpus-wide vector which belongs to each document d: word_id_d = self.word_id[doc_range[d]:doc_range[d+1]] word_ct_d = self.word_count[doc_range[d]:doc_range[d+1]] vocab_size : int size of set of possible vocabulary words vocabList : list of strings specifies the word associated with each vocab type id nDoc : int number of groups (documents) represented in memory by this object. nDocTotal : int total size of the corpus will differ from nDoc when this dataset represents a small minibatch of some larger corpus. TrueParams : None [default], or dict of true parameters. ''' self.word_id = as1D(toCArray(word_id, dtype=np.int32)) self.word_count = as1D(toCArray(word_count, dtype=np.float64)) self.doc_range = as1D(toCArray(doc_range, dtype=np.int32)) self.vocab_size = int(vocab_size) if summary is not None: self.summary = summary # Store "true" parameters that generated toy-data, if provided if TrueParams is not None: self.TrueParams = TrueParams # Add dictionary of vocab words, if provided if vocabList is not None: self.vocabList = vocabList elif vocabfile is not None: with open(vocabfile, 'r') as f: self.vocabList = [x.strip() for x in f.readlines()] else: self.vocabList = None if vocab_size == 0 and self.vocabList is not None: self.vocab_size = len(self.vocabList) self._verify_attributes() self._set_corpus_size_attributes(nDocTotal)
def __init__(self, X=None, doc_range=None, nDocTotal=None, Xprev=None, TrueZ=None, TrueParams=None, fileNames=None, summary=None, **kwargs): ''' Create an instance of GroupXData for provided array X Post Condition --------- self.X : 2D array, size N x D with standardized dtype, alignment, byteorder. self.Xprev : 2D array, size N x D with standardized dtype, alignment, byteorder. self.doc_range : 1D array, size nDoc+1 ''' self.X = as2D(toCArray(X, dtype=np.float64)) self.doc_range = as1D(toCArray(doc_range, dtype=np.int32)) if summary is not None: self.summary = summary if Xprev is not None: self.Xprev = as2D(toCArray(Xprev, dtype=np.float64)) # Verify attributes are consistent self._set_dependent_params(doc_range, nDocTotal) self._check_dims() # Add optional true parameters / true hard labels if TrueParams is not None: self.TrueParams = dict() for key, arr in TrueParams.items(): self.TrueParams[key] = toCArray(arr) if TrueZ is not None: if not hasattr(self, 'TrueParams'): self.TrueParams = dict() self.TrueParams['Z'] = as1D(toCArray(TrueZ)) self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size # Add optional source files for each group/sequence if fileNames is not None: if hasattr(fileNames, 'shape') and fileNames.shape == (1, 1): fileNames = fileNames[0, 0] if len(fileNames) > 1: self.fileNames = [str(x).strip() for x in np.squeeze(fileNames)] else: self.fileNames = [str(fileNames[0])] # Add extra data attributes custom for the dataset for key in kwargs: if hasattr(self, key): continue if not key.startswith("__"): arr = np.squeeze(as1D(kwargs[key])) if arr.shape == (): try: arr = float(arr) except TypeError: continue setattr(self, key, arr)
def calcHammingDistance(zTrue, zHat, excludeNegLabels=1, verbose=0, **kwargs): ''' Compute Hamming distance: sum of all timesteps with different labels. Normalizes result to be within [0, 1]. Args -------- zHat : 1D array each entry is an integer label in {0, 1, ... Kest-1} zTrue : 1D array each entry is an integer label in {0, 1, ... Ktrue-1} Returns ------ d : int Hamming distance from zTrue to zHat. Examples ------ >>> calcHammingDistance([0, 0, 1, 1], [0, 0, 1, 1]) 0.0 >>> calcHammingDistance([0, 0, 1, 1], [0, 0, 1, 2]) 0.25 >>> calcHammingDistance([0, 0, 1, 1], [1, 1, 0, 0]) 1.0 >>> calcHammingDistance([1, 1, 0, -1], [1, 1, 0, 0]) 0.0 >>> calcHammingDistance([-1, -1, -2, 3], [1, 2, 3, 3]) 0.0 >>> calcHammingDistance([-1, -1, 0, 1], [1, 2, 0, 1], excludeNegLabels=1) 0.0 >>> calcHammingDistance([-1, -1, 0, 1], [1, 2, 0, 1], excludeNegLabels=0) 0.5 ''' zHat = as1D(zHat) zTrue = as1D(zTrue) if excludeNegLabels: assert np.sum(zHat < 0) == 0 good_tstep_mask = zTrue >= 0 nGood = np.sum(good_tstep_mask) if verbose and np.sum(good_tstep_mask) < zTrue.size: print('EXCLUDED %d/%d timesteps' % (np.sum(zTrue < 0), zTrue.size)) dist = np.sum(zTrue[good_tstep_mask] != zHat[good_tstep_mask]) dist = dist/float(nGood) else: dist = np.sum(zTrue != zHat) / float(zHat.size) return dist
def getPrefixForLapQuery(taskpath, lapQuery): ''' Search among checkpoint laps for one nearest to query. Returns -------- prefix : str For lap 1, prefix = 'Lap0001.000'. For lap 5.5, prefix = 'Lap0005.500'. lap : int lap checkpoint for saved params close to lapQuery ''' try: saveLaps = np.loadtxt(os.path.join(taskpath, 'snapshot_lap.txt')) except IOError: fileList = glob.glob(os.path.join(taskpath, 'Lap*Topic*')) if len(fileList) == 0: fileList = glob.glob(os.path.join(taskpath, 'Lap*.log_prob_w')) assert len(fileList) > 0 saveLaps = list() for fpath in sorted(fileList): basename = fpath.split(os.path.sep)[-1] lapstr = basename[3:11] saveLaps.append(float(lapstr)) saveLaps = np.sort(np.asarray(saveLaps)) saveLaps = as1D(saveLaps) if lapQuery is None: bestLap = saveLaps[-1] # take final saved value else: distances = np.abs(lapQuery - saveLaps) bestLap = saveLaps[np.argmin(distances)] return makePrefixForLap(bestLap), bestLap
def eta2pi(eta_d): eta_d = as1D(np.asarray(eta_d)) pi_d = np.ones(eta_d.size+1) pi_d[:-1] = np.exp(eta_d) pi_d[:-1] += 1e-100 pi_d /= (1.0 + np.sum(pi_d[:-1])) return pi_d
def setPostFactors(self, obsModel=None, SS=None, LP=None, Data=None, nu=0, B=0, M=0, V=0, **kwargs): ''' Set Post attribute to provided values. ''' self.ClearCache() if obsModel is not None: if hasattr(obsModel, 'Post'): self.Post = obsModel.Post.copy() else: self.setPostFromEstParams(obsModel.EstParams) self.K = self.Post.K return if LP is not None and Data is not None: SS = self.calcSummaryStats(Data, None, LP) if SS is not None: self.updatePost(SS) else: M = as3D(M) B = as3D(B) V = as3D(V) K, D, E = M.shape assert D == self.D assert E == self.E self.Post = ParamBag(K=K, D=self.D, E=self.E) self.Post.setField('nu', as1D(nu), dims=('K')) self.Post.setField('B', B, dims=('K', 'D', 'D')) self.Post.setField('M', M, dims=('K', 'D', 'E')) self.Post.setField('V', V, dims=('K', 'E', 'E')) self.K = self.Post.K
def generateRandomBinaryDataFromMixture(**kwargs): for key in Defaults: if key not in kwargs: kwargs[key] = Defaults[key] phi = makePhi(**kwargs) nObsTotal = kwargs['nObsTotal'] PRNG = np.random.RandomState(kwargs['seed']) # Select number of observations from each cluster beta = 1.0 / K * np.ones(K) if nObsTotal < 2 * K: # force examples from every cluster nPerCluster = np.ceil(nObsTotal / K) * np.ones(K) else: nPerCluster = as1D(PRNG.multinomial(nObsTotal, beta, size=1)) nPerCluster = np.int32(nPerCluster) # Generate data from each cluster! X = np.zeros((nObsTotal, D)) Z = np.zeros(nObsTotal, dtype=np.int32) start = 0 for k in xrange(K): stop = start + nPerCluster[k] X[start:stop] = np.float64( PRNG.rand(nPerCluster[k], D) < phi[k, :][np.newaxis, :]) Z[start:stop] = k start = stop TrueParams = dict() TrueParams['beta'] = beta TrueParams['phi'] = phi TrueParams['Z'] = Z return XData(X, TrueParams=TrueParams)
def createParamBagForPrior( Data=None, D=0, pnu=0, ptau=None, w_E=0, P_EE=None, P_diag_E=None, P_diag_val=1.0, Prior=None, **kwargs): ''' Initialize Prior ParamBag attribute. Returns ------- Prior : ParamBag with dimension attributes K, D, E with parameter attributes pnu, ptau, w_E, P_EE ''' if Data is None: D = int(D) else: D = int(Data.dim) E = D + 1 # Init parameters of 1D Wishart prior on delta pnu = np.maximum(pnu, 1e-9) ptau = np.maximum(ptau, 1e-9) # Initialize precision matrix of the weight vector if P_EE is not None: P_EE = np.asarray(P_EE) elif P_diag_E is not None: P_EE = np.diag(np.asarray(P_diag_E)) else: P_EE = np.diag(P_diag_val * np.ones(E)) assert P_EE.ndim == 2 assert P_EE.shape == (E,E) # Initialize mean of the weight vector w_E = as1D(np.asarray(w_E)) if w_E.size < E: w_E = np.tile(w_E, E)[:E] assert w_E.ndim == 1 assert w_E.size == E if Prior is None: Prior = ParamBag(K=0, D=D, E=E) if not hasattr(Prior, 'E'): Prior.E = E assert Prior.D == D assert Prior.E == E Prior.setField('pnu', pnu, dims=None) Prior.setField('ptau', ptau, dims=None) Prior.setField('w_E', w_E, dims=('E')) Prior.setField('P_EE', P_EE, dims=('E', 'E')) Pw_E = np.dot(P_EE, w_E) wPw_1 = np.dot(w_E, Pw_E) Prior.setField('Pw_E', Pw_E, dims=('E')) Prior.setField('wPw_1', wPw_1, dims=None) return Prior
def calcSummaryStats(Data, SS, LP, **kwargs): ''' Calculate summary statistics for given dataset and local parameters Returns -------- SS : SuffStatBag object, with K components. ''' if not hasattr(Data, 'X_NE'): Data.X_NE = np.hstack([Data.X, np.ones(Data.nObs)[:, np.newaxis]]) Y_N = Data.Y X_NE = Data.X_NE E = X_NE.shape[1] if 'resp' in LP: # Dense responsibility calculations resp = LP['resp'] K = resp.shape[1] S_yy_K = dotATB(resp, np.square(Y_N)).flatten() S_yx_KE = dotATB(resp, Y_N * X_NE) # Expected outer product S_xxT_KEE = np.zeros((K, E, E)) sqrtResp_k_N = np.sqrt(resp[:, 0]) sqrtR_X_k_NE = sqrtResp_k_N[:, np.newaxis] * X_NE S_xxT_KEE[0] = dotATA(sqrtR_X_k_NE) for k in xrange(1, K): np.sqrt(resp[:, k], out=sqrtResp_k_N) np.multiply(sqrtResp_k_N[:, np.newaxis], X_NE, out=sqrtR_X_k_NE) S_xxT_KEE[k] = dotATA(sqrtR_X_k_NE) else: raise ValueError("TODO") spR = LP['spR'] K = spR.shape[1] if SS is None: SS = SuffStatBag(K=K, D=Data.dim, E=E) elif not hasattr(SS, 'E'): SS._Fields.E = E SS.setField('xxT_KEE', S_xxT_KEE, dims=('K', 'E', 'E')) SS.setField('yx_KE', S_yx_KE, dims=('K', 'E')) SS.setField('yy_K', S_yy_K, dims=('K')) # Expected count for each k # Usually computed by allocmodel. But just in case... if not hasattr(SS, 'N'): if 'resp' in LP: SS.setField('N', LP['resp'].sum(axis=0), dims='K') else: SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K') #SS.setField("N_K", SS.N, dims="K") return SS
def calcSmoothedMu(self, X, W=None): ''' Compute smoothed estimate of mean of statistic xxT. Args ---- X : 2D array, size N x D Returns ------- Mu_1 : 2D array, size D x D Expected value of Cov[ X[n] ] Mu_2 : 1D array, size D Expected value of Mean[ X[n] ] ''' if X is None: Mu1 = self.Prior.B / self.Prior.nu Mu2 = self.Prior.m return Mu1, Mu2 if X.ndim == 1: X = X[np.newaxis, :] N, D = X.shape # Compute suff stats if W is None: sum_wxxT = np.dot(X.T, X) sum_wx = np.sum(X, axis=0) sum_w = X.shape[0] else: W = as1D(W) sqrtWX = np.sqrt(W)[:, np.newaxis] * X sum_wxxT = np.dot(sqrtWX.T, sqrtWX) sum_wx = np.dot(W, X) sum_w = np.sum(W) kappa = self.Prior.kappa + sum_w m = (self.Prior.m * self.Prior.kappa + sum_wx) / kappa Mu_2 = m prior_kmmT = self.Prior.kappa * np.outer(self.Prior.m, self.Prior.m) post_kmmT = kappa * np.outer(m, m) B = sum_wxxT + self.Prior.B + prior_kmmT - post_kmmT Mu_1 = B / (self.Prior.nu + sum_w) assert Mu_1.ndim == 2 assert Mu_1.shape == ( D, D, ) assert Mu_2.shape == (D, ) return Mu_1, Mu_2
def calcSmoothedMu(self, X, W=None): ''' Compute smoothed estimate of mean of statistic xxT. Args ---- X : 2D array, size N x D Returns ------- Mu_1 : 2D array, size D Expected value of Var[ X[n,d] ] Mu_2 : 1D array, size D Expected value of Mean[ X[n] ] ''' if X is None: Mu1 = self.Prior.beta / self.Prior.nu Mu2 = self.Prior.m return Mu1, Mu2 if X.ndim == 1: X = X[np.newaxis, :] N, D = X.shape # Compute suff stats if W is None: sum_wxx = np.sum(np.square(X), axis=0) sum_wx = np.sum(X, axis=0) sum_w = X.shape[0] else: W = as1D(W) sum_wxx = np.dot(W, np.square(X)) sum_wx = np.dot(W, X) sum_w = np.sum(W) post_kappa = self.Prior.kappa + sum_w post_m = (self.Prior.m * self.Prior.kappa + sum_wx) / post_kappa Mu_2 = post_m prior_kmm = self.Prior.kappa * (self.Prior.m * self.Prior.m) post_kmm = post_kappa * (post_m * post_m) post_beta = sum_wxx + self.Prior.beta + prior_kmm - post_kmm Mu_1 = post_beta / (self.Prior.nu + sum_w) assert Mu_1.ndim == 1 assert Mu_1.shape == (D, ) assert Mu_2.shape == (D, ) return Mu_1, Mu_2
def calcSummaryStats(Data, SS, LP, **kwargs): ''' Calculate summary statistics for given dataset and local parameters Returns -------- SS : SuffStatBag object, with K components. ''' X = Data.X D = Data.dim if 'resp' in LP: resp = LP['resp'] K = resp.shape[1] # Compute expected outer-product statistic S_xxT = np.zeros((K, Data.dim, Data.dim)) sqrtResp_k = np.sqrt(resp[:, 0]) sqrtRX_k = sqrtResp_k[:, np.newaxis] * Data.X S_xxT[0] = dotATA(sqrtRX_k) for k in xrange(1, K): np.sqrt(resp[:, k], out=sqrtResp_k) np.multiply(sqrtResp_k[:, np.newaxis], Data.X, out=sqrtRX_k) S_xxT[k] = dotATA(sqrtRX_k) sqrtResp = np.sqrt(resp) xxT = np.zeros((K, D, D)) for k in xrange(K): xxT[k] = dotATA(sqrtResp[:, k][:, np.newaxis] * Data.X) assert np.allclose(xxT, S_xxT) else: spR = LP['spR'] K = spR.shape[1] # Compute expected outer-product statistic S_xxT = calcSpRXXT(X=X, spR_csr=spR) if SS is None: SS = SuffStatBag(K=K, D=D) # Expected outer-product for each state k SS.setField('xxT', S_xxT, dims=('K', 'D', 'D')) # Expected count for each k # Usually computed by allocmodel. But sometimes not (eg TopicModel) if not hasattr(SS, 'N'): if 'resp' in LP: SS.setField('N', LP['resp'].sum(axis=0), dims='K') else: SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K') return SS
def calcSummaryStats(Data, SS, LP, DataAtomType='doc', **kwargs): ''' Calculate summary statistics for given dataset and local parameters Returns -------- SS : SuffStatBag object, with K components. ''' if 'resp' in LP: K = LP['resp'].shape[1] else: K = LP['spR'].shape[1] nnzPerRow = LP['nnzPerRow'] if SS is None: SS = SuffStatBag(K=K, D=Data.vocab_size) if DataAtomType == 'doc': # X : 2D sparse matrix, size nDoc x vocab_size X = Data.getSparseDocTypeCountMatrix() # WordCounts : 2D array, size K x vocab_size # obtained by sparse matrix multiply # here, '*' operator does this because X is sparse matrix type Nvec = None if 'resp' in LP: WordCounts = LP['resp'].T * X if not hasattr(SS, 'N'): Nvec = LP['resp'].sum(axis=0) else: WordCounts = (LP['spR'].T * X).toarray() if not hasattr(SS, 'N'): Nvec = as1D(toCArray(LP['spR'].sum(axis=0))) if Nvec is not None: SS.setField('N', Nvec, dims=('K')) else: # 2D sparse matrix, size V x N X = Data.getSparseTokenTypeCountMatrix() if 'resp' in LP: WordCounts = (X * LP['resp']).T # matrix-matrix product else: WordCounts = (X * LP['spR']).T.toarray() SS.setField('WordCounts', WordCounts, dims=('K', 'D')) SS.setField('SumWordCounts', np.sum(WordCounts, axis=1), dims=('K')) return SS """
def pi2eta(pi_d): ''' Transform vector on simplex to unconstrained real vector Returns ------- eta : 1D array, size K-1 Examples -------- >>> print float(pi2eta(eta2pi(0.42))) 0.42 >>> print float(pi2eta(eta2pi(-1.337))) -1.337 >>> print pi2eta(eta2pi([-1, 0, 1])) [-1. 0. 1.] ''' pi_d = as1D(np.asarray(pi_d)) eta_d = pi_d[:-1] / pi_d[-1] np.log(eta_d, out=eta_d) return eta_d
def calcSummaryStats(Data, SS, LP, **kwargs): ''' Calculate summary statistics for given dataset and local parameters Returns -------- SS : SuffStatBag object, with K components. ''' X = Data.X if 'resp' in LP: resp = LP['resp'] K = resp.shape[1] # 1/2: Compute mean statistic S_x = dotATB(resp, X) # 2/2: Compute expected outer-product statistic S_xx = calcRXX_withDenseResp(resp, X) else: spR = LP['spR'] K = spR.shape[1] # 1/2: Compute mean statistic S_x = spR.T * X # 2/2: Compute expected outer-product statistic S_xx = calcSpRXX(X=X, spR_csr=spR) if SS is None: SS = SuffStatBag(K=K, D=Data.dim) # Expected mean for each state k SS.setField('x', S_x, dims=('K', 'D')) # Expected sum-of-squares for each state k SS.setField('xx', S_xx, dims=('K', 'D')) # Expected count for each k # Usually computed by allocmodel. But sometimes not (eg TopicModel) if not hasattr(SS, 'N'): if 'resp' in LP: SS.setField('N', LP['resp'].sum(axis=0), dims='K') else: SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K') return SS
def calcSmoothedMu(self, X, W=None): ''' Compute smoothed estimate of mean of statistic xxT. Args ---- X : 2D array, size N x D Returns ------- Mu : 2D array, size D x D ''' Prior_nu = self.Prior.nu - self.D - 1 # Prior_nu = self.Prior.nu if X is None: Mu = self.Prior.B / (Prior_nu) return Mu if X.ndim == 1: X = X[np.newaxis, :] N, D = X.shape # Compute suff stats if W is None: sum_wxxT = np.dot(X.T, X) sum_w = X.shape[0] else: W = as1D(W) wX = np.sqrt(W)[:, np.newaxis] * X sum_wxxT = np.dot(wX.T, wX) sum_w = np.sum(W) Mu = (self.Prior.B + sum_wxxT) / (Prior_nu + sum_w) assert Mu.ndim == 2 assert Mu.shape == ( D, D, ) return Mu
def plotGauss1DFromHModel(hmodel, compListToPlot=None, compsToHighlight=None, activeCompIDs=None, MaxKToDisplay=50, proba_thr=0.0001, ax_handle=None, Colors=Colors, dataset=None, **kwargs): ''' Make line plot of pdf for each component (1D observations). ''' if ax_handle is not None: pylab.sca(ax_handle) if compsToHighlight is not None: compsToHighlight = as1D(np.asarray(compsToHighlight)) else: compsToHighlight = list() if compListToPlot is None: compListToPlot = np.arange(0, hmodel.obsModel.K) if activeCompIDs is None: activeCompIDs = np.arange(0, hmodel.obsModel.K) # Load appearance probabilities as single vector if hmodel.allocModel.K == hmodel.obsModel.K: w = hmodel.allocModel.get_active_comp_probs() else: w = np.ones(hmodel.obsModel.K) if dataset is not None: if hasattr(dataset, 'X'): pylab.hist(dataset.X[:, 0], 50, density=1) #Xtile = np.tile(Data.X[:, 0], (2, 1)) #ys = 0.1 * np.arange(2) #pylab.plot(Xtile, ys, 'k-') nSkip = 0 nGood = 0 for ii, compID in enumerate(compListToPlot): if compID not in activeCompIDs: continue kk = np.flatnonzero(activeCompIDs == compID) assert kk.size == 1 kk = kk[0] if w[kk] < proba_thr and compID not in compsToHighlight: nSkip += 1 continue mu = hmodel.obsModel.get_mean_for_comp(kk) sigma2 = hmodel.obsModel.get_covar_mat_for_comp(kk) if len(compsToHighlight) == 0 or compID in compsToHighlight: color = Colors[ii % len(Colors)] plotGauss1D(mu, sigma2, color=color) elif kk not in compsToHighlight: plotGauss1D(mu, sigma2, color='k') nGood += 1 if nGood >= MaxKToDisplay: print('DISPLAY LIMIT EXCEEDED. Showing %d/%d components' \ % (nGood, len(activeCompIDs))) break if nSkip > 0: print('SKIPPED %d comps with size below %.2f' % (nSkip, proba_thr))
def loadTopicModel(matfilepath, queryLap=None, prefix=None, returnWordCounts=0, returnTPA=0, normalizeTopics=0, normalizeProbs=0, **kwargs): ''' Load saved topic model Returns ------- topics : 2D array, K x vocab_size (if returnTPA) probs : 1D array, size K (if returnTPA) alpha : scalar (if returnTPA) hmodel : HModel WordCounts : 2D array, size K x vocab_size (if returnWordCounts) ''' if prefix is None: prefix, lapQuery = getPrefixForLapQuery(matfilepath, queryLap) # avoids circular import from bnpy.HModel import HModel if len(glob.glob(os.path.join(matfilepath, "*.log_prob_w"))) > 0: return loadTopicModelFromMEDLDA(matfilepath, prefix, returnTPA=returnTPA) snapshotList = glob.glob(os.path.join(matfilepath, 'Lap*TopicSnapshot')) matfileList = glob.glob(os.path.join(matfilepath, 'Lap*TopicModel.mat')) if len(snapshotList) > 0: if prefix is None: snapshotList.sort() snapshotPath = snapshotList[-1] else: snapshotPath = None for curPath in snapshotList: if curPath.count(prefix): snapshotPath = curPath return loadTopicModelFromTxtFiles(snapshotPath, normalizeTopics=normalizeTopics, normalizeProbs=normalizeProbs, returnWordCounts=returnWordCounts, returnTPA=returnTPA) if prefix is not None: matfilepath = os.path.join(matfilepath, prefix + 'TopicModel.mat') Mdict = loadDictFromMatfile(matfilepath) if 'SparseWordCount_data' in Mdict: data = np.asarray(Mdict['SparseWordCount_data'], dtype=np.float64) K = int(Mdict['K']) vocab_size = int(Mdict['vocab_size']) try: indices = Mdict['SparseWordCount_indices'] indptr = Mdict['SparseWordCount_indptr'] WordCounts = scipy.sparse.csr_matrix((data, indices, indptr), shape=(K, vocab_size)) except KeyError: rowIDs = Mdict['SparseWordCount_i'] - 1 colIDs = Mdict['SparseWordCount_j'] - 1 WordCounts = scipy.sparse.csr_matrix((data, (rowIDs, colIDs)), shape=(K, vocab_size)) Mdict['WordCounts'] = WordCounts.toarray() if returnTPA: # Load topics : 2D array, K x vocab_size if 'WordCounts' in Mdict: topics = Mdict['WordCounts'] + Mdict['lam'] else: topics = Mdict['topics'] topics = as2D(toCArray(topics, dtype=np.float64)) assert topics.ndim == 2 K = topics.shape[0] if normalizeTopics: topics /= topics.sum(axis=1)[:, np.newaxis] # Load probs : 1D array, size K try: probs = Mdict['probs'] except KeyError: probs = (1.0 / K) * np.ones(K) probs = as1D(toCArray(probs, dtype=np.float64)) assert probs.ndim == 1 assert probs.size == K if normalizeProbs: probs = probs / np.sum(probs) # Load alpha : scalar float > 0 try: alpha = float(Mdict['alpha']) except KeyError: if 'alpha' in os.environ: alpha = float(os.environ['alpha']) else: raise ValueError('Unknown parameter alpha') if 'eta' in Mdict: return topics, probs, alpha, as1D(toCArray(Mdict['eta'])) return topics, probs, alpha infAlg = 'VB' if 'gamma' in Mdict: aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma']) HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel'] amodel = HDPTopicModel(infAlg, aPriorDict) else: FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel'] amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha'])) omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict) hmodel = HModel(amodel, omodel) hmodel.set_global_params(**Mdict) if returnWordCounts: return hmodel, Mdict['WordCounts'] return hmodel
def from_dict(self, Dict): self.inferType = Dict['inferType'] self.K = Dict['K'] self.uhat = as1D(Dict['uhat'])
def alignEstimatedStateSeqToTruth(zHat, zTrue, useInfo=None, returnInfo=False): ''' Relabel the states in zHat to minimize the hamming-distance to zTrue Args -------- zHat : 1D array each entry is an integer label in {0, 1, ... Kest-1} zTrue : 1D array each entry is an integer label in {0, 1, ... Ktrue-1} Returns -------- zHatAligned : 1D array relabeled version of zHat that aligns to zTrue AInfo : dict information about the alignment ''' zHat = as1D(zHat) zTrue = as1D(zTrue) Kest = zHat.max() + 1 Ktrue = zTrue.max() + 1 if useInfo is None: if not hasMunkres: raise ImportError( "alignEstimatedStateSeqToTruth requires the munkres package." + " Please install via 'pip install munkres'") CostMatrix = buildCostMatrix(zHat, zTrue) MunkresAlg = munkres.Munkres() tmpAlignedRowColPairs = MunkresAlg.compute(CostMatrix) AlignedRowColPairs = list() OrigToAlignedMap = dict() AlignedToOrigMap = dict() for (ktrue, kest) in tmpAlignedRowColPairs: if kest < Kest: AlignedRowColPairs.append((ktrue, kest)) OrigToAlignedMap[kest] = ktrue AlignedToOrigMap[ktrue] = kest else: # Unpack existing alignment info AlignedRowColPairs = useInfo['AlignedRowColPairs'] CostMatrix = useInfo['CostMatrix'] AlignedToOrigMap = useInfo['AlignedToOrigMap'] OrigToAlignedMap = useInfo['OrigToAlignedMap'] Ktrue = useInfo['Ktrue'] Kest = useInfo['Kest'] assert np.allclose(Ktrue, zTrue.max() + 1) Khat = zHat.max() + 1 # Account for extra states present in zHat # that have never been aligned before. # They should align to the next available UID in set # [Ktrue, Ktrue+1, Ktrue+2, ...] # so they don't get confused for a true label ktrueextra = np.max([r for r, c in AlignedRowColPairs]) ktrueextra = int(np.maximum(ktrueextra + 1, Ktrue)) for khat in np.arange(Kest, Khat + 1): if khat in OrigToAlignedMap: continue OrigToAlignedMap[khat] = ktrueextra AlignedToOrigMap[ktrueextra] = khat AlignedRowColPairs.append((ktrueextra, khat)) ktrueextra += 1 zHatA = -1 * np.ones_like(zHat) for kest in np.unique(zHat): mask = zHat == kest zHatA[mask] = OrigToAlignedMap[kest] assert np.all(zHatA >= 0) if returnInfo: return zHatA, dict(CostMatrix=CostMatrix, AlignedRowColPairs=AlignedRowColPairs, OrigToAlignedMap=OrigToAlignedMap, AlignedToOrigMap=AlignedToOrigMap, Ktrue=Ktrue, Kest=Kest) else: return zHatA
def alignEstimatedStateSeqToTruth(zHat, zTrue, useInfo=None, returnInfo=False, standardize_order_of_extras=True): ''' Relabel the states in zHat to minimize the hamming-distance to zTrue Args -------- zHat : 1D array each entry is an integer label in {0, 1, ... Kest-1} zTrue : 1D array each entry is an integer label in {0, 1, ... Ktrue-1} Returns -------- zHatAligned : 1D array relabeled version of zHat that aligns to zTrue AInfo : dict information about the alignment ''' zHat = as1D(zHat) zTrue = as1D(zTrue) Kest = zHat.max() + 1 Ktrue = zTrue.max() + 1 if useInfo is None: if not hasMunkres: raise ImportError( "alignEstimatedStateSeqToTruth requires the munkres package." + " Please install via 'pip install munkres'") CostMatrix = buildCostMatrix(zHat, zTrue) MunkresAlg = munkres.Munkres() tmpAlignedRowColPairs = MunkresAlg.compute(CostMatrix) AlignedRowColPairs = list() OrigToAlignedMap = dict() AlignedToOrigMap = dict() for (ktrue, kest) in tmpAlignedRowColPairs: if kest < Kest: AlignedRowColPairs.append((ktrue, kest)) OrigToAlignedMap[kest] = ktrue AlignedToOrigMap[ktrue] = kest if Ktrue < Kest and np.unique(zTrue).size < np.unique(zHat).size and standardize_order_of_extras: # All extra states will have ids in Ktrue, Ktrue+1, Ktrue+2, .... # Break ties by assigning these in ascending order by first appearance extra_states = [] for (kt, ke) in AlignedRowColPairs: if kt >= Ktrue and ke in zHat: extra_states.append(ke) rank = dict() for x in sorted(extra_states): match_ids = np.flatnonzero(zHat == x) if match_ids.size == 0: continue rank[x] = match_ids[0] ## Renumber by rank from smallest to largest old_ids, rank_vals = zip(*list(rank.items())) old_ids = np.asarray(old_ids) new_ids = Ktrue + np.argsort(np.asarray(rank_vals)) old2new = dict(zip(old_ids, new_ids)) OrigToAlignedMap = dict() AlignedToOrigMap = dict() for a, b in AlignedRowColPairs: newa = old2new.get(b, a) OrigToAlignedMap[b] = newa AlignedToOrigMap[newa] = b AlignedRowColPairs = list(AlignedToOrigMap.items()) else: # Unpack existing alignment info AlignedRowColPairs = useInfo['AlignedRowColPairs'] CostMatrix = useInfo['CostMatrix'] AlignedToOrigMap = useInfo['AlignedToOrigMap'] OrigToAlignedMap = useInfo['OrigToAlignedMap'] Ktrue = useInfo['Ktrue'] Kest = useInfo['Kest'] assert Ktrue >= zTrue.max() + 1 Khat = zHat.max() + 1 # Account for extra states present in zHat # that have never been aligned before. # They should align to the next available UID in set # [Ktrue, Ktrue+1, Ktrue+2, ...] # so they don't get confused for a true label ktrueextra = np.max([r for r, c in AlignedRowColPairs]) ktrueextra = int(np.maximum(ktrueextra + 1, Ktrue)) for khat in np.arange(Kest, Khat + 1): if khat in OrigToAlignedMap: continue OrigToAlignedMap[khat] = ktrueextra AlignedToOrigMap[ktrueextra] = khat AlignedRowColPairs.append((ktrueextra, khat)) ktrueextra += 1 zHatA = -1 * np.ones_like(zHat) for kest in np.unique(zHat): mask = zHat == kest zHatA[mask] = OrigToAlignedMap[kest] assert np.all(zHatA >= 0) if returnInfo: return zHatA, dict(CostMatrix=CostMatrix, AlignedRowColPairs=AlignedRowColPairs, OrigToAlignedMap=OrigToAlignedMap, AlignedToOrigMap=AlignedToOrigMap, Ktrue=Ktrue, Kest=Kest) else: return zHatA
def loadTopicModelFromTxtFiles(snapshotPath, returnTPA=False, returnWordCounts=False, normalizeProbs=True, normalizeTopics=True, **kwargs): ''' Load from snapshot text files. Returns ------- hmodel ''' Mdict = dict() possibleKeys = [ 'K', 'probs', 'alpha', 'beta', 'lam', 'gamma', 'nTopics', 'nTypes', 'vocab_size' ] keyMap = dict(beta='lam', nTopics='K', nTypes='vocab_size') for key in possibleKeys: try: arr = np.loadtxt(snapshotPath + "/%s.txt" % (key)) if key in keyMap: Mdict[keyMap[key]] = arr else: Mdict[key] = arr except Exception: pass assert 'K' in Mdict assert 'lam' in Mdict K = int(Mdict['K']) V = int(Mdict['vocab_size']) if os.path.exists(snapshotPath + "/topics.txt"): Mdict['topics'] = np.loadtxt(snapshotPath + "/topics.txt") Mdict['topics'] = as2D(toCArray(Mdict['topics'], dtype=np.float64)) assert Mdict['topics'].ndim == 2 assert Mdict['topics'].shape == (K, V) else: TWC_data = np.loadtxt(snapshotPath + "/TopicWordCount_data.txt") TWC_inds = np.loadtxt(snapshotPath + "/TopicWordCount_indices.txt", dtype=np.int32) if os.path.exists(snapshotPath + "/TopicWordCount_cscindptr.txt"): TWC_cscindptr = np.loadtxt(snapshotPath + "/TopicWordCount_cscindptr.txt", dtype=np.int32) TWC = scipy.sparse.csc_matrix((TWC_data, TWC_inds, TWC_cscindptr), shape=(K, V)) else: TWC_csrindptr = np.loadtxt(snapshotPath + "/TopicWordCount_indptr.txt", dtype=np.int32) TWC = scipy.sparse.csr_matrix((TWC_data, TWC_inds, TWC_csrindptr), shape=(K, V)) Mdict['WordCounts'] = TWC.toarray() if returnTPA: # Load topics : 2D array, K x vocab_size if 'WordCounts' in Mdict: topics = Mdict['WordCounts'] + Mdict['lam'] else: topics = Mdict['topics'] topics = as2D(toCArray(topics, dtype=np.float64)) assert topics.ndim == 2 K = topics.shape[0] if normalizeTopics: topics /= topics.sum(axis=1)[:, np.newaxis] # Load probs : 1D array, size K try: probs = Mdict['probs'] except KeyError: probs = (1.0 / K) * np.ones(K) probs = as1D(toCArray(probs, dtype=np.float64)) assert probs.ndim == 1 assert probs.size == K if normalizeProbs: probs = probs / np.sum(probs) # Load alpha : scalar float > 0 try: alpha = float(Mdict['alpha']) except KeyError: if 'alpha' in os.environ: alpha = float(os.environ['alpha']) else: raise ValueError('Unknown parameter alpha') return topics, probs, alpha # BUILD HMODEL FROM LOADED TXT infAlg = 'VB' # avoids circular import from bnpy.HModel import HModel if 'gamma' in Mdict: aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma']) HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel'] amodel = HDPTopicModel(infAlg, aPriorDict) else: FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel'] amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha'])) omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict) hmodel = HModel(amodel, omodel) hmodel.set_global_params(**Mdict) if returnWordCounts: return hmodel, Mdict['WordCounts'] return hmodel
def __init__(self, X=None, nObsTotal=None, TrueZ=None, Xprev=None, Y=None, TrueParams=None, name=None, summary=None, dtype='auto', row_names=None, column_names=None, y_column_names=None, xprev_column_names=None, do_copy=True, **kwargs): ''' Constructor for XData instance given in-memory dense array X. Post Condition --------- self.X : 2D array, size N x D with standardized dtype, alignment, byteorder. ''' if dtype == 'auto': dtype = X.dtype if not do_copy and X.dtype == dtype: self.X = as2D(X) else: self.X = as2D(toCArray(X, dtype=dtype)) if Xprev is not None: self.Xprev = as2D(toCArray(Xprev, dtype=dtype)) if Y is not None: self.Y = as2D(toCArray(Y, dtype=dtype)) # Verify attributes are consistent self._set_dependent_params(nObsTotal=nObsTotal) self._check_dims(do_copy=do_copy) # Add optional true parameters / true hard labels if TrueParams is not None: self.TrueParams = TrueParams if TrueZ is not None: if not hasattr(self, 'TrueParams'): self.TrueParams = dict() self.TrueParams['Z'] = as1D(toCArray(TrueZ)) self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size if summary is not None: self.summary = summary if name is not None: self.name = str(name) # Add optional row names if row_names is None: self.row_names = map(str, range(self.nObs)) else: assert len(row_names) == self.nObs self.row_names = map(str, row_names) # Add optional column names if column_names is None: self.column_names = map(lambda n: "dim_%d" % n, range(self.dim)) else: assert len(column_names) == self.dim self.column_names = map(str, column_names)
if __name__ == '__main__': N = 7 muG = np.asarray([0.02, 0.1, 0.5, 0.9, 0.98]) muG = N * np.vstack([muG, 1.0-muG]).T.copy() print muG phiG = mu2phi(muG, N) print phiG print phi2mu(mu2phi(muG, N), N) print 'BREGMAN Dist Mat:' print bregmanDiv(muG, muG, N) for row in xrange(phiG.shape[0]): phi = as1D(phiG[row]) mu = phi2mu(phi,N) print 'phi = %.3f' % (phi) print 'mu = ', mu print '----------' cdf = 0.0 for xVec in generateAllXForFixedN(N): pdf = pdf_Phi(xVec, phi, N) cdf += pdf print '%.3f %.3f %.3f %s' % ( cdf, pdf, pdf_Mu(xVec, mu, N), xVec) #print pdf_Phi(xVec, mu2phi(mu,N), N),
def from_dict(self, Dict): self.inferType = Dict['inferType'] self.K = Dict['K'] self.rho = as1D(Dict['rho']) self.omega = as1D(Dict['omega'])
numPi_d, numf, numInfo = estimatePi2(ids_d, cts_d, topics, alpha, scale=1.0, #/np.sum(cts_d), approx_grad=True) print("Numerical Pi[%d]:\n %s" % (d, pi2str(numPi_d))) estPi_d, f, Info = estimatePi2(ids_d, cts_d, topics, alpha, scale=1.0, #/np.sum(cts_d)) approx_grad=False, ) print("Estimated Pi[%d]:\n %s" % (d, pi2str(estPi_d))) PRNG = np.random.RandomState(d) nMatch = 0 nRep = 10 for rep in range(nRep): initPi_d = as1D(PRNG.dirichlet(K*np.ones(K), size=1)) estPiFromRand, f2, I2 = estimatePi2(ids_d, cts_d, topics, alpha, scale=1.0, #/np.sum(cts_d), piInit=initPi_d, approx_grad=False) if np.allclose(estPi_d, estPiFromRand, rtol=0, atol=atol): nMatch += 1 else: print("initrandom Pi[%d]:\n %s" % ( d, pi2str(estPiFromRand))) print(f) print(f2) print("%d/%d random inits within %s" % (nMatch, nRep, atol))