def setPostFactors(self, obsModel=None, SS=None, LP=None, Data=None, lam=None, WordCounts=None, **kwargs): ''' Set attribute Post to provided values. ''' self.ClearCache() if obsModel is not None: if hasattr(obsModel, 'Post'): self.Post = obsModel.Post.copy() self.K = self.Post.K else: self.setPostFromEstParams(obsModel.EstParams) return if LP is not None and Data is not None: SS = self.calcSummaryStats(Data, None, LP) if SS is not None: self.updatePost(SS) else: if WordCounts is not None: lam = as2D(WordCounts) + lam else: lam = as2D(lam) K, D = lam.shape self.Post = ParamBag(K=K, D=D) self.Post.setField('lam', lam, dims=('K', 'D')) self.K = self.Post.K
def packParamBagForPost(nu=None, beta=None, m=None, kappa=None, D=None, Post=None, **kwargs): ''' ''' m = as2D(m) beta = as2D(beta) if D is None: D = m.shape[1] if m.shape[1] != D: m = m.T.copy() if beta.shape[1] != D: beta = beta.T.copy() K, _ = m.shape if Post is None: Post = ParamBag(K=K, D=D) else: assert isinstance(Post, ParamBag) assert Post.K == K assert Post.D == D Post.setField('nu', as1D(nu), dims=('K')) Post.setField('beta', beta, dims=('K', 'D')) Post.setField('m', m, dims=('K', 'D')) Post.setField('kappa', as1D(kappa), dims=('K')) return Post
def read_from_graphtxtfile(cls, filepath, nEdgesTotal=None, nNodesTotal=None, settingspath=None, **kwargs): ''' Static constructor loading .graph file into GraphXData instance. ''' if settingspath is not None: with open(settingspath, 'r') as f: for line in f.readlines(): if line.count('='): fields = [f.strip() for f in line.split('=')] assert len(fields) == 2 if fields[0] == 'N' or fields[0] == 'nNodesTotal': nNodesTotal = int(fields[1]) if fields[0] == 'E' or fields[0] == 'nEdgesTotal': nEdgesTotal = int(fields[1]) txtData = np.loadtxt(filepath, dtype=np.int32) assert txtData.ndim == 2 assert txtData.shape[1] == 4 edges = txtData[:, [1, 2]] X = as2D(txtData[:, 3]) # Make sure X and edges have correct dims if X.shape[0] != edges.shape[0]: X = X.T return cls(nNodesTotal=nNodesTotal, nEdgesTotal=nEdgesTotal, edges=edges, X=X)
def packParamBagForPost(pnu_K=None, ptau_K=None, w_KE=None, P_KEE=None, Post=None, **kwargs): ''' Parse provided array args and pack into parameter bag Returns ------- Post : ParamBag, with K clusters ''' pnu_K = as1D(pnu_K) ptau_K = as1D(ptau_K) w_KE = as2D(w_KE) P_KEE = as3D(P_KEE) K = pnu_K.size E = w_KE.shape[1] if Post is None: Post = ParamBag(K=K, D=E - 1, E=E) elif not hasattr(Post, 'E'): Post.E = E assert Post.K == K assert Post.D == E - 1 assert Post.E == E Post.setField('pnu_K', pnu_K, dims=('K')) Post.setField('ptau_K', ptau_K, dims=('K')) Post.setField('w_KE', w_KE, dims=('K', 'E')) Post.setField('P_KEE', P_KEE, dims=('K', 'E', 'E')) return Post
def setEstParams(self, obsModel=None, SS=None, LP=None, Data=None, mu=None, Sigma=None, **kwargs): ''' Create EstParams ParamBag with fields mu, Sigma ''' self.ClearCache() if obsModel is not None: self.EstParams = obsModel.EstParams.copy() self.K = self.EstParams.K return if LP is not None and Data is not None: SS = self.calcSummaryStats(Data, None, LP) if SS is not None: self.updateEstParams(SS) else: Sigma = as3D(Sigma) K, D, D2 = Sigma.shape mu = as2D(mu) if mu.shape[0] != K: mu = mu.T assert mu.shape[0] == K assert mu.shape[1] == D self.EstParams = ParamBag(K=K, D=D) self.EstParams.setField('mu', mu, dims=('K', 'D')) self.EstParams.setField('Sigma', Sigma, dims=('K', 'D', 'D')) self.K = self.EstParams.K
def log_pdf_dirichlet(PiMat, alphavec): ''' Return scalar log probability for Dir(PiMat | alphavec) ''' PiMat = as2D(PiMat + 1e-100) J, K = PiMat.shape if isinstance(alphavec, float): alphavec = alphavec * np.ones(K) elif alphavec.ndim == 0: alphavec = alphavec * np.ones(K) assert alphavec.size == K cDir = gammaln(np.sum(alphavec)) - np.sum(gammaln(alphavec)) return K * cDir + np.sum(np.dot(np.log(PiMat), alphavec - 1.0))
def setPostFactors(self, obsModel=None, SS=None, LP=None, Data=None, lam1=None, lam0=None, **kwargs): ''' Set attribute Post to provided values. ''' self.ClearCache() if obsModel is not None: if hasattr(obsModel, 'Post'): self.Post = obsModel.Post.copy() self.K = self.Post.K else: self.setPostFromEstParams(obsModel.EstParams) return if LP is not None and Data is not None: SS = self.calcSummaryStats(Data, None, LP) if SS is not None: self.updatePost(SS) else: lam1 = as2D(lam1) lam0 = as2D(lam0) D = lam1.shape[-1] if self.D != D: if not lam1.shape[0] == self.D: raise ValueError("Bad dimension for lam1, lam0") lam1 = lam1.T.copy() lam0 = lam0.T.copy() K = lam1.shape[0] self.Post = ParamBag(K=K, D=self.D) self.Post.setField('lam1', lam1, dims=self.CompDims + ('D', )) self.Post.setField('lam0', lam0, dims=self.CompDims + ('D', )) self.K = self.Post.K
def SummaryAlg_py(PiInit, PiMat, SoftEv, margPrObs, fMsg, bMsg, mPairIDs=None): K = PiInit.size T = SoftEv.shape[0] if mPairIDs is None: M = 0 else: if len(mPairIDs) == 0: M = 0 else: mPairIDs = as2D(np.asarray(mPairIDs, dtype=np.int32)) assert mPairIDs.ndim == 2 assert mPairIDs.shape[1] == 2 assert mPairIDs.shape[0] > 0 M = mPairIDs.shape[0] mHtable = np.zeros((2 * M, K)) respPair_t = np.zeros((K, K)) Htable = np.zeros((K, K)) TransStateCount = np.zeros((K, K)) for t in xrange(1, T): respPair_t = np.outer(fMsg[t - 1], bMsg[t] * SoftEv[t]) respPair_t *= PiMat / margPrObs[t] TransStateCount += respPair_t respPair_t += 1e-100 rowwiseSum = np.sum(respPair_t, axis=1) Htable += respPair_t * np.log(respPair_t) \ - respPair_t * np.log(rowwiseSum)[:, np.newaxis] if M > 0: respPair = calcRespPair_fast(PiMat, SoftEv, margPrObs, fMsg, bMsg, K, T, doCopy=1) for m in xrange(M): kA = mPairIDs[m, 0] kB = mPairIDs[m, 1] mHtable[ 2 * m:2 * m + 2] = calc_sub_Htable_forMergePair( respPair, kA, kB) Htable *= -1 return TransStateCount, Htable, mHtable
def createPrior(self, Data, nu=0, B=None, ECovMat=None, sF=1.0, **kwargs): ''' Initialize Prior ParamBag attribute. Post Condition ------ Prior expected covariance matrix set to match provided value. ''' D = self.D nu = np.maximum(nu, D + 2) if B is None: if ECovMat is None or isinstance(ECovMat, str): ECovMat = createECovMatFromUserInput(D, Data, ECovMat, sF) B = ECovMat * (nu - D - 1) else: B = as2D(B) self.Prior = ParamBag(K=0, D=D) self.Prior.setField('nu', nu, dims=None) self.Prior.setField('B', B, dims=('D', 'D'))
def setPostFactors(self, obsModel=None, SS=None, LP=None, Data=None, nu=0, B=0, m=0, kappa=0, **kwargs): ''' Set attribute Post to provided values. ''' self.ClearCache() if obsModel is not None: if hasattr(obsModel, 'Post'): self.Post = obsModel.Post.copy() self.K = self.Post.K else: self.setPostFromEstParams(obsModel.EstParams) return if LP is not None and Data is not None: SS = self.calcSummaryStats(Data, None, LP) if SS is not None: self.updatePost(SS) else: m = as2D(m) if m.shape[1] != self.D: m = m.T.copy() K, _ = m.shape self.Post = ParamBag(K=K, D=self.D) self.Post.setField('nu', as1D(nu), dims=('K')) self.Post.setField('B', B, dims=('K', 'D', 'D')) self.Post.setField('m', m, dims=('K', 'D')) self.Post.setField('kappa', as1D(kappa), dims=('K')) self.K = self.Post.K
def createPrior(self, Data, D=None, E=None, nu=0, B=None, M=None, V=None, ECovMat=None, sF=1.0, VMat='eye', sV=1.0, MMat='zero', sM=1.0, **kwargs): ''' Initialize Prior ParamBag attribute. Post Condition ------ Prior expected covariance matrix set to match provided value. ''' if Data is None: if D is None: raise ValueError("Need to specify dimension D") if E is None: raise ValueError("Need to specify dimension E") if Data is not None: if D is None: D = Data.X.shape[1] else: assert D == Data.X.shape[1] if E is None: E = Data.Xprev.shape[1] else: assert E == Data.Xprev.shape[1] nu = np.maximum(nu, D + 2) if B is None: if ECovMat is None or isinstance(ECovMat, str): ECovMat = createECovMatFromUserInput(D, Data, ECovMat, sF) B = ECovMat * (nu - D - 1) B = as2D(B) if M is None: if MMat == 'zero': M = np.zeros((D, E)) elif MMat == 'eye': assert D <= E M = sM * np.eye(D) M = np.hstack([M, np.zeros((D, E - D))]) assert M.shape == (D, E) else: raise ValueError('Unrecognized MMat: %s' % (MMat)) else: M = as2D(M) if V is None: if VMat == 'eye': V = sV * np.eye(E) elif VMat == 'same': assert D == E V = sV * ECovMat else: raise ValueError('Unrecognized VMat: %s' % (VMat)) else: V = as2D(V) self.Prior = ParamBag(K=0, D=D, E=E) self.Prior.setField('nu', nu, dims=None) self.Prior.setField('B', B, dims=('D', 'D')) self.Prior.setField('V', V, dims=('E', 'E')) self.Prior.setField('M', M, dims=('D', 'E'))
def saveTopicModel(hmodel, SS, fpath, prefix, didExactUpdateWithSS=True, tryToSparsifyOutput=False, doLinkBest=False, sparseEPS=0.002, **kwargs): ''' Write TopicModel to .mat formatted file on disk. Post Condition ------ Topic model info written to file at location fpath/prefixTopicModel.mat ''' EstPDict = dict() # Active comp probabilities if hasattr(hmodel.allocModel, 'rho'): EstPDict['rho'] = hmodel.allocModel.rho EstPDict['omega'] = hmodel.allocModel.omega EstPDict['probs'] = np.asarray(hmodel.allocModel.get_active_comp_probs(), dtype=np.float32) if hasattr(hmodel.allocModel, 'alpha'): EstPDict['alpha'] = hmodel.allocModel.alpha if hasattr(hmodel.allocModel, 'gamma'): EstPDict['gamma'] = hmodel.allocModel.gamma lamPrior = hmodel.obsModel.Prior.lam if np.allclose(lamPrior, lamPrior[0]): lamPrior = lamPrior[0] EstPDict['lam'] = np.asarray(lamPrior, dtype=np.float64) EstPDict['K'] = hmodel.obsModel.K EstPDict['vocab_size'] = hmodel.obsModel.D if SS is not None: if hasattr(SS, 'nDoc'): EstPDict['nDoc'] = SS.nDoc EstPDict['countvec'] = np.sum(SS.WordCounts, axis=1) isMult = str(type(hmodel.obsModel)).count('Mult') > 0 # Obsmodel parameters # Remember, if no update has occurred, # then we'd be saving suff stats that are *not* in sync with model params if isMult and SS is not None and didExactUpdateWithSS: SparseWordCounts = np.asarray(SS.WordCounts, dtype=np.float32) SparseWordCounts[SparseWordCounts < sparseEPS] = 0 SparseWordCounts = scipy.sparse.csr_matrix(SparseWordCounts) EstPDict['TopicWordCount_data'] = SparseWordCounts.data EstPDict['TopicWordCount_indices'] = SparseWordCounts.indices EstPDict['TopicWordCount_indptr'] = SparseWordCounts.indptr elif isMult and tryToSparsifyOutput: effWordCount = np.asarray(hmodel.obsModel.Post.lam, dtype=np.float32) effWordCount -= lamPrior effWordCount[effWordCount < sparseEPS] = 0 SparseWordCounts = scipy.sparse.csr_matrix(effWordCount) EstPDict['TopicWordCount_data'] = SparseWordCounts.data EstPDict['TopicWordCount_indices'] = SparseWordCounts.indices EstPDict['TopicWordCount_indptr'] = SparseWordCounts.indptr else: # Temporary point estimate of topic-by-word matrix # TODO: handle EM case where these estimates already exist hmodel.obsModel.setEstParamsFromPost(hmodel.obsModel.Post) EstPDict['topics'] = hmodel.obsModel.EstParams.phi delattr(hmodel.obsModel, 'EstParams') outdirpath = os.path.join(fpath, prefix + "TopicSnapshot/") try: os.mkdir(outdirpath) except OSError as e: if not str(e).count("File exists"): raise e floatFmt = '%.5e' for key in EstPDict: outtxtpath = os.path.join(outdirpath, key + ".txt") if isinstance(EstPDict[key], np.ndarray): arr = EstPDict[key] if arr.ndim == 0 or EstPDict[key].size == 1: val = None try: val = int(EstPDict[key]) assert np.allclose(val, EstPDict[key]) val = '%d' % (val) except ValueError: val = float(EstPDict[key]) val = floatFmt % (val) except AssertionError: val = float(EstPDict[key]) val = floatFmt % (val) if val is None: val = str(EstPDict[key]) with open(outtxtpath, 'w') as f: f.write(str(val) + "\n") else: if key.count('indices') or key.count('indptr'): np.savetxt(outtxtpath, as2D(arr), fmt='%d') else: np.savetxt(outtxtpath, as2D(arr), fmt=floatFmt) else: with open(outtxtpath, 'w') as f: f.write(str(EstPDict[key]) + "\n")
def __init__(self, X=None, nObsTotal=None, TrueZ=None, Xprev=None, Y=None, TrueParams=None, name=None, summary=None, dtype='auto', row_names=None, column_names=None, y_column_names=None, xprev_column_names=None, do_copy=True, **kwargs): ''' Constructor for XData instance given in-memory dense array X. Post Condition --------- self.X : 2D array, size N x D with standardized dtype, alignment, byteorder. ''' if dtype == 'auto': dtype = X.dtype if not do_copy and X.dtype == dtype: self.X = as2D(X) else: self.X = as2D(toCArray(X, dtype=dtype)) if Xprev is not None: self.Xprev = as2D(toCArray(Xprev, dtype=dtype)) if Y is not None: self.Y = as2D(toCArray(Y, dtype=dtype)) # Verify attributes are consistent self._set_dependent_params(nObsTotal=nObsTotal) self._check_dims(do_copy=do_copy) # Add optional true parameters / true hard labels if TrueParams is not None: self.TrueParams = TrueParams if TrueZ is not None: if not hasattr(self, 'TrueParams'): self.TrueParams = dict() self.TrueParams['Z'] = as1D(toCArray(TrueZ)) self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size if summary is not None: self.summary = summary if name is not None: self.name = str(name) # Add optional row names # this line is added by Tingting self.row_names = np.arange(0, self.nObs, 1) if row_names is None: self.row_names = np.arange(0, self.nObs, 1) ## map(str, range(self.nObs)) else: assert len(list(self.row_names)) == self.nObs # self.row_names = map(str, row_names) # Add optional column names if column_names is None: self.column_names = map(lambda n: "dim_%d" % n, range(self.dim)) else: assert len(column_names) == self.dim self.column_names = map(str, column_names)
def createECovMatFromUserInput(D=0, Data=None, ECovMat='eye', sF=1.0): ''' Create expected covariance matrix defining Wishart prior. User specifies desired type of expected covariance matrix. Args ---- D : positive integer, size of each observation Data : [optional] dataset to use to make Sigma in data-driven way ECovMat : string name of the procedure to use to create Sigma * 'eye' : set Sigma to sF * identity matrix May be multiplied by scalar sF * 'covdata' : set Sigma to sF * data covariance matrix May be multiplied by scalar sF * 'fromtruelabels' : set Sigma using labeled data Across all true clusters, we compute the empirical mean of the covariances of data belonging to each cluster. Returns ------- Sigma : 2D array, size D x D Symmetric and positive definite. ''' if Data is not None: assert D == Data.dim if ECovMat == 'eye': Sigma = sF * np.eye(D) elif ECovMat == 'covdata': Sigma = sF * np.cov(Data.X.T, bias=1) elif ECovMat == 'diagcovdata': CovMat = as2D(np.cov(Data.X.T, bias=1)) # as2D deals with case of D=1 Sigma = sF * np.diag(np.diag(CovMat)) elif ECovMat == 'covfirstdiff': if not hasattr(Data, 'Xprev'): raise ValueError( 'covfirstdiff only applies to auto-regressive datasets') E = Data.Xprev.shape[1] assert E >= D Xdiff = Data.X - Data.Xprev[:, :D] Sigma = sF * np.cov(Xdiff.T, bias=1) elif ECovMat == 'diagcovfirstdiff': if not hasattr(Data, 'Xprev'): raise ValueError( 'covfirstdiff only applies to auto-regressive datasets') E = Data.Xprev.shape[1] assert E >= D Xdiff = Data.X - Data.Xprev[:, :D] Sigma = sF * np.diag(np.diag(np.cov(Xdiff.T, bias=1))) elif ECovMat == 'fromtruelabels': ''' Set Cov Matrix Sigma using the true labels in empirical Bayes style Sigma = \sum_{c : class labels} w_c * SampleCov[ data from class c] ''' if hasattr(Data, 'TrueLabels'): Z = Data.TrueLabels else: Z = Data.TrueParams['Z'] Zvals = np.unique(Z) Kmax = len(Zvals) wHat = np.zeros(Kmax) SampleCov = np.zeros((Kmax, D, D)) for kLoc, kVal in enumerate(Zvals): mask = Z == kVal wHat[kLoc] = np.sum(mask) SampleCov[kLoc] = np.cov(Data.X[mask].T, bias=1) wHat = wHat / np.sum(wHat) Sigma = 1e-8 * np.eye(D) for k in range(Kmax): Sigma += wHat[k] * SampleCov[k] else: raise ValueError('Unrecognized ECovMat procedure %s' % (ECovMat)) return Sigma
def loadTopicModelFromTxtFiles(snapshotPath, returnTPA=False, returnWordCounts=False, normalizeProbs=True, normalizeTopics=True, **kwargs): ''' Load from snapshot text files. Returns ------- hmodel ''' Mdict = dict() possibleKeys = [ 'K', 'probs', 'alpha', 'beta', 'lam', 'gamma', 'nTopics', 'nTypes', 'vocab_size' ] keyMap = dict(beta='lam', nTopics='K', nTypes='vocab_size') for key in possibleKeys: try: arr = np.loadtxt(snapshotPath + "/%s.txt" % (key)) if key in keyMap: Mdict[keyMap[key]] = arr else: Mdict[key] = arr except Exception: pass assert 'K' in Mdict assert 'lam' in Mdict K = int(Mdict['K']) V = int(Mdict['vocab_size']) if os.path.exists(snapshotPath + "/topics.txt"): Mdict['topics'] = np.loadtxt(snapshotPath + "/topics.txt") Mdict['topics'] = as2D(toCArray(Mdict['topics'], dtype=np.float64)) assert Mdict['topics'].ndim == 2 assert Mdict['topics'].shape == (K, V) else: TWC_data = np.loadtxt(snapshotPath + "/TopicWordCount_data.txt") TWC_inds = np.loadtxt(snapshotPath + "/TopicWordCount_indices.txt", dtype=np.int32) if os.path.exists(snapshotPath + "/TopicWordCount_cscindptr.txt"): TWC_cscindptr = np.loadtxt(snapshotPath + "/TopicWordCount_cscindptr.txt", dtype=np.int32) TWC = scipy.sparse.csc_matrix((TWC_data, TWC_inds, TWC_cscindptr), shape=(K, V)) else: TWC_csrindptr = np.loadtxt(snapshotPath + "/TopicWordCount_indptr.txt", dtype=np.int32) TWC = scipy.sparse.csr_matrix((TWC_data, TWC_inds, TWC_csrindptr), shape=(K, V)) Mdict['WordCounts'] = TWC.toarray() if returnTPA: # Load topics : 2D array, K x vocab_size if 'WordCounts' in Mdict: topics = Mdict['WordCounts'] + Mdict['lam'] else: topics = Mdict['topics'] topics = as2D(toCArray(topics, dtype=np.float64)) assert topics.ndim == 2 K = topics.shape[0] if normalizeTopics: topics /= topics.sum(axis=1)[:, np.newaxis] # Load probs : 1D array, size K try: probs = Mdict['probs'] except KeyError: probs = (1.0 / K) * np.ones(K) probs = as1D(toCArray(probs, dtype=np.float64)) assert probs.ndim == 1 assert probs.size == K if normalizeProbs: probs = probs / np.sum(probs) # Load alpha : scalar float > 0 try: alpha = float(Mdict['alpha']) except KeyError: if 'alpha' in os.environ: alpha = float(os.environ['alpha']) else: raise ValueError('Unknown parameter alpha') return topics, probs, alpha # BUILD HMODEL FROM LOADED TXT infAlg = 'VB' # avoids circular import from bnpy.HModel import HModel if 'gamma' in Mdict: aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma']) HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel'] amodel = HDPTopicModel(infAlg, aPriorDict) else: FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel'] amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha'])) omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict) hmodel = HModel(amodel, omodel) hmodel.set_global_params(**Mdict) if returnWordCounts: return hmodel, Mdict['WordCounts'] return hmodel
def loadTopicModel(matfilepath, queryLap=None, prefix=None, returnWordCounts=0, returnTPA=0, normalizeTopics=0, normalizeProbs=0, **kwargs): ''' Load saved topic model Returns ------- topics : 2D array, K x vocab_size (if returnTPA) probs : 1D array, size K (if returnTPA) alpha : scalar (if returnTPA) hmodel : HModel WordCounts : 2D array, size K x vocab_size (if returnWordCounts) ''' if prefix is None: prefix, lapQuery = getPrefixForLapQuery(matfilepath, queryLap) # avoids circular import from bnpy.HModel import HModel if len(glob.glob(os.path.join(matfilepath, "*.log_prob_w"))) > 0: return loadTopicModelFromMEDLDA(matfilepath, prefix, returnTPA=returnTPA) snapshotList = glob.glob(os.path.join(matfilepath, 'Lap*TopicSnapshot')) matfileList = glob.glob(os.path.join(matfilepath, 'Lap*TopicModel.mat')) if len(snapshotList) > 0: if prefix is None: snapshotList.sort() snapshotPath = snapshotList[-1] else: snapshotPath = None for curPath in snapshotList: if curPath.count(prefix): snapshotPath = curPath return loadTopicModelFromTxtFiles(snapshotPath, normalizeTopics=normalizeTopics, normalizeProbs=normalizeProbs, returnWordCounts=returnWordCounts, returnTPA=returnTPA) if prefix is not None: matfilepath = os.path.join(matfilepath, prefix + 'TopicModel.mat') Mdict = loadDictFromMatfile(matfilepath) if 'SparseWordCount_data' in Mdict: data = np.asarray(Mdict['SparseWordCount_data'], dtype=np.float64) K = int(Mdict['K']) vocab_size = int(Mdict['vocab_size']) try: indices = Mdict['SparseWordCount_indices'] indptr = Mdict['SparseWordCount_indptr'] WordCounts = scipy.sparse.csr_matrix((data, indices, indptr), shape=(K, vocab_size)) except KeyError: rowIDs = Mdict['SparseWordCount_i'] - 1 colIDs = Mdict['SparseWordCount_j'] - 1 WordCounts = scipy.sparse.csr_matrix((data, (rowIDs, colIDs)), shape=(K, vocab_size)) Mdict['WordCounts'] = WordCounts.toarray() if returnTPA: # Load topics : 2D array, K x vocab_size if 'WordCounts' in Mdict: topics = Mdict['WordCounts'] + Mdict['lam'] else: topics = Mdict['topics'] topics = as2D(toCArray(topics, dtype=np.float64)) assert topics.ndim == 2 K = topics.shape[0] if normalizeTopics: topics /= topics.sum(axis=1)[:, np.newaxis] # Load probs : 1D array, size K try: probs = Mdict['probs'] except KeyError: probs = (1.0 / K) * np.ones(K) probs = as1D(toCArray(probs, dtype=np.float64)) assert probs.ndim == 1 assert probs.size == K if normalizeProbs: probs = probs / np.sum(probs) # Load alpha : scalar float > 0 try: alpha = float(Mdict['alpha']) except KeyError: if 'alpha' in os.environ: alpha = float(os.environ['alpha']) else: raise ValueError('Unknown parameter alpha') if 'eta' in Mdict: return topics, probs, alpha, as1D(toCArray(Mdict['eta'])) return topics, probs, alpha infAlg = 'VB' if 'gamma' in Mdict: aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma']) HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel'] amodel = HDPTopicModel(infAlg, aPriorDict) else: FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel'] amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha'])) omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict) hmodel = HModel(amodel, omodel) hmodel.set_global_params(**Mdict) if returnWordCounts: return hmodel, Mdict['WordCounts'] return hmodel
def __init__(self, edges=None, X=None, AdjMat=None, nNodesTotal=None, nEdgesTotal=None, nNodes=None, TrueParams=None, nodeNames=None, nodeZ=None, **kwargs): ''' Construct a GraphXData object. Pass either a full adjacency matrix (nNodes x nNodes x D), or a list of edges and associated observations. Args ----- edges : 2D array, shape nEdges x 2 X : 2D array, shape nEdges x D AdjMat : 3D array, shape nNodes x nNodes x D Defines adjacency matrix of desired graph. Assumes D=1 if 2D array specified. Returns -------- Data : GraphXData ''' self.isSparse = False self.TrueParams = TrueParams if AdjMat is not None: AdjMat = np.asarray(AdjMat) if AdjMat.ndim == 2: AdjMat = AdjMat[:, :, np.newaxis] nNodes = AdjMat.shape[0] edges = makeEdgesForDenseGraphWithNNodes(nNodes) X = np.zeros((edges.shape[0], AdjMat.shape[-1])) for eid, (i, j) in enumerate(edges): X[eid] = AdjMat[i, j] if AdjMat is None and (X is None or edges is None): raise ValueError( 'Must specify adjacency matrix AdjMat, or ' + 'a list of edges and corresponding dense observations X') # Create core attributes self.edges = toCArray(as2D(edges), dtype=np.int32) self.X = toCArray(as2D(X), dtype=np.float64) # Verify all edges are unique (raise error otherwise) N = self.edges.max() + 1 edgeAsBaseNInteger = self.edges[:, 0] * N + self.edges[:, 1] nUniqueEdges = np.unique(edgeAsBaseNInteger).size if nUniqueEdges < self.edges.shape[0]: raise ValueError("Provided edges must be unique.") # Discard self loops nonselfloopmask = self.edges[:, 0] != self.edges[:, 1] if np.sum(nonselfloopmask) < self.edges.shape[0]: self.edges = self.edges[nonselfloopmask].copy() self.X = self.X[nonselfloopmask].copy() self._set_size_attributes(nNodesTotal=nNodesTotal, nEdgesTotal=nEdgesTotal) self._verify_attributes() if TrueParams is None: if nodeZ is not None: self.TrueParams = dict() self.TrueParams['nodeZ'] = nodeZ else: self.TrueParams = TrueParams if nodeNames is not None: self.nodeNames = nodeNames
def calcLocalParams(Data, LP, transTheta=None, startTheta=None, limitMemoryLP=1, hmm_feature_method_LP='forward+backward', mPairIDs=None, cslice=(0, None), **kwargs): ''' Compute local parameters for provided dataset. Returns ------- LP : dict of local params, with fields * resp : 2D array, nAtom x K if limitMemoryLP=0: * respPair : 3D array, nAtom x K x K if limitMemoryLP=1: * TransCount : 3D array, nSeq x K x K ''' # Unpack soft evidence 2D array logLik = LP['E_log_soft_ev'] nAtom, K = logLik.shape # Calculate trans prob 2D array digammaSumTransTheta = digamma(np.sum(transTheta[:K, :K + 1], axis=1)) transPi = digamma(transTheta[:K, :K]) - digammaSumTransTheta[:, np.newaxis] np.exp(transPi, out=transPi) # Calculate LOG of start state prob vector logstartPi = digamma(startTheta[:K]) - digamma(np.sum(startTheta[:K + 1])) # Set starting probs to uniform, # because Line A below updates first state's logLik to include logstartPi startPi = np.ones(K) logMargPr = np.empty(Data.nDoc) resp = np.empty((nAtom, K)) # Unpack pairs to track for merging. if mPairIDs is None: mPairIDs = np.zeros((0, 2)) M = 0 else: if len(mPairIDs) == 0: mPairIDs = np.zeros((0, 2)) M = 0 else: mPairIDs = as2D(mPairIDs) M = mPairIDs.shape[0] assert mPairIDs.shape[1] == 2 if hmm_feature_method_LP == 'forward': fmsg = np.zeros_like(LP['E_log_soft_ev']) # Run forward backward algorithm on each sequence n for n in xrange(Data.nDoc): start = Data.doc_range[n] stop = Data.doc_range[n + 1] logLik_n = logLik[start:stop] # Adding in start state probs, in log space for stability. logLik_n[0] += logstartPi PiInit, PiMat, K = _parseInput_TransParams(startPi, transPi) logSoftEv = _parseInput_SoftEv(logLik_n, K) T = logSoftEv.shape[0] SoftEv, lognormC = expLogLik(logSoftEv) fmsg_n, margPrObs = FwdAlg(PiInit, PiMat, SoftEv) if not np.all(np.isfinite(margPrObs)): raise ValueError('NaN values found. Numerical badness!') fmsg[start:stop] = fmsg_n LP['fmsg'] = fmsg elif limitMemoryLP: # Track sufficient statistics directly at each sequence. TransCount = np.empty((Data.nDoc, K, K)) Htable = np.empty((Data.nDoc, K, K)) mHtable = np.zeros((2 * M, K)) # Run forward backward algorithm on each sequence n for n in xrange(Data.nDoc): start = Data.doc_range[n] stop = Data.doc_range[n + 1] logLik_n = logLik[start:stop] # Adding in start state probs, in log space for stability. logLik_n[0] += logstartPi # Line A # Run fwd-fwd alg and record result. resp_n, lp_n, TransCount_n, Htable_n, mHtable_n = \ FwdBwdAlg_LimitMemory(startPi, transPi, logLik_n, mPairIDs) resp[start:stop] = resp_n logMargPr[n] = lp_n TransCount[n] = TransCount_n Htable[n] = Htable_n mHtable += mHtable_n LP['resp'] = resp LP['evidence'] = np.sum(logMargPr) LP['TransCount'] = TransCount LP['Htable'] = Htable LP['mHtable'] = mHtable else: # Track pair-wise assignment probs for each sequence respPair = np.empty((nAtom, K, K)) # Run the forward backward algorithm on each sequence for n in xrange(Data.nDoc): start = Data.doc_range[n] stop = Data.doc_range[n + 1] logLik_n = logLik[start:stop] # Adding in start state probs, in log space for stability. logLik_n[0] += logstartPi # Line A resp_n, respPair_n, lp_n = \ FwdBwdAlg(startPi, transPi, logLik_n) resp[start:stop] = resp_n respPair[start:stop] = respPair_n logMargPr[n] = lp_n LP['evidence'] = np.sum(logMargPr) LP['resp'] = resp LP['respPair'] = respPair # ... end if statement on limitMemoryLP return LP
def __init__(self, X=None, doc_range=None, nDocTotal=None, Xprev=None, TrueZ=None, TrueParams=None, fileNames=None, summary=None, **kwargs): ''' Create an instance of GroupXData for provided array X Post Condition --------- self.X : 2D array, size N x D with standardized dtype, alignment, byteorder. self.Xprev : 2D array, size N x D with standardized dtype, alignment, byteorder. self.doc_range : 1D array, size nDoc+1 ''' self.X = as2D(toCArray(X, dtype=np.float64)) self.doc_range = as1D(toCArray(doc_range, dtype=np.int32)) if summary is not None: self.summary = summary if Xprev is not None: self.Xprev = as2D(toCArray(Xprev, dtype=np.float64)) # Verify attributes are consistent self._set_dependent_params(doc_range, nDocTotal) self._check_dims() # Add optional true parameters / true hard labels if TrueParams is not None: self.TrueParams = dict() for key, arr in TrueParams.items(): self.TrueParams[key] = toCArray(arr) if TrueZ is not None: if not hasattr(self, 'TrueParams'): self.TrueParams = dict() self.TrueParams['Z'] = as1D(toCArray(TrueZ)) self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size # Add optional source files for each group/sequence if fileNames is not None: if hasattr(fileNames, 'shape') and fileNames.shape == (1, 1): fileNames = fileNames[0, 0] if len(fileNames) > 1: self.fileNames = [ str(x).strip() for x in np.squeeze(fileNames) ] else: self.fileNames = [str(fileNames[0])] # Add extra data attributes custom for the dataset for key in kwargs: if hasattr(self, key): continue if not key.startswith("__"): arr = np.squeeze(as1D(kwargs[key])) if arr.shape == (): try: arr = float(arr) except TypeError: continue setattr(self, key, arr)