def __init__(self, X=None, doc_range=None, nDocTotal=None, Xprev=None, TrueZ=None, TrueParams=None, fileNames=None, summary=None, **kwargs): ''' Create an instance of GroupXData for provided array X Post Condition --------- self.X : 2D array, size N x D with standardized dtype, alignment, byteorder. self.Xprev : 2D array, size N x D with standardized dtype, alignment, byteorder. self.doc_range : 1D array, size nDoc+1 ''' self.X = as2D(toCArray(X, dtype=np.float64)) self.doc_range = as1D(toCArray(doc_range, dtype=np.int32)) if summary is not None: self.summary = summary if Xprev is not None: self.Xprev = as2D(toCArray(Xprev, dtype=np.float64)) # Verify attributes are consistent self._set_dependent_params(doc_range, nDocTotal) self._check_dims() # Add optional true parameters / true hard labels if TrueParams is not None: self.TrueParams = dict() for key, arr in TrueParams.items(): self.TrueParams[key] = toCArray(arr) if TrueZ is not None: if not hasattr(self, 'TrueParams'): self.TrueParams = dict() self.TrueParams['Z'] = as1D(toCArray(TrueZ)) self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size # Add optional source files for each group/sequence if fileNames is not None: if hasattr(fileNames, 'shape') and fileNames.shape == (1, 1): fileNames = fileNames[0, 0] if len(fileNames) > 1: self.fileNames = [str(x).strip() for x in np.squeeze(fileNames)] else: self.fileNames = [str(fileNames[0])] # Add extra data attributes custom for the dataset for key in kwargs: if hasattr(self, key): continue if not key.startswith("__"): arr = np.squeeze(as1D(kwargs[key])) if arr.shape == (): try: arr = float(arr) except TypeError: continue setattr(self, key, arr)
def createPrior(self, Data, nu=0, B=None, ECovMat=None, sF=1.0, **kwargs): ''' Initialize Prior ParamBag attribute. Post Condition ------ Prior expected covariance matrix set to match provided value. ''' D = self.D nu = np.maximum(nu, D + 2) if B is None: if ECovMat is None or isinstance(ECovMat, str): ECovMat = createECovMatFromUserInput(D, Data, ECovMat, sF) B = ECovMat * (nu - D - 1) else: B = as2D(B) self.Prior = ParamBag(K=0, D=D) self.Prior.setField('nu', nu, dims=None) self.Prior.setField('B', B, dims=('D', 'D'))
def SummaryAlg_cpp(initPi, transPi, SoftEv, margPrObs, fMsg, bMsg, mPairIDs=None, order='C'): ''' Backward algorithm for a single HMM sequence. Implemented in C++/Eigen. ''' if not hasEigenLibReady: raise ValueError("Cannot find library %s. Please recompile." % (libfilename)) if order != 'C': raise NotImplementedError("LibFwdBwd only supports row-major order.") # Prep inputs T, K = SoftEv.shape initPi = np.asarray(initPi, order=order) transPi = np.asarray(transPi, order=order) SoftEv = np.asarray(SoftEv, order=order) margPrObs = np.asarray(margPrObs, order=order) fMsg = np.asarray(fMsg, order=order) bMsg = np.asarray(bMsg, order=order) if mPairIDs is None or len(mPairIDs) == 0: M = 0 mPairIDs = np.zeros((0, 2)) else: mPairIDs = as2D(np.asarray(mPairIDs, dtype=np.float64)) M = mPairIDs.shape[0] assert mPairIDs.shape[0] == M assert mPairIDs.shape[1] == 2 # Allocate outputs TransStateCount = np.zeros((K, K), order=order) Htable = np.zeros((K, K), order=order) mHtable = np.zeros((2 * M, K), order=order) # Execute C++ code for backward pass (fills in bMsg in-place) lib.SummaryAlg(initPi, transPi, SoftEv, margPrObs, fMsg, bMsg, TransStateCount, Htable, mPairIDs, mHtable, K, T, M) return TransStateCount, Htable, mHtable
def loadTopicModel(matfilepath, queryLap=None, prefix=None, returnWordCounts=0, returnTPA=0, normalizeTopics=0, normalizeProbs=0, **kwargs): ''' Load saved topic model Returns ------- topics : 2D array, K x vocab_size (if returnTPA) probs : 1D array, size K (if returnTPA) alpha : scalar (if returnTPA) hmodel : HModel WordCounts : 2D array, size K x vocab_size (if returnWordCounts) ''' if prefix is None: prefix, lapQuery = getPrefixForLapQuery(matfilepath, queryLap) # avoids circular import from bnpy.HModel import HModel if len(glob.glob(os.path.join(matfilepath, "*.log_prob_w"))) > 0: return loadTopicModelFromMEDLDA(matfilepath, prefix, returnTPA=returnTPA) snapshotList = glob.glob(os.path.join(matfilepath, 'Lap*TopicSnapshot')) matfileList = glob.glob(os.path.join(matfilepath, 'Lap*TopicModel.mat')) if len(snapshotList) > 0: if prefix is None: snapshotList.sort() snapshotPath = snapshotList[-1] else: snapshotPath = None for curPath in snapshotList: if curPath.count(prefix): snapshotPath = curPath return loadTopicModelFromTxtFiles(snapshotPath, normalizeTopics=normalizeTopics, normalizeProbs=normalizeProbs, returnWordCounts=returnWordCounts, returnTPA=returnTPA) if prefix is not None: matfilepath = os.path.join(matfilepath, prefix + 'TopicModel.mat') Mdict = loadDictFromMatfile(matfilepath) if 'SparseWordCount_data' in Mdict: data = np.asarray(Mdict['SparseWordCount_data'], dtype=np.float64) K = int(Mdict['K']) vocab_size = int(Mdict['vocab_size']) try: indices = Mdict['SparseWordCount_indices'] indptr = Mdict['SparseWordCount_indptr'] WordCounts = scipy.sparse.csr_matrix((data, indices, indptr), shape=(K, vocab_size)) except KeyError: rowIDs = Mdict['SparseWordCount_i'] - 1 colIDs = Mdict['SparseWordCount_j'] - 1 WordCounts = scipy.sparse.csr_matrix((data, (rowIDs, colIDs)), shape=(K, vocab_size)) Mdict['WordCounts'] = WordCounts.toarray() if returnTPA: # Load topics : 2D array, K x vocab_size if 'WordCounts' in Mdict: topics = Mdict['WordCounts'] + Mdict['lam'] else: topics = Mdict['topics'] topics = as2D(toCArray(topics, dtype=np.float64)) assert topics.ndim == 2 K = topics.shape[0] if normalizeTopics: topics /= topics.sum(axis=1)[:, np.newaxis] # Load probs : 1D array, size K try: probs = Mdict['probs'] except KeyError: probs = (1.0 / K) * np.ones(K) probs = as1D(toCArray(probs, dtype=np.float64)) assert probs.ndim == 1 assert probs.size == K if normalizeProbs: probs = probs / np.sum(probs) # Load alpha : scalar float > 0 try: alpha = float(Mdict['alpha']) except KeyError: if 'alpha' in os.environ: alpha = float(os.environ['alpha']) else: raise ValueError('Unknown parameter alpha') if 'eta' in Mdict: return topics, probs, alpha, as1D(toCArray(Mdict['eta'])) return topics, probs, alpha infAlg = 'VB' if 'gamma' in Mdict: aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma']) HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel'] amodel = HDPTopicModel(infAlg, aPriorDict) else: FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel'] amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha'])) omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict) hmodel = HModel(amodel, omodel) hmodel.set_global_params(**Mdict) if returnWordCounts: return hmodel, Mdict['WordCounts'] return hmodel