def loadDictFromMatfile(matfilepath): ''' Load dict of numpy arrays from a .mat-format file on disk. This is a wrapper around scipy.io.loadmat, which makes the returned numpy arrays in standard aligned format. Returns -------- D : dict Each key/value pair is a parameter name and a numpy array loaded from the provided mat file. We ensure before returning that each array has properties: * C alignment * Original 2D shape has been squeezed as much as possible * (1,1) becomes a size=1 1D array * (1,N) or (N,1) become 1D arrays * flags.aligned is True * flags.owndata is True * dtype.byteorder is '=' Examples ------- >>> import scipy.io >>> Dorig = dict(scalar=5, scalar1DN1=np.asarray([3.14,])) >>> Dorig['arr1DN3'] = np.asarray([1,2,3]) >>> scipy.io.savemat('Dorig.mat', Dorig, oned_as='row') >>> D = loadDictFromMatfile('Dorig.mat') >>> D['scalar'] array(5) >>> D['scalar1DN1'] array(3.14) >>> D['arr1DN3'] array([1, 2, 3]) ''' Dtmp = scipy.io.loadmat(matfilepath) D = dict([x for x in Dtmp.items() if not x[0].startswith('__')]) for key in D: if not isinstance(D[key], np.ndarray): continue x = D[key] if isinstance(x[0], np.unicode_): if x.size == 1: D[key] = str(x[0]) else: D[key] = tuple([str(s) for s in x]) continue if x.ndim == 2: x = np.squeeze(x) if str(x.dtype).count('int'): arr = toCArray(x, dtype=np.int32) else: arr = toCArray(x, dtype=np.float64) assert arr.dtype.byteorder == '=' assert arr.flags.aligned is True assert arr.flags.owndata is True D[key] = arr return D
def calcSummaryStats(Data, SS, LP, **kwargs): ''' Calculate summary statistics for given dataset and local parameters Returns -------- SS : SuffStatBag object, with K components. ''' if not hasattr(Data, 'X_NE'): Data.X_NE = np.hstack([Data.X, np.ones(Data.nObs)[:, np.newaxis]]) Y_N = Data.Y X_NE = Data.X_NE E = X_NE.shape[1] if 'resp' in LP: # Dense responsibility calculations resp = LP['resp'] K = resp.shape[1] S_yy_K = dotATB(resp, np.square(Y_N)).flatten() S_yx_KE = dotATB(resp, Y_N * X_NE) # Expected outer product S_xxT_KEE = np.zeros((K, E, E)) sqrtResp_k_N = np.sqrt(resp[:, 0]) sqrtR_X_k_NE = sqrtResp_k_N[:, np.newaxis] * X_NE S_xxT_KEE[0] = dotATA(sqrtR_X_k_NE) for k in xrange(1, K): np.sqrt(resp[:, k], out=sqrtResp_k_N) np.multiply(sqrtResp_k_N[:, np.newaxis], X_NE, out=sqrtR_X_k_NE) S_xxT_KEE[k] = dotATA(sqrtR_X_k_NE) else: raise ValueError("TODO") spR = LP['spR'] K = spR.shape[1] if SS is None: SS = SuffStatBag(K=K, D=Data.dim, E=E) elif not hasattr(SS, 'E'): SS._Fields.E = E SS.setField('xxT_KEE', S_xxT_KEE, dims=('K', 'E', 'E')) SS.setField('yx_KE', S_yx_KE, dims=('K', 'E')) SS.setField('yy_K', S_yy_K, dims=('K')) # Expected count for each k # Usually computed by allocmodel. But just in case... if not hasattr(SS, 'N'): if 'resp' in LP: SS.setField('N', LP['resp'].sum(axis=0), dims='K') else: SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K') #SS.setField("N_K", SS.N, dims="K") return SS
def calcSummaryStats(Data, SS, LP, **kwargs): ''' Calculate summary statistics for given dataset and local parameters Returns -------- SS : SuffStatBag object, with K components. ''' X = Data.X D = Data.dim if 'resp' in LP: resp = LP['resp'] K = resp.shape[1] # Compute expected outer-product statistic S_xxT = np.zeros((K, Data.dim, Data.dim)) sqrtResp_k = np.sqrt(resp[:, 0]) sqrtRX_k = sqrtResp_k[:, np.newaxis] * Data.X S_xxT[0] = dotATA(sqrtRX_k) for k in xrange(1, K): np.sqrt(resp[:, k], out=sqrtResp_k) np.multiply(sqrtResp_k[:, np.newaxis], Data.X, out=sqrtRX_k) S_xxT[k] = dotATA(sqrtRX_k) sqrtResp = np.sqrt(resp) xxT = np.zeros((K, D, D)) for k in xrange(K): xxT[k] = dotATA(sqrtResp[:, k][:, np.newaxis] * Data.X) assert np.allclose(xxT, S_xxT) else: spR = LP['spR'] K = spR.shape[1] # Compute expected outer-product statistic S_xxT = calcSpRXXT(X=X, spR_csr=spR) if SS is None: SS = SuffStatBag(K=K, D=D) # Expected outer-product for each state k SS.setField('xxT', S_xxT, dims=('K', 'D', 'D')) # Expected count for each k # Usually computed by allocmodel. But sometimes not (eg TopicModel) if not hasattr(SS, 'N'): if 'resp' in LP: SS.setField('N', LP['resp'].sum(axis=0), dims='K') else: SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K') return SS
def calcSummaryStats(Data, SS, LP, **kwargs): ''' Calculate summary statistics for given dataset and local parameters Returns -------- SS : SuffStatBag object, with K components. ''' X = Data.X if 'resp' in LP: resp = LP['resp'] K = resp.shape[1] # 1/2: Compute mean statistic S_x = dotATB(resp, X) # 2/2: Compute expected outer-product statistic S_xx = calcRXX_withDenseResp(resp, X) else: spR = LP['spR'] K = spR.shape[1] # 1/2: Compute mean statistic S_x = spR.T * X # 2/2: Compute expected outer-product statistic S_xx = calcSpRXX(X=X, spR_csr=spR) if SS is None: SS = SuffStatBag(K=K, D=Data.dim) # Expected mean for each state k SS.setField('x', S_x, dims=('K', 'D')) # Expected sum-of-squares for each state k SS.setField('xx', S_xx, dims=('K', 'D')) # Expected count for each k # Usually computed by allocmodel. But sometimes not (eg TopicModel) if not hasattr(SS, 'N'): if 'resp' in LP: SS.setField('N', LP['resp'].sum(axis=0), dims='K') else: SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K') return SS
def loadTopicModelFromTxtFiles(snapshotPath, returnTPA=False, returnWordCounts=False, normalizeProbs=True, normalizeTopics=True, **kwargs): ''' Load from snapshot text files. Returns ------- hmodel ''' Mdict = dict() possibleKeys = [ 'K', 'probs', 'alpha', 'beta', 'lam', 'gamma', 'nTopics', 'nTypes', 'vocab_size' ] keyMap = dict(beta='lam', nTopics='K', nTypes='vocab_size') for key in possibleKeys: try: arr = np.loadtxt(snapshotPath + "/%s.txt" % (key)) if key in keyMap: Mdict[keyMap[key]] = arr else: Mdict[key] = arr except Exception: pass assert 'K' in Mdict assert 'lam' in Mdict K = int(Mdict['K']) V = int(Mdict['vocab_size']) if os.path.exists(snapshotPath + "/topics.txt"): Mdict['topics'] = np.loadtxt(snapshotPath + "/topics.txt") Mdict['topics'] = as2D(toCArray(Mdict['topics'], dtype=np.float64)) assert Mdict['topics'].ndim == 2 assert Mdict['topics'].shape == (K, V) else: TWC_data = np.loadtxt(snapshotPath + "/TopicWordCount_data.txt") TWC_inds = np.loadtxt(snapshotPath + "/TopicWordCount_indices.txt", dtype=np.int32) if os.path.exists(snapshotPath + "/TopicWordCount_cscindptr.txt"): TWC_cscindptr = np.loadtxt(snapshotPath + "/TopicWordCount_cscindptr.txt", dtype=np.int32) TWC = scipy.sparse.csc_matrix((TWC_data, TWC_inds, TWC_cscindptr), shape=(K, V)) else: TWC_csrindptr = np.loadtxt(snapshotPath + "/TopicWordCount_indptr.txt", dtype=np.int32) TWC = scipy.sparse.csr_matrix((TWC_data, TWC_inds, TWC_csrindptr), shape=(K, V)) Mdict['WordCounts'] = TWC.toarray() if returnTPA: # Load topics : 2D array, K x vocab_size if 'WordCounts' in Mdict: topics = Mdict['WordCounts'] + Mdict['lam'] else: topics = Mdict['topics'] topics = as2D(toCArray(topics, dtype=np.float64)) assert topics.ndim == 2 K = topics.shape[0] if normalizeTopics: topics /= topics.sum(axis=1)[:, np.newaxis] # Load probs : 1D array, size K try: probs = Mdict['probs'] except KeyError: probs = (1.0 / K) * np.ones(K) probs = as1D(toCArray(probs, dtype=np.float64)) assert probs.ndim == 1 assert probs.size == K if normalizeProbs: probs = probs / np.sum(probs) # Load alpha : scalar float > 0 try: alpha = float(Mdict['alpha']) except KeyError: if 'alpha' in os.environ: alpha = float(os.environ['alpha']) else: raise ValueError('Unknown parameter alpha') return topics, probs, alpha # BUILD HMODEL FROM LOADED TXT infAlg = 'VB' # avoids circular import from bnpy.HModel import HModel if 'gamma' in Mdict: aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma']) HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel'] amodel = HDPTopicModel(infAlg, aPriorDict) else: FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel'] amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha'])) omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict) hmodel = HModel(amodel, omodel) hmodel.set_global_params(**Mdict) if returnWordCounts: return hmodel, Mdict['WordCounts'] return hmodel
def loadTopicModel(matfilepath, queryLap=None, prefix=None, returnWordCounts=0, returnTPA=0, normalizeTopics=0, normalizeProbs=0, **kwargs): ''' Load saved topic model Returns ------- topics : 2D array, K x vocab_size (if returnTPA) probs : 1D array, size K (if returnTPA) alpha : scalar (if returnTPA) hmodel : HModel WordCounts : 2D array, size K x vocab_size (if returnWordCounts) ''' if prefix is None: prefix, lapQuery = getPrefixForLapQuery(matfilepath, queryLap) # avoids circular import from bnpy.HModel import HModel if len(glob.glob(os.path.join(matfilepath, "*.log_prob_w"))) > 0: return loadTopicModelFromMEDLDA(matfilepath, prefix, returnTPA=returnTPA) snapshotList = glob.glob(os.path.join(matfilepath, 'Lap*TopicSnapshot')) matfileList = glob.glob(os.path.join(matfilepath, 'Lap*TopicModel.mat')) if len(snapshotList) > 0: if prefix is None: snapshotList.sort() snapshotPath = snapshotList[-1] else: snapshotPath = None for curPath in snapshotList: if curPath.count(prefix): snapshotPath = curPath return loadTopicModelFromTxtFiles(snapshotPath, normalizeTopics=normalizeTopics, normalizeProbs=normalizeProbs, returnWordCounts=returnWordCounts, returnTPA=returnTPA) if prefix is not None: matfilepath = os.path.join(matfilepath, prefix + 'TopicModel.mat') Mdict = loadDictFromMatfile(matfilepath) if 'SparseWordCount_data' in Mdict: data = np.asarray(Mdict['SparseWordCount_data'], dtype=np.float64) K = int(Mdict['K']) vocab_size = int(Mdict['vocab_size']) try: indices = Mdict['SparseWordCount_indices'] indptr = Mdict['SparseWordCount_indptr'] WordCounts = scipy.sparse.csr_matrix((data, indices, indptr), shape=(K, vocab_size)) except KeyError: rowIDs = Mdict['SparseWordCount_i'] - 1 colIDs = Mdict['SparseWordCount_j'] - 1 WordCounts = scipy.sparse.csr_matrix((data, (rowIDs, colIDs)), shape=(K, vocab_size)) Mdict['WordCounts'] = WordCounts.toarray() if returnTPA: # Load topics : 2D array, K x vocab_size if 'WordCounts' in Mdict: topics = Mdict['WordCounts'] + Mdict['lam'] else: topics = Mdict['topics'] topics = as2D(toCArray(topics, dtype=np.float64)) assert topics.ndim == 2 K = topics.shape[0] if normalizeTopics: topics /= topics.sum(axis=1)[:, np.newaxis] # Load probs : 1D array, size K try: probs = Mdict['probs'] except KeyError: probs = (1.0 / K) * np.ones(K) probs = as1D(toCArray(probs, dtype=np.float64)) assert probs.ndim == 1 assert probs.size == K if normalizeProbs: probs = probs / np.sum(probs) # Load alpha : scalar float > 0 try: alpha = float(Mdict['alpha']) except KeyError: if 'alpha' in os.environ: alpha = float(os.environ['alpha']) else: raise ValueError('Unknown parameter alpha') if 'eta' in Mdict: return topics, probs, alpha, as1D(toCArray(Mdict['eta'])) return topics, probs, alpha infAlg = 'VB' if 'gamma' in Mdict: aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma']) HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel'] amodel = HDPTopicModel(infAlg, aPriorDict) else: FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel'] amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha'])) omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict) hmodel = HModel(amodel, omodel) hmodel.set_global_params(**Mdict) if returnWordCounts: return hmodel, Mdict['WordCounts'] return hmodel
def __init__(self, edges=None, X=None, AdjMat=None, nNodesTotal=None, nEdgesTotal=None, nNodes=None, TrueParams=None, nodeNames=None, nodeZ=None, **kwargs): ''' Construct a GraphXData object. Pass either a full adjacency matrix (nNodes x nNodes x D), or a list of edges and associated observations. Args ----- edges : 2D array, shape nEdges x 2 X : 2D array, shape nEdges x D AdjMat : 3D array, shape nNodes x nNodes x D Defines adjacency matrix of desired graph. Assumes D=1 if 2D array specified. Returns -------- Data : GraphXData ''' self.isSparse = False self.TrueParams = TrueParams if AdjMat is not None: AdjMat = np.asarray(AdjMat) if AdjMat.ndim == 2: AdjMat = AdjMat[:, :, np.newaxis] nNodes = AdjMat.shape[0] edges = makeEdgesForDenseGraphWithNNodes(nNodes) X = np.zeros((edges.shape[0], AdjMat.shape[-1])) for eid, (i, j) in enumerate(edges): X[eid] = AdjMat[i, j] if AdjMat is None and (X is None or edges is None): raise ValueError( 'Must specify adjacency matrix AdjMat, or ' + 'a list of edges and corresponding dense observations X') # Create core attributes self.edges = toCArray(as2D(edges), dtype=np.int32) self.X = toCArray(as2D(X), dtype=np.float64) # Verify all edges are unique (raise error otherwise) N = self.edges.max() + 1 edgeAsBaseNInteger = self.edges[:, 0] * N + self.edges[:, 1] nUniqueEdges = np.unique(edgeAsBaseNInteger).size if nUniqueEdges < self.edges.shape[0]: raise ValueError("Provided edges must be unique.") # Discard self loops nonselfloopmask = self.edges[:, 0] != self.edges[:, 1] if np.sum(nonselfloopmask) < self.edges.shape[0]: self.edges = self.edges[nonselfloopmask].copy() self.X = self.X[nonselfloopmask].copy() self._set_size_attributes(nNodesTotal=nNodesTotal, nEdgesTotal=nEdgesTotal) self._verify_attributes() if TrueParams is None: if nodeZ is not None: self.TrueParams = dict() self.TrueParams['nodeZ'] = nodeZ else: self.TrueParams = TrueParams if nodeNames is not None: self.nodeNames = nodeNames
def __init__(self, X=None, doc_range=None, nDocTotal=None, Xprev=None, TrueZ=None, TrueParams=None, fileNames=None, summary=None, **kwargs): ''' Create an instance of GroupXData for provided array X Post Condition --------- self.X : 2D array, size N x D with standardized dtype, alignment, byteorder. self.Xprev : 2D array, size N x D with standardized dtype, alignment, byteorder. self.doc_range : 1D array, size nDoc+1 ''' self.X = as2D(toCArray(X, dtype=np.float64)) self.doc_range = as1D(toCArray(doc_range, dtype=np.int32)) if summary is not None: self.summary = summary if Xprev is not None: self.Xprev = as2D(toCArray(Xprev, dtype=np.float64)) # Verify attributes are consistent self._set_dependent_params(doc_range, nDocTotal) self._check_dims() # Add optional true parameters / true hard labels if TrueParams is not None: self.TrueParams = dict() for key, arr in TrueParams.items(): self.TrueParams[key] = toCArray(arr) if TrueZ is not None: if not hasattr(self, 'TrueParams'): self.TrueParams = dict() self.TrueParams['Z'] = as1D(toCArray(TrueZ)) self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size # Add optional source files for each group/sequence if fileNames is not None: if hasattr(fileNames, 'shape') and fileNames.shape == (1, 1): fileNames = fileNames[0, 0] if len(fileNames) > 1: self.fileNames = [ str(x).strip() for x in np.squeeze(fileNames) ] else: self.fileNames = [str(fileNames[0])] # Add extra data attributes custom for the dataset for key in kwargs: if hasattr(self, key): continue if not key.startswith("__"): arr = np.squeeze(as1D(kwargs[key])) if arr.shape == (): try: arr = float(arr) except TypeError: continue setattr(self, key, arr)
def __init__(self, X=None, nObsTotal=None, TrueZ=None, Xprev=None, Y=None, TrueParams=None, name=None, summary=None, dtype='auto', row_names=None, column_names=None, y_column_names=None, xprev_column_names=None, do_copy=True, **kwargs): ''' Constructor for XData instance given in-memory dense array X. Post Condition --------- self.X : 2D array, size N x D with standardized dtype, alignment, byteorder. ''' if dtype == 'auto': dtype = X.dtype if not do_copy and X.dtype == dtype: self.X = as2D(X) else: self.X = as2D(toCArray(X, dtype=dtype)) if Xprev is not None: self.Xprev = as2D(toCArray(Xprev, dtype=dtype)) if Y is not None: self.Y = as2D(toCArray(Y, dtype=dtype)) # Verify attributes are consistent self._set_dependent_params(nObsTotal=nObsTotal) self._check_dims(do_copy=do_copy) # Add optional true parameters / true hard labels if TrueParams is not None: self.TrueParams = TrueParams if TrueZ is not None: if not hasattr(self, 'TrueParams'): self.TrueParams = dict() self.TrueParams['Z'] = as1D(toCArray(TrueZ)) self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size if summary is not None: self.summary = summary if name is not None: self.name = str(name) # Add optional row names # this line is added by Tingting self.row_names = np.arange(0, self.nObs, 1) if row_names is None: self.row_names = np.arange(0, self.nObs, 1) ## map(str, range(self.nObs)) else: assert len(list(self.row_names)) == self.nObs # self.row_names = map(str, row_names) # Add optional column names if column_names is None: self.column_names = map(lambda n: "dim_%d" % n, range(self.dim)) else: assert len(column_names) == self.dim self.column_names = map(str, column_names)