コード例 #1
0
    def setPostFactors(self,
                       obsModel=None,
                       SS=None,
                       LP=None,
                       Data=None,
                       lam=None,
                       WordCounts=None,
                       **kwargs):
        ''' Set attribute Post to provided values.
        '''
        self.ClearCache()
        if obsModel is not None:
            if hasattr(obsModel, 'Post'):
                self.Post = obsModel.Post.copy()
                self.K = self.Post.K
            else:
                self.setPostFromEstParams(obsModel.EstParams)
            return

        if LP is not None and Data is not None:
            SS = self.calcSummaryStats(Data, None, LP)

        if SS is not None:
            self.updatePost(SS)
        else:
            if WordCounts is not None:
                lam = as2D(WordCounts) + lam
            else:
                lam = as2D(lam)
            K, D = lam.shape
            self.Post = ParamBag(K=K, D=D)
            self.Post.setField('lam', lam, dims=('K', 'D'))
        self.K = self.Post.K
コード例 #2
0
def packParamBagForPost(nu=None,
                        beta=None,
                        m=None,
                        kappa=None,
                        D=None,
                        Post=None,
                        **kwargs):
    '''
    '''
    m = as2D(m)
    beta = as2D(beta)

    if D is None:
        D = m.shape[1]

    if m.shape[1] != D:
        m = m.T.copy()
    if beta.shape[1] != D:
        beta = beta.T.copy()
    K, _ = m.shape
    if Post is None:
        Post = ParamBag(K=K, D=D)
    else:
        assert isinstance(Post, ParamBag)
        assert Post.K == K
        assert Post.D == D
    Post.setField('nu', as1D(nu), dims=('K'))
    Post.setField('beta', beta, dims=('K', 'D'))
    Post.setField('m', m, dims=('K', 'D'))
    Post.setField('kappa', as1D(kappa), dims=('K'))
    return Post
コード例 #3
0
 def read_from_graphtxtfile(cls,
                            filepath,
                            nEdgesTotal=None,
                            nNodesTotal=None,
                            settingspath=None,
                            **kwargs):
     ''' Static constructor loading .graph file into GraphXData instance.
     '''
     if settingspath is not None:
         with open(settingspath, 'r') as f:
             for line in f.readlines():
                 if line.count('='):
                     fields = [f.strip() for f in line.split('=')]
                     assert len(fields) == 2
                     if fields[0] == 'N' or fields[0] == 'nNodesTotal':
                         nNodesTotal = int(fields[1])
                     if fields[0] == 'E' or fields[0] == 'nEdgesTotal':
                         nEdgesTotal = int(fields[1])
     txtData = np.loadtxt(filepath, dtype=np.int32)
     assert txtData.ndim == 2
     assert txtData.shape[1] == 4
     edges = txtData[:, [1, 2]]
     X = as2D(txtData[:, 3])
     # Make sure X and edges have correct dims
     if X.shape[0] != edges.shape[0]:
         X = X.T
     return cls(nNodesTotal=nNodesTotal,
                nEdgesTotal=nEdgesTotal,
                edges=edges,
                X=X)
コード例 #4
0
def packParamBagForPost(pnu_K=None,
                        ptau_K=None,
                        w_KE=None,
                        P_KEE=None,
                        Post=None,
                        **kwargs):
    ''' Parse provided array args and pack into parameter bag

    Returns
    -------
    Post : ParamBag, with K clusters
    '''
    pnu_K = as1D(pnu_K)
    ptau_K = as1D(ptau_K)
    w_KE = as2D(w_KE)
    P_KEE = as3D(P_KEE)

    K = pnu_K.size
    E = w_KE.shape[1]
    if Post is None:
        Post = ParamBag(K=K, D=E - 1, E=E)
    elif not hasattr(Post, 'E'):
        Post.E = E
    assert Post.K == K
    assert Post.D == E - 1
    assert Post.E == E
    Post.setField('pnu_K', pnu_K, dims=('K'))
    Post.setField('ptau_K', ptau_K, dims=('K'))
    Post.setField('w_KE', w_KE, dims=('K', 'E'))
    Post.setField('P_KEE', P_KEE, dims=('K', 'E', 'E'))
    return Post
コード例 #5
0
ファイル: GaussObsModel.py プロジェクト: zhaottcrystal/bnpy
    def setEstParams(self,
                     obsModel=None,
                     SS=None,
                     LP=None,
                     Data=None,
                     mu=None,
                     Sigma=None,
                     **kwargs):
        ''' Create EstParams ParamBag with fields mu, Sigma
        '''
        self.ClearCache()
        if obsModel is not None:
            self.EstParams = obsModel.EstParams.copy()
            self.K = self.EstParams.K
            return

        if LP is not None and Data is not None:
            SS = self.calcSummaryStats(Data, None, LP)

        if SS is not None:
            self.updateEstParams(SS)
        else:
            Sigma = as3D(Sigma)
            K, D, D2 = Sigma.shape
            mu = as2D(mu)
            if mu.shape[0] != K:
                mu = mu.T
            assert mu.shape[0] == K
            assert mu.shape[1] == D
            self.EstParams = ParamBag(K=K, D=D)
            self.EstParams.setField('mu', mu, dims=('K', 'D'))
            self.EstParams.setField('Sigma', Sigma, dims=('K', 'D', 'D'))
            self.K = self.EstParams.K
コード例 #6
0
def log_pdf_dirichlet(PiMat, alphavec):
    ''' Return scalar log probability for Dir(PiMat | alphavec)
    '''
    PiMat = as2D(PiMat + 1e-100)
    J, K = PiMat.shape
    if isinstance(alphavec, float):
        alphavec = alphavec * np.ones(K)
    elif alphavec.ndim == 0:
        alphavec = alphavec * np.ones(K)
    assert alphavec.size == K
    cDir = gammaln(np.sum(alphavec)) - np.sum(gammaln(alphavec))
    return K * cDir + np.sum(np.dot(np.log(PiMat), alphavec - 1.0))
コード例 #7
0
    def setPostFactors(self,
                       obsModel=None,
                       SS=None,
                       LP=None,
                       Data=None,
                       lam1=None,
                       lam0=None,
                       **kwargs):
        ''' Set attribute Post to provided values.
        '''
        self.ClearCache()
        if obsModel is not None:
            if hasattr(obsModel, 'Post'):
                self.Post = obsModel.Post.copy()
                self.K = self.Post.K
            else:
                self.setPostFromEstParams(obsModel.EstParams)
            return

        if LP is not None and Data is not None:
            SS = self.calcSummaryStats(Data, None, LP)

        if SS is not None:
            self.updatePost(SS)
        else:
            lam1 = as2D(lam1)
            lam0 = as2D(lam0)
            D = lam1.shape[-1]
            if self.D != D:
                if not lam1.shape[0] == self.D:
                    raise ValueError("Bad dimension for lam1, lam0")
                lam1 = lam1.T.copy()
                lam0 = lam0.T.copy()

            K = lam1.shape[0]
            self.Post = ParamBag(K=K, D=self.D)
            self.Post.setField('lam1', lam1, dims=self.CompDims + ('D', ))
            self.Post.setField('lam0', lam0, dims=self.CompDims + ('D', ))
        self.K = self.Post.K
コード例 #8
0
ファイル: HMMUtil.py プロジェクト: zhaottcrystal/bnpy
def SummaryAlg_py(PiInit, PiMat, SoftEv, margPrObs, fMsg, bMsg,
                  mPairIDs=None):
    K = PiInit.size
    T = SoftEv.shape[0]
    if mPairIDs is None:
        M = 0
    else:
        if len(mPairIDs) == 0:
            M = 0
        else:
            mPairIDs = as2D(np.asarray(mPairIDs, dtype=np.int32))
            assert mPairIDs.ndim == 2
            assert mPairIDs.shape[1] == 2
            assert mPairIDs.shape[0] > 0
            M = mPairIDs.shape[0]
    mHtable = np.zeros((2 * M, K))

    respPair_t = np.zeros((K, K))
    Htable = np.zeros((K, K))
    TransStateCount = np.zeros((K, K))
    for t in xrange(1, T):
        respPair_t = np.outer(fMsg[t - 1], bMsg[t] * SoftEv[t])
        respPair_t *= PiMat / margPrObs[t]
        TransStateCount += respPair_t

        respPair_t += 1e-100
        rowwiseSum = np.sum(respPair_t, axis=1)
        Htable += respPair_t * np.log(respPair_t) \
            - respPair_t * np.log(rowwiseSum)[:, np.newaxis]

    if M > 0:
        respPair = calcRespPair_fast(PiMat, SoftEv,
                                     margPrObs, fMsg, bMsg,
                                     K, T, doCopy=1)
        for m in xrange(M):
            kA = mPairIDs[m, 0]
            kB = mPairIDs[m, 1]
            mHtable[
                2 *
                m:2 *
                m +
                2] = calc_sub_Htable_forMergePair(
                respPair,
                kA,
                kB)

    Htable *= -1
    return TransStateCount, Htable, mHtable
コード例 #9
0
    def createPrior(self, Data, nu=0, B=None, ECovMat=None, sF=1.0, **kwargs):
        ''' Initialize Prior ParamBag attribute.

        Post Condition
        ------
        Prior expected covariance matrix set to match provided value.
        '''
        D = self.D
        nu = np.maximum(nu, D + 2)
        if B is None:
            if ECovMat is None or isinstance(ECovMat, str):
                ECovMat = createECovMatFromUserInput(D, Data, ECovMat, sF)
            B = ECovMat * (nu - D - 1)
        else:
            B = as2D(B)
        self.Prior = ParamBag(K=0, D=D)
        self.Prior.setField('nu', nu, dims=None)
        self.Prior.setField('B', B, dims=('D', 'D'))
コード例 #10
0
ファイル: GaussObsModel.py プロジェクト: zhaottcrystal/bnpy
    def setPostFactors(self,
                       obsModel=None,
                       SS=None,
                       LP=None,
                       Data=None,
                       nu=0,
                       B=0,
                       m=0,
                       kappa=0,
                       **kwargs):
        ''' Set attribute Post to provided values.
        '''
        self.ClearCache()
        if obsModel is not None:
            if hasattr(obsModel, 'Post'):
                self.Post = obsModel.Post.copy()
                self.K = self.Post.K
            else:
                self.setPostFromEstParams(obsModel.EstParams)
            return

        if LP is not None and Data is not None:
            SS = self.calcSummaryStats(Data, None, LP)

        if SS is not None:
            self.updatePost(SS)
        else:
            m = as2D(m)
            if m.shape[1] != self.D:
                m = m.T.copy()
            K, _ = m.shape
            self.Post = ParamBag(K=K, D=self.D)
            self.Post.setField('nu', as1D(nu), dims=('K'))
            self.Post.setField('B', B, dims=('K', 'D', 'D'))
            self.Post.setField('m', m, dims=('K', 'D'))
            self.Post.setField('kappa', as1D(kappa), dims=('K'))
        self.K = self.Post.K
コード例 #11
0
    def createPrior(self,
                    Data,
                    D=None,
                    E=None,
                    nu=0,
                    B=None,
                    M=None,
                    V=None,
                    ECovMat=None,
                    sF=1.0,
                    VMat='eye',
                    sV=1.0,
                    MMat='zero',
                    sM=1.0,
                    **kwargs):
        ''' Initialize Prior ParamBag attribute.

        Post Condition
        ------
        Prior expected covariance matrix set to match provided value.
        '''
        if Data is None:
            if D is None:
                raise ValueError("Need to specify dimension D")
            if E is None:
                raise ValueError("Need to specify dimension E")
        if Data is not None:
            if D is None:
                D = Data.X.shape[1]
            else:
                assert D == Data.X.shape[1]
            if E is None:
                E = Data.Xprev.shape[1]
            else:
                assert E == Data.Xprev.shape[1]

        nu = np.maximum(nu, D + 2)
        if B is None:
            if ECovMat is None or isinstance(ECovMat, str):
                ECovMat = createECovMatFromUserInput(D, Data, ECovMat, sF)
            B = ECovMat * (nu - D - 1)
        B = as2D(B)

        if M is None:
            if MMat == 'zero':
                M = np.zeros((D, E))
            elif MMat == 'eye':
                assert D <= E
                M = sM * np.eye(D)
                M = np.hstack([M, np.zeros((D, E - D))])
                assert M.shape == (D, E)
            else:
                raise ValueError('Unrecognized MMat: %s' % (MMat))
        else:
            M = as2D(M)

        if V is None:
            if VMat == 'eye':
                V = sV * np.eye(E)
            elif VMat == 'same':
                assert D == E
                V = sV * ECovMat
            else:
                raise ValueError('Unrecognized VMat: %s' % (VMat))
        else:
            V = as2D(V)

        self.Prior = ParamBag(K=0, D=D, E=E)
        self.Prior.setField('nu', nu, dims=None)
        self.Prior.setField('B', B, dims=('D', 'D'))
        self.Prior.setField('V', V, dims=('E', 'E'))
        self.Prior.setField('M', M, dims=('D', 'E'))
コード例 #12
0
ファイル: ModelWriter.py プロジェクト: zhaottcrystal/bnpy
def saveTopicModel(hmodel,
                   SS,
                   fpath,
                   prefix,
                   didExactUpdateWithSS=True,
                   tryToSparsifyOutput=False,
                   doLinkBest=False,
                   sparseEPS=0.002,
                   **kwargs):
    ''' Write TopicModel to .mat formatted file on disk.

    Post Condition
    ------
    Topic model info written to file at location
        fpath/prefixTopicModel.mat
    '''
    EstPDict = dict()

    # Active comp probabilities
    if hasattr(hmodel.allocModel, 'rho'):
        EstPDict['rho'] = hmodel.allocModel.rho
        EstPDict['omega'] = hmodel.allocModel.omega
    EstPDict['probs'] = np.asarray(hmodel.allocModel.get_active_comp_probs(),
                                   dtype=np.float32)
    if hasattr(hmodel.allocModel, 'alpha'):
        EstPDict['alpha'] = hmodel.allocModel.alpha
    if hasattr(hmodel.allocModel, 'gamma'):
        EstPDict['gamma'] = hmodel.allocModel.gamma
    lamPrior = hmodel.obsModel.Prior.lam
    if np.allclose(lamPrior, lamPrior[0]):
        lamPrior = lamPrior[0]
    EstPDict['lam'] = np.asarray(lamPrior, dtype=np.float64)

    EstPDict['K'] = hmodel.obsModel.K
    EstPDict['vocab_size'] = hmodel.obsModel.D
    if SS is not None:
        if hasattr(SS, 'nDoc'):
            EstPDict['nDoc'] = SS.nDoc
        EstPDict['countvec'] = np.sum(SS.WordCounts, axis=1)
    isMult = str(type(hmodel.obsModel)).count('Mult') > 0
    # Obsmodel parameters
    # Remember, if no update has occurred,
    # then we'd be saving suff stats that are *not* in sync with model params
    if isMult and SS is not None and didExactUpdateWithSS:
        SparseWordCounts = np.asarray(SS.WordCounts, dtype=np.float32)
        SparseWordCounts[SparseWordCounts < sparseEPS] = 0
        SparseWordCounts = scipy.sparse.csr_matrix(SparseWordCounts)
        EstPDict['TopicWordCount_data'] = SparseWordCounts.data
        EstPDict['TopicWordCount_indices'] = SparseWordCounts.indices
        EstPDict['TopicWordCount_indptr'] = SparseWordCounts.indptr
    elif isMult and tryToSparsifyOutput:
        effWordCount = np.asarray(hmodel.obsModel.Post.lam, dtype=np.float32)
        effWordCount -= lamPrior
        effWordCount[effWordCount < sparseEPS] = 0
        SparseWordCounts = scipy.sparse.csr_matrix(effWordCount)
        EstPDict['TopicWordCount_data'] = SparseWordCounts.data
        EstPDict['TopicWordCount_indices'] = SparseWordCounts.indices
        EstPDict['TopicWordCount_indptr'] = SparseWordCounts.indptr
    else:
        # Temporary point estimate of topic-by-word matrix
        # TODO: handle EM case where these estimates already exist
        hmodel.obsModel.setEstParamsFromPost(hmodel.obsModel.Post)
        EstPDict['topics'] = hmodel.obsModel.EstParams.phi
        delattr(hmodel.obsModel, 'EstParams')

    outdirpath = os.path.join(fpath, prefix + "TopicSnapshot/")
    try:
        os.mkdir(outdirpath)
    except OSError as e:
        if not str(e).count("File exists"):
            raise e

    floatFmt = '%.5e'
    for key in EstPDict:
        outtxtpath = os.path.join(outdirpath, key + ".txt")
        if isinstance(EstPDict[key], np.ndarray):
            arr = EstPDict[key]
            if arr.ndim == 0 or EstPDict[key].size == 1:
                val = None
                try:
                    val = int(EstPDict[key])
                    assert np.allclose(val, EstPDict[key])
                    val = '%d' % (val)
                except ValueError:
                    val = float(EstPDict[key])
                    val = floatFmt % (val)
                except AssertionError:
                    val = float(EstPDict[key])
                    val = floatFmt % (val)

                if val is None:
                    val = str(EstPDict[key])

                with open(outtxtpath, 'w') as f:
                    f.write(str(val) + "\n")
            else:
                if key.count('indices') or key.count('indptr'):
                    np.savetxt(outtxtpath, as2D(arr), fmt='%d')
                else:
                    np.savetxt(outtxtpath, as2D(arr), fmt=floatFmt)
        else:
            with open(outtxtpath, 'w') as f:
                f.write(str(EstPDict[key]) + "\n")
コード例 #13
0
    def __init__(self,
                 X=None,
                 nObsTotal=None,
                 TrueZ=None,
                 Xprev=None,
                 Y=None,
                 TrueParams=None,
                 name=None,
                 summary=None,
                 dtype='auto',
                 row_names=None,
                 column_names=None,
                 y_column_names=None,
                 xprev_column_names=None,
                 do_copy=True,
                 **kwargs):
        ''' Constructor for XData instance given in-memory dense array X.

        Post Condition
        ---------
        self.X : 2D array, size N x D
            with standardized dtype, alignment, byteorder.
        '''
        if dtype == 'auto':
            dtype = X.dtype
        if not do_copy and X.dtype == dtype:
            self.X = as2D(X)
        else:
            self.X = as2D(toCArray(X, dtype=dtype))

        if Xprev is not None:
            self.Xprev = as2D(toCArray(Xprev, dtype=dtype))
        if Y is not None:
            self.Y = as2D(toCArray(Y, dtype=dtype))

        # Verify attributes are consistent
        self._set_dependent_params(nObsTotal=nObsTotal)
        self._check_dims(do_copy=do_copy)

        # Add optional true parameters / true hard labels
        if TrueParams is not None:
            self.TrueParams = TrueParams
        if TrueZ is not None:
            if not hasattr(self, 'TrueParams'):
                self.TrueParams = dict()
            self.TrueParams['Z'] = as1D(toCArray(TrueZ))
            self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size
        if summary is not None:
            self.summary = summary
        if name is not None:
            self.name = str(name)

        # Add optional row names
        # this line is added by Tingting
        self.row_names = np.arange(0, self.nObs, 1)

        if row_names is None:
            self.row_names = np.arange(0, self.nObs, 1)
            ## map(str, range(self.nObs))
        else:
            assert len(list(self.row_names)) == self.nObs
            # self.row_names = map(str, row_names)

        # Add optional column names
        if column_names is None:
            self.column_names = map(lambda n: "dim_%d" % n, range(self.dim))
        else:
            assert len(column_names) == self.dim
            self.column_names = map(str, column_names)
コード例 #14
0
ファイル: GaussObsModel.py プロジェクト: zhaottcrystal/bnpy
def createECovMatFromUserInput(D=0, Data=None, ECovMat='eye', sF=1.0):
    ''' Create expected covariance matrix defining Wishart prior.

    User specifies desired type of expected covariance matrix.

    Args
    ----
    D : positive integer, size of each observation
    Data : [optional] dataset to use to make Sigma in data-driven way
    ECovMat : string name of the procedure to use to create Sigma

        * 'eye' : set Sigma to sF * identity matrix
            May be multiplied by scalar sF
        * 'covdata' : set Sigma to sF * data covariance matrix
            May be multiplied by scalar sF
        * 'fromtruelabels' : set Sigma using labeled data
            Across all true clusters, we compute the empirical mean
            of the covariances of data belonging to each cluster.

    Returns
    -------
    Sigma : 2D array, size D x D
        Symmetric and positive definite.
    '''
    if Data is not None:
        assert D == Data.dim
    if ECovMat == 'eye':
        Sigma = sF * np.eye(D)
    elif ECovMat == 'covdata':
        Sigma = sF * np.cov(Data.X.T, bias=1)
    elif ECovMat == 'diagcovdata':
        CovMat = as2D(np.cov(Data.X.T, bias=1))  # as2D deals with case of D=1
        Sigma = sF * np.diag(np.diag(CovMat))
    elif ECovMat == 'covfirstdiff':
        if not hasattr(Data, 'Xprev'):
            raise ValueError(
                'covfirstdiff only applies to auto-regressive datasets')
        E = Data.Xprev.shape[1]
        assert E >= D
        Xdiff = Data.X - Data.Xprev[:, :D]
        Sigma = sF * np.cov(Xdiff.T, bias=1)
    elif ECovMat == 'diagcovfirstdiff':
        if not hasattr(Data, 'Xprev'):
            raise ValueError(
                'covfirstdiff only applies to auto-regressive datasets')
        E = Data.Xprev.shape[1]
        assert E >= D
        Xdiff = Data.X - Data.Xprev[:, :D]
        Sigma = sF * np.diag(np.diag(np.cov(Xdiff.T, bias=1)))

    elif ECovMat == 'fromtruelabels':
        ''' Set Cov Matrix Sigma using the true labels in empirical Bayes style
            Sigma = \sum_{c : class labels} w_c * SampleCov[ data from class c]
        '''
        if hasattr(Data, 'TrueLabels'):
            Z = Data.TrueLabels
        else:
            Z = Data.TrueParams['Z']
        Zvals = np.unique(Z)
        Kmax = len(Zvals)
        wHat = np.zeros(Kmax)
        SampleCov = np.zeros((Kmax, D, D))
        for kLoc, kVal in enumerate(Zvals):
            mask = Z == kVal
            wHat[kLoc] = np.sum(mask)
            SampleCov[kLoc] = np.cov(Data.X[mask].T, bias=1)
        wHat = wHat / np.sum(wHat)
        Sigma = 1e-8 * np.eye(D)
        for k in range(Kmax):
            Sigma += wHat[k] * SampleCov[k]
    else:
        raise ValueError('Unrecognized ECovMat procedure %s' % (ECovMat))
    return Sigma
コード例 #15
0
def loadTopicModelFromTxtFiles(snapshotPath,
                               returnTPA=False,
                               returnWordCounts=False,
                               normalizeProbs=True,
                               normalizeTopics=True,
                               **kwargs):
    ''' Load from snapshot text files.

    Returns
    -------
    hmodel
    '''
    Mdict = dict()
    possibleKeys = [
        'K', 'probs', 'alpha', 'beta', 'lam', 'gamma', 'nTopics', 'nTypes',
        'vocab_size'
    ]
    keyMap = dict(beta='lam', nTopics='K', nTypes='vocab_size')
    for key in possibleKeys:
        try:
            arr = np.loadtxt(snapshotPath + "/%s.txt" % (key))
            if key in keyMap:
                Mdict[keyMap[key]] = arr
            else:
                Mdict[key] = arr
        except Exception:
            pass
    assert 'K' in Mdict
    assert 'lam' in Mdict
    K = int(Mdict['K'])
    V = int(Mdict['vocab_size'])

    if os.path.exists(snapshotPath + "/topics.txt"):
        Mdict['topics'] = np.loadtxt(snapshotPath + "/topics.txt")
        Mdict['topics'] = as2D(toCArray(Mdict['topics'], dtype=np.float64))
        assert Mdict['topics'].ndim == 2
        assert Mdict['topics'].shape == (K, V)
    else:
        TWC_data = np.loadtxt(snapshotPath + "/TopicWordCount_data.txt")
        TWC_inds = np.loadtxt(snapshotPath + "/TopicWordCount_indices.txt",
                              dtype=np.int32)
        if os.path.exists(snapshotPath + "/TopicWordCount_cscindptr.txt"):
            TWC_cscindptr = np.loadtxt(snapshotPath +
                                       "/TopicWordCount_cscindptr.txt",
                                       dtype=np.int32)
            TWC = scipy.sparse.csc_matrix((TWC_data, TWC_inds, TWC_cscindptr),
                                          shape=(K, V))
        else:
            TWC_csrindptr = np.loadtxt(snapshotPath +
                                       "/TopicWordCount_indptr.txt",
                                       dtype=np.int32)
            TWC = scipy.sparse.csr_matrix((TWC_data, TWC_inds, TWC_csrindptr),
                                          shape=(K, V))

        Mdict['WordCounts'] = TWC.toarray()

    if returnTPA:
        # Load topics : 2D array, K x vocab_size
        if 'WordCounts' in Mdict:
            topics = Mdict['WordCounts'] + Mdict['lam']
        else:
            topics = Mdict['topics']
        topics = as2D(toCArray(topics, dtype=np.float64))
        assert topics.ndim == 2
        K = topics.shape[0]
        if normalizeTopics:
            topics /= topics.sum(axis=1)[:, np.newaxis]

        # Load probs : 1D array, size K
        try:
            probs = Mdict['probs']
        except KeyError:
            probs = (1.0 / K) * np.ones(K)
        probs = as1D(toCArray(probs, dtype=np.float64))
        assert probs.ndim == 1
        assert probs.size == K
        if normalizeProbs:
            probs = probs / np.sum(probs)

        # Load alpha : scalar float > 0
        try:
            alpha = float(Mdict['alpha'])
        except KeyError:
            if 'alpha' in os.environ:
                alpha = float(os.environ['alpha'])
            else:
                raise ValueError('Unknown parameter alpha')
        return topics, probs, alpha

    # BUILD HMODEL FROM LOADED TXT
    infAlg = 'VB'
    # avoids circular import
    from bnpy.HModel import HModel
    if 'gamma' in Mdict:
        aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma'])
        HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel']
        amodel = HDPTopicModel(infAlg, aPriorDict)
    else:
        FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel']
        amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha']))
    omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict)
    hmodel = HModel(amodel, omodel)
    hmodel.set_global_params(**Mdict)
    if returnWordCounts:
        return hmodel, Mdict['WordCounts']
    return hmodel
コード例 #16
0
def loadTopicModel(matfilepath,
                   queryLap=None,
                   prefix=None,
                   returnWordCounts=0,
                   returnTPA=0,
                   normalizeTopics=0,
                   normalizeProbs=0,
                   **kwargs):
    ''' Load saved topic model

    Returns
    -------
    topics : 2D array, K x vocab_size (if returnTPA)
    probs : 1D array, size K (if returnTPA)
    alpha : scalar (if returnTPA)
    hmodel : HModel
    WordCounts : 2D array, size K x vocab_size (if returnWordCounts)
    '''
    if prefix is None:
        prefix, lapQuery = getPrefixForLapQuery(matfilepath, queryLap)
    # avoids circular import
    from bnpy.HModel import HModel
    if len(glob.glob(os.path.join(matfilepath, "*.log_prob_w"))) > 0:
        return loadTopicModelFromMEDLDA(matfilepath,
                                        prefix,
                                        returnTPA=returnTPA)

    snapshotList = glob.glob(os.path.join(matfilepath, 'Lap*TopicSnapshot'))
    matfileList = glob.glob(os.path.join(matfilepath, 'Lap*TopicModel.mat'))
    if len(snapshotList) > 0:
        if prefix is None:
            snapshotList.sort()
            snapshotPath = snapshotList[-1]
        else:
            snapshotPath = None
            for curPath in snapshotList:
                if curPath.count(prefix):
                    snapshotPath = curPath
        return loadTopicModelFromTxtFiles(snapshotPath,
                                          normalizeTopics=normalizeTopics,
                                          normalizeProbs=normalizeProbs,
                                          returnWordCounts=returnWordCounts,
                                          returnTPA=returnTPA)

    if prefix is not None:
        matfilepath = os.path.join(matfilepath, prefix + 'TopicModel.mat')
    Mdict = loadDictFromMatfile(matfilepath)
    if 'SparseWordCount_data' in Mdict:
        data = np.asarray(Mdict['SparseWordCount_data'], dtype=np.float64)
        K = int(Mdict['K'])
        vocab_size = int(Mdict['vocab_size'])
        try:
            indices = Mdict['SparseWordCount_indices']
            indptr = Mdict['SparseWordCount_indptr']
            WordCounts = scipy.sparse.csr_matrix((data, indices, indptr),
                                                 shape=(K, vocab_size))
        except KeyError:
            rowIDs = Mdict['SparseWordCount_i'] - 1
            colIDs = Mdict['SparseWordCount_j'] - 1
            WordCounts = scipy.sparse.csr_matrix((data, (rowIDs, colIDs)),
                                                 shape=(K, vocab_size))
        Mdict['WordCounts'] = WordCounts.toarray()
    if returnTPA:
        # Load topics : 2D array, K x vocab_size
        if 'WordCounts' in Mdict:
            topics = Mdict['WordCounts'] + Mdict['lam']
        else:
            topics = Mdict['topics']
        topics = as2D(toCArray(topics, dtype=np.float64))
        assert topics.ndim == 2
        K = topics.shape[0]
        if normalizeTopics:
            topics /= topics.sum(axis=1)[:, np.newaxis]

        # Load probs : 1D array, size K
        try:
            probs = Mdict['probs']
        except KeyError:
            probs = (1.0 / K) * np.ones(K)
        probs = as1D(toCArray(probs, dtype=np.float64))
        assert probs.ndim == 1
        assert probs.size == K
        if normalizeProbs:
            probs = probs / np.sum(probs)

        # Load alpha : scalar float > 0
        try:
            alpha = float(Mdict['alpha'])
        except KeyError:
            if 'alpha' in os.environ:
                alpha = float(os.environ['alpha'])
            else:
                raise ValueError('Unknown parameter alpha')
        if 'eta' in Mdict:
            return topics, probs, alpha, as1D(toCArray(Mdict['eta']))
        return topics, probs, alpha

    infAlg = 'VB'
    if 'gamma' in Mdict:
        aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma'])
        HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel']
        amodel = HDPTopicModel(infAlg, aPriorDict)
    else:
        FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel']
        amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha']))
    omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict)
    hmodel = HModel(amodel, omodel)
    hmodel.set_global_params(**Mdict)
    if returnWordCounts:
        return hmodel, Mdict['WordCounts']
    return hmodel
コード例 #17
0
    def __init__(self,
                 edges=None,
                 X=None,
                 AdjMat=None,
                 nNodesTotal=None,
                 nEdgesTotal=None,
                 nNodes=None,
                 TrueParams=None,
                 nodeNames=None,
                 nodeZ=None,
                 **kwargs):
        ''' Construct a GraphXData object.

        Pass either a full adjacency matrix (nNodes x nNodes x D), 
        or a list of edges and associated observations.

        Args
        -----
        edges : 2D array, shape nEdges x 2
        X : 2D array, shape nEdges x D
        AdjMat : 3D array, shape nNodes x nNodes x D
            Defines adjacency matrix of desired graph.
            Assumes D=1 if 2D array specified.

        Returns
        --------
        Data : GraphXData
        '''
        self.isSparse = False
        self.TrueParams = TrueParams

        if AdjMat is not None:
            AdjMat = np.asarray(AdjMat)
            if AdjMat.ndim == 2:
                AdjMat = AdjMat[:, :, np.newaxis]
            nNodes = AdjMat.shape[0]
            edges = makeEdgesForDenseGraphWithNNodes(nNodes)
            X = np.zeros((edges.shape[0], AdjMat.shape[-1]))
            for eid, (i, j) in enumerate(edges):
                X[eid] = AdjMat[i, j]

        if AdjMat is None and (X is None or edges is None):
            raise ValueError(
                'Must specify adjacency matrix AdjMat, or ' +
                'a list of edges and corresponding dense observations X')

        # Create core attributes
        self.edges = toCArray(as2D(edges), dtype=np.int32)
        self.X = toCArray(as2D(X), dtype=np.float64)

        # Verify all edges are unique (raise error otherwise)
        N = self.edges.max() + 1
        edgeAsBaseNInteger = self.edges[:, 0] * N + self.edges[:, 1]
        nUniqueEdges = np.unique(edgeAsBaseNInteger).size
        if nUniqueEdges < self.edges.shape[0]:
            raise ValueError("Provided edges must be unique.")

        # Discard self loops
        nonselfloopmask = self.edges[:, 0] != self.edges[:, 1]
        if np.sum(nonselfloopmask) < self.edges.shape[0]:
            self.edges = self.edges[nonselfloopmask].copy()
            self.X = self.X[nonselfloopmask].copy()

        self._set_size_attributes(nNodesTotal=nNodesTotal,
                                  nEdgesTotal=nEdgesTotal)
        self._verify_attributes()

        if TrueParams is None:
            if nodeZ is not None:
                self.TrueParams = dict()
                self.TrueParams['nodeZ'] = nodeZ
        else:
            self.TrueParams = TrueParams
        if nodeNames is not None:
            self.nodeNames = nodeNames
コード例 #18
0
ファイル: HMMUtil.py プロジェクト: zhaottcrystal/bnpy
def calcLocalParams(Data, LP,
                    transTheta=None, startTheta=None,
                    limitMemoryLP=1,
                    hmm_feature_method_LP='forward+backward',
                    mPairIDs=None,
                    cslice=(0, None),
                    **kwargs):
    ''' Compute local parameters for provided dataset.

    Returns
    -------
    LP : dict of local params, with fields
        * resp : 2D array, nAtom x K
        if limitMemoryLP=0:
            * respPair : 3D array, nAtom x K x K
        if limitMemoryLP=1:
            * TransCount : 3D array, nSeq x K x K
    '''
    # Unpack soft evidence 2D array
    logLik = LP['E_log_soft_ev']
    nAtom, K = logLik.shape

    # Calculate trans prob 2D array
    digammaSumTransTheta = digamma(np.sum(transTheta[:K, :K + 1], axis=1))
    transPi = digamma(transTheta[:K, :K]) - digammaSumTransTheta[:, np.newaxis]
    np.exp(transPi, out=transPi)

    # Calculate LOG of start state prob vector
    logstartPi = digamma(startTheta[:K]) - digamma(np.sum(startTheta[:K + 1]))

    # Set starting probs to uniform,
    # because Line A below updates first state's logLik to include logstartPi
    startPi = np.ones(K)
    logMargPr = np.empty(Data.nDoc)
    resp = np.empty((nAtom, K))

    # Unpack pairs to track for merging.
    if mPairIDs is None:
        mPairIDs = np.zeros((0, 2))
        M = 0
    else:
        if len(mPairIDs) == 0:
            mPairIDs = np.zeros((0, 2))
            M = 0
        else:
            mPairIDs = as2D(mPairIDs)
            M = mPairIDs.shape[0]
    assert mPairIDs.shape[1] == 2
    if hmm_feature_method_LP == 'forward':
        fmsg = np.zeros_like(LP['E_log_soft_ev'])
        # Run forward backward algorithm on each sequence n
        for n in xrange(Data.nDoc):
            start = Data.doc_range[n]
            stop = Data.doc_range[n + 1]
            logLik_n = logLik[start:stop]
            # Adding in start state probs, in log space for stability.
            logLik_n[0] += logstartPi

            PiInit, PiMat, K = _parseInput_TransParams(startPi, transPi)
            logSoftEv = _parseInput_SoftEv(logLik_n, K)
            T = logSoftEv.shape[0]
            SoftEv, lognormC = expLogLik(logSoftEv)
            fmsg_n, margPrObs = FwdAlg(PiInit, PiMat, SoftEv)
            if not np.all(np.isfinite(margPrObs)):
                raise ValueError('NaN values found. Numerical badness!')
            fmsg[start:stop] = fmsg_n
        LP['fmsg'] = fmsg

    elif limitMemoryLP:
        # Track sufficient statistics directly at each sequence.
        TransCount = np.empty((Data.nDoc, K, K))
        Htable = np.empty((Data.nDoc, K, K))
        mHtable = np.zeros((2 * M, K))

        # Run forward backward algorithm on each sequence n
        for n in xrange(Data.nDoc):
            start = Data.doc_range[n]
            stop = Data.doc_range[n + 1]
            logLik_n = logLik[start:stop]
            # Adding in start state probs, in log space for stability.
            logLik_n[0] += logstartPi  # Line A

            # Run fwd-fwd alg and record result.
            resp_n, lp_n, TransCount_n, Htable_n, mHtable_n = \
                FwdBwdAlg_LimitMemory(startPi, transPi, logLik_n, mPairIDs)
            resp[start:stop] = resp_n
            logMargPr[n] = lp_n
            TransCount[n] = TransCount_n
            Htable[n] = Htable_n
            mHtable += mHtable_n

        LP['resp'] = resp
        LP['evidence'] = np.sum(logMargPr)
        LP['TransCount'] = TransCount
        LP['Htable'] = Htable
        LP['mHtable'] = mHtable
    else:
        # Track pair-wise assignment probs for each sequence
        respPair = np.empty((nAtom, K, K))

        # Run the forward backward algorithm on each sequence
        for n in xrange(Data.nDoc):
            start = Data.doc_range[n]
            stop = Data.doc_range[n + 1]
            logLik_n = logLik[start:stop]
            # Adding in start state probs, in log space for stability.
            logLik_n[0] += logstartPi  # Line A

            resp_n, respPair_n, lp_n = \
                FwdBwdAlg(startPi, transPi, logLik_n)
            resp[start:stop] = resp_n
            respPair[start:stop] = respPair_n
            logMargPr[n] = lp_n

        LP['evidence'] = np.sum(logMargPr)
        LP['resp'] = resp
        LP['respPair'] = respPair
    # ... end if statement on limitMemoryLP

    return LP
コード例 #19
0
ファイル: GroupXData.py プロジェクト: zhaottcrystal/bnpy
    def __init__(self,
                 X=None,
                 doc_range=None,
                 nDocTotal=None,
                 Xprev=None,
                 TrueZ=None,
                 TrueParams=None,
                 fileNames=None,
                 summary=None,
                 **kwargs):
        ''' Create an instance of GroupXData for provided array X

        Post Condition
        ---------
        self.X : 2D array, size N x D
            with standardized dtype, alignment, byteorder.
        self.Xprev : 2D array, size N x D
            with standardized dtype, alignment, byteorder.
        self.doc_range : 1D array, size nDoc+1
        '''
        self.X = as2D(toCArray(X, dtype=np.float64))
        self.doc_range = as1D(toCArray(doc_range, dtype=np.int32))
        if summary is not None:
            self.summary = summary
        if Xprev is not None:
            self.Xprev = as2D(toCArray(Xprev, dtype=np.float64))

        # Verify attributes are consistent
        self._set_dependent_params(doc_range, nDocTotal)
        self._check_dims()

        # Add optional true parameters / true hard labels
        if TrueParams is not None:
            self.TrueParams = dict()
            for key, arr in TrueParams.items():
                self.TrueParams[key] = toCArray(arr)

        if TrueZ is not None:
            if not hasattr(self, 'TrueParams'):
                self.TrueParams = dict()
            self.TrueParams['Z'] = as1D(toCArray(TrueZ))
            self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size

        # Add optional source files for each group/sequence
        if fileNames is not None:
            if hasattr(fileNames, 'shape') and fileNames.shape == (1, 1):
                fileNames = fileNames[0, 0]
            if len(fileNames) > 1:
                self.fileNames = [
                    str(x).strip() for x in np.squeeze(fileNames)
                ]
            else:
                self.fileNames = [str(fileNames[0])]
        # Add extra data attributes custom for the dataset
        for key in kwargs:
            if hasattr(self, key):
                continue
            if not key.startswith("__"):
                arr = np.squeeze(as1D(kwargs[key]))
                if arr.shape == ():
                    try:
                        arr = float(arr)
                    except TypeError:
                        continue
                setattr(self, key, arr)