Esempio n. 1
0
def calcSummaryStats(Data, LP,
                     doPrecompEntropy=0,
                     doPrecompMergeEntropy=0,
                     mPairIDs=None,
                     trackDocUsage=0,
                     **kwargs):
    ''' Calculate summary statistics for given data slice and local params.

    Returns
    -------
    SS : SuffStatBag
    '''
    if mPairIDs is None:
        M = 0
    else:
        M = len(mPairIDs)

    resp = LP['resp']
    K = resp.shape[1]
    startLocIDs = Data.doc_range[:-1]
    StartStateCount = np.sum(resp[startLocIDs], axis=0)
    N = np.sum(resp, axis=0)

    if 'TransCount' in LP:
        TransStateCount = np.sum(LP['TransCount'], axis=0)
    else:
        respPair = LP['respPair']
        TransStateCount = np.sum(respPair, axis=0)

    SS = SuffStatBag(K=K, D=Data.dim, M=M)
    SS.setField('StartStateCount', StartStateCount, dims=('K'))
    SS.setField('TransStateCount', TransStateCount, dims=('K', 'K'))
    SS.setField('N', N, dims=('K'))
    SS.setField('nDoc', Data.nDoc, dims=None)

    if doPrecompEntropy or 'Htable' in LP:
        # Compute entropy terms!
        # 'Htable', 'Hstart' will both be in Mdict
        Mdict = calcELBO_NonlinearTerms(Data=Data,
                                        LP=LP, returnMemoizedDict=1)
        SS.setELBOTerm('Htable', Mdict['Htable'], dims=('K', 'K'))
        SS.setELBOTerm('Hstart', Mdict['Hstart'], dims=('K'))

    if doPrecompMergeEntropy:
        subHstart, subHtable = HMMUtil.PrecompMergeEntropy_SpecificPairs(
            LP, Data, mPairIDs)
        SS.setMergeTerm('Hstart', subHstart, dims=('M'))
        SS.setMergeTerm('Htable', subHtable, dims=('M', 2, 'K'))
        SS.mPairIDs = np.asarray(mPairIDs)

    if trackDocUsage:
        # Track how often topic appears in a seq. with mass > thresh.
        DocUsage = np.zeros(K)
        for n in xrange(Data.nDoc):
            start = Data.doc_range[n]
            stop = Data.doc_range[n + 1]
            DocUsage += np.sum(LP['resp'][start:stop], axis=0) > 0.01
        SS.setSelectionTerm('DocUsageCount', DocUsage, dims='K')
    return SS
Esempio n. 2
0
def calcSummaryStats(Data,
                     LP,
                     doPrecompEntropy=False,
                     doPrecompMergeEntropy=False,
                     mPairIDs=None,
                     mergePairSelection=None,
                     trackDocUsage=False,
                     **kwargs):
    ''' Calculate sufficient statistics for global updates.

    Parameters
    -------
    Data : bnpy data object
    LP : local param dict with fields
        resp : Data.nObs x K array,
            where resp[n,k] = posterior resp of comp k
    doPrecompEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        used for memoized learning algorithms (moVB)
    doPrecompMergeEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        for certain merge candidates.

    Returns
    -------
    SS : SuffStatBag with K components
        Summarizes for this mixture model, with fields
        * N : 1D array, size K
            N[k] = expected number of items assigned to comp k

        Also has optional ELBO field when precompELBO is True
        * ElogqZ : 1D array, size K
            Vector of entropy contributions from each comp.
            ElogqZ[k] = \sum_{n=1}^N resp[n,k] log resp[n,k]

        Also has optional Merge field when precompMergeELBO is True
        * ElogqZ : 2D array, size K x K
            Each term is scalar entropy of merge candidate
    '''
    if mPairIDs is not None and len(mPairIDs) > 0:
        M = len(mPairIDs)
    else:
        M = 0
    if 'resp' in LP:
        Nvec = np.sum(LP['resp'], axis=0)
        K = Nvec.size
    else:
        # Sparse assignment case
        Nvec = as1D(toCArray(LP['spR'].sum(axis=0)))
        K = LP['spR'].shape[1]

    if hasattr(Data, 'dim'):
        SS = SuffStatBag(K=K, D=Data.dim, M=M)
    else:
        SS = SuffStatBag(K=K, D=Data.vocab_size, M=M)
    SS.setField('N', Nvec, dims=('K'))
    if doPrecompEntropy:
        Mdict = calcELBO_NonlinearTerms(LP=LP, returnMemoizedDict=1)
        if type(Mdict['Hresp']) == float:
            # SPARSE HARD ASSIGNMENTS
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=None)
        else:
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=('K', ))

    if doPrecompMergeEntropy:
        m_Hresp = None
        if 'resp' in LP:
            m_Hresp = -1 * NumericUtil.calcRlogR_specificpairs(
                LP['resp'], mPairIDs)
        elif 'spR' in LP:
            if LP['nnzPerRow'] > 1:
                m_Hresp = calcSparseMergeRlogR(spR_csr=LP['spR'],
                                               nnzPerRow=LP['nnzPerRow'],
                                               mPairIDs=mPairIDs)
        else:
            raise ValueError("Need resp or spR in LP")
        if m_Hresp is not None:
            assert m_Hresp.size == len(mPairIDs)
            SS.setMergeTerm('Hresp', m_Hresp, dims=('M'))
    if trackDocUsage:
        Usage = np.sum(LP['resp'] > 0.01, axis=0)
        SS.setSelectionTerm('DocUsageCount', Usage, dims='K')

    return SS
Esempio n. 3
0
def calcSummaryStats(Dslice,
                     LP=None,
                     alpha=None,
                     alphaEbeta=None,
                     doTrackTruncationGrowth=0,
                     doPrecompEntropy=0,
                     doPrecompMergeEntropy=0,
                     mergePairSelection=None,
                     mPairIDs=None,
                     trackDocUsage=0,
                     **kwargs):
    """ Calculate summary from local parameters for given data slice.

    Parameters
    -------
    Data : bnpy data object
    LP : local param dict with fields
        resp : Data.nObs x K array,
            where resp[n,k] = posterior resp of comp k
    doPrecompEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        used for memoized learning algorithms (moVB)

    Returns
    -------
    SS : SuffStatBag with K components
        Relevant fields
        * nDoc : scalar float
            Counts total documents available in provided data.
        * sumLogPi : 1D array, size K
            Entry k equals \sum_{d in docs} E[ \log \pi_{dk} ]
        * sumLogPiRem : scalar float
            Equals sum over docs of probability of inactive topics.

        Also has optional ELBO field when precompELBO is True
        * Hvec : 1D array, size K
            Vector of entropy contributions from each comp.
            Hvec[k] = \sum_{n=1}^N H[q(z_n)], a function of 'resp'
    """
    if mPairIDs is None:
        M = 0
    else:
        M = len(mPairIDs)
    K = LP['DocTopicCount'].shape[1]
    if 'digammaSumTheta' not in LP:
        digammaSumTheta = digamma(LP['theta'].sum(axis=1) + LP['thetaRem'])
        LP['digammaSumTheta'] = digammaSumTheta  # Used for merges

    if 'ElogPi' not in LP:
        LP['ElogPiRem'] = digamma(LP['thetaRem']) - LP['digammaSumTheta']
        LP['ElogPi'] = digamma(LP['theta']) - \
            LP['digammaSumTheta'][:, np.newaxis]

    SS = SuffStatBag(K=K, D=Dslice.dim, M=M)
    SS.setField('nDoc', Dslice.nDoc, dims=None)
    SS.setField('sumLogPi', np.sum(LP['ElogPi'], axis=0), dims='K')
    if 'ElogPiEmptyComp' in LP:
        sumLogPiEmptyComp = np.sum(LP['ElogPiEmptyComp']) - \
            np.sum(LP['ElogPiOrigComp'])
        SS.setField('sumLogPiEmptyComp', sumLogPiEmptyComp, dims=None)
    if doTrackTruncationGrowth:
        remvec = np.zeros(K)
        remvec[K - 1] = np.sum(LP['ElogPiRem'])
        SS.setField('sumLogPiRemVec', remvec, dims='K')
    else:
        SS.setField('sumLogPiRem', np.sum(LP['ElogPiRem']), dims=None)

    if doPrecompEntropy:
        Mdict = calcELBO_NonlinearTerms(Data=Dslice,
                                        LP=LP,
                                        returnMemoizedDict=1)
        if type(Mdict['Hresp']) == float:
            # SPARSE HARD ASSIGNMENTS
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=None)
        else:
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=('K', ))
        SS.setELBOTerm('slackTheta', Mdict['slackTheta'], dims='K')
        SS.setELBOTerm('gammalnTheta', Mdict['gammalnTheta'], dims='K')
        if 'ElogPiEmptyComp' in LP:
            SS.setELBOTerm('slackThetaEmptyComp', Mdict['slackThetaEmptyComp'])
            SS.setELBOTerm('gammalnThetaEmptyComp',
                           Mdict['gammalnThetaEmptyComp'])
            SS.setELBOTerm('HrespEmptyComp', Mdict['HrespEmptyComp'])

        else:
            SS.setELBOTerm('gammalnSumTheta',
                           Mdict['gammalnSumTheta'],
                           dims=None)
            SS.setELBOTerm('slackThetaRem', Mdict['slackThetaRem'], dims=None)
            SS.setELBOTerm('gammalnThetaRem',
                           Mdict['gammalnThetaRem'].sum(),
                           dims=None)

    if doPrecompMergeEntropy:
        if mPairIDs is None:
            raise NotImplementedError("TODO: all pairs for merges")
        m_Hresp = calcHrespForSpecificMergePairs(LP, Dslice, mPairIDs)
        if m_Hresp is not None:
            SS.setMergeTerm('Hresp', m_Hresp, dims=('M'))

        m_sumLogPi = np.zeros(M)
        m_gammalnTheta = np.zeros(M)
        m_slackTheta = np.zeros(M)
        for m, (kA, kB) in enumerate(mPairIDs):
            theta_vec = LP['theta'][:, kA] + LP['theta'][:, kB]
            ElogPi_vec = digamma(theta_vec) - LP['digammaSumTheta']
            m_gammalnTheta[m] = np.sum(gammaln(theta_vec))
            m_sumLogPi[m] = np.sum(ElogPi_vec)
            # slack = (Ndm - theta_dm) * E[log pi_dm]
            slack_vec = ElogPi_vec
            slack_vec *= -1 * (alphaEbeta[kA] + alphaEbeta[kB])
            m_slackTheta[m] = np.sum(slack_vec)
        SS.setMergeTerm('gammalnTheta', m_gammalnTheta, dims=('M'))
        SS.setMergeTerm('sumLogPi', m_sumLogPi, dims=('M'))
        SS.setMergeTerm('slackTheta', m_slackTheta, dims=('M'))

        # Uncomment this for verification of merge calculations.
        # for (kA, kB) in mPairIDs:
        #      self.verifySSForMergePair(Data, SS, LP, kA, kB)
        # .... end merge computations

    # Selection terms (computes doc-topic correlation)
    if mergePairSelection is not None:
        if mergePairSelection.count('corr') > 0:
            Tmat = LP['DocTopicCount']
            SS.setSelectionTerm('DocTopicPairMat',
                                np.dot(Tmat.T, Tmat),
                                dims=('K', 'K'))
            SS.setSelectionTerm('DocTopicSum', np.sum(Tmat, axis=0), dims='K')

    if trackDocUsage:
        # Track num of times a topic appears nontrivially in a doc
        DocUsage = np.sum(LP['DocTopicCount'] > 0.01, axis=0)
        SS.setSelectionTerm('DocUsageCount', DocUsage, dims='K')
        Pi = LP['theta'] / LP['theta'].sum(axis=1)[:, np.newaxis]
        SumPi = np.sum(Pi, axis=0)
        SS.setSelectionTerm('SumPi', SumPi, dims='K')
    return SS