Example #1
0
def summarizeRestrictedLocalStep_HDPTopicModel(Dslice=None,
                                               curModel=None,
                                               curLPslice=None,
                                               curSSwhole=None,
                                               ktarget=None,
                                               targetUID=None,
                                               xUIDs=None,
                                               mUIDPairs=None,
                                               xObsModel=None,
                                               xInitSS=None,
                                               **kwargs):
    ''' Perform one restricted local step and summarize it.

    Returns
    -------
    xSSslice : SuffStatBag
    Info : dict with other information
    '''
    # Determine which uid to target
    if ktarget is None:
        assert targetUID is not None
        ktarget = curSSwhole.uid2k(targetUID)
    elif targetUID is None:
        assert ktarget is not None
        targetUID = curSSwhole.uids[ktarget]
    assert targetUID == curSSwhole.uids[ktarget]
    # Determine how many new uids to make
    Kfresh = len(xUIDs)
    # Verify provided summary states used to initialize clusters, if any.
    if xInitSS is not None:
        assert xInitSS.K == Kfresh
        xInitSS.setUIDs(xUIDs)
    # Create temporary observation model for each of Kfresh new clusters
    # If it doesn't exist already
    if xObsModel is None:
        xObsModel = curModel.obsModel.copy()
    if xInitSS is not None:
        xObsModel.update_global_params(xInitSS)
    assert xObsModel.K == Kfresh
    xPiVec, emptyPi = make_xPiVec_and_emptyPi(curModel=curModel,
                                              xInitSS=xInitSS,
                                              ktarget=ktarget,
                                              Kfresh=Kfresh,
                                              **kwargs)
    xalphaPi = curModel.allocModel.alpha * xPiVec
    thetaEmptyComp = curModel.allocModel.alpha * emptyPi

    # Perform restricted inference!
    # xLPslice contains local params for all Kfresh expansion clusters
    xLPslice = restrictedLocalStep_HDPTopicModel(Dslice=Dslice,
                                                 curLPslice=curLPslice,
                                                 ktarget=ktarget,
                                                 xObsModel=xObsModel,
                                                 xalphaPi=xalphaPi,
                                                 thetaEmptyComp=thetaEmptyComp,
                                                 **kwargs)
    assert "HrespOrigComp" in xLPslice

    # Summarize this expanded local parameter pack
    xSSslice = curModel.get_global_suff_stats(Dslice,
                                              xLPslice,
                                              trackDocUsage=1,
                                              doPrecompEntropy=1,
                                              doTrackTruncationGrowth=1)
    xSSslice.setUIDs(xUIDs)
    assert xSSslice.hasELBOTerm("Hresp")
    if emptyPi > 0:
        assert xSSslice.hasELBOTerm("HrespEmptyComp")

    # If desired, add merge terms into the expanded summaries,
    if mUIDPairs is not None and len(mUIDPairs) > 0:
        Mdict = curModel.allocModel.calcMergeTermsFromSeparateLP(
            Data=Dslice,
            LPa=curLPslice,
            SSa=curSSwhole,
            LPb=xLPslice,
            SSb=xSSslice,
            mUIDPairs=mUIDPairs)
        xSSslice.setMergeUIDPairs(mUIDPairs)
        for key, arr in list(Mdict.items()):
            xSSslice.setMergeTerm(key, arr, dims='M')
    # Prepare dict of info for debugging/inspection
    Info = dict()
    Info['Kfresh'] = Kfresh
    Info['xInitSS'] = xInitSS
    Info['xLPslice'] = xLPslice
    Info['xPiVec'] = xPiVec
    Info['emptyPi'] = emptyPi
    return xSSslice, Info
Example #2
0
def makeExpansionLPFromZ_HDPTopicModel(Dslice=None,
                                       curModel=None,
                                       curLPslice=None,
                                       ktarget=None,
                                       xInitSS=None,
                                       targetZ=None,
                                       atomType=None,
                                       chosenDataIDs=None,
                                       emptyPiFrac=None,
                                       **kwargs):
    ''' Create expanded local parameters from Z assignments on target subset.

    Returns
    -------
    xLP : dict with fields
        resp : N x Kfresh
        DocTopicCount : D x Kfresh
        theta : D x Kfresh
        ElogPi : D x Kfresh
    '''
    Kfresh = targetZ.max() + 1
    N = curLPslice['resp'].shape[0]
    # Compute prior probability of each proposed comp
    xPiVec, emptyPi = make_xPiVec_and_emptyPi(curModel=curModel,
                                              ktarget=ktarget,
                                              Kfresh=Kfresh,
                                              xInitSS=xInitSS,
                                              **kwargs)
    xalphaPi = curModel.allocModel.alpha * xPiVec
    emptyalphaPi = curModel.allocModel.alpha * emptyPi

    # Compute likelihood under each proposed comp
    xObsModel = curModel.obsModel.copy()
    xObsModel.update_global_params(xInitSS)
    xLPslice = xObsModel.calc_local_params(Dslice)

    # Initialize xresp so each atom is normalized
    # This is the "default", for non-target atoms.
    xresp = xLPslice['E_log_soft_ev']
    xresp += np.log(xalphaPi)  # log prior probability
    xresp -= xresp.max(axis=1)[:, np.newaxis]
    assert np.allclose(xresp.max(axis=1), 0.0)

    np.exp(xresp, out=xresp)
    xresp /= xresp.sum(axis=1)[:, np.newaxis]

    # Now, replace all targeted atoms with an all-or-nothing assignment
    if atomType == 'doc' and curModel.getAllocModelName().count('HDP'):
        if curModel.getObsModelName().count('Mult'):
            for pos, d in enumerate(chosenDataIDs):
                start = Dslice.doc_range[d]
                stop = Dslice.doc_range[d + 1]
                xresp[start:stop, :] = 1e-100
                xresp[start:stop, targetZ[pos]] = 1.0
        elif curModel.getObsModelName().count('Bern'):
            # For all words in each targeted doc,
            # Assign them to the corresponding cluster in targetZ
            for pos, d in enumerate(chosenDataIDs):
                bstart = Dslice.vocab_size * d
                bstop = Dslice.vocab_size * (d + 1)
                xresp[bstart:bstop, :] = 1e-100
                xresp[bstart:bstop, targetZ[pos]] = 1.0
                #words_d = Dslice.word_id[
                #    Dslice.doc_range[d]:Dslice.doc_range[d+1]]
                #xresp[bstart + words_d, :] = 1e-100
                #xresp[bstart + words_d, targetZ[pos]] = 1.0

    else:
        for pos, n in enumerate(chosenDataIDs):
            xresp[n, :] = 1e-100
            xresp[n, targetZ[pos]] = 1.0
    assert np.allclose(1.0, xresp.sum(axis=1))

    # Make resp consistent with ktarget comp
    xresp *= curLPslice['resp'][:, ktarget][:, np.newaxis]
    np.maximum(xresp, 1e-100, out=xresp)

    # Create xDocTopicCount
    xDocTopicCount = np.zeros((Dslice.nDoc, Kfresh))
    for d in range(Dslice.nDoc):
        start = Dslice.doc_range[d]
        stop = Dslice.doc_range[d + 1]
        if hasattr(Dslice, 'word_id') and \
                curModel.getObsModelName().count('Mult'):
            xDocTopicCount[d] = np.dot(Dslice.word_count[start:stop],
                                       xresp[start:stop])
        elif hasattr(Dslice, 'word_id') and \
                curModel.getObsModelName().count('Bern'):
            bstart = d * Dslice.vocab_size
            bstop = (d + 1) * Dslice.vocab_size
            xDocTopicCount[d] = np.sum(xresp[bstart:bstop], axis=0)
        else:
            xDocTopicCount[d] = np.sum(xresp[start:stop], axis=0)
    # Create xtheta
    xtheta = xDocTopicCount + xalphaPi[np.newaxis, :]

    # Package up into xLPslice
    xLPslice['resp'] = xresp
    xLPslice['DocTopicCount'] = xDocTopicCount
    xLPslice['theta'] = xtheta
    assert np.allclose(xDocTopicCount.sum(axis=1),
                       curLPslice['DocTopicCount'][:, ktarget])
    assert np.allclose(
        xtheta.sum(axis=1) + emptyalphaPi, curLPslice['theta'][:, ktarget])

    # Compute other LP quantities related to log prob (topic | doc)
    # and fill these into the expanded LP dict
    digammaSumTheta = curLPslice['digammaSumTheta'].copy()
    xLPslice['digammaSumTheta'] = digammaSumTheta
    xLPslice['ElogPi'] = \
        digamma(xLPslice['theta']) - digammaSumTheta[:, np.newaxis]
    xLPslice['thetaRem'] = curLPslice['thetaRem'].copy()
    xLPslice['ElogPiRem'] = curLPslice['ElogPiRem'].copy()

    # Compute quantities related to leaving ktarget almost empty,
    # as we expand and transfer mass to other comps
    if emptyalphaPi > 0:
        thetaEmptyComp = emptyalphaPi
        ElogPiEmptyComp = digamma(thetaEmptyComp) - digammaSumTheta
        xLPslice['thetaEmptyComp'] = thetaEmptyComp
        xLPslice['ElogPiEmptyComp'] = ElogPiEmptyComp

    # Compute quantities related to OrigComp, the original target cluster.
    # These need to be tracked and turned into relevant summaries
    # so that they can be used to created a valid proposal state "propSS"
    xLPslice['ElogPiOrigComp'] = curLPslice['ElogPi'][:, ktarget]
    xLPslice['gammalnThetaOrigComp'] = \
        np.sum(gammaln(curLPslice['theta'][:, ktarget]))
    slack = curLPslice['DocTopicCount'][:, ktarget] - \
        curLPslice['theta'][:, ktarget]
    xLPslice['slackThetaOrigComp'] = np.sum(slack *
                                            curLPslice['ElogPi'][:, ktarget])

    if hasattr(Dslice, 'word_count') and \
            xLPslice['resp'].shape[0] == Dslice.word_count.size:
        xLPslice['HrespOrigComp'] = -1 * NumericUtil.calcRlogRdotv(
            curLPslice['resp'][:, ktarget], Dslice.word_count)
    else:
        xLPslice['HrespOrigComp'] = -1 * NumericUtil.calcRlogR(
            curLPslice['resp'][:, ktarget])
    return xLPslice