Example #1
0
def create_model_with_new_comps(bigModel,
                                bigSS,
                                freshData,
                                Q=None,
                                Plan=None,
                                **kwargs):
    '''

      Returns
      -------
      freshModel : HModel with Kfresh components,
                     scale *may not* be consistent with target dataset
      freshSS : SuffStatBag with Kfresh components,
                     scale will be consistent with target dataset
    '''
    Info = dict()
    freshModel = bigModel.copy()

    if kwargs['creationRoutine'] == 'targetWordFreq':
        freshModel.set_global_params(
            beta=np.ones(1),
            K=1,
            topics=Plan['targetWordFreq'][np.newaxis, :],
            wordcountTotal=freshData.word_count.sum())
    elif kwargs['creationRoutine'] == 'findmissingtopics':
        freshModel = create_new_model_findmissingtopics(
            freshModel, freshData, bigModel, **kwargs)
    elif kwargs['creationRoutine'] == 'xspectral':
        assert Q is not None
        freshModel = create_new_model_expandedspectral(freshModel, Q,
                                                       freshData, bigModel,
                                                       **kwargs)
    elif kwargs['creationRoutine'] == 'spectralOnTarget':
        freshModel = create_new_model_spectralOnTarget(freshModel, freshData,
                                                       bigModel, **kwargs)
    else:
        freshModel.init_global_params(freshData,
                                      K=kwargs['Kfresh'],
                                      initname=kwargs['creationRoutine'],
                                      **kwargs)

    logPhase('Creation')
    log('CreationRoutine: ' + kwargs['creationRoutine'], 'debug')
    log('Kfresh=%d' % (freshModel.obsModel.K), 'debug')

    if not kwargs['creationDoUpdateFresh']:
        # Create freshSS that would produce (nearly) same freshModel.obsModel
        # after a call to update_global_params
        freshSS._Fields.setAllFieldsToZero()
        if hasattr(freshSS, 'WordCounts'):
            topics = freshSS.WordCounts
            priorvec = freshModel.obsModel.obsPrior.lamvec
            for k in range(freshSS.K):
                topics[k, :] = freshModel.obsModel.comp[k].lamvec - priorvec
            freshSS.setField('WordCounts', topics, dims=('K', 'D'))
        return freshModel, freshSS, Info

    # Record initial model for posterity
    if kwargs['birthDebug']:
        Info['freshModelInit'] = freshModel.copy()

    # Complete several iterations to improve this fresh proposal
    for step in range(kwargs['creationNumIters']):
        freshLP = freshModel.calc_local_params(freshData, **fastParams)
        freshSS = freshModel.get_global_suff_stats(freshData, freshLP)
        freshModel.update_global_params(freshSS)
        if step < 3 or (step + 1) % 10 == 0:
            logPosVector(freshSS.N,
                         label='iter %3d' % (step + 1),
                         level='debug')
        if step > 1:
            maxDiff = np.max(np.abs(freshSS.N - prevN))
            if maxDiff < 1.0:
                break
        prevN = freshSS.N.copy()

    logPosVector(freshSS.N, label='after creation', level='moreinfo')
    if kwargs['birthDebug']:
        Info['freshModelRefined'] = freshModel.copy()

    if kwargs['cleanupDeleteEmpty']:
        Kbefore = freshSS.K
        freshModel, freshSS = BirthCleanup.delete_empty_comps(freshData,
                                                              freshModel,
                                                              freshSS,
                                                              Korig=0,
                                                              **kwargs)
        freshLP = freshModel.calc_local_params(freshData)
        freshSS = freshModel.get_global_suff_stats(freshData, freshLP)
        freshModel.update_global_params(freshSS)
        if freshSS.K < Kbefore:
            msg = 'after remove empty (size < %d)' % (kwargs['cleanupMinSize'])
            logPosVector(freshSS.N, label=msg, level='moreinfo')

    if kwargs['cleanupDeleteToImproveFresh']:
        freshModel, freshSS, ELBO = BirthCleanup.delete_comps_to_improve_ELBO(
            freshData, freshModel, LP=freshLP)
        Info['evBound'] = ELBO
        if kwargs['birthDebug']:
            Info['freshModelPostDelete'] = freshModel.copy()

    elif kwargs['cleanupMergeToImproveFresh']:
        Korig = freshSS.K
        while freshSS.K > 1:
            mPairIDs, MM = MergePlanner.preselect_candidate_pairs(
                freshModel,
                freshSS,
                preselect_routine='wholeELBO',
                doLimitNumPairs=0,
                returnScoreMatrix=1,
                **kwargs)
            freshLP = freshModel.calc_local_params(freshData)
            freshSS = freshModel.get_global_suff_stats(freshData,
                                                       freshLP,
                                                       doPrecompEntropy=1,
                                                       doPrecompMergeEntropy=1,
                                                       mPairIDs=mPairIDs)
            freshModel.update_global_params(freshSS)
            freshELBO = freshModel.calc_evidence(SS=freshSS)
            freshModel, freshSS, freshELBO, Info = \
                MergeMove.run_many_merge_moves(
                    freshModel, freshSS, freshELBO,
                    mPairIDs, M=MM,
                    isBirthCleanup=1,
                    logFunc=log)
            if len(Info['AcceptedPairs']) == 0:
                break
        if freshSS.K < Korig:
            msg = 'after merges'
            logPosVector(freshSS.N, label=msg, level='moreinfo')

    if freshSS.K < 2:
        msg = "BIRTH failed. Fresh proposal does not prefer multiple comps."
        raise BirthProposalError(msg)

    return freshModel, freshSS, Info
Example #2
0
def refine_expanded_model_with_VB_iters(xbigModel,
                                        freshData,
                                        xbigSS=None,
                                        Korig=0,
                                        **kwargs):
    ''' Execute multiple local/global update steps for the current model

        Args
        --------
        xbigSS : SuffStatBag, with K + Kfresh comps,
                                   scale equal to bigData only

        Returns
        --------
        model : HModel, with K + Kfresh comps
                        scale equal to bigData + freshData
        freshSS : SuffStatBag, with K + Kfresh comps
                        scale equal to freshData
        freshLP : dict of local parameters for freshData


        Updates (in-place)
        ----------
        xbigSS : SuffStatBag, with K + Kfresh comps
                         scale with equal to bigData only
    '''
    logPhase('Refinement')

    xInfo = dict()
    origIDs = list(range(0, xbigSS.K))

    nIters = kwargs['refineNumIters']
    traceBeta = np.zeros((nIters, xbigSS.K))
    traceN = np.zeros((nIters, xbigSS.K))
    traceELBO = np.zeros(nIters)

    xfreshLP = None
    for riter in range(nIters):
        xfreshLP = xbigModel.calc_local_params(freshData, xfreshLP, **kwargs)
        xfreshSS = xbigModel.get_global_suff_stats(freshData, xfreshLP)

        traceN[riter, origIDs] = xfreshSS.N
        if kwargs['birthDebug']:
            traceBeta[riter,
                      origIDs] = xbigModel.allocModel.get_active_comp_probs()
            traceELBO[riter] = xbigModel.calc_evidence(freshData, xfreshSS,
                                                       xfreshLP)

        if riter < 3 or (riter + 1) % 5 == 0:
            logPosVector(traceN[riter, Korig:], label='iter %3d' % (riter + 1))

        # For all but last iteration, attempt removing empty topics
        if kwargs[
                'cleanupDeleteEmpty'] and riter < kwargs['refineNumIters'] - 1:
            for k in reversed(list(range(Korig, xfreshSS.K))):
                if xfreshSS.N[k] < kwargs['cleanupMinSize']:
                    xfreshSS.removeComp(k)
                    xbigSS.removeComp(xbigSS.K - 1)  # last in order!
                    del origIDs[k]

        if xfreshSS.K == Korig:
            msg = "BIRTH failed. After refining, no comps > cleanupMinSize."
            raise BirthProposalError(msg)

        xbigSS += xfreshSS
        xbigModel.allocModel.update_global_params(xbigSS)
        xbigModel.obsModel.update_global_params(xbigSS)
        xbigSS -= xfreshSS

    xfreshLP = xbigModel.calc_local_params(freshData, xfreshLP, **kwargs)
    xfreshSS = xbigModel.get_global_suff_stats(freshData, xfreshLP)
    log('Final Assignment Counts')
    logPosVector(xfreshSS.N[Korig:], label='final')

    if kwargs['birthDebug']:
        xInfo['traceBeta'] = traceBeta
        xInfo['traceN'] = traceN
        xInfo['traceELBO'] = traceELBO
    xInfo['origIDs'] = origIDs

    return xbigModel, xfreshSS, xfreshLP, xInfo
Example #3
0
def delete_comps_from_expanded_model_to_improve_ELBO(Data,
                                                     xbigModel,
                                                     xbigSS,
                                                     xfreshSS,
                                                     xfreshLP=None,
                                                     Korig=0,
                                                     **kwargs):
    ''' Attempts deleting components K, K-1, K-2, ... Korig,
         keeping (and building on) any proposals that improve the ELBO

       Returns
       ---------
        model : HModel with Knew comps
        SS : SuffStatBag with Knew comps
        ELBO : evidence lower bound for the returned model
    '''
    logPhase('Cleanup')

    K = xbigSS.K
    assert xbigSS.K == xfreshSS.K
    assert xbigModel.obsModel.K == K

    origIDs = range(0, K)
    if K == 1:
        return xbigModel, xbigSS, xfreshSS, origIDs

    xfreshELBO = xbigModel.calc_evidence(SS=xfreshSS)
    for k in reversed(range(Korig, K)):
        if kwargs['cleanupDeleteViaLP']:
            rbigModel, rbigSS, rfreshSS, rfreshELBO, rfreshLP = \
                _make_xcandidate_LP(
                    xbigModel, Data,
                    xbigSS, xfreshSS, xfreshLP,
                    k, **kwargs)
        else:
            rbigModel, rbigSS, rfreshSS, rfreshELBO = _make_xcandidate(
                xbigModel, Data, xbigSS, xfreshSS, k)
        # If ELBO has improved, set current model to delete component k
        didAccept = False
        if rfreshELBO >= xfreshELBO:
            log('Deletion accepted. prop %.5e > cur %.5e' %
                (rfreshELBO, xfreshELBO))
            logPosVector(xfreshSS.N[Korig:])

            xbigSS = rbigSS
            xfreshSS = rfreshSS
            xbigModel = rbigModel
            xfreshELBO = rfreshELBO
            if kwargs['cleanupDeleteViaLP']:
                xfreshLP = rfreshLP
            didAccept = True
            del origIDs[k]

        if xfreshSS.K == 1:
            break
        # end loop over comps to delete

    if xbigSS.K == Korig and kwargs['cleanupRaiseErrorWhenAllDeleted']:
        log('FAILED. Deleting all new comps improves ELBO.')
        msg = "FAILED. After expansion, deleting all new comps improves ELBO."
        raise BirthProposalError(msg)
    return xbigModel, xbigSS, xfreshSS, xfreshELBO, origIDs