Example #1
0
def inferDocTopicCountForDocFromHModel(docData, hmodel, alpha=0.5, **LPkwargs):
    # Lik_d : 2D array, size N x K
    # Each row is non-negative
    LP = hmodel.obsModel.calc_local_params(docData)
    Lik_d = LP['E_log_soft_ev']
    Lik_d -= Lik_d.max(axis=1)[:, np.newaxis]
    np.exp(Lik_d, out=Lik_d)

    # alphaEbeta : 1D array, size K
    alphaEbeta = alpha * hmodel.allocModel.get_active_comp_probs()
    DocTopicCount_d, _, _, Info = calcLocalParams_SingleDoc(1.0,
                                                            Lik_d,
                                                            alphaEbeta,
                                                            alphaEbetaRem=None,
                                                            **LPkwargs)
    assert np.allclose(DocTopicCount_d.sum(), Lik_d.shape[0])
    return DocTopicCount_d, Info
Example #2
0
def inferDocTopicCountForDoc(word_id, word_ct, topics, probs, alpha,
                             **LPkwargs):
    K = probs.size
    K2, W = topics.shape
    assert K == K2
    # topics : 2D array, vocab_size x K
    # Each col is non-negative and sums to one.
    topics = topics.T.copy()
    assert np.allclose(np.sum(topics, axis=0), 1.0)
    # Lik_d : 2D array, size N x K
    # Each row is non-negative
    Lik_d = np.asarray(topics[word_id, :].copy(), dtype=np.float64)
    # alphaEbeta : 1D array, size K
    alphaEbeta = np.asarray(alpha * probs, dtype=np.float64)
    DocTopicCount_d, _, _, Info = calcLocalParams_SingleDoc(word_ct,
                                                            Lik_d,
                                                            alphaEbeta,
                                                            alphaEbetaRem=None,
                                                            **LPkwargs)
    assert np.allclose(DocTopicCount_d.sum(), word_ct.sum())
    return DocTopicCount_d, Info
Example #3
0
def calcLocalParams(Data,
                    LP,
                    alphaEbeta=None,
                    alphaEbetaRem=None,
                    alpha=None,
                    initDocTopicCountLP='setDocProbsToEGlobalProbs',
                    cslice=(0, None),
                    nnzPerRowLP=0,
                    doSparseOnlyAtFinalLP=0,
                    **kwargs):
    ''' Calculate all local parameters for provided dataset under a topic model


    Kwargs
    ------
    initDocTopicCountLP : str, must be one of the options below
        'useDocTopicCountIfProvided'
            If provided LP contains 'DocTopicCount' array, 
            will use that to initialize the per-doc counts.
            Otherwise, will default back to 'setDocProbsToEGlobalProbs'.
        'setDocProbsToEGlobalProbs'
            Initialize doc-topic probas directly using global probas.
            Recommended for the first time a document is processed.
        'setDocTopicCountToAllZeros' 
            Initialize doc-topic counts to all zeros.

    Returns
    -------
    LP : dict
        Local parameter fields
        resp : 2D array, N x K
        DocTopicCount : 2D array, nDoc x K
        model-specific fields for doc-topic probabilities
    '''
    assert isinstance(cslice, tuple)
    if len(cslice) != 2:
        cslice = (0, None)
    elif cslice[0] is None:
        cslice = (0, None)
    nDoc = calcNumDocFromSlice(Data, cslice)

    if 'obsModelName' in LP:
        obsModelName = LP['obsModelName']
    elif hasattr(Data, 'word_count'):
        obsModelName = 'Mult'
    else:
        obsModelName = 'Gauss'
    # Unpack the problem size
    N, K = LP['E_log_soft_ev'].shape
    # Prepare the initial DocTopicCount matrix,
    # Useful for warm starts of the local step.
    initDocTopicCount = None
    if 'DocTopicCount' in LP:
        if LP['DocTopicCount'].shape == (nDoc, K):
            initDocTopicCount = LP['DocTopicCount'].copy()

    sumRespTilde = np.zeros(N)
    DocTopicCount = np.zeros((nDoc, K))
    DocTopicProb = np.zeros((nDoc, K))
    # Prepare the extra terms
    if alphaEbeta is None:
        assert alpha is not None
        alphaEbeta = alpha * np.ones(K)
    else:
        alphaEbeta = alphaEbeta[:K]
    # Prepare the likelihood matrix
    # Make sure it is C-contiguous, so that matrix ops are very fast
    Lik = np.asarray(LP['E_log_soft_ev'], order='C')
    if (nnzPerRowLP <= 0 or nnzPerRowLP >= K) or doSparseOnlyAtFinalLP:
        DO_DENSE = True
        # Dense Representation
        Lik -= Lik.max(axis=1)[:, np.newaxis]
        NumericUtil.inplaceExp(Lik)
    else:
        DO_DENSE = False
        nnzPerRowLP = np.minimum(nnzPerRowLP, K)
        spR_data = np.zeros(N * nnzPerRowLP, dtype=np.float64)
        spR_colids = np.zeros(N * nnzPerRowLP, dtype=np.int32)
    slice_start = Data.doc_range[cslice[0]]

    if not DO_DENSE and obsModelName.count('Mult'):
        if initDocTopicCountLP.count('fastfirstiter'):
            init_spR = calcInitSparseResp(LP,
                                          alphaEbeta,
                                          nnzPerRowLP=nnzPerRowLP,
                                          **kwargs)

    AggInfo = dict()
    AggInfo['maxDiff'] = np.zeros(Data.nDoc)
    AggInfo['iter'] = np.zeros(Data.nDoc, dtype=np.int32)

    if 'restartLP' in kwargs and kwargs['restartLP']:
        AggInfo['nRestartsAccepted'] = np.zeros(1, dtype=np.int32)
        AggInfo['nRestartsTried'] = np.zeros(1, dtype=np.int32)
    else:
        AggInfo['nRestartsAccepted'] = None
        AggInfo['nRestartsTried'] = None

    for d in range(nDoc):
        start = Data.doc_range[cslice[0] + d]
        stop = Data.doc_range[cslice[0] + d + 1]
        if hasattr(Data, 'word_count') and obsModelName.count('Bern'):
            lstart = d * Data.vocab_size
            lstop = (d + 1) * Data.vocab_size
        else:
            lstart = start - slice_start
            lstop = stop - slice_start
        if hasattr(Data, 'word_count') and not obsModelName.count('Bern'):
            wc_d = Data.word_count[start:stop].copy()
        else:
            wc_d = 1.0
        initDTC_d = None
        if initDocTopicCountLP == 'useDocTopicCountIfProvided':
            if initDocTopicCount is not None:
                if DO_DENSE:
                    initDTC_d = initDocTopicCount[d]
                else:
                    DocTopicCount[d] = initDocTopicCount[d]
            else:
                initDocTopicCountLP = 'setDocProbsToEGlobalProbs'
        if not DO_DENSE and initDocTopicCountLP.count('fastfirstiter'):
            if obsModelName.count('Mult'):
                DocTopicCount[d, :] = wc_d * init_spR[Data.word_id[start:stop]]
        if not DO_DENSE:
            m_start = nnzPerRowLP * start
            m_stop = nnzPerRowLP * stop

            # SPARSE RESP
            calcSparseLocalParams_SingleDoc(
                wc_d,
                Lik[lstart:lstop],
                alphaEbeta,
                topicCount_d_OUT=DocTopicCount[d],
                spResp_data_OUT=spR_data[m_start:m_stop],
                spResp_colids_OUT=spR_colids[m_start:m_stop],
                nnzPerRowLP=nnzPerRowLP,
                initDocTopicCountLP=initDocTopicCountLP,
                d=d,
                maxDiffVec=AggInfo['maxDiff'],
                numIterVec=AggInfo['iter'],
                nRAcceptVec=AggInfo['nRestartsAccepted'],
                nRTrialVec=AggInfo['nRestartsTried'],
                **kwargs)
        else:
            Lik_d = Lik[lstart:lstop].copy()  # Local copy
            (DocTopicCount[d], DocTopicProb[d],
                sumRespTilde[lstart:lstop], Info_d) \
                = calcLocalParams_SingleDoc(
                    wc_d, Lik_d, alphaEbeta, alphaEbetaRem,
                    DocTopicCount_d=initDTC_d,
                    initDocTopicCountLP=initDocTopicCountLP,
                    **kwargs)
            AggInfo = updateConvergenceInfoForDoc_d(d, Info_d, AggInfo, Data)

    #if initDocTopicCountLP.startswith('fast'):
    #    AggInfo['time_extra'] = telapsed
    LP['DocTopicCount'] = DocTopicCount
    if hasattr(Data, 'word_count') and LP['obsModelName'] == 'MultObsModel':
        if cslice is None or (cslice[0] == 0 and cslice[1] is None):
            assert np.allclose(np.sum(DocTopicCount), np.sum(Data.word_count))
    LP = updateLPGivenDocTopicCount(LP, DocTopicCount, alphaEbeta,
                                    alphaEbetaRem)
    if DO_DENSE:
        LP = updateLPWithResp(LP,
                              Data,
                              Lik,
                              DocTopicProb,
                              sumRespTilde,
                              cslice,
                              nnzPerRowLP=nnzPerRowLP,
                              doSparseOnlyAtFinalLP=doSparseOnlyAtFinalLP)
    else:
        indptr = np.arange(0, (N + 1) * nnzPerRowLP,
                           nnzPerRowLP,
                           dtype=np.int32)
        LP['spR'] = scipy.sparse.csr_matrix((spR_data, spR_colids, indptr),
                                            shape=(N, K))
        LP['nnzPerRow'] = nnzPerRowLP

    LP['Info'] = AggInfo
    writeLogMessageForManyDocs(Data, AggInfo, LP, **kwargs)
    return LP