Ejemplo n.º 1
0
def calcLocalParams(Data,
                    LP,
                    aModel,
                    methodLP='scratch',
                    routineLP='simple',
                    **kwargs):
    ''' Calculate all local parameters for provided dataset under a topic model

      Returns
      -------
      LP : dict of local params, with fields
      * DocTopicCount
      * resp
      * model-specific fields for doc-topic probabilities
  '''
    kwargs['methodLP'] = methodLP

    ## Prepare the log soft ev matrix
    ## Make sure it is C-contiguous, so that matrix ops are very fast
    Lik = np.asarray(LP['E_log_soft_ev'], order='C')
    Lik -= Lik.max(axis=1)[:, np.newaxis]
    NumericUtil.inplaceExp(Lik)
    K = Lik.shape[1]
    hasDocTopicCount = 'DocTopicCount' in LP \
                       and LP['DocTopicCount'].shape == (Data.nDoc, K)
    if methodLP == 'memo' and hasDocTopicCount:
        initDocTopicCount = LP['DocTopicCount']
    else:
        initDocTopicCount = None

    if routineLP == 'simple':
        DocTopicCount, Prior, sumR, AI = calcDocTopicCountForData_Simple(
            Data, aModel, Lik, initDocTopicCount=initDocTopicCount, **kwargs)
    elif routineLP == 'fast':
        DocTopicCount, Prior, sumR = calcDocTopicCountForData_Fast(
            Data, aModel, Lik, initDocTopicCount=initDocTopicCount, **kwargs)
    else:
        raise ValueError('Unrecognized routine ' + routineLP)

    LP['DocTopicCount'] = DocTopicCount
    LP = aModel.updateLPGivenDocTopicCount(LP, DocTopicCount)
    LP = updateLPWithResp(LP, Data, Lik, Prior, sumR)

    if kwargs['restartremovejunkLP'] == 1:
        LP, RInfo = removeJunkTopicsFromAllDocs(aModel, Data, LP, **kwargs)

    if 'lapFrac' in kwargs and 'batchID' in kwargs:
        if hasattr(Data, 'batchID') and Data.batchID == kwargs['batchID']:
            perc = [0, 5, 10, 50, 90, 95, 100]
            siter = ' '.join(
                ['%4d' % np.percentile(AI['iter'], p) for p in perc])
            sdiff = ['%6.4f' % np.percentile(AI['maxDiff'], p) for p in perc]
            sdiff = ' '.join(sdiff)
            nFail = np.sum(AI['maxDiff'] > kwargs['convThrLP'])
            msg = '%4.2f %3d %4d %s %s' % (kwargs['lapFrac'], Data.batchID,
                                           nFail, siter, sdiff)

            if kwargs['restartremovejunkLP'] == 1:
                msg += " %4d/%4d %4d/%4d" % (
                    RInfo['nDocRestartsAccepted'], RInfo['nDocRestartsTried'],
                    RInfo['nRestartsAccepted'], RInfo['nRestartsTried'])
            elif kwargs['restartremovejunkLP'] == 2:
                msg += " %4d/%4d" % (AI['nRestartsAccepted'],
                                     AI['nRestartsTried'])

    LP['Info'] = AI
    return LP
Ejemplo n.º 2
0
def calcLocalParams(Data,
                    LP,
                    alphaEbeta=None,
                    alphaEbetaRem=None,
                    alpha=None,
                    initDocTopicCountLP='scratch',
                    cslice=(0, None),
                    nnzPerRowLP=0,
                    doSparseOnlyAtFinalLP=0,
                    **kwargs):
    ''' Calculate all local parameters for provided dataset under a topic model

    Returns
    -------
    LP : dict
        Local parameter fields
        resp : 2D array, N x K
        DocTopicCount : 2D array, nDoc x K
        model-specific fields for doc-topic probabilities
    '''
    assert isinstance(cslice, tuple)
    if len(cslice) != 2:
        cslice = (0, None)
    elif cslice[0] is None:
        cslice = (0, None)
    nDoc = calcNumDocFromSlice(Data, cslice)

    if 'obsModelName' in LP:
        obsModelName = LP['obsModelName']
    elif hasattr(Data, 'word_count'):
        obsModelName = 'Mult'
    else:
        obsModelName = 'Gauss'
    # Unpack the problem size
    N, K = LP['E_log_soft_ev'].shape
    # Prepare the initial DocTopicCount matrix,
    # Useful for warm starts of the local step.
    initDocTopicCount = None
    if 'DocTopicCount' in LP:
        if LP['DocTopicCount'].shape == (nDoc, K):
            initDocTopicCount = LP['DocTopicCount'].copy()
    sumRespTilde = np.zeros(N)
    DocTopicCount = np.zeros((nDoc, K))
    DocTopicProb = np.zeros((nDoc, K))
    # Prepare the extra terms
    if alphaEbeta is None:
        assert alpha is not None
        alphaEbeta = alpha * np.ones(K)
    else:
        alphaEbeta = alphaEbeta[:K]
    # Prepare the likelihood matrix
    # Make sure it is C-contiguous, so that matrix ops are very fast
    Lik = np.asarray(LP['E_log_soft_ev'], order='C')
    if (nnzPerRowLP <= 0 or nnzPerRowLP >= K) or doSparseOnlyAtFinalLP:
        DO_DENSE = True
        # Dense Representation
        Lik -= Lik.max(axis=1)[:, np.newaxis]
        NumericUtil.inplaceExp(Lik)
    else:
        DO_DENSE = False
        nnzPerRowLP = np.minimum(nnzPerRowLP, K)
        spR_data = np.zeros(N * nnzPerRowLP, dtype=np.float64)
        spR_colids = np.zeros(N * nnzPerRowLP, dtype=np.int32)
    slice_start = Data.doc_range[cslice[0]]

    if not DO_DENSE and obsModelName.count('Mult'):
        if initDocTopicCountLP.count('fastfirstiter'):
            #tstart = time.time()
            init_spR = calcInitSparseResp(LP,
                                          alphaEbeta,
                                          nnzPerRowLP=nnzPerRowLP,
                                          **kwargs)
            #tstop = time.time()
            #telapsed = tstop - tstart

    AggInfo = dict()
    AggInfo['maxDiff'] = np.zeros(Data.nDoc)
    AggInfo['iter'] = np.zeros(Data.nDoc, dtype=np.int32)

    if 'restartLP' in kwargs and kwargs['restartLP']:
        AggInfo['nRestartsAccepted'] = np.zeros(1, dtype=np.int32)
        AggInfo['nRestartsTried'] = np.zeros(1, dtype=np.int32)
    else:
        AggInfo['nRestartsAccepted'] = None
        AggInfo['nRestartsTried'] = None

    for d in xrange(nDoc):
        start = Data.doc_range[cslice[0] + d]
        stop = Data.doc_range[cslice[0] + d + 1]
        if hasattr(Data, 'word_count') and obsModelName.count('Bern'):
            lstart = d * Data.vocab_size
            lstop = (d + 1) * Data.vocab_size
        else:
            lstart = start - slice_start
            lstop = stop - slice_start
        if hasattr(Data, 'word_count') and not obsModelName.count('Bern'):
            wc_d = Data.word_count[start:stop].copy()
        else:
            wc_d = 1.0
        initDTC_d = None
        if initDocTopicCountLP == 'memo':
            if initDocTopicCount is not None:
                if DO_DENSE:
                    initDTC_d = initDocTopicCount[d]
                else:
                    DocTopicCount[d] = initDocTopicCount[d]
            else:
                initDocTopicCountLP = 'setDocProbsToEGlobalProbs'
        if not DO_DENSE and initDocTopicCountLP.count('fastfirstiter'):
            if obsModelName.count('Mult'):
                #tstart = time.time()
                DocTopicCount[d, :] = wc_d * init_spR[Data.word_id[start:stop]]
                #telapsed += time.time() - tstart
        if not DO_DENSE:
            m_start = nnzPerRowLP * start
            m_stop = nnzPerRowLP * stop

            # SPARSE RESP
            calcSparseLocalParams_SingleDoc(
                wc_d,
                Lik[lstart:lstop],
                alphaEbeta,
                topicCount_d_OUT=DocTopicCount[d],
                spResp_data_OUT=spR_data[m_start:m_stop],
                spResp_colids_OUT=spR_colids[m_start:m_stop],
                nnzPerRowLP=nnzPerRowLP,
                initDocTopicCountLP=initDocTopicCountLP,
                d=d,
                maxDiffVec=AggInfo['maxDiff'],
                numIterVec=AggInfo['iter'],
                nRAcceptVec=AggInfo['nRestartsAccepted'],
                nRTrialVec=AggInfo['nRestartsTried'],
                **kwargs)
        else:
            Lik_d = Lik[lstart:lstop].copy()  # Local copy
            (DocTopicCount[d], DocTopicProb[d],
                sumRespTilde[lstart:lstop], Info_d) \
                = calcLocalParams_SingleDoc(
                    wc_d, Lik_d, alphaEbeta, alphaEbetaRem,
                    DocTopicCount_d=initDTC_d,
                    initDocTopicCountLP=initDocTopicCountLP,
                    **kwargs)
            AggInfo = updateConvergenceInfoForDoc_d(d, Info_d, AggInfo, Data)
    #if initDocTopicCountLP.startswith('fast'):
    #    AggInfo['time_extra'] = telapsed
    LP['DocTopicCount'] = DocTopicCount
    if hasattr(Data, 'word_count'):
        if cslice is None or (cslice[0] == 0 and cslice[1] is None):
            assert np.allclose(np.sum(DocTopicCount), np.sum(Data.word_count))
    LP = updateLPGivenDocTopicCount(LP, DocTopicCount, alphaEbeta,
                                    alphaEbetaRem)
    if DO_DENSE:
        LP = updateLPWithResp(LP,
                              Data,
                              Lik,
                              DocTopicProb,
                              sumRespTilde,
                              cslice,
                              nnzPerRowLP=nnzPerRowLP,
                              doSparseOnlyAtFinalLP=doSparseOnlyAtFinalLP)
    else:
        indptr = np.arange(0, (N + 1) * nnzPerRowLP,
                           nnzPerRowLP,
                           dtype=np.int32)
        LP['spR'] = scipy.sparse.csr_matrix((spR_data, spR_colids, indptr),
                                            shape=(N, K))
        LP['nnzPerRow'] = nnzPerRowLP

    LP['Info'] = AggInfo
    writeLogMessageForManyDocs(Data, AggInfo, LP, **kwargs)
    return LP