Esempio n. 1
0
def calcSummaryStats(Data, SS, LP, **kwargs):
    ''' Calculate summary statistics for given dataset and local parameters

    Returns
    --------
    SS : SuffStatBag object, with K components.
    '''
    X = Data.X
    if 'resp' in LP:
        resp = LP['resp']
        K = resp.shape[1]
        # 1/2: Compute mean statistic
        S_x = dotATB(resp, X)
        # 2/2: Compute expected outer-product statistic
        S_xxT = np.zeros((K, Data.dim, Data.dim))
        sqrtResp_k = np.sqrt(resp[:, 0])
        sqrtRX_k = sqrtResp_k[:, np.newaxis] * Data.X
        S_xxT[0] = dotATA(sqrtRX_k)
        for k in range(1, K):
            np.sqrt(resp[:, k], out=sqrtResp_k)
            np.multiply(sqrtResp_k[:, np.newaxis], Data.X, out=sqrtRX_k)
            S_xxT[k] = dotATA(sqrtRX_k)
    else:
        spR = LP['spR']
        K = spR.shape[1]
        # 1/2: Compute mean statistic
        S_x = spR.T * X
        # 2/2: Compute expected outer-product statistic
        S_xxT = calcSpRXXT(X=X, spR_csr=spR)

    if SS is None:
        SS = SuffStatBag(K=K, D=Data.dim)
    # Expected mean for each state k
    SS.setField('x', S_x, dims=('K', 'D'))
    # Expected outer-product for each state k
    SS.setField('xxT', S_xxT, dims=('K', 'D', 'D'))
    # Expected count for each k
    #  Usually computed by allocmodel. But just in case...
    if not hasattr(SS, 'N'):
        if 'resp' in LP:
            SS.setField('N', LP['resp'].sum(axis=0), dims='K')
        else:
            SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K')
    return SS
Esempio n. 2
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs):
        ''' Create sufficient stats needed for global param updates

        Args
        -------
        Data : bnpy data object
        LP : Dictionary containing the local parameters. Expected to contain:
            resp : Data.nObs x K array
            respPair : Data.nObs x K x K array (from the def. of respPair, note
                       respPair[0,:,:] is undefined)

        Returns
        -------
        SS : SuffStatBag with fields
            StartStateCount : A vector of length K with entry i being
                             resp(z_{1k}) = resp[0,:]
            TransStateCount : A K x K matrix where TransStateCount[i,j] =
                           sum_{n=2}^K respPair(z_{n-1,j}, z_{nk})
            N : A vector of length K with entry k being
                sum_{n=1}^Data.nobs resp(z_{nk})

            The first two of these are used by FiniteHMM.update_global_params,
            and the third is used by ObsModel.update_global_params.

        (see the documentation for information about resp and respPair)
        '''
        resp = LP['resp']
        respPair = LP['respPair']
        K = resp.shape[1]
        startLocIDs = Data.doc_range[:-1]

        StartStateCount = np.sum(resp[startLocIDs], axis=0)
        N = np.sum(resp, axis=0)
        TransStateCount = np.sum(respPair, axis=0)

        SS = SuffStatBag(K=K, D=Data.dim)
        SS.setField('StartStateCount', StartStateCount, dims=('K'))
        SS.setField('TransStateCount', TransStateCount, dims=('K', 'K'))
        SS.setField('N', N, dims=('K'))

        if doPrecompEntropy is not None:
            entropy = self.elbo_entropy(Data, LP)
            SS.setELBOTerm('Elogqz', entropy, dims=None)
        return SS
Esempio n. 3
0
    def calcHardMergeGap(self, SS, kA, kB):
        ''' Calculate scalar improvement in ELBO for hard merge of comps kA, kB

        Does *not* include any entropy.

        Returns
        ---------
        L : scalar
        '''
        m_K = SS.K - 1
        m_SS = SuffStatBag(K=SS.K, D=0)
        m_SS.setField('StartStateCount', SS.StartStateCount.copy(), dims='K')
        m_SS.setField('TransStateCount', SS.TransStateCount.copy(),
                      dims=('K', 'K'))
        m_SS.mergeComps(kA, kB)

        # Create candidate beta vector
        m_beta = StickBreakUtil.rho2beta(self.rho)
        m_beta[kA] += m_beta[kB]
        m_beta = np.delete(m_beta, kB, axis=0)

        # Create candidate rho and omega vectors
        m_rho = StickBreakUtil.beta2rho(m_beta, m_K)
        m_omega = np.delete(self.omega, kB)

        # Create candidate startTheta
        m_startTheta = self.startAlpha * m_beta.copy()
        m_startTheta[:m_K] += m_SS.StartStateCount

        # Create candidate transTheta
        m_transTheta = self.transAlpha * np.tile(m_beta, (m_K, 1))
        if self.kappa > 0:
            m_transTheta[:, :m_K] += self.kappa * np.eye(m_K)
        m_transTheta[:, :m_K] += m_SS.TransStateCount

        # Evaluate objective func. for both candidate and current model
        Lcur = calcELBO_LinearTerms(
            SS=SS, rho=self.rho, omega=self.omega,
            startTheta=self.startTheta, transTheta=self.transTheta,
            alpha=self.transAlpha, startAlpha=self.startAlpha,
            gamma=self.gamma, kappa=self.kappa)

        Lprop = calcELBO_LinearTerms(
            SS=m_SS, rho=m_rho, omega=m_omega,
            startTheta=m_startTheta, transTheta=m_transTheta,
            alpha=self.transAlpha, startAlpha=self.startAlpha,
            gamma=self.gamma, kappa=self.kappa)

        # Note: This gap relies on fact that all nonlinear terms are entropies,
        return Lprop - Lcur
Esempio n. 4
0
    def calcSummaryStatsForContigBlock(self, Data, SS=None, a=0, b=0):
        ''' Calculate sufficient stats for a single contiguous block of data
        '''
        if SS is None:
            SS = SuffStatBag(K=1, D=Data.dim)

        SS.setField('N', (b - a) * np.ones(1), dims='K')
        SS.setField('x',
                    np.sum(Data.X[a:b], axis=0)[np.newaxis, :],
                    dims=('K', 'D'))
        SS.setField('xxT',
                    dotATA(Data.X[a:b])[np.newaxis, :, :],
                    dims=('K', 'D', 'D'))
        return SS
Esempio n. 5
0
    def calcSummaryStatsForContigBlock(self, Data, a=0, b=0, **kwargs):
        ''' Calculate summary stats for a contiguous block of the data.

        Returns
        --------
        SS : SuffStatBag object, with 1 component.
        '''
        Xab = Data.X[a:b]  # 2D array, Nab x D
        CountON = np.sum(Xab, axis=0)[np.newaxis, :]
        CountOFF = (b - a) - CountON

        SS = SuffStatBag(K=1, D=Data.dim)
        SS.setField('N', np.asarray([b - a], dtype=np.float64), dims='K')
        SS.setField('Count1', CountON, dims=('K', 'D'))
        SS.setField('Count0', CountOFF, dims=('K', 'D'))
        return SS
Esempio n. 6
0
def calcSummaryStats(Data, SS, LP, **kwargs):
    ''' Calculate summary statistics for given dataset and local parameters

    Returns
    --------
    SS : SuffStatBag object, with K components.
    '''
    X = Data.X
    if 'resp' in LP:
        resp = LP['resp']
        K = resp.shape[1]
        # 1/2: Compute mean statistic
        S_x = dotATB(resp, X)
        # 2/2: Compute expected outer-product statistic
        S_xx = calcRXX_withDenseResp(resp, X)
    else:
        spR = LP['spR']
        K = spR.shape[1]
        # 1/2: Compute mean statistic
        S_x = spR.T * X
        # 2/2: Compute expected outer-product statistic
        S_xx = calcSpRXX(X=X, spR_csr=spR)
    if SS is None:
        SS = SuffStatBag(K=K, D=Data.dim)
    # Expected mean for each state k
    SS.setField('x', S_x, dims=('K', 'D'))
    # Expected sum-of-squares for each state k
    SS.setField('xx', S_xx, dims=('K', 'D'))
    # Expected count for each k
    #  Usually computed by allocmodel. But sometimes not (eg TopicModel)
    if not hasattr(SS, 'N'):
        if 'resp' in LP:
            SS.setField('N', LP['resp'].sum(axis=0), dims='K')
        else:
            SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K')
    return SS
Esempio n. 7
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=0, **kwargs):
        ''' Compute sufficient stats for provided dataset and local params

        Returns
        -------
        SS : SuffStatBag
            Updated fields
            * NodeStateCount : 2D array, nNodes x K
            * N : 2D array, size K x K
        '''
        K = LP['resp'].shape[-1]

        V = Data.nNodes
        SS = SuffStatBag(K=K, D=Data.dim, V=V)

        # NodeStateCount_src[i,k]
        #   Num edges assigned to topic k associated with node i as source
        srcResp = LP['resp'].sum(axis=2)
        NodeStateCount_src = Data.getSparseSrcNodeMat() * srcResp
        # Equivalent but slower: for loop
        # NodeStateCount_src = np.zeros((Data.nNodes, K))
        # for i in xrange(Data.nNodes):
        #     mask_i = Data.edges[:,0] == i
        #     NodeStateCount_src[i,:] = srcResp[mask_i].sum(axis=0)

        # NodeStateCount_rcv[i,k]
        #   Num edges assigned to topic k associated with node i as receiver
        rcvResp = LP['resp'].sum(axis=1)
        NodeStateCount_rcv = Data.getSparseRcvNodeMat() * rcvResp

        # Summing src counts and rcv counts gives the total
        SS.setField('NodeStateCount',
                    NodeStateCount_src + NodeStateCount_rcv,
                    dims=('V', 'K'))
        # Compute total atoms assigned to each cluster pair
        Nresp = np.sum(LP['resp'], axis=0)
        SS.setField('N', Nresp, dims=('K', 'K'))

        if doPrecompEntropy:
            # Remember, resp has shape nEdges x K x K
            # So, need to sum so we track scalar entropy, not K x K
            Hresp = calcLentropyAsScalar(LP)
            SS.setELBOTerm('Hresp', Hresp, dims=None)
        return SS
Esempio n. 8
0
def calcSummaryStats(Data, SS, LP, DataAtomType='doc', **kwargs):
    ''' Calculate summary statistics for given dataset and local parameters

    Returns
    --------
    SS : SuffStatBag object, with K components.
    '''
    if 'resp' in LP:
        K = LP['resp'].shape[1]
    else:
        K = LP['spR'].shape[1]
        nnzPerRow = LP['nnzPerRow']
    if SS is None:
        SS = SuffStatBag(K=K, D=Data.vocab_size)
    if DataAtomType == 'doc':
        # X : 2D sparse matrix, size nDoc x vocab_size
        X = Data.getSparseDocTypeCountMatrix()
        # WordCounts : 2D array, size K x vocab_size
        # obtained by sparse matrix multiply
        # here, '*' operator does this because X is sparse matrix type
        Nvec = None
        if 'resp' in LP:
            WordCounts = LP['resp'].T * X
            if not hasattr(SS, 'N'):
                Nvec = LP['resp'].sum(axis=0)
        else:
            WordCounts = (LP['spR'].T * X).toarray()
            if not hasattr(SS, 'N'):
                Nvec = as1D(toCArray(LP['spR'].sum(axis=0)))
        if Nvec is not None:
            SS.setField('N', Nvec, dims=('K'))
    else:
        # 2D sparse matrix, size V x N
        X = Data.getSparseTokenTypeCountMatrix()
        if 'resp' in LP:
            WordCounts = (X * LP['resp']).T  # matrix-matrix product
        else:
            WordCounts = (X * LP['spR']).T.toarray()
    SS.setField('WordCounts', WordCounts, dims=('K', 'D'))
    SS.setField('SumWordCounts', np.sum(WordCounts, axis=1), dims=('K'))
    return SS
    """
Esempio n. 9
0
    def calcSummaryStatsForContigBlock(self, Data, SS=None, a=0, b=0):
        ''' Calculate sufficient stats for a single contiguous block of data
        '''
        D = Data.X.shape[1]
        E = Data.Xprev.shape[1]

        if SS is None:
            SS = SuffStatBag(K=1, D=D, E=E)
        elif not hasattr(SS, 'E'):
            SS._Fields.E = E

        ppT = dotATA(Data.Xprev[a:b])[np.newaxis, :, :]
        xxT = dotATA(Data.X[a:b])[np.newaxis, :, :]
        pxT = dotATB(Data.Xprev[a:b], Data.X[a:b])[np.newaxis, :, :]

        SS.setField('N', (b - a) * np.ones(1), dims='K')
        SS.setField('xxT', xxT, dims=('K', 'D', 'D'))
        SS.setField('ppT', ppT, dims=('K', 'E', 'E'))
        SS.setField('pxT', pxT, dims=('K', 'E', 'D'))
        return SS
Esempio n. 10
0
    def init_global_params(self, Data, K=0, **initArgs):
        ''' Initialize rho, omega, and theta to reasonable values.

        This is only called by "from scratch" init routines.
        '''
        self.K = K
        self.rho = OptimizerRhoOmega.create_initrho(K)
        self.omega = (1.0 + self.gamma) * np.ones(K)

        # To initialize theta, perform standard update given rho, omega
        # but with "empty" sufficient statistics.
        SS = SuffStatBag(K=self.K, D=Data.dim)
        SS.setField('StartStateCount', np.ones(K), dims=('K'))
        SS.setField('TransStateCount', np.ones((K, K)), dims=('K', 'K'))
        self.transTheta, self.startTheta = self._calcTheta(SS)
Esempio n. 11
0
def calcSummaryStats(Data, SS, LP, **kwargs):
    ''' Calculate sufficient statistics for local params at data slice.

    Returns
    -------
    SS
    '''
    X = Data.X
    Xprev = Data.Xprev
    resp = LP['resp']
    K = resp.shape[1]
    D = Data.X.shape[1]
    E = Data.Xprev.shape[1]

    if SS is None:
        SS = SuffStatBag(K=K, D=D, E=E)
    elif not hasattr(SS, 'E'):
        SS._Fields.E = E

    # Expected count for each k
    #  Usually computed by allocmodel. But just in case...
    if not hasattr(SS, 'N'):
        SS.setField('N', np.sum(resp, axis=0), dims='K')

    # Expected outer products
    sqrtResp = np.sqrt(resp)
    xxT = np.empty((K, D, D))
    ppT = np.empty((K, E, E))
    pxT = np.empty((K, E, D))
    for k in xrange(K):
        sqrtResp_k = sqrtResp[:, k][:, np.newaxis]
        xxT[k] = dotATA(sqrtResp_k * Data.X)
        ppT[k] = dotATA(sqrtResp_k * Data.Xprev)
        pxT[k] = np.dot(Data.Xprev.T, resp[:, k][:, np.newaxis] * Data.X)
    SS.setField('xxT', xxT, dims=('K', 'D', 'D'))
    SS.setField('ppT', ppT, dims=('K', 'E', 'E'))
    SS.setField('pxT', pxT, dims=('K', 'E', 'D'))
    return SS
Esempio n. 12
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=None, **kwargs):
        ''' Calculate the sufficient statistics for global parameter updates

        Only adds stats relevant for this allocModel.
        Other stats are added by the obsModel.

        Args
        -------
        Data : bnpy data object
        LP : local param dict with fields
              resp : Data.nObs x K array,
                       where resp[n,k] = posterior resp of comp k
        doPrecompEntropy : boolean flag
                      indicates whether to precompute ELBO terms in advance
                      used for memoized learning algorithms (moVB)

        Returns
        -------
        SS : SuffStats for K components, with field
              N : vector of dimension K,
                   effective number of observations assigned to each comp
              Npair : matrix of dimensions K x K, where Npair[l,m] =
                      effective # of obs x_{ij} with z_{il} and z_{jm}

        '''
        Npair = np.sum(LP['resp'], axis=0)
        self.Npair = Npair
        N = np.sum(LP['respSingle'], axis=0)

        SS = SuffStatBag(K=N.shape[0], D=Data.dim)
        SS.setField('Npair', Npair, dims=('K', 'K'))
        SS.setField('N', N, dims=('K',))
        if doPrecompEntropy is not None:
            ElogqZ_vec = self.E_logqZ(LP)
            SS.setELBOTerm('ElogqZ', ElogqZ_vec, dims=('K',))
        return SS
Esempio n. 13
0
    def calcSummaryStatsForContigBlock(self,
                                       Data,
                                       SS=None,
                                       a=None,
                                       b=None,
                                       **kwargs):
        ''' Calculate summary statistics for specific block of dataset

        Returns
        --------
        SS : SuffStatBag object, with K components.
        '''
        SS = SuffStatBag(K=1, D=Data.dim)

        # Expected count
        SS.setField('N', (b - a) * np.ones(1, dtype=np.float64), dims='K')

        # Expected outer-product
        xxT = dotATA(Data.X[a:b])[np.newaxis, :, :]
        SS.setField('xxT', xxT, dims=('K', 'D', 'D'))
        return SS
Esempio n. 14
0
    def get_global_suff_stats(self, Data, LP, doPrecompEntropy=0, **kwargs):
        ''' Compute sufficient stats for provided dataset and local params

        Returns
        -------
        SS : SuffStatBag with K components and fields
            * sumSource : nNodes x K
            * sumReceiver : nNodes x K
        '''
        V = Data.nNodes
        K = LP['resp'].shape[-1]
        SS = SuffStatBag(K=K, D=Data.dim, V=V)
        if 'NodeStateCount' not in LP:
            assert 'resp' in LP
            LP = self.initLPFromResp(Data, LP)
        SS.setField('NodeStateCount', LP['NodeStateCount'], dims=('V', 'K'))
        if np.allclose(LP['resp'].sum(axis=1).min(), 1.0):
            # If the LP fully represents all present edges,
            # then the NodeStateCount should as well.
            assert np.allclose(SS.NodeStateCount, Data.nEdges * 2)
        SS.setField('N', LP['N_fg'], dims=('K', ))
        SS.setField('scaleFactor', Data.nEdges, dims=None)

        if 'Ldata_bg' in LP:
            SS.setELBOTerm('Ldata_bg', LP['Ldata_bg'], dims=None)

        if doPrecompEntropy:
            Hresp_fg = LP['Lentropy_fg']  # = -1 * calcRlogR(LP['resp'])
            Hresp_bg = LP['Lentropy_bg']

            SS.setELBOTerm('Hresp', Hresp_fg, dims='K')
            SS.setELBOTerm('Hresp_bg', Hresp_bg, dims=None)

        return SS
Esempio n. 15
0
def init_global_params(obsModel,
                       Data,
                       K=0,
                       seed=0,
                       initname='randexamples',
                       initBlockLen=20,
                       **kwargs):
    ''' Initialize parameters for Bernoulli obsModel, in place.

    Parameters
    -------
    obsModel : AbstractObsModel subclass
        Observation model object to initialize.
    Data   : bnpy.data.DataObj
        Dataset to use to drive initialization.
        obsModel dimensions must match this dataset.
    initname : str
        name of routine used to do initialization
        Options: 'randexamples', 'randexamplesbydist', 'kmeans'

    Returns
    -------
    initLP : dict
        Local parameters used for initialization

    Post Condition
    --------------
    obsModel has valid global parameters.
    Either its EstParams or Post attribute will be contain K components.
    '''
    PRNG = np.random.RandomState(seed)
    AdjMat = Data.toAdjacencyMatrix()
    if AdjMat.ndim == 3:
        AdjMat = AdjMat[:, :, 0]

    K = np.minimum(K, Data.nNodes)
    if len(obsModel.CompDims) == 2:
        CompDims = (
            K,
            K,
        )
    else:
        CompDims = (K, )

    if initname == 'randexamples':
        # Pick K nodes at random in provided graph,
        # and set all edges belonging to that node to one cluster.
        nNodes = AdjMat.shape[0]
        chosenNodes = PRNG.choice(nNodes, size=K, replace=False)

    elif initname == 'randexamplesbydist':
        # Choose K items from the Data,
        #  selecting the first at random,
        # then subsequently proportional to euclidean distance to the closest
        # item
        objID = PRNG.choice(AdjMat.shape[0])
        chosenNodes = list([objID])
        minDistVec = np.inf * np.ones(AdjMat.shape[0])
        for k in range(1, K):
            curDistVec = np.sum((AdjMat - AdjMat[objID])**2, axis=1)
            minDistVec = np.minimum(minDistVec, curDistVec)
            sum_minDistVec = np.sum(minDistVec)
            if sum_minDistVec > 0:
                p = minDistVec / sum_minDistVec
            else:
                D = minDistVec.size
                p = 1.0 / D * np.ones(D)
            objID = PRNG.choice(Data.nNodes, p=p)
            chosenNodes.append(objID)

    elif initname == 'kmeans':
        # Fill in resp matrix with hard-clustering from K-means
        # using an initialization with K randomly selected points from X
        np.random.seed(seed)
        centroids, chosenNodes = kmeans2(data=AdjMat, k=K, minit='points')
    else:
        raise NotImplementedError('Unrecognized initname ' + initname)

    # Convert chosen nodes into temporary local param dict
    # tempLP will have proper resp (nEdges x K), etc.
    tempLP = chosenNodes_to_LP(chosenNodes, CompDims, Data, PRNG=PRNG, K=K)
    # Perform summary step and global step to get obsModel global params.
    SS = SuffStatBag(K=K, D=Data.dim)
    SS = obsModel.get_global_suff_stats(Data, SS, tempLP)
    obsModel.update_global_params(SS)
    return tempLP
Esempio n. 16
0
def init_global_params(obsModel,
                       Data,
                       K=0,
                       seed=0,
                       initname='randexamples',
                       initBlockLen=20,
                       **kwargs):
    ''' Initialize parameters for Gaussian obsModel, in place.

    Parameters
    -------
    obsModel : bnpy.obsModel subclass
        Observation model object to initialize.
    Data : bnpy.data.DataObj
        Dataset to use to drive initialization.
        obsModel dimensions must match this dataset.
    initname : str
        name of routine used to do initialization
        Options: ['randexamples', 'randexamplesbydist', 'kmeans',
                  'randcontigblocks', 'randsoftpartition',
                 ]

    Post Condition
    -------
    obsModel has valid global parameters.
    Either its EstParams or Post attribute will be contain K components.
    '''
    K = int(K)
    PRNG = np.random.RandomState(seed)
    X = Data.X
    if initname == 'randexamples':
        # Choose K items uniformly at random from the Data
        #    then component params by M-step given those single items
        resp = np.zeros((Data.nObs, K))
        permIDs = PRNG.permutation(Data.nObs).tolist()
        for k in range(K):
            resp[permIDs[k], k] = 1.0

    elif initname == 'randexamplesbydist':
        # Choose K items from the Data,
        #  selecting the first at random,
        # then subsequently proportional to euclidean distance to the closest
        # item
        objID = PRNG.choice(Data.nObs)
        chosenObjIDs = list([objID])
        minDistVec = np.inf * np.ones(Data.nObs)
        for k in range(1, K):
            curDistVec = np.sum((Data.X - Data.X[objID])**2, axis=1)
            minDistVec = np.minimum(minDistVec, curDistVec)
            objID = PRNG.choice(Data.nObs, p=minDistVec / minDistVec.sum())
            chosenObjIDs.append(objID)
        resp = np.zeros((Data.nObs, K))
        for k in range(K):
            resp[chosenObjIDs[k], k] = 1.0

    elif initname == 'randcontigblocks':
        # Choose K contig blocks of provided size from the Data,
        #  selecting each block at random from a particular sequence
        if hasattr(Data, 'doc_range'):
            doc_range = Data.doc_range.copy()
        else:
            doc_range = np.asarray([0, Data.X.shape[0]])
        nDoc = doc_range.size - 1
        docIDs = np.arange(nDoc)
        PRNG.shuffle(docIDs)
        resp = np.zeros((Data.nObs, K))
        for k in range(K):
            n = docIDs[k % nDoc]
            start = doc_range[n]
            stop = doc_range[n + 1]
            T = stop - start
            if initBlockLen >= T:
                a = start
                b = stop
            else:
                a = start + PRNG.choice(T - initBlockLen)
                b = a + initBlockLen
            resp[a:b, k] = 1.0

    elif initname == 'randsoftpartition':
        # Randomly assign all data items some mass in each of K components
        #  then create component params by M-step given that soft partition
        resp = PRNG.gamma(1.0 / (K * K), 1, size=(Data.nObs, K))
        resp[resp < 1e-3] = 0
        rsum = np.sum(resp, axis=1)
        badIDs = rsum < 1e-8
        # if any rows have no content, just set them to unif resp.
        if np.any(badIDs):
            resp[badIDs] = 1.0 / K
            rsum[badIDs] = 1
        resp = resp / rsum[:, np.newaxis]
        assert np.allclose(np.sum(resp, axis=1), 1.0)

    elif initname == 'kmeans':
        # Fill in resp matrix with hard-clustering from K-means
        # using an initialization with K randomly selected points from X
        np.random.seed(seed)
        centroids, labels = kmeans2(data=Data.X, k=K, minit='points')
        resp = np.zeros((Data.nObs, K))
        for t in range(Data.nObs):
            resp[t, labels[t]] = 1

    else:
        raise NotImplementedError('Unrecognized initname ' + initname)

    tempLP = dict(resp=resp)
    SS = SuffStatBag(K=K, D=Data.dim)
    SS = obsModel.get_global_suff_stats(Data, SS, tempLP)
    obsModel.update_global_params(SS)
Esempio n. 17
0
def calcSummaryStats(Dslice, SS, LP, DataAtomType='doc', **kwargs):
    ''' Calculate summary statistics for given dataset and local parameters

    Returns
    --------
    SS : SuffStatBag object, with K components.
    '''
    if 'resp' in LP:
        N = LP['resp'].shape[0]
        K = LP['resp'].shape[1]
        if LP['resp'].ndim == 2:
            CompDims = ('K', )  # typical case
        else:
            assert LP['resp'].ndim == 3
            CompDims = ('K', 'K')  # relational data
    else:
        assert 'spR' in LP
        N, K = LP['spR'].shape
        CompDims = ('K', )

    if SS is None:
        SS = SuffStatBag(K=K, D=Dslice.dim)
    if not hasattr(SS, 'N'):
        if 'resp' in LP:
            SS.setField('N', np.sum(LP['resp'], axis=0), dims=CompDims)
        else:
            SS.setField('N', LP['spR'].sum(axis=0), dims=CompDims)

    if hasattr(Dslice, 'X'):
        X = Dslice.X
        if 'resp' in LP:
            # Matrix-matrix product, result is K x D (or KxKxD if relational)
            CountON = np.tensordot(LP['resp'].T, X, axes=1)
            CountOFF = np.tensordot(LP['resp'].T, 1 - X, axes=1)
        else:
            CountON = LP['spR'].T * X
            CountOFF = LP['spR'].T * (1 - X)
    elif DataAtomType == 'doc' or Dslice.nDoc == N:
        X = Dslice.getSparseDocTypeBinaryMatrix()
        if 'resp' in LP:
            # Sparse matrix product
            CountON = LP['resp'].T * X
        else:
            CountON = (LP['spR'].T * X).toarray()
        CountOFF = SS.N[:, np.newaxis] - CountON
    else:
        CountON = np.zeros((SS.K, Dslice.vocab_size))
        CountOFF = np.zeros((SS.K, Dslice.vocab_size))
        for d in xrange(Dslice.nDoc):
            words_d = Dslice.word_id[Dslice.doc_range[d]:Dslice.doc_range[d +
                                                                          1]]
            rstart_d = d * Dslice.vocab_size
            rstop_d = (d + 1) * Dslice.vocab_size
            if 'resp' in LP:
                Count_d = LP['resp'][rstart_d:rstop_d, :].T
            else:
                raise NotImplementedError("TODO")
            CountOFF += Count_d
            CountON[:, words_d] += Count_d[:, words_d]
            CountOFF[:, words_d] -= Count_d[:, words_d]
    SS.setField('Count1', CountON, dims=CompDims + ('D', ))
    SS.setField('Count0', CountOFF, dims=CompDims + ('D', ))
    return SS
Esempio n. 18
0
def calcSummaryStats(Data, LP,
                     doPrecompEntropy=0,
                     doPrecompMergeEntropy=0,
                     mPairIDs=None,
                     trackDocUsage=0,
                     **kwargs):
    ''' Calculate summary statistics for given data slice and local params.

    Returns
    -------
    SS : SuffStatBag
    '''
    if mPairIDs is None:
        M = 0
    else:
        M = len(mPairIDs)

    resp = LP['resp']
    K = resp.shape[1]
    startLocIDs = Data.doc_range[:-1]
    StartStateCount = np.sum(resp[startLocIDs], axis=0)
    N = np.sum(resp, axis=0)

    if 'TransCount' in LP:
        TransStateCount = np.sum(LP['TransCount'], axis=0)
    else:
        respPair = LP['respPair']
        TransStateCount = np.sum(respPair, axis=0)

    SS = SuffStatBag(K=K, D=Data.dim, M=M)
    SS.setField('StartStateCount', StartStateCount, dims=('K'))
    SS.setField('TransStateCount', TransStateCount, dims=('K', 'K'))
    SS.setField('N', N, dims=('K'))
    SS.setField('nDoc', Data.nDoc, dims=None)

    if doPrecompEntropy or 'Htable' in LP:
        # Compute entropy terms!
        # 'Htable', 'Hstart' will both be in Mdict
        Mdict = calcELBO_NonlinearTerms(Data=Data,
                                        LP=LP, returnMemoizedDict=1)
        SS.setELBOTerm('Htable', Mdict['Htable'], dims=('K', 'K'))
        SS.setELBOTerm('Hstart', Mdict['Hstart'], dims=('K'))

    if doPrecompMergeEntropy:
        subHstart, subHtable = HMMUtil.PrecompMergeEntropy_SpecificPairs(
            LP, Data, mPairIDs)
        SS.setMergeTerm('Hstart', subHstart, dims=('M'))
        SS.setMergeTerm('Htable', subHtable, dims=('M', 2, 'K'))
        SS.mPairIDs = np.asarray(mPairIDs)

    if trackDocUsage:
        # Track how often topic appears in a seq. with mass > thresh.
        DocUsage = np.zeros(K)
        for n in xrange(Data.nDoc):
            start = Data.doc_range[n]
            stop = Data.doc_range[n + 1]
            DocUsage += np.sum(LP['resp'][start:stop], axis=0) > 0.01
        SS.setSelectionTerm('DocUsageCount', DocUsage, dims='K')
    return SS
Esempio n. 19
0
def calcSummaryStats(Dslice,
                     LP=None,
                     alpha=None,
                     alphaEbeta=None,
                     doTrackTruncationGrowth=0,
                     doPrecompEntropy=0,
                     doPrecompMergeEntropy=0,
                     mergePairSelection=None,
                     mPairIDs=None,
                     trackDocUsage=0,
                     **kwargs):
    """ Calculate summary from local parameters for given data slice.

    Parameters
    -------
    Data : bnpy data object
    LP : local param dict with fields
        resp : Data.nObs x K array,
            where resp[n,k] = posterior resp of comp k
    doPrecompEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        used for memoized learning algorithms (moVB)

    Returns
    -------
    SS : SuffStatBag with K components
        Relevant fields
        * nDoc : scalar float
            Counts total documents available in provided data.
        * sumLogPi : 1D array, size K
            Entry k equals \sum_{d in docs} E[ \log \pi_{dk} ]
        * sumLogPiRem : scalar float
            Equals sum over docs of probability of inactive topics.

        Also has optional ELBO field when precompELBO is True
        * Hvec : 1D array, size K
            Vector of entropy contributions from each comp.
            Hvec[k] = \sum_{n=1}^N H[q(z_n)], a function of 'resp'
    """
    if mPairIDs is None:
        M = 0
    else:
        M = len(mPairIDs)
    K = LP['DocTopicCount'].shape[1]
    if 'digammaSumTheta' not in LP:
        digammaSumTheta = digamma(LP['theta'].sum(axis=1) + LP['thetaRem'])
        LP['digammaSumTheta'] = digammaSumTheta  # Used for merges

    if 'ElogPi' not in LP:
        LP['ElogPiRem'] = digamma(LP['thetaRem']) - LP['digammaSumTheta']
        LP['ElogPi'] = digamma(LP['theta']) - \
            LP['digammaSumTheta'][:, np.newaxis]

    SS = SuffStatBag(K=K, D=Dslice.dim, M=M)
    SS.setField('nDoc', Dslice.nDoc, dims=None)
    SS.setField('sumLogPi', np.sum(LP['ElogPi'], axis=0), dims='K')
    if 'ElogPiEmptyComp' in LP:
        sumLogPiEmptyComp = np.sum(LP['ElogPiEmptyComp']) - \
            np.sum(LP['ElogPiOrigComp'])
        SS.setField('sumLogPiEmptyComp', sumLogPiEmptyComp, dims=None)
    if doTrackTruncationGrowth:
        remvec = np.zeros(K)
        remvec[K - 1] = np.sum(LP['ElogPiRem'])
        SS.setField('sumLogPiRemVec', remvec, dims='K')
    else:
        SS.setField('sumLogPiRem', np.sum(LP['ElogPiRem']), dims=None)

    if doPrecompEntropy:
        Mdict = calcELBO_NonlinearTerms(Data=Dslice,
                                        LP=LP,
                                        returnMemoizedDict=1)
        if type(Mdict['Hresp']) == float:
            # SPARSE HARD ASSIGNMENTS
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=None)
        else:
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=('K', ))
        SS.setELBOTerm('slackTheta', Mdict['slackTheta'], dims='K')
        SS.setELBOTerm('gammalnTheta', Mdict['gammalnTheta'], dims='K')
        if 'ElogPiEmptyComp' in LP:
            SS.setELBOTerm('slackThetaEmptyComp', Mdict['slackThetaEmptyComp'])
            SS.setELBOTerm('gammalnThetaEmptyComp',
                           Mdict['gammalnThetaEmptyComp'])
            SS.setELBOTerm('HrespEmptyComp', Mdict['HrespEmptyComp'])

        else:
            SS.setELBOTerm('gammalnSumTheta',
                           Mdict['gammalnSumTheta'],
                           dims=None)
            SS.setELBOTerm('slackThetaRem', Mdict['slackThetaRem'], dims=None)
            SS.setELBOTerm('gammalnThetaRem',
                           Mdict['gammalnThetaRem'].sum(),
                           dims=None)

    if doPrecompMergeEntropy:
        if mPairIDs is None:
            raise NotImplementedError("TODO: all pairs for merges")
        m_Hresp = calcHrespForSpecificMergePairs(LP, Dslice, mPairIDs)
        if m_Hresp is not None:
            SS.setMergeTerm('Hresp', m_Hresp, dims=('M'))

        m_sumLogPi = np.zeros(M)
        m_gammalnTheta = np.zeros(M)
        m_slackTheta = np.zeros(M)
        for m, (kA, kB) in enumerate(mPairIDs):
            theta_vec = LP['theta'][:, kA] + LP['theta'][:, kB]
            ElogPi_vec = digamma(theta_vec) - LP['digammaSumTheta']
            m_gammalnTheta[m] = np.sum(gammaln(theta_vec))
            m_sumLogPi[m] = np.sum(ElogPi_vec)
            # slack = (Ndm - theta_dm) * E[log pi_dm]
            slack_vec = ElogPi_vec
            slack_vec *= -1 * (alphaEbeta[kA] + alphaEbeta[kB])
            m_slackTheta[m] = np.sum(slack_vec)
        SS.setMergeTerm('gammalnTheta', m_gammalnTheta, dims=('M'))
        SS.setMergeTerm('sumLogPi', m_sumLogPi, dims=('M'))
        SS.setMergeTerm('slackTheta', m_slackTheta, dims=('M'))

        # Uncomment this for verification of merge calculations.
        # for (kA, kB) in mPairIDs:
        #      self.verifySSForMergePair(Data, SS, LP, kA, kB)
        # .... end merge computations

    # Selection terms (computes doc-topic correlation)
    if mergePairSelection is not None:
        if mergePairSelection.count('corr') > 0:
            Tmat = LP['DocTopicCount']
            SS.setSelectionTerm('DocTopicPairMat',
                                np.dot(Tmat.T, Tmat),
                                dims=('K', 'K'))
            SS.setSelectionTerm('DocTopicSum', np.sum(Tmat, axis=0), dims='K')

    if trackDocUsage:
        # Track num of times a topic appears nontrivially in a doc
        DocUsage = np.sum(LP['DocTopicCount'] > 0.01, axis=0)
        SS.setSelectionTerm('DocUsageCount', DocUsage, dims='K')
        Pi = LP['theta'] / LP['theta'].sum(axis=1)[:, np.newaxis]
        SumPi = np.sum(Pi, axis=0)
        SS.setSelectionTerm('SumPi', SumPi, dims='K')
    return SS
Esempio n. 20
0
def calcSummaryStats(Dslice,
                     LP=None,
                     alpha=None,
                     doPrecompEntropy=False,
                     cslice=(0, None),
                     **kwargs):
    """ Calculate summary from local parameters for given data slice.

    Parameters
    -------
    Data : bnpy data object
    LP : local param dict with fields
        resp : Data.nObs x K array,
            where resp[n,k] = posterior resp of comp k
        doPrecompEntropy : boolean flag
            indicates whether to precompute ELBO terms in advance
            used for memoized learning algorithms (moVB)

    Returns
    -------
    SS : SuffStatBag with K components
        * nDoc : scalar float
            Counts total documents available in provided data.

        Also has optional ELBO field when precompELBO is True
        * Hvec : 1D array, size K
            Vector of entropy contributions from each comp.
            Hvec[k] = \sum_{n=1}^N H[q(z_n)], a function of 'resp'
    """
    K = LP['DocTopicCount'].shape[1]
    SS = SuffStatBag(K=K, D=Dslice.dim)

    if cslice[1] is None:
        SS.setField('nDoc', Dslice.nDoc, dims=None)
    else:
        SS.setField('nDoc', cslice[1] - cslice[0], dims=None)

    SS.setField('nDoc', Dslice.nDoc, dims=None)
    if doPrecompEntropy:
        assert 'theta' in LP
        Lalloc = L_alloc(Dslice, LP, alpha=alpha)
        SS.setELBOTerm('L_alloc', Lalloc, dims=None)

        if 'nnzPerRow' in LP and LP['nnzPerRow'] == 1:
            SS.setELBOTerm('Hvec', 0.0, dims=None)
        else:
            Hvec = L_entropy(Dslice, LP, returnVector=1)
            SS.setELBOTerm('Hvec', Hvec, dims='K')
    return SS
def calcSummaryStats(Data, SS, LP, **kwargs):
    ''' Calculate summary statistics for given dataset and local parameters

    Returns
    --------
    SS : SuffStatBag object, with K components.
    '''
    if not hasattr(Data, 'X_NE'):
        Data.X_NE = np.hstack([Data.X, np.ones(Data.nObs)[:, np.newaxis]])

    Y_N = Data.Y
    X_NE = Data.X_NE
    E = X_NE.shape[1]

    if 'resp' in LP:
        # Dense responsibility calculations
        resp = LP['resp']
        K = resp.shape[1]
        S_yy_K = dotATB(resp, np.square(Y_N)).flatten()
        S_yx_KE = dotATB(resp, Y_N * X_NE)

        # Expected outer product
        S_xxT_KEE = np.zeros((K, E, E))
        sqrtResp_k_N = np.sqrt(resp[:, 0])
        sqrtR_X_k_NE = sqrtResp_k_N[:, np.newaxis] * X_NE
        S_xxT_KEE[0] = dotATA(sqrtR_X_k_NE)
        for k in xrange(1, K):
            np.sqrt(resp[:, k], out=sqrtResp_k_N)
            np.multiply(sqrtResp_k_N[:, np.newaxis], X_NE, out=sqrtR_X_k_NE)
            S_xxT_KEE[k] = dotATA(sqrtR_X_k_NE)
    else:
        raise ValueError("TODO")
        spR = LP['spR']
        K = spR.shape[1]

    if SS is None:
        SS = SuffStatBag(K=K, D=Data.dim, E=E)
    elif not hasattr(SS, 'E'):
        SS._Fields.E = E
    SS.setField('xxT_KEE', S_xxT_KEE, dims=('K', 'E', 'E'))
    SS.setField('yx_KE', S_yx_KE, dims=('K', 'E'))
    SS.setField('yy_K', S_yy_K, dims=('K'))
    # Expected count for each k
    # Usually computed by allocmodel. But just in case...
    if not hasattr(SS, 'N'):
        if 'resp' in LP:
            SS.setField('N', LP['resp'].sum(axis=0), dims='K')
        else:
            SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K')

    #SS.setField("N_K", SS.N, dims="K")
    return SS
Esempio n. 22
0
def calcSummaryStats(Data,
                     LP,
                     doPrecompEntropy=False,
                     doPrecompMergeEntropy=False,
                     mPairIDs=None,
                     mergePairSelection=None,
                     trackDocUsage=False,
                     **kwargs):
    ''' Calculate sufficient statistics for global updates.

    Parameters
    -------
    Data : bnpy data object
    LP : local param dict with fields
        resp : Data.nObs x K array,
            where resp[n,k] = posterior resp of comp k
    doPrecompEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        used for memoized learning algorithms (moVB)
    doPrecompMergeEntropy : boolean flag
        indicates whether to precompute ELBO terms in advance
        for certain merge candidates.

    Returns
    -------
    SS : SuffStatBag with K components
        Summarizes for this mixture model, with fields
        * N : 1D array, size K
            N[k] = expected number of items assigned to comp k

        Also has optional ELBO field when precompELBO is True
        * ElogqZ : 1D array, size K
            Vector of entropy contributions from each comp.
            ElogqZ[k] = \sum_{n=1}^N resp[n,k] log resp[n,k]

        Also has optional Merge field when precompMergeELBO is True
        * ElogqZ : 2D array, size K x K
            Each term is scalar entropy of merge candidate
    '''
    if mPairIDs is not None and len(mPairIDs) > 0:
        M = len(mPairIDs)
    else:
        M = 0
    if 'resp' in LP:
        Nvec = np.sum(LP['resp'], axis=0)
        K = Nvec.size
    else:
        # Sparse assignment case
        Nvec = as1D(toCArray(LP['spR'].sum(axis=0)))
        K = LP['spR'].shape[1]

    if hasattr(Data, 'dim'):
        SS = SuffStatBag(K=K, D=Data.dim, M=M)
    else:
        SS = SuffStatBag(K=K, D=Data.vocab_size, M=M)
    SS.setField('N', Nvec, dims=('K'))
    if doPrecompEntropy:
        Mdict = calcELBO_NonlinearTerms(LP=LP, returnMemoizedDict=1)
        if type(Mdict['Hresp']) == float:
            # SPARSE HARD ASSIGNMENTS
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=None)
        else:
            SS.setELBOTerm('Hresp', Mdict['Hresp'], dims=('K', ))

    if doPrecompMergeEntropy:
        m_Hresp = None
        if 'resp' in LP:
            m_Hresp = -1 * NumericUtil.calcRlogR_specificpairs(
                LP['resp'], mPairIDs)
        elif 'spR' in LP:
            if LP['nnzPerRow'] > 1:
                m_Hresp = calcSparseMergeRlogR(spR_csr=LP['spR'],
                                               nnzPerRow=LP['nnzPerRow'],
                                               mPairIDs=mPairIDs)
        else:
            raise ValueError("Need resp or spR in LP")
        if m_Hresp is not None:
            assert m_Hresp.size == len(mPairIDs)
            SS.setMergeTerm('Hresp', m_Hresp, dims=('M'))
    if trackDocUsage:
        Usage = np.sum(LP['resp'] > 0.01, axis=0)
        SS.setSelectionTerm('DocUsageCount', Usage, dims='K')

    return SS