Beispiel #1
0
    def step(self, d, w):
        word = self._docs[d]['w'][w]

        self._remove_word(d, w)

        q = 0
        for k in self._word_topics[word]:
            q += self._q_cache[d, k]*self._n_kw[k, word]

        u = random.random()*(self._s + self._r + q)

        t = None
        if u < self._s:
            # uniform
            t = discrete_draw(-np.log(self._cnst), logp=True)

        elif u < self._s + self._r:
            # document
            ks = [k for k in self._doc_topics[d]]
            logps = np.zeros(len(self._doc_topics[d]))
            for i, k in enumerate(self._doc_topics[d]):
                logps[i] = log(self._n_dk[d, k]) + log(self._beta/self._cnst[k])

            t = ks[discrete_draw(logps, logp=True)]

        else:
            # word
            ks = [k for k in self._word_topics[word]]
            logps = np.zeros(len(self._word_topics[word]))
            for i, k in enumerate(self._word_topics[word]):
                logps[i] = log(self._n_kw[k, word]) + log(self._q_cache[d, k])

            t = ks[discrete_draw(logps, logp=True)]

        self._insert_word(d, w, t)
def test_discrete_draw_exp_uniform():
    u = np.ones(2)/2.
    x = utils.discrete_draw(u, n=N_DRAWS)

    assert len(x) == N_DRAWS
    assert not np.any(np.isnan(x))
    assert binerr(x, u*N_DRAWS) < .05
Beispiel #3
0
def test_discrete_draw_log_uniform():
    u = np.ones(2) / 2.
    x = utils.discrete_draw(np.log(u), n=N_DRAWS, logp=True)

    assert len(x) == N_DRAWS
    assert not np.any(np.isnan(x))
    assert binerr(x, u * N_DRAWS) < .05
Beispiel #4
0
def test_discrete_draw_exp_point():
    u = np.array([0., 1.])
    x = utils.discrete_draw(u, n=N_DRAWS)

    assert len(x) == N_DRAWS
    assert not np.any(np.isnan(x))
    assert not np.any(x == 0)
Beispiel #5
0
def test_discrete_draw_exp_nonuniform():
    u = np.array([.2, .8])
    x = utils.discrete_draw(u, n=N_DRAWS)

    assert len(x) == N_DRAWS
    assert not np.any(np.isnan(x))
    assert binerr(x, u * N_DRAWS) < .05
Beispiel #6
0
def test_discrete_draw_exp_uniform():
    u = np.ones(2) / 2.
    x = utils.discrete_draw(u, n=N_DRAWS)

    assert len(x) == N_DRAWS
    assert not np.any(np.isnan(x))
    assert binerr(x, u * N_DRAWS) < .05
def _forward_sample(ldactr, n_docs, n_topics, n_words, n_words_per_doc, alpha,
                    beta):
    # docs, _ = gen_docs(n_docs, n_topics, n_words, n_words_per_doc, alpha, beta)
    # lda = ldactr(docs, n_topics, n_words, alpha, beta, seed)
    # z, asgn, n_dk, _ = _get_lda_properties(lda)
    docs = [{'w': [0] * n_words_per_doc} for _ in range(n_docs)]
    n_dk = np.zeros((
        n_docs,
        n_topics,
    ))
    n_kw = np.zeros((
        n_topics,
        n_words,
    ))
    asgn = []
    for d in range(n_docs):
        asgn_d = []
        theta_d = dirichlet.rvs([alpha] * n_topics)[0]
        for _ in range(n_words_per_doc):
            k = discrete_draw(theta_d)
            asgn_d.append(k)
            n_dk[d, k] += 1
        asgn.append(asgn_d)
    z = [item for sublist in asgn for item in sublist]
    docs, phi = _geweke_draw_docs(docs,
                                  asgn,
                                  n_dk,
                                  n_kw,
                                  alpha,
                                  beta,
                                  init=True)
    return docs, z, phi
def test_discrete_draw_exp_nonuniform():
    u = np.array([.2, .8])
    x = utils.discrete_draw(u, n=N_DRAWS)

    assert len(x) == N_DRAWS
    assert not np.any(np.isnan(x))
    assert binerr(x, u*N_DRAWS) < .05
def test_discrete_draw_exp_point():
    u = np.array([0., 1.])
    x = utils.discrete_draw(u, n=N_DRAWS)

    assert len(x) == N_DRAWS
    assert not np.any(np.isnan(x))
    assert not np.any(x == 0)
def test_discrete_draw_log_uniform():
    u = np.ones(2)/2.
    x = utils.discrete_draw(np.log(u), n=N_DRAWS, logp=True)

    assert len(x) == N_DRAWS
    assert not np.any(np.isnan(x))
    assert binerr(x, u*N_DRAWS) < .05
def test_discrete_draw_log_nonuniform():
    u = np.array([.2, .8])
    x = utils.discrete_draw(np.log(u), n=N_DRAWS, logp=True)

    assert len(x) == N_DRAWS
    assert not np.any(np.isnan(x))
    assert binerr(x, u*N_DRAWS) < .05
Beispiel #12
0
def test_discrete_draw_log_nonuniform():
    u = np.array([.2, .8])
    x = utils.discrete_draw(np.log(u), n=N_DRAWS, logp=True)

    assert len(x) == N_DRAWS
    assert not np.any(np.isnan(x))
    assert binerr(x, u * N_DRAWS) < .05
Beispiel #13
0
def test_discrete_draw_log_point():
    n_samples = 1000
    u = np.array([0, 1])
    x = utils.discrete_draw(np.log(u), n=n_samples, logp=True)

    assert len(x) == n_samples
    assert not np.any(x == 0)
    assert not np.any(np.isnan(x))
def test_discrete_draw_log_point():
    n_samples = 1000
    u = np.array([0, 1])
    x = utils.discrete_draw(np.log(u), n=n_samples, logp=True)

    assert len(x) == n_samples
    assert not np.any(x == 0)
    assert not np.any(np.isnan(x))
Beispiel #15
0
    def __init__(self, docs, n_topics, n_words, alpha=1.0, beta=1.0,
                 init_mode='prior', seed=None):
        """
        Parameters
        ----------
        docs : list<dict>
            list of document data structures
        n_topics : int
            number of topics
        n_words : int
            number of words in corpus (vocabulary)
        alpha : float (0, Inf), optional
            symmetric Dirchlet parameter for word/document distribution
        beta : float (0, Inf), optional
            symmetric Drichlet parameter for topic/document distribution
        """
        self._docs = docs
        self._n_docs = len(docs)
        self._n_topics = n_topics
        self._n_words = n_words
        self._alpha = alpha
        self._beta = beta

        # number of words assigned to topic k in doc d
        self._n_dk = np.zeros((self._n_docs, self._n_topics,))
        # number of times word w is assigned to topic k
        self._n_kw = np.zeros((self._n_topics, self._n_words,))
        # number of times any word is assigned to topic k
        self._n_k = np.zeros((1, self._n_topics,))
        # Entry z[d][w] is the topic to which the w^th word in document d is
        # assigned
        self._z = []
        self._key = []

        for d, doc in enumerate(self._docs):
            self._z.append([])
            if init_mode == 'prior':
                theta_k = dirichlet.rvs([self._alpha]*self._n_topics)[0]
            elif init_mode == 'random':
                theta_k = np.ones(self._n_topics)/self._n_topics
            else:
                raise ValueError("init_mode must be 'random' or 'prior'")
            for w, wrd in enumerate(doc['w']):
                topic = int(discrete_draw(theta_k))
                self._z[d].append(topic)

                self._n_dk[d, topic] += 1.0
                self._n_kw[topic, wrd] += 1.0
                self._n_k[0, topic] += 1.0
                self._key.append((d, wrd, w,))
Beispiel #16
0
    def step(self, d, w):
        word = self._docs[d]['w'][w]

        self._remove_word(d, w)

        q = 0
        for k in self._word_topics[word]:
            q += self._q_cache[d, k] * self._n_kw[k, word]

        u = random.random() * (self._s + self._r + q)

        t = None
        if u < self._s:
            # uniform
            t = discrete_draw(-np.log(self._cnst), logp=True)

        elif u < self._s + self._r:
            # document
            ks = [k for k in self._doc_topics[d]]
            logps = np.zeros(len(self._doc_topics[d]))
            for i, k in enumerate(self._doc_topics[d]):
                logps[i] = log(self._n_dk[d, k]) + log(
                    self._beta / self._cnst[k])

            t = ks[discrete_draw(logps, logp=True)]

        else:
            # word
            ks = [k for k in self._word_topics[word]]
            logps = np.zeros(len(self._word_topics[word]))
            for i, k in enumerate(self._word_topics[word]):
                logps[i] = log(self._n_kw[k, word]) + log(self._q_cache[d, k])

            t = ks[discrete_draw(logps, logp=True)]

        self._insert_word(d, w, t)
Beispiel #17
0
    def step(self):
        for d, word, w in self._key:
            topic = self._z[d][w]

            self._n_dk[d, topic] -= 1.0
            self._n_kw[topic, word] -= 1.0
            self._n_k[0, topic] -= 1.0

            logp_k = np.zeros(self._n_topics)
            for k in range(self._n_topics):
                logp_k[k] = self.log_conditional(d, k, word)

            topic = discrete_draw(logp_k, logp=True)
            self._z[d][w] = topic
            self._n_dk[d, topic] += 1.0
            self._n_kw[topic, word] += 1.0
            self._n_k[0, topic] += 1.0
Beispiel #18
0
    def step(self):
        for d, word, w in self._key:
            topic = self._z[d][w]

            self._n_dk[d, topic] -= 1.0
            self._n_kw[topic, word] -= 1.0
            self._n_k[0, topic] -= 1.0

            logp_k = np.zeros(self._n_topics)
            for k in range(self._n_topics):
                logp_k[k] = self.log_conditional(d, k, word)

            topic = discrete_draw(logp_k, logp=True)
            self._z[d][w] = topic
            self._n_dk[d, topic] += 1.0
            self._n_kw[topic, word] += 1.0
            self._n_k[0, topic] += 1.0
def _forward_sample(ldactr, n_docs, n_topics, n_words, n_words_per_doc, alpha,
                    beta):
    # docs, _ = gen_docs(n_docs, n_topics, n_words, n_words_per_doc, alpha, beta)
    # lda = ldactr(docs, n_topics, n_words, alpha, beta, seed)
    # z, asgn, n_dk, _ = _get_lda_properties(lda)
    docs = [{'w': [0]*n_words_per_doc} for _ in range(n_docs)]
    n_dk = np.zeros((n_docs, n_topics,))
    n_kw = np.zeros((n_topics, n_words,))
    asgn = []
    for d in range(n_docs):
        asgn_d = []
        theta_d = dirichlet.rvs([alpha]*n_topics)[0]
        for _ in range(n_words_per_doc):
            k = discrete_draw(theta_d)
            asgn_d.append(k)
            n_dk[d, k] += 1
        asgn.append(asgn_d)
    z = [item for sublist in asgn for item in sublist]
    docs, phi = _geweke_draw_docs(docs, asgn, n_dk, n_kw, alpha, beta,
                                  init=True)
    return docs, z, phi
def _geweke_draw_docs(docs, asgn, n_dk, n_kw, alpha, beta, init=False):
    n_dk = np.array(n_dk)
    n_kw = np.array(n_kw)

    n_topics = len(n_kw)
    n_words = len(n_kw[0])

    n_d = np.sum(n_dk, axis=1)
    assert len(n_d) == len(docs)

    for d, doc in enumerate(docs):
        for w, wrd in enumerate(doc['w']):
            tpc = asgn[d][w]

            n_dk[d, tpc] -= 1
            n_d[d] -= 1
            if not init:
                n_kw[tpc, wrd] -= 1

            logps = np.zeros(n_words)
            for widx in range(n_words):
                logp = log(n_kw[tpc, widx] + beta)
                logp += log(n_dk[d, tpc] + alpha)
                logp -= log(n_d[d] + alpha*n_topics)
                logps[widx] = logp

            new_wrd = discrete_draw(logps, logp=True)
            docs[d]['w'][w] = new_wrd

            n_dk[d, tpc] += 1
            n_kw[tpc, new_wrd] += 1
            n_d[d] += 1

    assert np.sum(n_dk) == np.sum(n_kw)
    assert np.sum(n_d) == np.sum(n_dk)

    phi = n_kw + beta
    phi /= np.sum(phi, axis=1)[:, np.newaxis]

    return docs, phi
def _geweke_draw_docs(docs, asgn, n_dk, n_kw, alpha, beta, init=False):
    n_dk = np.array(n_dk)
    n_kw = np.array(n_kw)

    n_topics = len(n_kw)
    n_words = len(n_kw[0])

    n_d = np.sum(n_dk, axis=1)
    assert len(n_d) == len(docs)

    for d, doc in enumerate(docs):
        for w, wrd in enumerate(doc['w']):
            tpc = asgn[d][w]

            n_dk[d, tpc] -= 1
            n_d[d] -= 1
            if not init:
                n_kw[tpc, wrd] -= 1

            logps = np.zeros(n_words)
            for widx in range(n_words):
                logp = log(n_kw[tpc, widx] + beta)
                logp += log(n_dk[d, tpc] + alpha)
                logp -= log(n_d[d] + alpha * n_topics)
                logps[widx] = logp

            new_wrd = discrete_draw(logps, logp=True)
            docs[d]['w'][w] = new_wrd

            n_dk[d, tpc] += 1
            n_kw[tpc, new_wrd] += 1
            n_d[d] += 1

    assert np.sum(n_dk) == np.sum(n_kw)
    assert np.sum(n_d) == np.sum(n_dk)

    phi = n_kw + beta
    phi /= np.sum(phi, axis=1)[:, np.newaxis]

    return docs, phi
Beispiel #22
0
    def __init__(self,
                 docs,
                 n_topics,
                 n_words,
                 alpha=1.0,
                 beta=1.0,
                 init_mode='prior',
                 seed=None):
        """
        Parameters
        ----------
        docs : list<dict>
            list of document data structures
        n_topics : int
            number of topics
        n_words : int
            number of words in corpus (vocabulary)
        alpha : float (0, Inf), optional
            symmetric Dirchlet parameter for word/document distribution
        beta : float (0, Inf), optional
            symmetric Drichlet parameter for topic/document distribution
        """
        self._docs = docs
        self._n_docs = len(docs)
        self._n_topics = n_topics
        self._n_words = n_words
        self._alpha = alpha
        self._beta = beta

        # number of words assigned to topic k in doc d
        self._n_dk = np.zeros((
            self._n_docs,
            self._n_topics,
        ))
        # number of times word w is assigned to topic k
        self._n_kw = np.zeros((
            self._n_topics,
            self._n_words,
        ))
        # number of times any word is assigned to topic k
        self._n_k = np.zeros((
            1,
            self._n_topics,
        ))
        # Entry z[d][w] is the topic to which the w^th word in document d is
        # assigned
        self._z = []
        self._key = []

        for d, doc in enumerate(self._docs):
            self._z.append([])
            if init_mode == 'prior':
                theta_k = dirichlet.rvs([self._alpha] * self._n_topics)[0]
            elif init_mode == 'random':
                theta_k = np.ones(self._n_topics) / self._n_topics
            else:
                raise ValueError("init_mode must be 'random' or 'prior'")
            for w, wrd in enumerate(doc['w']):
                topic = int(discrete_draw(theta_k))
                self._z[d].append(topic)

                self._n_dk[d, topic] += 1.0
                self._n_kw[topic, wrd] += 1.0
                self._n_k[0, topic] += 1.0
                self._key.append((
                    d,
                    wrd,
                    w,
                ))