Esempio n. 1
0
def generate_docs_with_hlda(num_docs, words_per_doc, vocab_size, 
                            topic_to_word_beta, topic_dist_m, topic_dist_pi, 
                            new_child_gamma):
    params = {}
    params["topic_to_word_param"] = [topic_to_word_beta] * vocab_size
    params["words_per_doc_distribution"] = lambda: util.poisson(words_per_doc)
    pta = topic_dist_m * topic_dist_pi
    ptb = topic_dist_pi - pta
    params["parent_topic_bias_sample"] = lambda: beta(pta, ptb)
    params["new_child_gamma"] = new_child_gamma
    topic_root = Topic_node(params)
    documents, topic_stay_probs, topic_paths, topics, levels = \
    zip(*[generate_one_doc_with_hlda(topic_root, params) for i in range(num_docs)])
    return documents, topic_root, topic_stay_probs, topic_paths, topics, levels
Esempio n. 2
0
def generate(topics, words, words_per_doc):
    num_docs = len(topics)
    word_cdfs = [util.get_cdf(topic) for topic in words]

    docs = []
    doc_topics = []
    for i in range(num_docs):
        if i % 100 == 0:
            print "reached document", i
        num_words = util.poisson(words_per_doc)
        topic_dist = topics[i]
        topic_cdf = util.get_cdf(topic_dist)

        doc = []
        word_topics = []
        for word in range(num_words):
            topic = util.sample(topic_cdf)
            doc.append(util.sample(word_cdfs[topic]))
            word_topics.append(topic)
        docs.append(doc)
        doc_topics.append(word_topics)

    return docs, doc_topics
Esempio n. 3
0
def generate_docs(num_topics, num_docs, words_per_doc=50, vocab_size=30,
                  alpha=None, beta=None, noise=-1, plsi=False, ctm=False, 
                  pareto=False):
    """Generates documents according to plsi, ctm, or lda
    
    Args:
        num_topics: 
            the number of underlying latent topics
        num_docs: 
            the number of documents to generate
        words_per_doc: 
            parameter to a Poisson distribution;
            determines the average words in a documents
        vocab_size: 
            the number of words in the vocabulary
        DISTRIBUTION PARAMETERS
        ---------------------
        depending on which model, alpha and beta are parameters to different
        distributions
        
        LDA: Assumes symmetric dirichlet distributions (ie all elements in the
        parameter vector have the same value)
            alpha: 
                parameter to dirichlet distribution for topics
            beta: 
                parameter to dirichlet distribution for words
            
        PLSI:
            alpha:
                parameter to poisson distribution to determine the number of
                topics per document (each topic will have uniform
                probability; all other topics will have probability 0)
            beta:
                as alpha, but poisson distribution instead controls the number
                of words per topic (each word will have uniform
                probability; all other words will have probability 0)
        ---------------------
        noise: 
            given as a probability; each word will be replaced with a random
            word with noise probability
        plsi:
            flag to draw distributions according to plsi (ie random 
            distributions)
        ctm:
            flag to draw distributions according to ctm (ie a multivariate
            gaussian distribution) 
        pareto:
            flag to make dirichlet distribution pareto (ie for the dirichlet
            parameter, set each alpha_i = alpha / alpha_i)
            
    Returns:
        docs:
            the list of documents, each a list of words (represented by their
            indices in range(vocab_size)
        topics:
            a list of documents, each a list of topics (represented by their
            indices in range(num_topics)
        word_dist:
            the distribution over words for each topic; 
            each row is the distribution for a different topic 
        topics_dist:
            the distribution over topics for each document;
            each row is the distribution for a different document
    """
    #@TODO: integrate ctm parameters (ie mu and sigma) into alpha and beta
    mu = np.zeros(num_topics)
    sigma = np.ones((num_topics, num_topics))
    
    if plsi and ctm:
        print "plsi and ctm flags cannot both be active (returning None)"
        return None
    
    if not plsi and not ctm:
        if pareto:
            alpha = [alpha / i for i in range(1, num_topics + 1)]
            beta = [np.sqrt(beta / i) for i in range(1, vocab_size + 1)]
            #beta = [beta / i for i in range(1, vocab_size + 1)]
        else:
            alpha = [alpha] * num_topics
            beta = [beta] * vocab_size

    if plsi or ctm:
        sig_words = [rsample(range(vocab_size), util.poisson(beta, vocab_size))\
                     for t in range(num_topics)]
        word_dist = [np.zeros(vocab_size) for t in range(num_topics)]
        for i in range(num_topics):
            word_dist[i][sig_words[i]] = 1.0 / len(sig_words[i])
    else:
        word_dist = [dirichlet(beta) for i in range(num_topics)]
    word_cdfs = []
    for topic in word_dist:
        word_cdfs.append(get_cdf(topic))
    
    topic_cdfs = []
    docs = []
    topics = []
    topic_dists = []
    doc_index = 0
    for i in range(num_docs):
        if doc_index % 100 == 0:
            print "reached document", doc_index
        if plsi:
            sig_topics = rsample(range(num_topics), 
                                 util.poisson(alpha, num_topics))
            topic_dist = np.zeros(num_topics)
            topic_dist[sig_topics] = 1.0 / len(sig_topics)
        elif ctm:
            eta = N(mu, sigma)
            topic_dist = np.exp(eta) / np.sum(np.exp(eta))
        else:
            topic_dist = dirichlet(alpha)
        num_words = util.poisson(words_per_doc)
        doc = []
        topic_dists.append(topic_dist)
        topic_cdf = get_cdf(topic_dist)
        topic_cdfs.append(topic_cdf)
        doc_topics = []
        for word in range(num_words):
            if rand() < noise:
                doc.append(rsample(range(vocab_size), 1))
                doc_topics.append(-1)
            else:
                topic = sample(topic_cdf)
                doc.append(sample(word_cdfs[topic]))
                doc_topics.append(topic)
        docs.append(doc)
        topics.append(doc_topics)
        doc_index += 1
    return docs, topics, word_dist, topic_dists