def generate(topics, words, words_per_doc): num_docs = len(topics) word_cdfs = [util.get_cdf(topic) for topic in words] docs = [] doc_topics = [] for i in range(num_docs): if i % 100 == 0: print "reached document", i num_words = util.poisson(words_per_doc) topic_dist = topics[i] topic_cdf = util.get_cdf(topic_dist) doc = [] word_topics = [] for word in range(num_words): topic = util.sample(topic_cdf) doc.append(util.sample(word_cdfs[topic])) word_topics.append(topic) docs.append(doc) doc_topics.append(word_topics) return docs, doc_topics
def __init__(self, params): # The word distribution of this node's topic. self.word_dist = dirichlet(params["topic_to_word_param"]) self.word_cdf = util.get_cdf(self.word_dist) # The number of documents that pass through this node. self.num_documents = 0 # Those children of this node which have looked below this level. # Documents that reached this node but never looked below aren't # represented here; this is okay because the Chinese Restaurant # Process is exchangeable (doesn't depend on order). self.children = [] # The number of documents which looked below this level. This # should always be equal to sum(c.num_documents for c in # self.children). self.num_documents_in_children = 0
def new_customer(self): self.customers += 1 self.update_probabilities() assert(abs(1 - sum(self.probabilities)) < 1e-10) cdf = util.get_cdf(self.probabilities) table = util.sample(cdf) print "new customer has arrived!", self.customers #print "customers are sitting at tables", self.sparse_seats() #print "customer will sit with probabilities:", self.probabilities print "customer has chosen to sit at table", table if table not in self.seats: self.tables.append(1) else: self.tables[table] += 1 self.seats.append(table)