def sampler(self, i, TOPICS, xcounts, ycounts, docId, different_word): ll = 0 adder = add_count(xcounts, ycounts) probs = {} for i in range(0, len(self.xcorpus)): for j in range(0, len(self.xcorpus[i])): x = self.xcorpus[i][j] y = self.ycorpus[i][j] adder.add_counter(x, y, i, -1) for k in range(TOPICS): if xcounts.has_key(k) and (x, k) in xcounts and ycounts.has_key(docId) \ and (y, docId) in ycounts: if xcounts[k] != 0 and ycounts[docId] != 0: p_x_y = 1.0 * xcounts[ (x, k)] + self.alpha / xcounts[ k] + self.alpha * len(different_word) p_y_Y = 1.0 * ycounts[ (y, docId)] + self.beta / ycounts[ docId] + self.beta * TOPICS probs.update({k: p_x_y * p_y_Y}) new_y = Sampling.sampleOne(probs) print new_y ll = ll + log(probs[new_y]) adder.add_counter(x, new_y, i, 1) self.ycorpus[i][j] = new_y print ll
def sampler(self, i, TOPICS, xcounts, ycounts, docId, different_word): ll = 0 adder = add_count(xcounts, ycounts) probs = {} for i in range(0, len(self.xcorpus)): for j in range(0, len(self.xcorpus[i])): x = self.xcorpus[i][j] y = self.ycorpus[i][j] adder.add_counter(x, y, i, -1) for k in range(TOPICS): if xcounts.has_key(k) and (x, k) in xcounts and ycounts.has_key(docId) \ and (y, docId) in ycounts: if xcounts[k] != 0 and ycounts[docId] != 0: p_x_y = 1.0 * xcounts[(x, k)] + self.alpha / xcounts[k] + self.alpha * len(different_word) p_y_Y = 1.0 * ycounts[(y, docId)] + self.beta / ycounts[docId] + self.beta * TOPICS probs.update({k : p_x_y * p_y_Y}) new_y = Sampling.sampleOne(probs) print new_y ll = ll + log(probs[new_y]) adder.add_counter(x, new_y, i ,1) self.ycorpus[i][j] = new_y print ll
def initilize(self): first_time = 1 adder = add_count(self.xcounts, self.ycounts) self.docid = os.path.getsize("07-train.txt") #for line in open("wiki-en-documents.word", "r"): for line in open("07-train.txt", "r"): rline = line.rstrip("¥n") words = numpy.array(rline.split(" ")) topics_vector = [] self.different_word = set(words) for word in words: topic = randint(self.TOPICS) + 1 topics_vector.append(topic) adder.add_counter(word, topic, self.docid, 1) array_topics_vector = numpy.array(topics_vector) if first_time == 1: self.xcorpus = numpy.hstack((self.xcorpus, words)) self.ycorpus = numpy.hstack((self.ycorpus, array_topics_vector)) first_time = first_time + 1 else: self.xcorpus = numpy.vstack((self.xcorpus, words)) self.ycorpus = numpy.vstack((self.ycorpus, array_topics_vector))
def initilize(self): first_time = 1 adder = add_count(self.xcounts, self.ycounts) self.docid = os.path.getsize("07-train.txt") #for line in open("wiki-en-documents.word", "r"): for line in open("07-train.txt", "r"): rline = line.rstrip("¥n") words = numpy.array(rline.split(" ")) topics_vector = [] self.different_word = set(words) for word in words: topic = randint(self.TOPICS) + 1 topics_vector.append(topic) adder.add_counter(word, topic, self.docid, 1) array_topics_vector = numpy.array(topics_vector) if first_time == 1: self.xcorpus = numpy.hstack((self.xcorpus, words)) self.ycorpus = numpy.hstack( (self.ycorpus, array_topics_vector)) first_time = first_time + 1 else: self.xcorpus = numpy.vstack((self.xcorpus, words)) self.ycorpus = numpy.vstack( (self.ycorpus, array_topics_vector))
def test_add(self): adder = add_count()