def main(argv): print "Usage: python ./main.py <number_of_topics> <alpha> <beta> <maxiteration>" # load stop words list from file stopwordsfile = open("stopwords.txt", "r") for word in stopwordsfile: # a stop word in each line word = word.replace("\n", '') word = word.replace("\r\n", '') STOP_WORDS_SET.add(word) corpus = lda.Corpus() # instantiate corpus # iterate over the files in the directory. document_paths = [ './texts/grimm_fairy_tales', './texts/tech_blog_posts', './texts/nyt' ] for document_path in document_paths: for document_file in glob.glob(os.path.join(document_path, '*.txt')): document = lda.Document(document_file) # instantiate document document.split(STOP_WORDS_SET) # tokenize corpus.add_document(document) # push onto corpus documents list corpus.build_vocabulary() print "Vocabulary size:" + str(len(corpus.vocabulary)) print "Number of documents:" + str(len(corpus.documents)) number_of_topics = int(argv[1]) alpha = float(argv[2]) # alpha = 50 / iterations beta = float(argv[3]) # beta = 0.01 max_iterations = int(argv[4]) corpus.lda(number_of_topics, max_iterations, alpha, beta) print_topic_word_distribution(corpus, number_of_topics, 20, "./topic-word.txt") print_document_topic_distribution(corpus, number_of_topics, 10, "./document-topic.txt")
samples = numpy.random.multinomial(100,topicDist) # Iterate each topic and draw words from it... words = numpy.zeros(25,dtype=numpy.int_) for t in xrange(10): words += numpy.random.multinomial(samples[t],topics[t]) inputImageSet.append(words) # Convert the word counts into a dictionary... dic = dict() for i in xrange(25): if words[i]!=0: dic[i] = words[i] # Create the document and store it in the corpus... doc = lda.Document(dic) c.add(doc) # Save out the input documents for confirmation (50x20 grid)... docImageSet = [] for words in inputImageSet: image = numpy.asfarray(words) image *= 255.0/image.max() image = numpy.reshape(image,(5,5)) image = numpy.repeat(numpy.repeat(image,5,axis=0),5,axis=1) image = numpy.append(image,numpy.atleast_2d(numpy.zeros(image.shape[1])),axis=0) image = numpy.append(image,numpy.atleast_2d(numpy.zeros(image.shape[0])).T,axis=1) docImageSet.append(image)
def doRun(tdc): # Create a corpus... c = lda.Corpus(4) c.setWordCount(identCount() * 4) for i in xrange(tdc): dic, abn = genDoc() nDic = dict() for key, item in dic.iteritems(): nDic[key[0] * 4 + key[1]] = item doc = lda.Document(nDic) doc.abn = abn c.add(doc) # Fit a model... params = lda.Params() params.setRuns(16) print 'Fitting model...' p = ProgBar() c.fit(params, p.callback) del p tw = c.topicsWords() # Test on a bunch of documents, creating a list of abnormality score/actually an abnormality pairs... ab_gt = [] print 'Testing...' p = ProgBar() for i in xrange(testDocCount): p.callback(i, testDocCount) dic, abn = genDoc() nDic = dict() for key, item in dic.iteritems(): nDic[key[0] * 4 + key[1]] = item doc = lda.Document(nDic) doc.fit(tw) ab_gt.append((doc.negLogLikelihood(tw), abn)) del p ab_gt.sort(reverse=True) # Use the pairs to construct a roc... posCount = len(filter(lambda p: p[1] == True, ab_gt)) negCount = len(ab_gt) - posCount print 'positive samples = ', posCount print 'negative samples = ', negCount truePos = 0 falsePos = 0 trueNeg = negCount falseNeg = posCount roc = [] for p in ab_gt: if p[1]: truePos += 1 falseNeg -= 1 else: falsePos += 1 trueNeg -= 1 pnt = (float(falsePos) / float(falsePos + trueNeg), float(truePos) / float(truePos + falseNeg)) roc.append(pnt) # Save the roc to disk... if not sweep: f = open('junction_roc.txt', 'w') f.write('0.0 0.0\n') for pnt in roc: f.write('%f %f\n' % pnt) f.close() # Calculate and print out the area under the roc... area = 0.0 for i in xrange(1, len(roc)): area += 0.5 * (roc[i - 1][1] + roc[i][1]) * (roc[i][0] - roc[i - 1][0]) print 'area under roc =', area, '(above', (1.0 - area), ')' return area
# Generate documents and stuff them in... testDocs = [] for i in xrange(docCount + testCount): # Probability of word comming from topicA... probA = numpy.random.random() # Generate a dictionary of words... d = dict() tac = numpy.random.binomial(wordCount, probA) wfa = numpy.random.multinomial(tac, topicA) wfb = numpy.random.multinomial(wordCount - tac, topicB) for j in xrange(topicA.shape[0]): d[j] = wfa[j] + wfb[j] # Make the document, add to corpus or test set... doc = lda.Document(d) doc.topicA = probA # For verification if i < docCount: c.add(doc) else: testDocs.append(doc) # Add an unusual document... d = dict() d[0] = wordCount doc = lda.Document(d) doc.topicA = -1.0 c.add(doc) # Train... print 'Trainning...'