Esempio n. 1
0
def main(argv):
    print "Usage: python ./main.py <number_of_topics> <alpha> <beta> <maxiteration>"
    # load stop words list from file
    stopwordsfile = open("stopwords.txt", "r")
    for word in stopwordsfile:  # a stop word in each line
        word = word.replace("\n", '')
        word = word.replace("\r\n", '')
        STOP_WORDS_SET.add(word)

    corpus = lda.Corpus()  # instantiate corpus
    # iterate over the files in the directory.
    document_paths = [
        './texts/grimm_fairy_tales', './texts/tech_blog_posts', './texts/nyt'
    ]
    for document_path in document_paths:
        for document_file in glob.glob(os.path.join(document_path, '*.txt')):
            document = lda.Document(document_file)  # instantiate document
            document.split(STOP_WORDS_SET)  # tokenize
            corpus.add_document(document)  # push onto corpus documents list

    corpus.build_vocabulary()
    print "Vocabulary size:" + str(len(corpus.vocabulary))
    print "Number of documents:" + str(len(corpus.documents))

    number_of_topics = int(argv[1])
    alpha = float(argv[2])  # alpha = 50 / iterations
    beta = float(argv[3])  # beta = 0.01
    max_iterations = int(argv[4])
    corpus.lda(number_of_topics, max_iterations, alpha, beta)

    print_topic_word_distribution(corpus, number_of_topics, 20,
                                  "./topic-word.txt")
    print_document_topic_distribution(corpus, number_of_topics, 10,
                                      "./document-topic.txt")
Esempio n. 2
0
  samples = numpy.random.multinomial(100,topicDist)
  
  # Iterate each topic and draw words from it...
  words = numpy.zeros(25,dtype=numpy.int_)
  for t in xrange(10):
    words += numpy.random.multinomial(samples[t],topics[t])
  inputImageSet.append(words)
  
  # Convert the word counts into a dictionary...
  dic = dict()
  for i in xrange(25):
    if words[i]!=0:
      dic[i] = words[i]
  
  # Create the document and store it in the corpus...
  doc = lda.Document(dic)
  
  c.add(doc)



# Save out the input documents for confirmation (50x20 grid)...
docImageSet = []
for words in inputImageSet:
  image = numpy.asfarray(words)
  image *= 255.0/image.max()
  image = numpy.reshape(image,(5,5))
  image = numpy.repeat(numpy.repeat(image,5,axis=0),5,axis=1)
  image = numpy.append(image,numpy.atleast_2d(numpy.zeros(image.shape[1])),axis=0)
  image = numpy.append(image,numpy.atleast_2d(numpy.zeros(image.shape[0])).T,axis=1)
  docImageSet.append(image)
Esempio n. 3
0
def doRun(tdc):
    # Create a corpus...
    c = lda.Corpus(4)
    c.setWordCount(identCount() * 4)

    for i in xrange(tdc):
        dic, abn = genDoc()

        nDic = dict()
        for key, item in dic.iteritems():
            nDic[key[0] * 4 + key[1]] = item

        doc = lda.Document(nDic)
        doc.abn = abn
        c.add(doc)

    # Fit a model...
    params = lda.Params()
    params.setRuns(16)

    print 'Fitting model...'
    p = ProgBar()
    c.fit(params, p.callback)
    del p

    tw = c.topicsWords()

    # Test on a bunch of documents, creating a list of abnormality score/actually an abnormality pairs...
    ab_gt = []
    print 'Testing...'
    p = ProgBar()
    for i in xrange(testDocCount):
        p.callback(i, testDocCount)
        dic, abn = genDoc()

        nDic = dict()
        for key, item in dic.iteritems():
            nDic[key[0] * 4 + key[1]] = item

        doc = lda.Document(nDic)
        doc.fit(tw)
        ab_gt.append((doc.negLogLikelihood(tw), abn))
    del p

    ab_gt.sort(reverse=True)

    # Use the pairs to construct a roc...
    posCount = len(filter(lambda p: p[1] == True, ab_gt))
    negCount = len(ab_gt) - posCount
    print 'positive samples = ', posCount
    print 'negative samples = ', negCount

    truePos = 0
    falsePos = 0
    trueNeg = negCount
    falseNeg = posCount

    roc = []

    for p in ab_gt:
        if p[1]:
            truePos += 1
            falseNeg -= 1
        else:
            falsePos += 1
            trueNeg -= 1

        pnt = (float(falsePos) / float(falsePos + trueNeg),
               float(truePos) / float(truePos + falseNeg))
        roc.append(pnt)

    # Save the roc to disk...
    if not sweep:
        f = open('junction_roc.txt', 'w')
        f.write('0.0 0.0\n')
        for pnt in roc:
            f.write('%f %f\n' % pnt)
        f.close()

    # Calculate and print out the area under the roc...
    area = 0.0
    for i in xrange(1, len(roc)):
        area += 0.5 * (roc[i - 1][1] + roc[i][1]) * (roc[i][0] - roc[i - 1][0])
    print 'area under roc =', area, '(above', (1.0 - area), ')'

    return area
Esempio n. 4
0
# Generate documents and stuff them in...
testDocs = []
for i in xrange(docCount + testCount):
    # Probability of word comming from topicA...
    probA = numpy.random.random()

    # Generate a dictionary of words...
    d = dict()
    tac = numpy.random.binomial(wordCount, probA)
    wfa = numpy.random.multinomial(tac, topicA)
    wfb = numpy.random.multinomial(wordCount - tac, topicB)
    for j in xrange(topicA.shape[0]):
        d[j] = wfa[j] + wfb[j]

    # Make the document, add to corpus or test set...
    doc = lda.Document(d)
    doc.topicA = probA  # For verification
    if i < docCount:
        c.add(doc)
    else:
        testDocs.append(doc)

# Add an unusual document...
d = dict()
d[0] = wordCount
doc = lda.Document(d)
doc.topicA = -1.0
c.add(doc)

# Train...
print 'Trainning...'