Esempio n. 1
0
    def compute(filename):
        gold_doc = Document(LDATester.PATH + filename + "_gold.txt")
        doc = Document(LDATester.PATH + filename + ".txt")

        ## Get random summary
        indices = [x for x in range(len(doc.sentences))]
        random.shuffle(indices)
        indices = indices[0 : len(gold_doc.sentences)]
        sentences = [doc.sentences[i] for i in indices]
        calibration = [doc.getSentenceOrginal(sentence) for sentence in sentences]
        calibration = " ".join(calibration)
        return BLEU.computeNormalize(gold_doc.document, calibration)
Esempio n. 2
0
from Project import Document
# file = raw_input("file name ")
# doc = Document(file + '.txt')
doc = Document('economist1.txt')
# for i in range(len(doc.paragraphs)):
    # print doc.paragraphs[i]
# print doc.getParagraphLocation(doc.sentences[7])
# print doc.getWordFreqBins()
# print doc.getLengthBins()
for sentence in doc.sentences:
    doc.parameterize(sentence)

Esempio n. 3
0
from Project import Document
# filename = raw_input("file name ")
filename = 'newyorker1'
doc = Document(filename + '.txt')
# doc = Document('economist1.txt')


## Test Freq Distribution
print 'Frequency Test'
print
print 'freq of \'long\'', doc.freq_dist.freq('long')
print 'most common word', doc.freq_dist.max()
print 'num words', doc.freq_dist.N()


print 'Conditional Test previous'
print
## Test Conditional Frequency Distribution Previous
print 'most common word to follow Start', doc.cfdistPrev['Start'].max() ## most common word after Start
print 'most common word after',doc.freq_dist.max(),doc.cfdistPrev[doc.freq_dist.max()].max() ## most common word after long

print 'Conditional Test after'
print

## Test Conditional Frequency Distribution Next
print 'most common word to precede End', doc.cfdistNext['End'].max() ## most common word after Start
print 'most common word before',doc.freq_dist.max(),doc.cfdistNext[doc.freq_dist.max()].max() ## most common word after long

## get closest sentences to doc freq dist. WE WANT LDA DIST
sent = doc.setencesByFreqCloseness()
print '1', doc.getSentenceOrginal(sent[0])
Esempio n. 4
0
from Project import Document
from collections import Counter
import numpy as np
# filename = raw_input("file name ")
filename = 'economist1'
doc = Document(filename + '.txt')

## Get key sentence
sent = doc.setencesByFreqCloseness()
maxSent = sent[0]
print doc.getSentenceOrginal(maxSent)



doc.getLDA(5)
topicAndScore = doc.getTopicAndScore()
maxTopic, maxScore = topicAndScore[maxSent]
print topicAndScore[maxSent]

sentByTopics = {}

for key in topicAndScore:
    
    value = topicAndScore[key]
    topic = value[0]
    if topic in sentByTopics:

        sentByTopics[topic] += [key]
    else:

        sentByTopics[topic] = [key]