def compute(filename): gold_doc = Document(LDATester.PATH + filename + "_gold.txt") doc = Document(LDATester.PATH + filename + ".txt") ## Get random summary indices = [x for x in range(len(doc.sentences))] random.shuffle(indices) indices = indices[0 : len(gold_doc.sentences)] sentences = [doc.sentences[i] for i in indices] calibration = [doc.getSentenceOrginal(sentence) for sentence in sentences] calibration = " ".join(calibration) return BLEU.computeNormalize(gold_doc.document, calibration)
from Project import Document # file = raw_input("file name ") # doc = Document(file + '.txt') doc = Document('economist1.txt') # for i in range(len(doc.paragraphs)): # print doc.paragraphs[i] # print doc.getParagraphLocation(doc.sentences[7]) # print doc.getWordFreqBins() # print doc.getLengthBins() for sentence in doc.sentences: doc.parameterize(sentence)
from Project import Document # filename = raw_input("file name ") filename = 'newyorker1' doc = Document(filename + '.txt') # doc = Document('economist1.txt') ## Test Freq Distribution print 'Frequency Test' print print 'freq of \'long\'', doc.freq_dist.freq('long') print 'most common word', doc.freq_dist.max() print 'num words', doc.freq_dist.N() print 'Conditional Test previous' print ## Test Conditional Frequency Distribution Previous print 'most common word to follow Start', doc.cfdistPrev['Start'].max() ## most common word after Start print 'most common word after',doc.freq_dist.max(),doc.cfdistPrev[doc.freq_dist.max()].max() ## most common word after long print 'Conditional Test after' print ## Test Conditional Frequency Distribution Next print 'most common word to precede End', doc.cfdistNext['End'].max() ## most common word after Start print 'most common word before',doc.freq_dist.max(),doc.cfdistNext[doc.freq_dist.max()].max() ## most common word after long ## get closest sentences to doc freq dist. WE WANT LDA DIST sent = doc.setencesByFreqCloseness() print '1', doc.getSentenceOrginal(sent[0])
from Project import Document from collections import Counter import numpy as np # filename = raw_input("file name ") filename = 'economist1' doc = Document(filename + '.txt') ## Get key sentence sent = doc.setencesByFreqCloseness() maxSent = sent[0] print doc.getSentenceOrginal(maxSent) doc.getLDA(5) topicAndScore = doc.getTopicAndScore() maxTopic, maxScore = topicAndScore[maxSent] print topicAndScore[maxSent] sentByTopics = {} for key in topicAndScore: value = topicAndScore[key] topic = value[0] if topic in sentByTopics: sentByTopics[topic] += [key] else: sentByTopics[topic] = [key]