def main(): # initialize custom vectorizer with all documents collection vectorizer1 = CustomVectorizer(input='fromfiles', stopwords=stopwords, encoding='iso-8859-1') vectorizer2 = CustomVectorizer(input='fromfiles', stopwords=stopwords, encoding='iso-8859-1', bigrams=True, nphrases=True) documents = ['textos-fonte/' + d for d in listdir('textos-fonte')] vectorizer1.fit(documents) vectorizer1._input = 'content' vectorizer2._input = 'content' # print all statistics MAP1 = 0 MAP2 = 0 print( 'File\t\t\tPrecision Alternative\tRecall Alternative\tF1 Alternative\tPrecision Improved\tRecall Improved\tF1 Improved' ) for doc in listdir('textos-fonte'): path = 'textos-fonte/' + doc stats = compareApproaches(doc, summary(vectorizer1, path, True), summary(vectorizer2, path)) MAP1 += float(stats[1]) MAP2 += float(stats[4]) print(stats[0] + '\t\t' + stats[1] + '\t\t' + stats[2] + '\t' + stats[3] + '\t' + stats[4] + '\t\t' + stats[5] + '\t\t' + stats[6]) print('\nMAP Score for Alternative Approach: ' + str(MAP1 / 100)) print('\nMAP Score for Improved Approach: ' + str(MAP2 / 100))
def summarizeDocument(file): ''' Parse the file for sentences ''' doc = io.open(file, 'r', encoding='utf').read() doc = preprocessor(doc) sentences = sent_tokenize(doc) ''' Compute vector space representations of every sentence. It will treat each sentence as a document and so use the correct values (sentence frequency). !!! The tf-idf values computed by this vectorizer are not in accordance to what is requested. Documentation states that tf is simply the count of each word in each doc/sentence (and so, not normalized), and 1 is added to all idf values. To meet the requirements, for each term, we would need to subtract its tf (as described above) and then divide by the maximal tf in that doc/sentence. I think we need to use a CounterVectorizer first !!! ''' vectorizer = CustomVectorizer(input='content', stopwords=list(stopwords.words('english'))) vectors = vectorizer.fit(sentences) vectors = vectorizer.transform_tfidf(sentences) ''' Transform the document into a single sentence and use the vectorizer to model it in the same feature space. ''' docVector = vectorizer.transform_tfidf([doc]) ''' For each sentence vector, reduce the document vector to the same dimension space, to be able to compute the dot product -> similarity ''' sim = [] for vector in vectors: sim.append(similarity(vector, docVector[0])) summary = sorted(enumerate(sim), key=lambda s: s[1], reverse=True)[:5] summary.sort() ''' Returns the list of selected sentences ''' res = [] for s in summary: res.append(sentences[s[0]]) return res
def main(): # initialize custom vectorizer with all documents collection vectorizer = CustomVectorizer(input='fromfiles', stopwords=stopwords, encoding='iso-8859-1') documents = ['textos-fonte/' + d for d in listdir('textos-fonte')] vectorizer.fit(documents) vectorizer._input = 'content' # print all statistics MAP = 0 print('File\t\t\tPrecision\tRecall\t\tF1 Score') for doc in listdir('textos-fonte'): path = 'textos-fonte/' + doc stats = calculateStats(path, summary(vectorizer, path)) MAP += float(stats[0]) print(doc + '\t\t' + stats[0] + '\t' + stats[1] + '\t' + stats[2]) print('\nMAP Score: ' + str(MAP / 100))
def extract_features(source): def count_tags(tags, label): tag_counter = Counter(tags) counts = 0 if label == 'nouns': for tag in tag_counter.keys(): if tag in ['N', 'NPROP', 'PROPESS']: counts += tag_counter.get(tag) elif label == 'verbs': for tag in tag_counter.keys(): if tag in ['V', 'VAUX', 'ADV', 'ADV-KS', 'ADV-KS-REL']: counts += tag_counter.get(tag) elif label == 'adjectives': for tag in tag_counter.keys(): if tag in ['ADJ']: counts += tag_counter.get(tag) return counts sents = filter_list(sent_tokenize(source)) feature_pos = [i + 1 for i in range(len(sents))] vectorizer = CustomVectorizer() vectorizer.fit( sents) # whole document must be split before fitting TODO -> filter tfidf_source = vectorizer.transform_tfidf([source])[0] tfidf_sents = vectorizer.transform_tfidf(sents) feature_sim = [ similarity(tfidf_sent, tfidf_source) for tfidf_sent in tfidf_sents ] tagger = customtagger.load_tagger() tagged = [tagger.tag(filter_list(word_tokenize(sent))) for sent in sents] sent_tags = [list(map(lambda t: t[1], tags)) for tags in tagged] feature_nouns = [count_tags(tags, 'nouns') for tags in sent_tags] #feature_verbs = [count_tags(tags, 'verbs') for tags in sent_tags] #feature_adjectives = [count_tags(tags, 'adjectives') for tags in sent_tags] return sents, feature_pos, feature_sim, feature_nouns #, feature_verbs, feature_adjectives
"http://www.latimes.com/world/rss2.0.xml", file) file.close() print('(done)') print('\nSummarizing...') file = open('worldnews.txt', encoding='utf-8') text = file.read() sents = sent_tokenize(text) file.close() vectorizer = CustomVectorizer(stopwords=stopwords.words()) vectorizer.fit(sents) vecs = vectorizer.transform_tfidf(sents) graph = {i: [] for i in range(len(vecs))} threshold = 0.1 for i in range(len(vecs)): for j in range(i + 1, len(vecs)): if similarity(vecs[i], vecs[j]) > threshold: graph[i].append(j) graph[j].append(i) graph = {k: list(set(graph[k])) for k in graph.keys()}
source_file = open(TeMario_originals + file, 'r', encoding='iso-8859-1') target_file = open(TeMario_summaries + 'Ext-' + file, 'r', encoding='iso-8859-1') source = pre_process(source_file.read()) target = pre_process(target_file.read()) source_file.close() target_file.close() sents = filter_list(sent_tokenize(source)) vectorizer = CustomVectorizer() vectorizer.fit(sents) vecs = vectorizer.transform_tfidf(sents) source_score = vectorizer.transform_tfidf([source])[0] graph = defaultdict(lambda: []) weights_tfidf = defaultdict(lambda: {}) weights_alternative = [] # TODO # Build graph threshold = 0.1 for i, v1 in enumerate(vecs): for j, v2 in enumerate(vecs[i + 1:], start=i + 1):
if __name__ == '__main__': print('\nTesting adapted PageRank algorithm for sentence ranking and consequent text summarization.\n' + 'A graph is built linking sentences with similarity bigger than a certain threshold.\n' + 'This method is tested and evaluated on the "catalunha.txt" file, with a 0.1 threshold.\n') file = open('catalunha.txt', encoding='utf-8') source = pre_process(file.read()) sents = filter_list(sent_tokenize(source)) file.close() vectorizer = CustomVectorizer(stopwords=stopwords.words()) vectorizer.fit(sents) # -> fit on sentences or on whole text? vecs = vectorizer.transform_tfidf(sents) graph = {i: [] for i in range(len(vecs))} threshold = 0.1 for i in range(len(vecs)): for j in range(i+1, len(vecs)): if similarity(vecs[i], vecs[j]) > threshold: graph[i].append(j) graph[j].append(i) graph = {k: list(set(graph[k])) for k in graph.keys()}