/
summarize.py
154 lines (146 loc) · 5.28 KB
/
summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
from gr3ybot_settings import SUMMARY_COUNT
from urldetect import *
from urlparse import urlparse
import zlib
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tag import pos_tag
#from nltk.tag.stanford import StanfordPOSTagger
import re
import operator
from newspaper import Article, Config
ignoredWords = stopwords.words('english')
ignoredWords.extend(['said','could','want','told'])
ignoredWords = [x.lower() for x in ignoredWords]
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#TAGGER = StanfordPOSTagger('english-left3words-distsim.tagger')
def getWordFrequency(text, kws):
words = []
for sentence in text:
for thing in sentence.split(' '):
words.append(thing)
wordFreq = nltk.FreqDist(words)
wordFreq = sorted(wordFreq.items(), key=lambda x: (x[1],x[0]), reverse=True)
wordFreq = wordFreq[:16]
wordFreq = dict((a,b) for a, b in wordFreq)
impWords = [w for i,w in enumerate(words) if w in kws]
return wordFreq.keys(),impWords
def getPropCount(sentence):
tags = pos_tag(sentence.split())
props = [word for word,pos in tags if pos == "NNP"]
return len(props)
def getBaseScore(text,title,kws):
scores = {}
cleanBody = []
body = tokenizer.tokenize(text)
for sentence in body:
props = getPropCount(sentence)
sentence = sentence.split(' ')
cleanedSent = [w for w in sentence if w.lower() not in ignoredWords]
cleanBody.append(" ".join(cleanedSent))
i = 0
wordFreq, impWrds = getWordFrequency(cleanBody,kws)
for n,sentence in enumerate(cleanBody):
if n == 0: firstSen = sentence.split(' ')
sentence = sentence.split(' ')
if "==" in sentence: continue # Attempt to skip code blocks
scoreBonus = -n # Sentences earlier on in the article are generally better for summaries.
cnt = 0
wordlen = 0
num = 0
freq = 0
impWords = 0
for n,word in enumerate(sentence):
# If the first word isn't an ASCII-based word, skip the sentence.
if n == 0:
if not all(ord(c) < 128 for c in word):
break
if word in wordFreq: freq += 1
if word in impWrds: impWords += 1
if word in title or word in firstSen: cnt += 1
try:
if int(word.replace(",", "").replace(".", "").replace("$", "")) > 0: num += 1
if word.find("illion") or word in ["one","two","three","four","five","six","seven","eight","nine","ten"]: num += 1
except:
pass
wordlen += len(word)
#--
# Here is where I determine the strength of the sentence. This is broken down into
# a few sections to make it easier to read.
# NOTE: All of the following scores exclude stopwords.
# -- -- -- -- -- -- -- -- -- --
# First, how many words in the sentence share words with the headline?
# Multiply that by 16.5. Divide by the sentence length.
sharedWords = (cnt * 16.5)
# Next, how many uncommon words does the sentence have?
# Multiply that by 2.5.
frequentWords = (freq * 2.5)
# Next, let's try to determine how many keywords were used.
# Multiply that by 7.75
importantWords = (impWords * 7.75)
#importantWords = 0
# Next, how many proper nounds does the sentence contain?
# Multiply that by 7.
propNouns = (props * 7)
# Finally, let's see how many numbers are in the sentence. Numbers
# indicate years or statistics, so those sentences should
# be rated higher.
# Multiply that by 3.5
statWords = (num * 3.5)
#--
# Estimations
#
# The following equations are estimations based on common findings
# in articles. As such, while they are still counted, they are
# weighted nuch less.
#--
# Fourth, how is the overall length of each word in the sentence?
# Sentences with longer than average words might be more important.
# Divide that average by 3. Factor in the keywords.
averageLen = float(int((wordlen / len(sentence)) / 3))
# Finally, what's the overall length of the sentence in relation to
# the amount of common words it contains and the amount
# of words it shares with the headline?
# Divide that by 15.
# We SUBTRACT this from the score -- longer sentences don't
# necessarily make better ones.
sentenceLen = float((float(impWords) + float(freq)) / len(sentence))
# Add that all together to get the strength score of the sentence:
scores[i] = (scoreBonus + sharedWords + propNouns + frequentWords + importantWords + statWords + averageLen) * sentenceLen
i+=1
return scores
def summary(title, text, kws):
summItUp = []
highScores = []
title = title.split(' ')
cleanedTitle = [w for w in title if w not in ignoredWords]
scores = getBaseScore(text,cleanedTitle, kws)
sortedScores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
for thing in sortedScores:
scoreKey = int(str(thing).split(',')[0].split('(')[1])
highScores.append(scoreKey)
body = tokenizer.tokenize(text)
sentences = SUMMARY_COUNT
for i in range(SUMMARY_COUNT):
try:
summItUp.append(body[highScores[i]].encode('utf-8', 'ignore'))
except:
sentences-=1
return summItUp
def whoHasTimeToRead(url):
is_article = valid_url(url, verbose=True)
config = Config()
config.MAX_KEYWORDS = 10
if is_article:
sumitup = {}
b = Article(url=url,config=config)
b.download()
b.parse()
b.nlp()
sumNews = summary(b.title, b.text, b.keywords)
sumTitle = b.title
movies = b.movies[0] if len(b.movies) > 0 else "None"
return sumNews,sumTitle,movies
return "Nope"