/
scraper.py
47 lines (30 loc) · 1 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from gensim.summarization import summarize
from bs4 import BeautifulSoup
import requests
def extractText(url):
response = requests.get(url)
html = response.content
result = {}
soup = BeautifulSoup(html, "html.parser")
# print soup.encode("utf-8")
header = soup.find_all("h1")[0]
result["header"] = header.text
print(header.text)
result["text"] = ""
tag = soup.find_all("main")
for p_tag in soup.find_all('p'):
p = p_tag.text.encode("utf-8")
if len(str(p).split(" ")) > 3:
result["text"] += p.decode("utf-8")
return result
def findSummary(text, ratio):
print('Summary:')
return summarize(text, ratio)
if __name__ == "__main__":
url = 'https://www.nytimes.com/2018/06/09/science/fish-decompression-chamber.html?action=click&contentCollection=science®ion=rank&module=package&version=highlights&contentPlacement=2&pgtype=sectionfront'
result = extractText(url)["text"]
print(result)
findSummary(result, 0.1)
# # body = soup.find(id="site content")
# print tag
# print soup.prettify()