/
parser.py
88 lines (75 loc) · 2.24 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 19 21:37:14 2019
@author: Carlos McNulty
"""
import urllib.request as urllib2
import lxml.html
from lxml.html import parse
from lxml.html import document_fromstring
import requests
from summarizer import summarize
import logging
from bs4 import BeautifulSoup
from lxml import etree
from lxml.html.clean import Cleaner
from lxml.html import builder as E
from urllib.parse import urlparse
import urllib
def parse_default(tree, title):
pass
def parse_cnn(tree, title):
pass
def parse_wiki(tree, title):
root = tree.getroot()
parser_div = root.xpath("//div[@class='mw-parser-output']")[0]
headers = ["h1","h2","h3","h4","h5","h6"]
children = parser_div.getchildren()
text = ""
header = ""
html = ""
for child in children:
if child.tag == "p":
text += child.text_content().lstrip().rstrip()
elif child.tag in headers:
if len(text) > 0:
summary = summarize(text, limit=2)
html += "<h2>"+header+"</h2><p>"+summary+"</p>"
text = ""
header = child.text_content().split("[")[0]
print(header)
# TODO - add style sheet
# TODO - format text
html_out = E.HTML(
E.HEAD(
E.TITLE(title)
),
E.BODY(
E.H1(E.CLASS("heading"), title),
lxml.html.fromstring(html)
)
)
html_out.getroottree().write(file="summarized-roanoke.html", method="html")
if __name__ == "__main__":
cleaner = Cleaner()
cleaner.javascript = True
cleaner.scripts = True
cleaner.frame = True
cleaner.meta = True
cleaner.comments = True
cleaner.links = True
cleaner.style = True
cleaner.kill_tags = ["cite", "sup", "img", "noscript", "label", "video"]
url = "https://en.wikipedia.org/wiki/Roanoke_Colony"
doc = urllib2.urlopen(url)
tree = lxml.html.parse(doc)
title = tree.find(".//title").text
tree = cleaner.clean_html(tree)
netloc = urlparse(url).netloc
if netloc == "en.wikipedia.org":
parse_wiki(tree, title)
elif netloc == "cnn.com":
parse_cnn(tree, title)
else:
parse_default(tree, title)