-
Notifications
You must be signed in to change notification settings - Fork 0
/
glean.py
148 lines (116 loc) · 4.77 KB
/
glean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""Use boilerpipy + LXML to clean downloaded html files."""
from html import escape as html_escape
import lxml.etree
import lxml.html as html
from lxml.html.clean import Cleaner
def remove_xpaths(elem, xpaths, parent=False):
for path in xpaths:
for tag in elem.xpath(path):
if parent:
tag = tag.getparent()
tag.getparent().remove(tag)
def clean(content, title=None):
content = content.decode("utf-8")
# We're parsing the content html twice!
# TODO: This one can probably be removed
# LXML parsing is used to get title and meta head info from HTML
html_doc = html.fromstring(content,
parser=html.HTMLParser(encoding="utf-8"))
head_doc = html_doc.find('head')
reconstructed_body = "<html><body>" + content + "</body></html>"
# Get title so it can be added as an H1 tag, but remove it from
# the html itself - so that Pandoc doesn't use it
if not title:
title = html_doc.find('.//title')
title.getparent().remove(title)
title = title.text_content()
title = title[:title.rfind('-')]
# Add in the title
if "<body><h1>" not in reconstructed_body:
reconstructed_body = reconstructed_body.replace(
"<body>", "<body><h1>" + title + "</h1>"
)
# Remove stuff that readability didn't remove
doc = html.fromstring(reconstructed_body)
# Use lxml's cleaner to remove all useless tags
# (currently, this removes styles, even when not asked to)
cleaner = Cleaner(
scripts=True, javascript=True, comments=True,
links=True, forms=True, annoying_tags=True,
style=True, inline_style=False,
)
doc = cleaner.clean_html(doc)
body_doc = doc.find('body')
bad_body_xpaths = [
"//nav",
"//footer",
"//button",
"//form[@id='interview_experience_form']",
"//div[@id='author']",
"//div[@id='video']",
"//div[@id='share-buttons']",
"//div[@id='ide_link']",
"//div[@id='disqus_thread']",
"//div[@id='secondary']",
"//div[@id='personalNoteDiv']",
"//div[@id='practiceLinkDiv']",
"//div[@class='leftSideBarParent']",
"//div[@class='author_info_box']",
"//div[@class='plugins']",
"//div[@class='no-p-tag']",
"//div[@class='comments-main']",
"//ins[@class='adsbygoogle']",
"//h3",
"//h1[@class='entry-title']",
"//h2[not(@class='tabtitle')]",
"//hr",
# This requires XPath 2.0
# "//a[ends-with(@href, 'sudo-gate')]",
"//a[contains(@href, 'sudo-gate')]",
"//p[contains(., 'contribute@geeksforgeeks.org')]",
"//p[starts-with(., 'Please write comments if you find')]",
]
bad_parent_xpaths = [
"//h2[starts-with(text(), 'Recommended')]",
]
# This one has to be removed first, so h2's parent can die!
remove_xpaths(body_doc, bad_parent_xpaths, parent=True)
remove_xpaths(body_doc, bad_body_xpaths)
# Convert all language tags to p tags
# H1 is used only for post title
for lang_h1 in body_doc.xpath("//h2[@class='tabtitle']"):
lang_p = '<p><strong>%s</strong></p>' % lang_h1.text_content()
lang_h1.addnext(lxml.etree.XML(lang_p))
lang_h1.getparent().remove(lang_h1)
# Not too sure if this is needed - but at this point
# I don't want to remove any code that works
for pre_tag in body_doc.xpath("//pre"):
if 'class' in pre_tag.attrib:
pre_tag.attrib.pop('class')
if 'title' in pre_tag.attrib:
pre_tag.attrib.pop('title')
try:
# Add Source link to doc - this may fail for various reasons
src_url = head_doc.cssselect('meta[property="og:url"]')[0].get('content') # noqa
src_link = "<p><a href='" + src_url + "' rel='tag'>" + src_url + "</a></p>" # noqa
post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0]
post_content_doc.append(lxml.etree.XML("<h3>Source</h3>"))
post_content_doc.append(lxml.etree.XML(src_link))
except: # noqa
pass
# Code in the HTML is in the form of a table
# We convert the table into a single pre / code tag
for code_tag in body_doc.xpath('//div[starts-with(@id,"highlighter")]'):
code = str(code_tag.text_content()).replace("\n\n", "")
code = html_escape(code)
code = "<pre> <code>" + code + "</code> </pre>"
code_tag.addnext(lxml.etree.XML(code))
code_tag.getparent().remove(code_tag)
result = html.tostring(body_doc).decode("utf-8")
return result
if __name__ == '__main__':
import requests
u = "https://www.geeksforgeeks.org/m-coloring-problem-backtracking-5/"
r = requests.get(u)
cleaned = clean(r.content)
print(cleaned)