/
html2xhtml.py
executable file
·121 lines (107 loc) · 3.42 KB
/
html2xhtml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
from typing import Union, Any
import bs4
DOCTYPES = {
'1.0': (
'html',
'-//W3C//DTD XHTML 1.0 Strict//EN',
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd',
),
'1.1': (
'html',
'-//W3C//DTD XHTML 1.1//EN',
'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd',
),
}
def html2xhtml(html: Union[str, bytes], version: str = '1.1') -> str:
soup = bs4.BeautifulSoup(html, "lxml")
set_doctype(soup, version)
set_xml_namespace(soup)
set_charset(soup)
remove_empty_paragraphs(soup)
if version == '1.1':
convert_name_to_id(soup)
wrap_body(soup)
return str(soup)
def set_doctype(soup: bs4.BeautifulSoup, version: str) -> None:
if version not in DOCTYPES:
raise ValueError('unsupported version: %s' % version)
new_doctype = bs4.Doctype.for_name_and_ids(*DOCTYPES[version])
for item in soup.contents:
if isinstance(item, bs4.Doctype):
item.replaceWith('')
soup.insert(0, new_doctype)
def set_xml_namespace(soup: bs4.BeautifulSoup) -> None:
soup.html['xmlns'] = 'http://www.w3.org/1999/xhtml'
def set_charset(soup: bs4.BeautifulSoup) -> None:
def element_is_meta_charset(element: Any) -> bool:
if element.name != 'meta':
return False
if element.has_attr('charset'):
return True
if element.has_attr('http-equiv'):
if element['http-equiv'] == 'Content-Type':
return True
return False
for meta in soup.html.head.find_all(element_is_meta_charset):
meta.decompose()
meta_attrs = {
'http-equiv': 'Content-Type',
'content': 'text/html; charset=utf-8',
}
soup.html.head.append(soup.new_tag('meta', **meta_attrs))
def remove_empty_paragraphs(soup: bs4.BeautifulSoup) -> None:
def is_empty(tag: Any) -> bool:
for child in tag.children:
if isinstance(child, bs4.element.Tag):
return False
if isinstance(child, bs4.element.NavigableString):
if child.strip() != '':
return False
continue
return False
return True
for element in soup.find_all('p'):
if is_empty(element):
element.decompose()
def convert_name_to_id(soup: bs4.BeautifulSoup) -> None:
for anchor in soup.html.find_all('a'):
if anchor.has_attr('name'):
anchor['id'] = anchor['name']
del anchor['name']
def wrap_body(soup: bs4.BeautifulSoup) -> None:
wrapper = soup.new_tag('div')
saved = list(soup.body.children)
soup.body.clear()
soup.body.append(wrapper)
for saved_element in saved:
wrapper.append(saved_element)
def main() -> None:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'-i',
dest='input_file',
required=True,
help='input file'
)
parser.add_argument(
'-o',
dest='output_file',
required=True,
help='output file'
)
parser.add_argument(
'-x',
dest='version',
required=False,
choices=list(DOCTYPES.keys()),
default='1.1',
help='XHTML version'
)
args = parser.parse_args()
html = open(args.input_file, 'r').read()
xhtml = html2xhtml(html, args.version)
open(args.output_file, 'wb').write(xhtml.encode('utf-8'))
if __name__ == "__main__":
main()