forked from oerpub/oerpub.rhaptoslabs.html_gdocs2cnxml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlsoup2cnxml.py
184 lines (161 loc) · 7.71 KB
/
htmlsoup2cnxml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#! /usr/bin/env python
import sys
import os
import urllib2
#from urlparse import urlparse
from urlparse import urljoin
import libxml2
import libxslt
from tidylib import tidy_document
from xhtmlpremailer import xhtmlPremailer
from lxml import etree
import magic
from readability.readability import Document
current_dir = os.path.dirname(__file__)
XHTML_ENTITIES = os.path.join(current_dir, 'www', 'catalog_xhtml', 'catalog.xml')
XHTML2CNXML_XSL1 = os.path.join(current_dir, 'www', 'xhtml2cnxml_meta1.xsl')
XHTML2CNXML_XSL2 = os.path.join(current_dir, 'www', 'xhtml2cnxml_meta2.xsl')
# HTML Tidy, HTML Soup to XHTML
# Premail XHTML
def tidy_and_premail(content):
# HTML Tidy
# Tidy up HTML and convert it to XHTML
strTidiedXhtml, strErrors = tidy_document(content, options={
'output-xhtml': 1, # XHTML instead of HTML4
'indent': 0, # Don't use indent which adds extra linespace or linefeeds which are big problems
'tidy-mark': 0, # No tidy meta tag in output
'wrap': 0, # No wrapping
'alt-text': '', # Help ensure validation
'doctype': 'strict', # Little sense in transitional for tool-generated markup...
'force-output': 1, # May not get what you expect but you will get something
'numeric-entities': 1, # Remove HTML entities like e.g. nbsp
'clean': 1, # Cleaning
'bare': 1,
'word-2000': 1, # Cleans Word HTML
'drop-proprietary-attributes': 1,
'enclose-text': 1, # enclose text in body always with <p>...</p>
'logical-emphasis': 1 # transforms <i> and <b> text to <em> and <strong> text
})
# DEBUG
#f=open('xhtml.xml', 'w')
#f.write(strTidiedXhtml)
#f.close
# XHTML Premailer
# Remove CSS references and place the whole CSS inside tags.
# BTW: Premailer does this usually for old email clients.
# Use a special XHTML Premailer which does not destroy the XML structure.
# If Premailer fails (on complicated CSS) then return the unpremailed tidied HTML
try:
premailer = xhtmlPremailer(strTidiedXhtml)
strTidiedPremailedHtml = premailer.transform()
return strTidiedPremailedHtml
except:
return strTidiedXhtml
# Downloads images and sets metadata for further processing
def downloadImages(xml, base_or_source_url='.'):
objects = {} # image contents will be saved here
xpathImages = etree.XPath('//cnxtra:image', namespaces={'cnxtra':'http://cnxtra'})
imageList = xpathImages(xml)
image_opener = urllib2.build_opener()
image_opener.addheaders = [('User-agent', 'Mozilla/5.0')]
for position, image in enumerate(imageList):
strImageUrl = image.get('src')
if len(strImageUrl) > 0 and len(base_or_source_url) > 0:
if base_or_source_url != '.': # if we have a base url join this url strings
strImageUrl = urljoin(base_or_source_url, strImageUrl)
try:
# strImageContent = urllib2.urlopen(strImageUrl).read() # this does not work for websites like e.g. Wikipedia
image_request = image_opener.open(strImageUrl)
strImageContent = image_request.read()
# get Mime type from image
strImageMime = magic.whatis(strImageContent)
# only allow this three image formats
if strImageMime in ('image/png', 'image/jpeg', 'image/gif'):
image.set('mime-type', strImageMime)
strImageName = "gd-%04d" % (position + 1) # gd0001.jpg
if strImageMime == 'image/jpeg':
strImageName += '.jpg'
elif strImageMime == 'image/png':
strImageName += '.png'
elif strImageMime == 'image/gif':
strImageName += '.gif'
strAlt = image.get('alt')
if not strAlt:
image.set('alt', strImageUrl) # getNameFromUrl(strImageUrl))
image.text = strImageName
# add contents of image to object
objects[strImageName] = strImageContent
# just for debugging
#myfile = open(strImageName, "wb")
#myfile.write(strImageContent)
#myfile.close
except:
print 'Warning: ' + strImageUrl + ' could not be downloaded.' # do nothing if url could not be downloaded
else:
print 'Warning: image url or base url not valid! One image will be skipped!'
return xml, objects
def add_cnxml_title(etree_xml, new_title):
title = etree_xml.xpath('/cnxml:document/cnxml:title', namespaces={'cnxml':'http://cnx.rice.edu/cnxml'})
title[0].text = new_title
return etree_xml
# Main method. Doing all steps for the HTMLSOUP to CNXML transformation
def xsl_transform(content, bDownloadImages, base_or_source_url='.'):
html_title = "Untitled"
# 1 get title with readability
# ONLY MAKES SENSE FOR AN UNKNOWN HTML, SO I COMMENTED IT OUT FOR https://github.com/Connexions/rhaptos.html2cnxml
#try:
# html_title = Document(content).title()
#except:
# pass
# 2 use readabilty to get content
# ONLY MAKES SENSE FOR AN UNKNOWN HTML, SO I COMMENTED IT OUT FOR https://github.com/Connexions/rhaptos.html2cnxml
#readable_article = Document(content).summary()
readable_article = content
# 3 tidy and premail
strTidiedHtml = tidy_and_premail(readable_article)
# 4 Load XHTML catalog files: Makes XHTML entities readable.
libxml2.loadCatalog(XHTML_ENTITIES)
libxml2.lineNumbersDefault(1)
libxml2.substituteEntitiesDefault(1)
# 5 XSLT transformation
styleDoc1 = libxml2.parseFile(XHTML2CNXML_XSL1)
style1 = libxslt.parseStylesheetDoc(styleDoc1)
# doc1 = libxml2.parseFile(afile))
doc1 = libxml2.parseDoc(strTidiedHtml)
result1 = style1.applyStylesheet(doc1, None)
#style1.saveResultToFilename(os.path.join('output', docFilename + '_meta.xml'), result1, 1)
strResult1 = style1.saveResultToString(result1)
style1.freeStylesheet()
doc1.freeDoc()
result1.freeDoc()
# Parse XML with etree from lxml for TeX2MathML and image download
etreeXml = etree.fromstring(strResult1)
# 6 Convert TeX to MathML with Blahtex (not in XHTML)
# etreeXml = tex2mathml(etreeXml)
# 7 Optional: Download Google Docs Images
imageObjects = {}
if bDownloadImages:
etreeXml, imageObjects = downloadImages(etreeXml, base_or_source_url)
# 8 add title from html
etreeXml = add_cnxml_title(etreeXml, html_title)
# Convert etree back to string
strXml = etree.tostring(etreeXml) # pretty_print=True)
# 9 Second transformation
styleDoc2 = libxml2.parseFile(XHTML2CNXML_XSL2)
style2 = libxslt.parseStylesheetDoc(styleDoc2)
doc2 = libxml2.parseDoc(strXml)
result2 = style2.applyStylesheet(doc2, None)
#style2.saveResultToFilename('tempresult.xml', result2, 0) # just for debugging
strResult2 = style2.saveResultToString(result2)
style2.freeStylesheet()
doc2.freeDoc()
result2.freeDoc()
return strResult2, imageObjects, html_title
def htmlsoup_to_cnxml(content, bDownloadImages=False, base_or_source_url='.'):
objects = {}
content, objects, title = xsl_transform(content, bDownloadImages, base_or_source_url)
return content, objects, title
if __name__ == "__main__":
f = open(sys.argv[1])
content = f.read()
print htmlsoup_to_cnxml(content)