Exemple #1
0
 def _walkhtml(self, ip):
     '''Walks HTML input
     
     Arguments:
     ip -- HTML input path'''
     # Tidy the HTML and Feed into DOM tree
     try: html = nr.parseString(tidy(ip, output_xhtml=1)[2], 'file://file')
     # Don't embed if parsing errors
     except self._rex: return None
     # Extract only first child for cleanliness
     html = html.firstChild
     # Avoids issue with HTML docs made up of comments
     try: self._attdel(html, 'xmlns')
     except AttributeError: pass
     # Return HTML
     return html
Exemple #2
0
def htmldom(file):
    htmlstring = tidy(open(file, 'rb').read(), output_xhtml=1, wrap=0)[2]
    htmldom = nr.parseString(htmlstring, htmlns)
    stripws(htmldom)
    htmldom.normalize()
    return htmldom
Exemple #3
0
import urllib2
from mx.Tidy import tidy

_options = dict(char_encoding="iso2022", \
                       drop_font_tags = 1, \
                       drop_proprietary_attributes = 1, \
                       hide_comments = 1, \
                       language = "kr",\
                       fix_bad_comments = 1,\
                       output_xhtml=1,\
                       tidy_mark=0,\
                       fix_uri=1
                       )

#when i parse first url, error is occur
#but second url or almost other url is well work

url = "http://gangdong.go.kr/pub/jpn/jpn02040101.html"
#url = "http://www.python.org"

response = urllib2.urlopen(url)
listofpage = response.readlines()
html = "".join(listofpage)
nerrors, nwarnings, outputdata, errordata = tidy(html,
                                                 output=None,
                                                 errors=None,
                                                 **_options)
print outputdata
import urllib2
from mx.Tidy import tidy

_options = dict(char_encoding="iso2022", \
                       drop_font_tags = 1, \
                       drop_proprietary_attributes = 1, \
                       hide_comments = 1, \
                       language = "kr",\
                       fix_bad_comments = 1,\
                       output_xhtml=1,\
                       tidy_mark=0,\
                       fix_uri=1
                       )
                       
#when i parse first url, error is occur
#but second url or almost other url is well work

url = "http://gangdong.go.kr/pub/jpn/jpn02040101.html"
#url = "http://www.python.org"

response = urllib2.urlopen(url)
listofpage = response.readlines()
html = "".join(listofpage)
nerrors, nwarnings, outputdata, errordata = tidy(html, output=None, errors=None, **_options)
print outputdata