def _walkhtml(self, ip): '''Walks HTML input Arguments: ip -- HTML input path''' # Tidy the HTML and Feed into DOM tree try: html = nr.parseString(tidy(ip, output_xhtml=1)[2], 'file://file') # Don't embed if parsing errors except self._rex: return None # Extract only first child for cleanliness html = html.firstChild # Avoids issue with HTML docs made up of comments try: self._attdel(html, 'xmlns') except AttributeError: pass # Return HTML return html
def htmldom(file): htmlstring = tidy(open(file, 'rb').read(), output_xhtml=1, wrap=0)[2] htmldom = nr.parseString(htmlstring, htmlns) stripws(htmldom) htmldom.normalize() return htmldom
import urllib2 from mx.Tidy import tidy _options = dict(char_encoding="iso2022", \ drop_font_tags = 1, \ drop_proprietary_attributes = 1, \ hide_comments = 1, \ language = "kr",\ fix_bad_comments = 1,\ output_xhtml=1,\ tidy_mark=0,\ fix_uri=1 ) #when i parse first url, error is occur #but second url or almost other url is well work url = "http://gangdong.go.kr/pub/jpn/jpn02040101.html" #url = "http://www.python.org" response = urllib2.urlopen(url) listofpage = response.readlines() html = "".join(listofpage) nerrors, nwarnings, outputdata, errordata = tidy(html, output=None, errors=None, **_options) print outputdata