def test_bad_option_values(self): badopts = [{"indent": "---"}, {"indent_spaces": None}] for opts in badopts: with self.assertRaisesRegexp( tidy.OptionArgError, "missing or malformed argument" ): tidy.parseString(self.input2, **opts)
def test_bad_options(self): badopts = [{"foo": 1}] for opts in badopts: with self.assertRaisesRegexp( tidy.InvalidOptionError, "not a valid Tidy option" ): tidy.parseString(self.input2, **opts)
def test_encodings(self): foo = file('foo.htm').read().decode('utf8').encode('ascii', 'xmlcharrefreplace') doc1u = tidy.parseString(foo, input_encoding='ascii', output_encoding='latin1') self.failUnless(str(doc1u).find('\xe9')>=0) doc2u = tidy.parseString(foo, input_encoding='ascii', output_encoding='utf8') self.failUnless(str(doc2u).find('\xc3\xa9')>=0)
def test_badOptions(self): badopts = [{'foo': 1}, {'indent': '---'}, {'indent_spaces': None}] for dct in badopts: try: tidy.parseString(self.input2, **dct) except tidy.TidyLibError: pass else: self.fail("Invalid option %s should have raised an error" % repr(dct))
def test_encodings(self): text = ( open(self.test_file, "rb") .read() .decode("utf8") .encode("ascii", "xmlcharrefreplace") ) doc1u = tidy.parseString(text, input_encoding="ascii", output_encoding="latin1") self.assertTrue(doc1u.getvalue().find(b"\xe9") >= 0) doc2u = tidy.parseString(text, input_encoding="ascii", output_encoding="utf8") self.assertTrue(doc2u.getvalue().find(b"\xc3\xa9") >= 0)
def test_encodings(self): text = (open(self.test_file, "rb").read().decode("utf8").encode( "ascii", "xmlcharrefreplace")) doc1u = tidy.parseString(text, input_encoding="ascii", output_encoding="latin1") self.assertTrue(doc1u.getvalue().find(b"\xe9") >= 0) doc2u = tidy.parseString(text, input_encoding="ascii", output_encoding="utf8") self.assertTrue(doc2u.getvalue().find(b"\xc3\xa9") >= 0)
def test_encodings(self): foo = file('foo.htm').read().decode('utf8').encode( 'ascii', 'xmlcharrefreplace') doc1u = tidy.parseString(foo, input_encoding='ascii', output_encoding='latin1') self.failUnless(str(doc1u).find('\xe9') >= 0) doc2u = tidy.parseString(foo, input_encoding='ascii', output_encoding='utf8') self.failUnless(str(doc2u).find('\xc3\xa9') >= 0)
def test_encodings(self): text = open(self.test_file, 'rb').read().decode('utf8').encode('ascii', 'xmlcharrefreplace') doc1u = tidy.parseString(text, input_encoding='ascii', output_encoding='latin1') self.assertTrue(str(doc1u).find(b'\xe9') >= 0) doc2u = tidy.parseString(text, input_encoding='ascii', output_encoding='utf8') self.assertTrue(str(doc2u).find(b'\xc3\xa9') >= 0)
def test_options(self): doc1 = tidy.parseString( self.input1, add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1 ) self.assertIn("CDATA", str(doc1)) doc2 = tidy.parseString( "<Html>", add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1 ) self.assertTrue(str(doc2).startswith("<?xml")) self.assertFalse(len(doc2.errors) == 0) self.assertNotIn("\n", str(doc2)) doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo") self.assertIn('alt="foo"', doc3.gettext()) self.assertIn("é", doc3.gettext())
def get_page_title(content): try: content = str(tidy.parseString(content, output_xhtml=True, add_xml_decl=True, indent=False, tidy_mark=False)) content = ENTITY.sub(ENTITY_REP, content) #~ f = open("tmp.log", "w") #~ f.write(content) #~ f.close() root = etree.fromstring(content) head = root.find("{http://www.w3.org/1999/xhtml}head") title = head.find("{http://www.w3.org/1999/xhtml}title") titletext = title.text time.sleep(0.5) return titletext except Exception, e: print "\tHTML Parser Error:", str(e) m = R_TITLE.search(content) if m is not None: return m.group(1) return ""
def issue(answers_xml): # validate the answers # validateAnswers(answers_xml) # generate the answers XML document ctxt = validateAnswers(answers_xml) # lxml.etree.parse(StringIO(answers_xml)) # apply the xslt transform transform = lxml.etree.XSLT( lxml.etree.parse(XSLT_SOURCE) ) result = transform.apply(ctxt) # return the transformed document, after passing it through tidy return transform.tostring(result) try: return str(tidy.parseString(transform.tostring(result), output_xml=1, input_xml=1, tidy_mark=0, indent=1)) except: # if something goes wrong with Tidy, just return the version with # the f****d img tag return transform.tostring(result)
def convert(text): unicodeflag = False if isinstance(text, unicode): text = text.encode('utf-8') unicodeflag = True text = re_div.sub('', text) options = dict(output_xhtml=1, add_xml_decl=0, indent='auto', tidy_mark=0, wrap=0, drop_empty_paras=1, logical_emphasis=1, lower_literals=1, show_body_only=1, char_encoding='utf8') dom = tidy.parseString(text, **options) buf = StringIO.StringIO() dom.write(buf) text = buf.getvalue() if unicodeflag: text = text.decode('utf-8') con = Converter() con.parse_string(text) text = con.output() return text
def load_doc_file(filename, f): tidyopts = dict(drop_proprietary_attributes=1, alt_text='', hide_comments=1, output_xhtml=1, show_body_only=1, clean=1, char_encoding='utf8', indent='auto', ) contents = unicode(f.read(),'latin1') tm = re_titlematch.search(contents) if tm: title = tm.group(1) else: title = "" if not quiet: print "--- file: %s (%s) ---" % (filename, title) s = tidy.parseString(contents.encode('utf-8'), **tidyopts) curs.execute("INSERT INTO docs (file, version, title, content) VALUES (%(f)s, %(v)s, %(t)s, %(c)s)",{ 'f': filename, 'v': ver, 't': title, 'c': str(s), }) global pagecount pagecount += 1
def clean(txt): return unicode(str(tidy.parseString(txt, **{'output_xhtml' : 1, 'add_xml_decl' : 0, 'indent' : 0, 'tidy_mark' : 0, 'doctype' : "strict", 'wrap' : 0})),'utf8')
def traverseDom(domString): out = StringIO() domString = str(tidy.parseString(domString)) domString = clean_html(domString) tree = etree.HTML(domString.replace('\r', '')) domString = '\n'.join([ etree.tostring(stree, pretty_print=True, method="xml") for stree in tree ]) tagCount = 0 dom = xml.dom.minidom.parseString(domString) q = Queue() nodes = dom.childNodes for node in nodes: q.put(node) while(q.empty() == False): cur = q.get() tagCount+=1 if cur.nodeName == '#text': cur.nodeValue = "" tagCount-=1 if cur.attributes: keys = cur.attributes.keys() for key in keys: cur.attributes[key].value = "" l = len(cur.childNodes) i = 0 while i < l: q.put(cur.childNodes[i]) i+=1 dom.writexml(out) s = out.getvalue() return (tagCount, s)
def tidyhtml(html): """simply tidies up html code, returning xhtml""" if isinstance(html, unicode): html = html.encode("utf-8") html = tidy.parseString(html, output_xhtml=1, tidy_mark=0, input_encoding="utf8", output_encoding="utf8") html = str(html) return html
def parse_html(doc, url, config): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links we want to spider in the HTML. """ BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>' END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>' new_urls = [] if TIDY: options = dict(output_xhtml=1, wrap=0) doc = str(tidy.parseString(doc, **options)) # Temporarily "get rid" of comments so htmldata will find the URLs # in the funky "<!--[if" HTML hackery for IE. doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE) doc = doc.replace('-->', END_COMMENT_REPLACE) L = htmldata.urlextract(doc, url, 'text/html') for item in L: u = item.url if should_follow(url, u, config): # Store url locally. new_urls += [u] item.url = url_to_relative(u, url, config) else: item.url = rewrite_external_url(item.url, config) newdoc = htmldata.urljoin(doc, L) newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--') newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->') newdoc = newdoc.replace('<br>', '<br/>') newdoc = post_html_transform(newdoc, url, config) return (newdoc, new_urls)
def render_partial(T, template, fragments=None, vars=None, loader=None, **kw): try: result = T._evaluate(template, fragments, vars, loader, **kw) output = flatten(result) except: if T.debug: return T.debug_out(sys.exc_info()[:-1], template) else: # print "Error in template ( %s )" % template raise if T.tidy and tidylib: options = dict(input_xml=True, output_xhtml=True, add_xml_decl=False, doctype='omit', indent='auto', tidy_mark=False, input_encoding='utf8') return unicode( tidylib.parseString(output.encode('utf-8'), **options)) else: # p = PrettyPrinter ( ) # return p.parse ( output ) return output
def wiki_to_html(self, wikitext, req): self.env.log.debug('start function wiki_to_html') # pylint: disable-msg=E1101 # Remove some macros (TOC is better handled in ODT itself) for macro in self.remove_macros: wikitext = re.sub('\[\[%s(\([^)]*\))?\]\]' % macro, "", wikitext) # Now convert wiki to HTML out = StringIO() context = Context.from_request(req, absurls=True) Formatter(self.env, # pylint: disable-msg=E1101 context('wiki', self.page_name)).format(wikitext, out) html = Markup(out.getvalue()) html = html.encode("utf-8", 'replace') # Clean up the HTML html = re.sub('<span class="icon">.</span>', '', html) # Remove external link icon tidy_options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, input_encoding='utf8', output_encoding='utf8', doctype='auto', wrap=0, char_encoding='utf8') html = tidy.parseString(html, **tidy_options) # Replace nbsp with entity: # http://www.mail-archive.com/[email protected]/msg03670.html html = str(html).replace(" ", " ") # Tidy creates newlines after <pre> (by indenting) html = re.sub('<pre([^>]*)>\n', '<pre\\1>', html) return html
def cleanupText(self): '''This function generates an ODT document from the text of a report''' #This should really be at the top of this file. #Leaving it here for the time being so that having #libtidy is not a requirement to run bungeni import tidy body_text = removeSecurityProxy(self.context.body_text) #utidylib options options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, char_encoding='utf8', quote_nbsp=0) #remove html entities from the text ubody_text = unescape(body_text) #clean up xhtml using tidy aftertidy = tidy.parseString(ubody_text.encode('utf8'), **options) #tidy returns a <tidy.lib._Document object> dom = parseString(str(aftertidy)) nodeList = dom.getElementsByTagName("body") text = "" for childNode in nodeList[0].childNodes: text += childNode.toxml() dom.unlink() return text
def tidy_html(html_buffer, cleaning_lib='utidylib'): """ Tidy up the input HTML using one of the installed cleaning libraries. @param html_buffer: the input HTML to clean up @type html_buffer: string @param cleaning_lib: chose the preferred library to clean the HTML. One of: - utidylib - beautifulsoup @return: a cleaned version of the input HTML @note: requires uTidylib or BeautifulSoup to be installed. If the chosen library is missing, the input X{html_buffer} is returned I{as is}. """ if CFG_TIDY_INSTALLED and cleaning_lib == 'utidylib': options = dict(output_xhtml=1, show_body_only=1, merge_divs=0, wrap=0) try: output = str(tidy.parseString(html_buffer, **options)) except: output = html_buffer elif CFG_BEAUTIFULSOUP_INSTALLED and cleaning_lib == 'beautifulsoup': try: output = str(BeautifulSoup(html_buffer).prettify()) except: output = html_buffer else: output = html_buffer return output
def mergestore(self, inputstore, templatetext, includefuzzy): """converts a file to .po format""" htmlresult = templatetext.replace("\n", " ") if isinstance(htmlresult, str): #TODO: get the correct encoding htmlresult = htmlresult.decode('utf-8') # TODO: use the algorithm from html2po to get blocks and translate them individually # rather than using replace for inputunit in inputstore.units: if inputunit.isheader(): continue msgid = inputunit.source msgstr = None if includefuzzy or not inputunit.isfuzzy(): msgstr = self.wrapmessage(inputunit.target) else: msgstr = self.wrapmessage(inputunit.source) if msgstr.strip(): # TODO: "msgid" is already html-encoded ("&" -> "&"), while # "msgstr" is not encoded -> thus the replace fails # see test_po2html.py in line 67 htmlresult = htmlresult.replace(msgid, msgstr, 1) htmlresult = htmlresult.encode('utf-8') if self.tidy: htmlresult = str(tidy.parseString(htmlresult)) return htmlresult
def fix_html(htmlstr): options = dict() options['output_xhtml'] = 1 options['tidy_mark'] = 0 options['numeric_entities'] = 1 return str(tidy.parseString(htmlstr, **options))
def run(self, text): # Pass text to Tidy. As Tidy does not accept unicode we need to encode # it and decode its return value. enc = self.markdown.tidy_options.get('char_encoding', 'utf8') return unicode(tidy.parseString(text.encode(enc), **self.markdown.tidy_options), encoding=enc)
def test_options(self): options = dict(add_xml_decl=1, show_errors=1, newline='CR', output_xhtml=1) doc1 = tidy.parseString(self.input1, **options) found = re.search('//<![[]CDATA[[]\W+1>2\W+//]]>', str(doc1), re.MULTILINE) self.failUnless(found) doc2 = tidy.parseString("<Html>", **options) self.failUnless(str(doc2).startswith('<?xml')) ## self.failIf(len(doc2.errors)>1) # FIXME - tidylib doesn't ## # support this? self.failUnless(str(doc2).find('\n')<0) doc3 = tidy.parse('foo.htm', char_encoding='utf8', alt_text='foo') self.failUnless(str(doc3).find('alt="foo"')>=0) self.failUnless(str(doc3).find('\xc3\xa9')>=0)
def stripFluff(html): """Return a string of html content. Takes an auto-generated html page and strips out the fluff e.g. extra inline styles, extraneous spans etc. and returns a well-formed and plain html version. Only captures stuff within the body tag. """ options = dict(output_xhtml=1,indent=0,tidy_mark=0, clean=1, drop_empty_paras=1, drop_font_tags=1, drop_proprietary_attributes=1, enclose_block_text=1, literal_attributes=1, logical_emphasis=1, merge_divs=0, error_file='tidyerror.log', gnu_emacs=1, bare=1) html = str(tidy.parseString(html, **options)) pattern = r'<body.*?</body>' temp = re.findall(pattern, html, re.DOTALL|re.I)[0] temp = removePattern(temp, r'<body.*?>') temp = temp.replace('</body>', '') #temp = removePattern(temp, r'\r\n') temp = cleanLi(temp) ## temp = removePattern(temp, r'<SPAN.*?>') ## temp = temp.replace('</SPAN>', '') ## temp = removePattern(temp, r'<FONT.*?>') ## temp = temp.replace('</FONT>', '') temp = removePattern(temp, r'style=".*?"') temp = removePattern(temp, r'target=".*?"') temp = removePattern(temp, r'class=".*?"') temp = temp.replace('<br>', '<br />') temp = lowerTags(temp) return temp
def _tidy2(text): """uTidyLib's XHTML validator. This function is a wrapper to uTidyLib's validator. """ text = tidy.parseString(text, output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0) return _in_tag(str(text), 'body')
def __save_response(no, outdir, rtype, url, headers=None): ''' Make request and save response to file @param no: test number @param outdir ''' r = requests.request(rtype, url, headers=headers) outfile = open(os.path.join(outdir, '%s.%02d' % (rtype, no)), 'wb') tosave = '' tosave += 'HTTP/1.1 %s %s\n' % (r.status_code, r.raw.reason) for k, v in r.headers.iteritems(): tosave += '%s: %s\n' % (k, v) outfile.write(tosave) if (r.content): if (r.status_code == 207): #xml = etree.fromstring(r.content) content = str( tidy.parseString( r.content, **{ 'output_xml': 1, 'indent': 1, 'input_xml': 1 })) else: content = r.content outfile.write(content) outfile.close()
def cleanupText(self): """This function generates an ODT document from the text of a report""" #This should really be at the top of this file. #Leaving it here for the time being so that having #libtidy is not a requirement to run bungeni import tidy body_text = self.report.body_text #utidylib options options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, char_encoding="utf8", quote_nbsp=0) #remove html entities from the text ubody_text = unescape(body_text) #clean up xhtml using tidy aftertidy = tidy.parseString(ubody_text.encode("utf8"), **options) #tidy returns a <tidy.lib._Document object> dom = parseString(str(aftertidy)) nodeList = dom.getElementsByTagName("body") text = "" for childNode in nodeList[0].childNodes: text += childNode.toxml() dom.unlink() return text
def convert_to_xhtml(output, xml_decl=True): old_output = output tidy_options = { "output_xhtml": 1, "add_xml_decl": int(xml_decl), "indent": 1, "tidy-mark": 0, } output = str(tidy.parseString(output, **tidy_options)) if "text/javascript" not in old_output: output = output.replace('type="text/javascript"', '') p1 = re.compile(HTML_SCRIPT_TAG_REGEX) def replace_tag_content(tag_content): comment_chars = "//" h = HTMLParser() if "'vbs" in h.unescape(tag_content).lower() or '"vbs' in h.unescape( tag_content).lower(): comment_chars = "'" return tag_content.replace("<![CDATA[", comment_chars + "<![CDATA[").replace( "]]>", comment_chars + "]]>") output = p1.sub(lambda m: replace_tag_content(m.group()), output) return output
def html2docbook(html): options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, input_encoding='utf8', output_encoding='utf8', doctype='auto', wrap=0, char_encoding='utf8') xhtml = parseString(html.encode("utf-8"), **options) xhtml_xmldoc = libxml2.parseDoc(str(xhtml)) xhtml2_xmldoc = normalizedHeadingsXsl_xsldoc.applyStylesheet( xhtml_xmldoc, None) nhstring = normalizedHeadingsXsl_xsldoc.saveResultToString(xhtml2_xmldoc) docbook_xmldoc = xhtml2dbXsl_xsldoc.applyStylesheet(xhtml2_xmldoc, None) dbstring = xhtml2dbXsl_xsldoc.saveResultToString(docbook_xmldoc) xhtml_xmldoc.freeDoc() xhtml2_xmldoc.freeDoc() docbook_xmldoc.freeDoc() return dbstring.decode('utf-8')
def __init__(self, docid=None, *args,**kwargs): self.__dict__['type'] = 'etherpad' if docid: hostValidator = PADRE.search(docid) if hostValidator: if hostValidator.group(2) and hostValidator.group(3): docid=("%s/%s" % (hostValidator.group(2), hostValidator.group(3))).encode('utf8') kwargs['docid']=docid url="%s%s/ep/pad/export/%s/latest?format=html" % (hostValidator.group(1) or 'http://', hostValidator.group(2), hostValidator.group(3)) if not Docs.find_one({"docid": docid}): context = urllib2.urlopen(url).read() soup = BeautifulSoup(context) self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8') doc='<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>%s</html>' % (self.title, unescape(unicode(soup.body)).encode('utf8')) raw=str(tidy.parseString(doc, **{'output_xhtml' : 1, 'add_xml_decl' : 0, 'indent' : 0, 'tidy_mark' : 0, 'doctype' : "strict", 'wrap' : 0})) kwargs['raw'] = raw kwargs['docid']=docid super(Etherpad,self).__init__(*args, **kwargs) if not 'stems' in self.__dict__ or not self.stems: # let's calculate and cache the results models.tfidf.add_input_document(self.termcnt.keys()) self.save() return kwargs['docid']=docid super(Etherpad,self).__init__(*args, **kwargs)
def to_xhtml(self, stylesheet_url='', settings=DEFAULT_HTML_OVERRIDES, tidy_settings=DEFAULT_TIDY_XHTML_OPTIONS, *args, **kwargs): if 'tidy_output' in kwargs: del kwargs['tidy_output'] html_string, discard = self.to_html(stylesheet_url, tidy_output=False, *args, **kwargs) return str(tidy.parseString(html_string, **tidy_settings)), []
def EpozTidy(self, html, pageurl): """ Take html and deliver xhtml if mxTidy is installed; call EpozPostTidy for html-postprocessings before returning the result """ errors = 0 output = html errordata = "" input = html.encode("utf-8") input = EPOZ_SCRIPT.sub('<script ', input) input = input.replace('</epoz:script>', '</script>') if uWordUnmunger: input = unmungeString(input) if mxTidyIsAvailable: (errors, warnings, output, errordata) = Tidy.tidy( input, drop_empty_paras=1, logical_emphasis=1, indent_spaces=1, indent="no", output_xhtml=1, word_2000=1, wrap=0, alt_text='', char_encoding="utf8") # (errors, warnings, output, errordata) = Tidy.tidy( # input, drop_empty_paras=1, indent_spaces=1, indent="auto", # output_xhtml=1, word_2000=1, wrap=79, char_encoding="utf8") if errors: output = html elif uTidyIsAvailable: parsed = tidy.parseString( input, drop_empty_paras=1, indent_spaces=1, indent="auto", output_xhtml=1, word_2000=1, wrap=79, char_encoding="utf8", add_xml_decl=0, doctype="omit", indent_attributes=1, drop_proprietary_attributes=1, bare=1, clean=1, enclose_text=1, tidy_mark=0) reports = parsed.get_errors() all_errors = [str(x) for x in reports if x.severity != 'W'] errors = len(all_errors) errordata = '\n'.join(all_errors) if errors: output = html else: output = str(parsed) output = MSO_CLASS.sub(r"<\1>", output) result = HTML_BODY.search(output) if result: output = result.group(1) # Call External Method / PythonScript for postprocessing # The script should expect two parameters: # self = called context (=server) # html = the htmlbody to postprocess # pathname = path of edited object (maybe with template!) # The script should return the new htmlbody EpozPostTidy = getattr(self, 'EpozPostTidy', None) if EpozPostTidy is not None: output = EpozPostTidy(self, output, pageurl) return (errors, output, errordata)
def html_tidy(instr): options = dict(output_xhtml=1, add_xml_decl=0, indent=1, show_body_only=1, ) tidied = tidy.parseString(instr, **options) return str(tidied)
def get_sounding_skewt(self): data=urllib.urlencode({"region":"europe","TYPE":"GIF:SKEWT","YEAR":self.YEAR,"MONTH":self.MONTH,"FROM":self.FROM,"TO":self.FROM,"STNM":SOUNDINGSTATION}) #print data s = urllib.urlopen("http://weather.uwyo.edu/cgi-bin/sounding?",data) o=s.read() s.close() document=tidy.parseString(o) urllib.urlretrieve("http://weather.uwyo.edu/upperair/images/"+self.YEAR+self.MONTH+self.FROM+".08221.skewt.gif", CHARTPATH+"skewt.gif")
def test_options(self): options = dict(add_xml_decl=1, show_errors=1, newline='CR', output_xhtml=1) doc1 = tidy.parseString(self.input1, **options) found = re.search('//<![[]CDATA[[]\W+1>2\W+//]]>', str(doc1), re.MULTILINE) self.failUnless(found) doc2 = tidy.parseString("<Html>", **options) self.failUnless(str(doc2).startswith('<?xml')) ## self.failIf(len(doc2.errors)>1) # FIXME - tidylib doesn't ## # support this? self.failUnless(str(doc2).find('\n') < 0) doc3 = tidy.parse('foo.htm', char_encoding='utf8', alt_text='foo') self.failUnless(str(doc3).find('alt="foo"') >= 0) self.failUnless(str(doc3).find('\xc3\xa9') >= 0)
def get_errors(self, content, url=''): if isinstance(content, str): content = content.encode('utf8') import tidy doc = tidy.parseString(content) if not doc.errors: return return self.create_error_report(doc, content, url)
def parse_html(html_str): xhtml_str = tidy.parseString(html_str, output_xhtml=1, indent=1, numeric_entities=1, add_xml_decl=1, tidy_mark=1) return ET.parse(StringIO(xhtml_str))
def clean(txt): return tostring(xmlparse(StringIO(str(tidy.parseString(txt, **{'output_xhtml' : 1, 'add_xml_decl' : 0, 'indent' : 0, 'anchor-as-name': 0, 'tidy_mark' : 0, 'doctype' : "strict", 'wrap' : 0})))))
def process_response(self, request, response): if 'text/html' not in response['Content-Type'].lower(): return response else: content = response.content content = str(tidy.parseString(content, **options)) response.content = content return response
def prettyXHTML(uglyXHTML): options = dict(input_xml=True, output_xhtml=True, add_xml_decl=False, doctype='omit', indent='auto', tidy_mark=False) return str(tidy.parseString(uglyXHTML, **options))
def tidy2xhtml(instream, outstream): options = dict(output_xhtml=1, add_xml_decl=1, indent=1 ) tidied = tidy.parseString(instream.read(), **options) tidied.write(outstream) return
def test_options(self): doc1 = tidy.parseString(self.input1, add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1) self.assertIn("CDATA", str(doc1)) doc2 = tidy.parseString("<Html>", add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1) self.assertTrue(str(doc2).startswith("<?xml")) self.assertFalse(len(doc2.errors) == 0) self.assertNotIn("\n", str(doc2)) doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo") self.assertIn('alt="foo"', doc3.gettext()) self.assertIn("é", doc3.gettext())
def xtruncate(s, length=255, end='...'): import tidy options = dict(output_xhtml=1, add_xml_decl=1, indent=1, show_body_only=1, tidy_mark=0) return str(tidy.parseString(str(s[:length]) + end, **options))
def load_finished_cb(view, frame): ctx = jscore.JSContext(view.get_main_frame().get_global_context()) jsw = ctx.globalObject.window doc = jsw.document serializer = ctx.evaluateScript('new XMLSerializer()') html_str = serializer.serializeToString(doc) xml_str = tidy.parseString(html_str, **options) print xml_str gtk.main_quit()
def check_validates(url): if not should_validate(url): return errors = tidy.parseString(html_store[url], **TIDY_OPTIONS).errors errors = filter_errors(errors) if errors: errors.insert(0, 'Found following validation errors:') raise Exception(u'\n'.join([unicode(e) for e in errors]))
def normalize(raw, debug=None): # tidy to xhtml if debug: print >> sys.stderr, 'cleaning response' options = dict(output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0, doctype="strict", wrap=0) return str(tidy.parseString(raw, **options))
def _xml_format(self): for dirname, _, files in os.walk(self.dirname): for filename in files: full_name = os.path.join(dirname, filename) if full_name.endswith('.xml'): i = open(full_name, 'r') obj = tidy.parseString(i.read(), **self._tidy_options) i.close() o = open(full_name, 'w') obj.write(o)
def run(self, product): ''' ''' import tidy frm = product[self.config["from"]] try: input_enc = self.config["input_encoding"] except: input_enc = "utf8" try: output_enc = self.config["output_encoding"] except: output_enc = "utf8" try: if isinstance(frm, list): outputs = [] for it in frm: outputs.append( str( tidy.parseString(it, input_encoding=input_enc, output_encoding=output_enc, preserve_entities="yes", accessibility_check=0, new_empty_tags="", output_html="yes", show_errors=6, force_output="yes"))) product[self.config["to"]] = outputs else: product[self.config["to"]] = str( tidy.parseString(frm, input_encoding=input_enc, output_encoding=output_enc, preserve_entities="yes", accessibility_check=0, new_empty_tags="", output_html="yes", show_errors=6, force_output="yes")) except Exception, e: print "got an exception when tidy...", e
def _tidy_page(path): """Read a page, run it through tidy, and create a temporary output file. returns a temporary file object containing the results""" if not os.path.exists(path): raise IOError('file %s not found!' % path) # set up some tidy options tidy_options = { 'char-encoding': 'utf8', 'enclose-text': 'yes', # wrap loose text nodes in <p> 'show-body-only': 'auto', # do not add <html> and <body> unless present in input 'indent': 'no', # don't prettily indent output to make parsing easier 'tidy-mark': 'no', # no creator meta-tag 'force-output': 'yes', # some output is better than none, I hope } # unicode files make utidylib cry :( so we need to be creative # http://developer.berlios.de/bugs/?func=detailbug&bug_id=14186&group_id=1810 # http://muffinresearch.co.uk/archives/2008/07/29/working-around-utidylibs-unicode-handling/ f = open(path, 'r') content = unicode(f.read(), 'utf-8').encode('utf8') f.close() try: parsed = tidy.parseString(content, **tidy_options) except tidy.error.OptionArgError: # show-body-only is new-ish, so emulate it del tidy_options['show-body-only'] try: parsed = tidy.parseString(content, **tidy_options) except Exception as e: print e bodytag = re.compile("<body>(.*)</body>", re.IGNORECASE | re.DOTALL) if not bodytag.search(content): if path.find('validation') != -1: print parsed parsed = bodytag.search(str(parsed)).group(1) result = StringIO.StringIO(parsed) result.name = os.path.basename(path) return result
def tidy_html(body): if tidy: body = to_utf8(body) body = tidy.parseString(body, indent=1, char_encoding='utf8', output_xhtml=1, word_2000=1) body = str(body) return stream_to_str_as_xhtml(HTMLParser(body))