Example #1
0
 def test_bad_option_values(self):
     badopts = [{"indent": "---"}, {"indent_spaces": None}]
     for opts in badopts:
         with self.assertRaisesRegexp(
             tidy.OptionArgError, "missing or malformed argument"
         ):
             tidy.parseString(self.input2, **opts)
Example #2
0
 def test_bad_options(self):
     badopts = [{"foo": 1}]
     for opts in badopts:
         with self.assertRaisesRegexp(
             tidy.InvalidOptionError, "not a valid Tidy option"
         ):
             tidy.parseString(self.input2, **opts)
Example #3
0
 def test_encodings(self):
     foo = file('foo.htm').read().decode('utf8').encode('ascii', 
                                                        'xmlcharrefreplace')
     doc1u = tidy.parseString(foo, input_encoding='ascii',
                              output_encoding='latin1')
     self.failUnless(str(doc1u).find('\xe9')>=0)
     doc2u = tidy.parseString(foo, input_encoding='ascii',
                              output_encoding='utf8')
     self.failUnless(str(doc2u).find('\xc3\xa9')>=0)
Example #4
0
 def test_badOptions(self):
     badopts = [{'foo': 1}, {'indent': '---'}, {'indent_spaces': None}]
     for dct in badopts:
         try:
             tidy.parseString(self.input2, **dct)
         except tidy.TidyLibError:
             pass
         else:
             self.fail("Invalid option %s should have raised an error" %
                       repr(dct))
Example #5
0
 def test_badOptions(self):
     badopts = [{'foo': 1}, {'indent': '---'}, {'indent_spaces': None}]
     for dct in badopts:
         try:
             tidy.parseString(self.input2, **dct)
         except tidy.TidyLibError:
             pass
         else:
             self.fail("Invalid option %s should have raised an error" %
                       repr(dct))
Example #6
0
 def test_encodings(self):
     text = (
         open(self.test_file, "rb")
         .read()
         .decode("utf8")
         .encode("ascii", "xmlcharrefreplace")
     )
     doc1u = tidy.parseString(text, input_encoding="ascii", output_encoding="latin1")
     self.assertTrue(doc1u.getvalue().find(b"\xe9") >= 0)
     doc2u = tidy.parseString(text, input_encoding="ascii", output_encoding="utf8")
     self.assertTrue(doc2u.getvalue().find(b"\xc3\xa9") >= 0)
Example #7
0
 def test_encodings(self):
     text = (open(self.test_file, "rb").read().decode("utf8").encode(
         "ascii", "xmlcharrefreplace"))
     doc1u = tidy.parseString(text,
                              input_encoding="ascii",
                              output_encoding="latin1")
     self.assertTrue(doc1u.getvalue().find(b"\xe9") >= 0)
     doc2u = tidy.parseString(text,
                              input_encoding="ascii",
                              output_encoding="utf8")
     self.assertTrue(doc2u.getvalue().find(b"\xc3\xa9") >= 0)
Example #8
0
 def test_encodings(self):
     foo = file('foo.htm').read().decode('utf8').encode(
         'ascii', 'xmlcharrefreplace')
     doc1u = tidy.parseString(foo,
                              input_encoding='ascii',
                              output_encoding='latin1')
     self.failUnless(str(doc1u).find('\xe9') >= 0)
     doc2u = tidy.parseString(foo,
                              input_encoding='ascii',
                              output_encoding='utf8')
     self.failUnless(str(doc2u).find('\xc3\xa9') >= 0)
Example #9
0
 def test_encodings(self):
     text = open(self.test_file,
                 'rb').read().decode('utf8').encode('ascii',
                                                    'xmlcharrefreplace')
     doc1u = tidy.parseString(text,
                              input_encoding='ascii',
                              output_encoding='latin1')
     self.assertTrue(str(doc1u).find(b'\xe9') >= 0)
     doc2u = tidy.parseString(text,
                              input_encoding='ascii',
                              output_encoding='utf8')
     self.assertTrue(str(doc2u).find(b'\xc3\xa9') >= 0)
Example #10
0
 def test_options(self):
     doc1 = tidy.parseString(
         self.input1, add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1
     )
     self.assertIn("CDATA", str(doc1))
     doc2 = tidy.parseString(
         "<Html>", add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1
     )
     self.assertTrue(str(doc2).startswith("<?xml"))
     self.assertFalse(len(doc2.errors) == 0)
     self.assertNotIn("\n", str(doc2))
     doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo")
     self.assertIn('alt="foo"', doc3.gettext())
     self.assertIn("é", doc3.gettext())
Example #11
0
def get_page_title(content):
  try:
    content = str(tidy.parseString(content, output_xhtml=True, add_xml_decl=True, indent=False, tidy_mark=False))
    content = ENTITY.sub(ENTITY_REP, content)
  
  #~ f = open("tmp.log", "w")
  #~ f.write(content)
  #~ f.close()
  
    root = etree.fromstring(content)
  
    head = root.find("{http://www.w3.org/1999/xhtml}head")
    title = head.find("{http://www.w3.org/1999/xhtml}title")
    titletext = title.text
    
    time.sleep(0.5)
    
    return titletext
  
  except Exception, e:
    print "\tHTML Parser Error:", str(e)
    
    m = R_TITLE.search(content)
    if m is not None:
      return m.group(1)
    
    return ""
Example #12
0
def issue(answers_xml):

    # validate the answers
    # validateAnswers(answers_xml)
        
    # generate the answers XML document
    ctxt = validateAnswers(answers_xml) # lxml.etree.parse(StringIO(answers_xml)) 

    # apply the xslt transform
    transform = lxml.etree.XSLT(
        lxml.etree.parse(XSLT_SOURCE)
        )

    result = transform.apply(ctxt)

    # return the transformed document, after passing it through tidy
    return transform.tostring(result)

    try:
        return str(tidy.parseString(transform.tostring(result),
                                output_xml=1, input_xml=1, tidy_mark=0, indent=1))
    except:
        # if something goes wrong with Tidy, just return the version with 
        # the f****d img tag
        return transform.tostring(result)
Example #13
0
def convert(text):
    unicodeflag = False
    if isinstance(text, unicode):
        text = text.encode('utf-8')
        unicodeflag = True
    text = re_div.sub('', text)
    options = dict(output_xhtml=1,
                   add_xml_decl=0,
                   indent='auto',
                   tidy_mark=0,
                   wrap=0,
                   drop_empty_paras=1,
                   logical_emphasis=1,
                   lower_literals=1,
                   show_body_only=1,
                   char_encoding='utf8')
    dom = tidy.parseString(text, **options)
    buf = StringIO.StringIO()
    dom.write(buf)
    text = buf.getvalue()
    if unicodeflag:
        text = text.decode('utf-8')
    con = Converter()
    con.parse_string(text)
    text = con.output()
    return text
Example #14
0
def load_doc_file(filename, f):
	tidyopts = dict(drop_proprietary_attributes=1,
				alt_text='',
				hide_comments=1,
				output_xhtml=1,
				show_body_only=1,
				clean=1,
				char_encoding='utf8',
				indent='auto',
			)

	contents = unicode(f.read(),'latin1')
	tm = re_titlematch.search(contents)
	if tm:
		title = tm.group(1)
	else:
		title = ""
	if not quiet: print "--- file: %s (%s) ---" % (filename, title)

	s = tidy.parseString(contents.encode('utf-8'), **tidyopts)
	curs.execute("INSERT INTO docs (file, version, title, content) VALUES (%(f)s, %(v)s, %(t)s, %(c)s)",{
		'f': filename,
		'v': ver,
		't': title,
		'c': str(s),
	})
	global pagecount
	pagecount += 1
Example #15
0
File: fetch.py Project: dnet/f33dme
def clean(txt):
    return unicode(str(tidy.parseString(txt, **{'output_xhtml' : 1,
                                                'add_xml_decl' : 0,
                                                'indent' : 0,
                                                'tidy_mark' : 0,
                                                'doctype' : "strict",
                                                'wrap' : 0})),'utf8')
Example #16
0
def traverseDom(domString):
    out = StringIO()
    domString = str(tidy.parseString(domString))
    domString = clean_html(domString)
    tree   = etree.HTML(domString.replace('\r', ''))
    domString = '\n'.join([ etree.tostring(stree, pretty_print=True, method="xml")
                          for stree in tree ])
    tagCount = 0
    dom = xml.dom.minidom.parseString(domString)
    q = Queue()
    nodes = dom.childNodes

    for node in nodes:
        q.put(node)
    while(q.empty() == False):
        cur  = q.get()
        tagCount+=1
        if cur.nodeName == '#text':
            cur.nodeValue = ""
            tagCount-=1
        if cur.attributes:
            keys = cur.attributes.keys()
            for key in keys:
                cur.attributes[key].value = ""
        l = len(cur.childNodes)
        i = 0
        while i < l:
            q.put(cur.childNodes[i])
            i+=1
    dom.writexml(out)
    s = out.getvalue()
    return (tagCount, s)
Example #17
0
def tidyhtml(html):
    """simply tidies up html code, returning xhtml"""
    if isinstance(html, unicode):
        html = html.encode("utf-8")
    html = tidy.parseString(html, output_xhtml=1, tidy_mark=0, input_encoding="utf8", output_encoding="utf8")
    html = str(html)
    return html
def parse_html(doc, url, config):
    """
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links we want to spider in the HTML.
  """
    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'

    new_urls = []
    if TIDY:
        options = dict(output_xhtml=1, wrap=0)
        doc = str(tidy.parseString(doc, **options))

    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
    doc = doc.replace('-->', END_COMMENT_REPLACE)

    L = htmldata.urlextract(doc, url, 'text/html')
    for item in L:
        u = item.url
        if should_follow(url, u, config):
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url, config)
        else:
            item.url = rewrite_external_url(item.url, config)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
    newdoc = newdoc.replace('<br>', '<br/>')
    newdoc = post_html_transform(newdoc, url, config)
    return (newdoc, new_urls)
Example #19
0
    def render_partial(T,
                       template,
                       fragments=None,
                       vars=None,
                       loader=None,
                       **kw):
        try:
            result = T._evaluate(template, fragments, vars, loader, **kw)
            output = flatten(result)
        except:
            if T.debug:
                return T.debug_out(sys.exc_info()[:-1], template)
            else:
                # print "Error in template ( %s )" % template
                raise

        if T.tidy and tidylib:
            options = dict(input_xml=True,
                           output_xhtml=True,
                           add_xml_decl=False,
                           doctype='omit',
                           indent='auto',
                           tidy_mark=False,
                           input_encoding='utf8')
            return unicode(
                tidylib.parseString(output.encode('utf-8'), **options))
        else:
            # p = PrettyPrinter ( )
            # return p.parse ( output )
            return output
Example #20
0
    def wiki_to_html(self, wikitext, req):
        self.env.log.debug('start function wiki_to_html') # pylint: disable-msg=E1101

        # Remove some macros (TOC is better handled in ODT itself)
        for macro in self.remove_macros:
            wikitext = re.sub('\[\[%s(\([^)]*\))?\]\]' % macro, "", wikitext)

        # Now convert wiki to HTML
        out = StringIO()
        context = Context.from_request(req, absurls=True)
        Formatter(self.env, # pylint: disable-msg=E1101
                  context('wiki', self.page_name)).format(wikitext, out)
        html = Markup(out.getvalue())
        html = html.encode("utf-8", 'replace')

        # Clean up the HTML
        html = re.sub('<span class="icon">.</span>', '', html) # Remove external link icon
        tidy_options = dict(output_xhtml=1, add_xml_decl=1, indent=1,
                            tidy_mark=0, input_encoding='utf8',
                            output_encoding='utf8', doctype='auto',
                            wrap=0, char_encoding='utf8')
        html = tidy.parseString(html, **tidy_options)
        # Replace nbsp with entity:
        # http://www.mail-archive.com/[email protected]/msg03670.html
        html = str(html).replace("&nbsp;", "&#160;")
        # Tidy creates newlines after <pre> (by indenting)
        html = re.sub('<pre([^>]*)>\n', '<pre\\1>', html)
        return html
Example #21
0
 def cleanupText(self):
     '''This function generates an ODT document from the text of a report'''
     #This should really be at the top of this file.
     #Leaving it here for the time being so that having 
     #libtidy is not a requirement to run bungeni
     import tidy
     body_text = removeSecurityProxy(self.context.body_text)
     #utidylib options
     options = dict(output_xhtml=1, 
                 add_xml_decl=1, 
                 indent=1, 
                 tidy_mark=0,
                 char_encoding='utf8',
                 quote_nbsp=0)
     #remove html entities from the text
     ubody_text = unescape(body_text)
     #clean up xhtml using tidy
     aftertidy = tidy.parseString(ubody_text.encode('utf8'), **options)
     #tidy returns a <tidy.lib._Document object>
     dom = parseString(str(aftertidy))
     nodeList = dom.getElementsByTagName("body")
     text = ""
     for childNode in nodeList[0].childNodes:
         text += childNode.toxml()
     dom.unlink()
     return text
Example #22
0
def tidy_html(html_buffer, cleaning_lib='utidylib'):
    """
    Tidy up the input HTML using one of the installed cleaning
    libraries.

    @param html_buffer: the input HTML to clean up
    @type html_buffer: string
    @param cleaning_lib: chose the preferred library to clean the HTML. One of:
                         - utidylib
                         - beautifulsoup
    @return: a cleaned version of the input HTML
    @note: requires uTidylib or BeautifulSoup to be installed. If the chosen library is missing, the input X{html_buffer} is returned I{as is}.
    """

    if CFG_TIDY_INSTALLED and cleaning_lib == 'utidylib':
        options = dict(output_xhtml=1,
                       show_body_only=1,
                       merge_divs=0,
                       wrap=0)
        try:
            output = str(tidy.parseString(html_buffer, **options))
        except:
            output = html_buffer
    elif CFG_BEAUTIFULSOUP_INSTALLED and cleaning_lib == 'beautifulsoup':
        try:
            output = str(BeautifulSoup(html_buffer).prettify())
        except:
            output = html_buffer
    else:
        output = html_buffer

    return output
Example #23
0
 def mergestore(self, inputstore, templatetext, includefuzzy):
     """converts a file to .po format"""
     htmlresult = templatetext.replace("\n", " ")
     if isinstance(htmlresult, str):
         #TODO: get the correct encoding
         htmlresult = htmlresult.decode('utf-8')
     # TODO: use the algorithm from html2po to get blocks and translate them individually
     # rather than using replace
     for inputunit in inputstore.units:
         if inputunit.isheader():
             continue
         msgid = inputunit.source
         msgstr = None
         if includefuzzy or not inputunit.isfuzzy():
             msgstr = self.wrapmessage(inputunit.target)
         else:
             msgstr = self.wrapmessage(inputunit.source)
         if msgstr.strip():
             # TODO: "msgid" is already html-encoded ("&" -> "&amp;"), while
             #   "msgstr" is not encoded -> thus the replace fails
             #   see test_po2html.py in line 67
             htmlresult = htmlresult.replace(msgid, msgstr, 1)
     htmlresult = htmlresult.encode('utf-8')
     if self.tidy:
         htmlresult = str(tidy.parseString(htmlresult))
     return htmlresult
def fix_html(htmlstr):
    options = dict()
    options['output_xhtml'] = 1
    options['tidy_mark'] = 0
    options['numeric_entities'] = 1

    return str(tidy.parseString(htmlstr, **options))
Example #25
0
 def run(self, text):
     # Pass text to Tidy. As Tidy does not accept unicode we need to encode
     # it and decode its return value.
     enc = self.markdown.tidy_options.get('char_encoding', 'utf8')
     return unicode(tidy.parseString(text.encode(enc),
                                     **self.markdown.tidy_options),
                    encoding=enc)
Example #26
0
    def test_options(self):
        options = dict(add_xml_decl=1, show_errors=1, newline='CR', 
                       output_xhtml=1)
        doc1 = tidy.parseString(self.input1, **options)
        found = re.search('//<![[]CDATA[[]\W+1>2\W+//]]>', str(doc1),
                          re.MULTILINE)
        self.failUnless(found)
        doc2 = tidy.parseString("<Html>", **options)
        self.failUnless(str(doc2).startswith('<?xml'))
##        self.failIf(len(doc2.errors)>1) # FIXME - tidylib doesn't
##                                        # support this?
        self.failUnless(str(doc2).find('\n')<0)
        doc3 = tidy.parse('foo.htm', char_encoding='utf8', 
                          alt_text='foo')
        self.failUnless(str(doc3).find('alt="foo"')>=0)
        self.failUnless(str(doc3).find('\xc3\xa9')>=0)
Example #27
0
 def run(self, text):
     # Pass text to Tidy. As Tidy does not accept unicode we need to encode
     # it and decode its return value.
     enc = self.markdown.tidy_options.get('char_encoding', 'utf8')
     return unicode(tidy.parseString(text.encode(enc), 
                                     **self.markdown.tidy_options),
                    encoding=enc) 
Example #28
0
def stripFluff(html):
    """Return a string of html content.
    
    Takes an auto-generated html page and strips out the fluff
    e.g. extra inline styles, extraneous spans etc. and returns
    a well-formed and plain html version.  Only captures stuff
    within the body tag.  """

    options = dict(output_xhtml=1,indent=0,tidy_mark=0,
                   clean=1, drop_empty_paras=1, drop_font_tags=1,
                   drop_proprietary_attributes=1, enclose_block_text=1,
                   literal_attributes=1, logical_emphasis=1, merge_divs=0,
                   error_file='tidyerror.log', gnu_emacs=1, bare=1)
    html = str(tidy.parseString(html, **options))

    pattern = r'<body.*?</body>'

    temp = re.findall(pattern, html, re.DOTALL|re.I)[0]
    temp = removePattern(temp, r'<body.*?>')
    temp = temp.replace('</body>', '')
    #temp = removePattern(temp, r'\r\n')
    temp = cleanLi(temp)
##    temp = removePattern(temp, r'<SPAN.*?>')
##    temp = temp.replace('</SPAN>', '')
##    temp = removePattern(temp, r'<FONT.*?>')
##    temp = temp.replace('</FONT>', '')
    temp = removePattern(temp, r'style=".*?"')
    temp = removePattern(temp, r'target=".*?"')
    temp = removePattern(temp, r'class=".*?"')
    temp = temp.replace('<br>', '<br />')
    temp = lowerTags(temp)
    return temp
Example #29
0
        def _tidy2(text):
            """uTidyLib's XHTML validator.

            This function is a wrapper to uTidyLib's validator.
            """
            text = tidy.parseString(text,  output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0)
            return _in_tag(str(text), 'body')
Example #30
0
def __save_response(no, outdir, rtype, url, headers=None):
    '''
    Make request and save response to file
    @param no: test number
    @param outdir
    '''
    r = requests.request(rtype, url, headers=headers)
    outfile = open(os.path.join(outdir, '%s.%02d' % (rtype, no)), 'wb')
    tosave = ''
    tosave += 'HTTP/1.1 %s %s\n' % (r.status_code, r.raw.reason)
    for k, v in r.headers.iteritems():
        tosave += '%s: %s\n' % (k, v)
    outfile.write(tosave)
    if (r.content):
        if (r.status_code == 207):
            #xml = etree.fromstring(r.content)
            content = str(
                tidy.parseString(
                    r.content, **{
                        'output_xml': 1,
                        'indent': 1,
                        'input_xml': 1
                    }))
        else:
            content = r.content
        outfile.write(content)
    outfile.close()
Example #31
0
 def cleanupText(self):
     """This function generates an ODT document from the text of a report"""
     #This should really be at the top of this file.
     #Leaving it here for the time being so that having
     #libtidy is not a requirement to run bungeni
     import tidy
     body_text = self.report.body_text
     #utidylib options
     options = dict(output_xhtml=1,
                    add_xml_decl=1,
                    indent=1,
                    tidy_mark=0,
                    char_encoding="utf8",
                    quote_nbsp=0)
     #remove html entities from the text
     ubody_text = unescape(body_text)
     #clean up xhtml using tidy
     aftertidy = tidy.parseString(ubody_text.encode("utf8"), **options)
     #tidy returns a <tidy.lib._Document object>
     dom = parseString(str(aftertidy))
     nodeList = dom.getElementsByTagName("body")
     text = ""
     for childNode in nodeList[0].childNodes:
         text += childNode.toxml()
     dom.unlink()
     return text
Example #32
0
def tidy_html(html_buffer, cleaning_lib='utidylib'):
    """
    Tidy up the input HTML using one of the installed cleaning
    libraries.

    @param html_buffer: the input HTML to clean up
    @type html_buffer: string
    @param cleaning_lib: chose the preferred library to clean the HTML. One of:
                         - utidylib
                         - beautifulsoup
    @return: a cleaned version of the input HTML
    @note: requires uTidylib or BeautifulSoup to be installed. If the chosen library is missing, the input X{html_buffer} is returned I{as is}.
    """

    if CFG_TIDY_INSTALLED and cleaning_lib == 'utidylib':
        options = dict(output_xhtml=1, show_body_only=1, merge_divs=0, wrap=0)
        try:
            output = str(tidy.parseString(html_buffer, **options))
        except:
            output = html_buffer
    elif CFG_BEAUTIFULSOUP_INSTALLED and cleaning_lib == 'beautifulsoup':
        try:
            output = str(BeautifulSoup(html_buffer).prettify())
        except:
            output = html_buffer
    else:
        output = html_buffer

    return output
Example #33
0
def convert_to_xhtml(output, xml_decl=True):
    old_output = output

    tidy_options = {
        "output_xhtml": 1,
        "add_xml_decl": int(xml_decl),
        "indent": 1,
        "tidy-mark": 0,
    }
    output = str(tidy.parseString(output, **tidy_options))

    if "text/javascript" not in old_output:
        output = output.replace('type="text/javascript"', '')

    p1 = re.compile(HTML_SCRIPT_TAG_REGEX)

    def replace_tag_content(tag_content):
        comment_chars = "//"
        h = HTMLParser()
        if "'vbs" in h.unescape(tag_content).lower() or '"vbs' in h.unescape(
                tag_content).lower():
            comment_chars = "'"
        return tag_content.replace("<![CDATA[",
                                   comment_chars + "<![CDATA[").replace(
                                       "]]>", comment_chars + "]]>")

    output = p1.sub(lambda m: replace_tag_content(m.group()), output)
    return output
Example #34
0
def html2docbook(html):

    options = dict(output_xhtml=1,
                   add_xml_decl=1,
                   indent=1,
                   tidy_mark=0,
                   input_encoding='utf8',
                   output_encoding='utf8',
                   doctype='auto',
                   wrap=0,
                   char_encoding='utf8')
    xhtml = parseString(html.encode("utf-8"), **options)

    xhtml_xmldoc = libxml2.parseDoc(str(xhtml))

    xhtml2_xmldoc = normalizedHeadingsXsl_xsldoc.applyStylesheet(
        xhtml_xmldoc, None)

    nhstring = normalizedHeadingsXsl_xsldoc.saveResultToString(xhtml2_xmldoc)

    docbook_xmldoc = xhtml2dbXsl_xsldoc.applyStylesheet(xhtml2_xmldoc, None)

    dbstring = xhtml2dbXsl_xsldoc.saveResultToString(docbook_xmldoc)

    xhtml_xmldoc.freeDoc()
    xhtml2_xmldoc.freeDoc()
    docbook_xmldoc.freeDoc()
    return dbstring.decode('utf-8')
Example #35
0
 def mergestore(self, inputstore, templatetext, includefuzzy):
     """converts a file to .po format"""
     htmlresult = templatetext.replace("\n", " ")
     if isinstance(htmlresult, str):
         #TODO: get the correct encoding
         htmlresult = htmlresult.decode('utf-8')
     # TODO: use the algorithm from html2po to get blocks and translate them individually
     # rather than using replace
     for inputunit in inputstore.units:
         if inputunit.isheader():
             continue
         msgid = inputunit.source
         msgstr = None
         if includefuzzy or not inputunit.isfuzzy():
             msgstr = self.wrapmessage(inputunit.target)
         else:
             msgstr = self.wrapmessage(inputunit.source)
         if msgstr.strip():
             # TODO: "msgid" is already html-encoded ("&" -> "&amp;"), while
             #   "msgstr" is not encoded -> thus the replace fails
             #   see test_po2html.py in line 67
             htmlresult = htmlresult.replace(msgid, msgstr, 1)
     htmlresult = htmlresult.encode('utf-8')
     if self.tidy:
         htmlresult = str(tidy.parseString(htmlresult))
     return htmlresult
Example #36
0
    def __init__(self, docid=None, *args,**kwargs):
        self.__dict__['type'] = 'etherpad'
        if docid:
            hostValidator = PADRE.search(docid)
            if hostValidator:
                if hostValidator.group(2) and hostValidator.group(3):
                    docid=("%s/%s" % (hostValidator.group(2), hostValidator.group(3))).encode('utf8')
                    kwargs['docid']=docid
                url="%s%s/ep/pad/export/%s/latest?format=html" % (hostValidator.group(1) or 'http://', hostValidator.group(2), hostValidator.group(3))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8')

                    doc='<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>%s</html>' % (self.title, unescape(unicode(soup.body)).encode('utf8'))
                    raw=str(tidy.parseString(doc, **{'output_xhtml' : 1,
                                                             'add_xml_decl' : 0,
                                                             'indent' : 0,
                                                             'tidy_mark' : 0,
                                                             'doctype' : "strict",
                                                             'wrap' : 0}))
                    kwargs['raw'] = raw
                    kwargs['docid']=docid
                    super(Etherpad,self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid']=docid
        super(Etherpad,self).__init__(*args, **kwargs)
Example #37
0
 def to_xhtml(self, stylesheet_url='', settings=DEFAULT_HTML_OVERRIDES,
 tidy_settings=DEFAULT_TIDY_XHTML_OPTIONS, *args, **kwargs):
     if 'tidy_output' in kwargs:
         del kwargs['tidy_output']
     html_string, discard = self.to_html(stylesheet_url, tidy_output=False,
         *args, **kwargs)
     return str(tidy.parseString(html_string, **tidy_settings)), []
Example #38
0
def EpozTidy(self, html, pageurl):
    """ Take html and deliver xhtml if mxTidy is installed;
        call EpozPostTidy for html-postprocessings before returning the result
    """

    errors = 0
    output = html
    errordata = ""

    input = html.encode("utf-8")
    input = EPOZ_SCRIPT.sub('<script ', input)
    input = input.replace('</epoz:script>', '</script>')

    if uWordUnmunger:
        input = unmungeString(input)

    if mxTidyIsAvailable:
        (errors, warnings, output, errordata) = Tidy.tidy(
            input, drop_empty_paras=1, logical_emphasis=1, indent_spaces=1,
            indent="no", output_xhtml=1, word_2000=1, wrap=0, alt_text='',
            char_encoding="utf8")
#        (errors, warnings, output, errordata) = Tidy.tidy(
#            input, drop_empty_paras=1, indent_spaces=1, indent="auto",
#            output_xhtml=1, word_2000=1, wrap=79, char_encoding="utf8")
        if errors:
            output = html
    elif uTidyIsAvailable:
        parsed = tidy.parseString(
            input, drop_empty_paras=1, indent_spaces=1, indent="auto",
            output_xhtml=1, word_2000=1, wrap=79, char_encoding="utf8",
            add_xml_decl=0, doctype="omit", indent_attributes=1,
            drop_proprietary_attributes=1, bare=1, clean=1,
            enclose_text=1, tidy_mark=0)
        reports = parsed.get_errors()
        all_errors = [str(x) for x in reports if x.severity != 'W']
        errors = len(all_errors)
        errordata = '\n'.join(all_errors)
        if errors:
            output = html
        else:
            output = str(parsed)

    output = MSO_CLASS.sub(r"<\1>", output)
    result = HTML_BODY.search(output)
    if result:
        output = result.group(1)

    # Call External Method / PythonScript for postprocessing
    # The script should expect two parameters:
    # self = called context (=server)
    # html = the htmlbody to postprocess
    # pathname = path of edited object (maybe with template!)
    # The script should return the new htmlbody

    EpozPostTidy = getattr(self, 'EpozPostTidy', None)
    if EpozPostTidy is not None:
        output = EpozPostTidy(self, output, pageurl)

    return (errors, output, errordata)
Example #39
0
def html_tidy(instr):
   options = dict(output_xhtml=1,
                  add_xml_decl=0,
                  indent=1,
                  show_body_only=1,
                  )
   tidied = tidy.parseString(instr, **options)
   return str(tidied)
Example #40
0
 def get_sounding_skewt(self):
     data=urllib.urlencode({"region":"europe","TYPE":"GIF:SKEWT","YEAR":self.YEAR,"MONTH":self.MONTH,"FROM":self.FROM,"TO":self.FROM,"STNM":SOUNDINGSTATION})
     #print data
     s = urllib.urlopen("http://weather.uwyo.edu/cgi-bin/sounding?",data)
     o=s.read()
     s.close()
     document=tidy.parseString(o)    
     urllib.urlretrieve("http://weather.uwyo.edu/upperair/images/"+self.YEAR+self.MONTH+self.FROM+".08221.skewt.gif", CHARTPATH+"skewt.gif")
Example #41
0
 def test_options(self):
     options = dict(add_xml_decl=1,
                    show_errors=1,
                    newline='CR',
                    output_xhtml=1)
     doc1 = tidy.parseString(self.input1, **options)
     found = re.search('//<![[]CDATA[[]\W+1>2\W+//]]>', str(doc1),
                       re.MULTILINE)
     self.failUnless(found)
     doc2 = tidy.parseString("<Html>", **options)
     self.failUnless(str(doc2).startswith('<?xml'))
     ##        self.failIf(len(doc2.errors)>1) # FIXME - tidylib doesn't
     ##                                        # support this?
     self.failUnless(str(doc2).find('\n') < 0)
     doc3 = tidy.parse('foo.htm', char_encoding='utf8', alt_text='foo')
     self.failUnless(str(doc3).find('alt="foo"') >= 0)
     self.failUnless(str(doc3).find('\xc3\xa9') >= 0)
Example #42
0
 def get_errors(self, content, url=''):
     if isinstance(content, str):
         content = content.encode('utf8')
     import tidy
     doc = tidy.parseString(content)
     if not doc.errors:
         return
     return self.create_error_report(doc, content, url)
Example #43
0
def parse_html(html_str):
    xhtml_str = tidy.parseString(html_str,
                                 output_xhtml=1,
                                 indent=1,
                                 numeric_entities=1,
                                 add_xml_decl=1,
                                 tidy_mark=1)
    return ET.parse(StringIO(xhtml_str))
Example #44
0
File: fetch.py Project: stef/f33dme
def clean(txt):
    return tostring(xmlparse(StringIO(str(tidy.parseString(txt, **{'output_xhtml' : 1,
                                                                  'add_xml_decl' : 0,
                                                                  'indent' : 0,
                                                                  'anchor-as-name': 0,
                                                                  'tidy_mark' : 0,
                                                                  'doctype' : "strict",
                                                                  'wrap' : 0})))))
Example #45
0
 def process_response(self, request, response):
   if 'text/html' not in response['Content-Type'].lower(): 
     return response
   else:
     content = response.content
     content = str(tidy.parseString(content, **options))
     response.content = content
     return response
Example #46
0
def prettyXHTML(uglyXHTML):
    options = dict(input_xml=True,
                   output_xhtml=True,
                   add_xml_decl=False,
                   doctype='omit',
                   indent='auto',
                   tidy_mark=False)
    return str(tidy.parseString(uglyXHTML, **options))
def prettyXHTML(uglyXHTML):
    options = dict(input_xml=True,
        output_xhtml=True,
        add_xml_decl=False,
        doctype='omit',
        indent='auto',
        tidy_mark=False)
    return str(tidy.parseString(uglyXHTML, **options))
Example #48
0
def tidy2xhtml(instream, outstream):
    options = dict(output_xhtml=1,
                   add_xml_decl=1,
                   indent=1
                   )
    tidied = tidy.parseString(instream.read(), **options)
    tidied.write(outstream)
    return
Example #49
0
 def test_options(self):
     doc1 = tidy.parseString(self.input1,
                             add_xml_decl=1,
                             show_errors=1,
                             newline="CR",
                             output_xhtml=1)
     self.assertIn("CDATA", str(doc1))
     doc2 = tidy.parseString("<Html>",
                             add_xml_decl=1,
                             show_errors=1,
                             newline="CR",
                             output_xhtml=1)
     self.assertTrue(str(doc2).startswith("<?xml"))
     self.assertFalse(len(doc2.errors) == 0)
     self.assertNotIn("\n", str(doc2))
     doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo")
     self.assertIn('alt="foo"', doc3.gettext())
     self.assertIn("é", doc3.gettext())
Example #50
0
def xtruncate(s, length=255, end='...'):
    import tidy

    options = dict(output_xhtml=1,
                   add_xml_decl=1,
                   indent=1,
                   show_body_only=1,
                   tidy_mark=0)
    return str(tidy.parseString(str(s[:length]) + end, **options))
def load_finished_cb(view, frame):
    ctx = jscore.JSContext(view.get_main_frame().get_global_context())
    jsw = ctx.globalObject.window
    doc = jsw.document
    serializer = ctx.evaluateScript('new XMLSerializer()')
    html_str = serializer.serializeToString(doc)
    xml_str = tidy.parseString(html_str, **options)
    print xml_str
    gtk.main_quit()
Example #52
0
def check_validates(url):
    if not should_validate(url):
        return

    errors = tidy.parseString(html_store[url], **TIDY_OPTIONS).errors
    errors = filter_errors(errors)

    if errors:
        errors.insert(0, 'Found following validation errors:')
        raise Exception(u'\n'.join([unicode(e) for e in errors]))
Example #53
0
def normalize(raw, debug=None):
    # tidy to xhtml
    if debug: print >> sys.stderr, 'cleaning response'
    options = dict(output_xhtml=1,
                   add_xml_decl=0,
                   indent=0,
                   tidy_mark=0,
                   doctype="strict",
                   wrap=0)
    return str(tidy.parseString(raw, **options))
Example #54
0
 def _xml_format(self):
     for dirname, _, files in os.walk(self.dirname):
         for filename in files:
             full_name = os.path.join(dirname, filename)
             if full_name.endswith('.xml'):
                 i = open(full_name, 'r')
                 obj = tidy.parseString(i.read(), **self._tidy_options)
                 i.close()
                 o = open(full_name, 'w')
                 obj.write(o)
Example #55
0
 def run(self, product):
     '''
     '''
     import tidy
     frm = product[self.config["from"]]
     try:
         input_enc = self.config["input_encoding"]
     except:
         input_enc = "utf8"
     try:
         output_enc = self.config["output_encoding"]
     except:
         output_enc = "utf8"
     try:
         if isinstance(frm, list):
             outputs = []
             for it in frm:
                 outputs.append(
                     str(
                         tidy.parseString(it,
                                          input_encoding=input_enc,
                                          output_encoding=output_enc,
                                          preserve_entities="yes",
                                          accessibility_check=0,
                                          new_empty_tags="",
                                          output_html="yes",
                                          show_errors=6,
                                          force_output="yes")))
             product[self.config["to"]] = outputs
         else:
             product[self.config["to"]] = str(
                 tidy.parseString(frm,
                                  input_encoding=input_enc,
                                  output_encoding=output_enc,
                                  preserve_entities="yes",
                                  accessibility_check=0,
                                  new_empty_tags="",
                                  output_html="yes",
                                  show_errors=6,
                                  force_output="yes"))
     except Exception, e:
         print "got an exception when tidy...", e
Example #56
0
def _tidy_page(path):
    """Read a page, run it through tidy, and create a temporary output file.
    returns a temporary file object containing the results"""

    if not os.path.exists(path):
        raise IOError('file %s not found!' % path)

    # set up some tidy options
    tidy_options = {
        'char-encoding': 'utf8',
        'enclose-text': 'yes',  # wrap loose text nodes in <p>
        'show-body-only':
        'auto',  # do not add <html> and <body> unless present in input
        'indent': 'no',  # don't prettily indent output to make parsing easier
        'tidy-mark': 'no',  # no creator meta-tag
        'force-output': 'yes',  # some output is better than none, I hope
    }

    # unicode files make utidylib cry :( so we need to be creative
    # http://developer.berlios.de/bugs/?func=detailbug&bug_id=14186&group_id=1810
    # http://muffinresearch.co.uk/archives/2008/07/29/working-around-utidylibs-unicode-handling/
    f = open(path, 'r')
    content = unicode(f.read(), 'utf-8').encode('utf8')
    f.close()
    try:
        parsed = tidy.parseString(content, **tidy_options)
    except tidy.error.OptionArgError:
        # show-body-only is new-ish, so emulate it
        del tidy_options['show-body-only']
        try:
            parsed = tidy.parseString(content, **tidy_options)
        except Exception as e:
            print e
        bodytag = re.compile("<body>(.*)</body>", re.IGNORECASE | re.DOTALL)
        if not bodytag.search(content):
            if path.find('validation') != -1:
                print parsed
            parsed = bodytag.search(str(parsed)).group(1)

    result = StringIO.StringIO(parsed)
    result.name = os.path.basename(path)
    return result
Example #57
0
def tidy_html(body):
    if tidy:
        body = to_utf8(body)
        body = tidy.parseString(body,
                                indent=1,
                                char_encoding='utf8',
                                output_xhtml=1,
                                word_2000=1)
        body = str(body)

    return stream_to_str_as_xhtml(HTMLParser(body))
Example #58
0
        def _tidy2(text):
            """uTidyLib's XHTML validator.

            This function is a wrapper to uTidyLib's validator.
            """
            text = tidy.parseString(text,
                                    output_xhtml=1,
                                    add_xml_decl=0,
                                    indent=0,
                                    tidy_mark=0)
            return _in_tag(str(text), 'body')