Python tidy_document Examples, tidylib.tidy_document Python Examples

Example #1

0

Show file

File: DocsTest.py Project: 18600597055/hue

 def test_doc_with_entity(self):
     h = "&eacute;"
     expected = DOC % "&eacute;"
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)
     
     expected = DOC % "&#233;"
     doc, err = tidy_document(h, {'numeric-entities':1})
     self.assertEqual(doc, expected)

Example #2

0

Show file

File: helpers.py Project: VitorVRS/kuma

def _massage_diff_content(content):
    tidy_options = {
        'output-xhtml': 0,
        'force-output': 1,
    }
    try:
        content = tidy_document(content, options=tidy_options)
    except UnicodeDecodeError:
        # In case something happens in pytidylib we'll try again with
        # a proper encoding
        content = tidy_document(content.encode('utf-8'), options=tidy_options)
        tidied, errors = content
        content = tidied.decode('utf-8'), errors
    return content

Example #3

0

Show file

File: scrape_pages.py Project: satyadevi-nyros/eracks

def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            "output-html": 1,
            #'indent':1,
            "clean": 1,
            "drop-font-tags": 1,
        },
    )
    if errs:
        # raise Exception, errs
        print errs

    doc = html5lib.parse(doc, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery("td#content")
    assert len(td) == 1

    for img in td("img"):
        # print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr("src")
        # alt = img.attr('alt')

        # if src.startswith ('/image'):
        rslt = getimage(src, slug.split("/")[0])
        img.attr("src", rslt)
        if trace:
            print rslt

    # td =
    # no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    # content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  # .html()  # [:60]

    if dbteeth:
        # q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                # defaults = dict (sortorder = sortorder),
            ),
        )

Example #4

0

Show file

File: parser.py Project: mhagander/pgarchivesweb

    def html_clean(self, html):
        # First we pass it through tidy
        (html, errors) = tidylib.tidy_document(html,
                                               options={
                                                   'drop-proprietary-attributes': 1,
                                                   'alt-text': '',
                                                   'hide-comments': 1,
                                                   'output-xhtml': 1,
                                                   'show-body-only': 1,
                                                   'clean': 1,
                                                   'char-encoding': 'utf8',
                                                   'show-warnings': 0,
                                                   'show-info': 0,
                                               })
        if errors:
            print(("HTML tidy failed for %s!" % self.msgid))
            print(errors)
            return None

        try:
            cleaner = HTMLCleaner()
            cleaner.feed(html)
            return cleaner.get_text()
        except Exception as e:
            # Failed to parse the html, thus failed to clean it. so we must
            # give up...
            return None

Example #5

0

Show file

File: sinads.py Project: B-Rich/dealer

 def __trading_years(self, instrument):
     re = urllib2.urlopen('http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/%s.phtml' % (instrument))
     document, errors = tidy_document(re.read())
     soup = BeautifulSoup(document)
     node = soup.find('select', attrs={'name':'year'})
     for option in node.findAll('option'):
         yield option.getText()

Example #6

0

Show file

File: middleware.py Project: nosamanuel/django-test-validator

    def process_response(self, request, response):
        if 'text/html' in response['Content-Type'] and response.content:
            document, errors = tidy_document(response.content)
            if errors:
                raise HTMLValidationError(errors)

        return response

Example #7

0

Show file

File: getMenu.py Project: blueskywalker/menuReview

def getMenu():
    storeFile = open("list.txt","r")
    txt = storeFile.read()
    storeFile.close()
    
    list=txt.split('\n\n\n')
    

 #   print list
    
    for store in list:    
#        print store
        rest = store.split('\n')
        if len(rest)!=3:
            break
        try:
            url=baseUrl+rest[2] +'menu'
            print url
            res=urlopen(url)
            html=res.read()    
         
            options = {'output-encoding':'utf8', 'output-xhtml':1 }
            document,errors = tidy_document(html,options)   
            
            filepath = dataDir+ (rest[2].split('/'))[2] + ".html"
            saveFile = open(filepath,"w")
            saveFile.write(document)
            saveFile.close()
            print filepath
        except :
            print "skip:"+url

Example #8

0

Show file

File: htmlutils.py Project: kollivier/brightwriter

def cleanUpHTML(html, options=None):
    import tidylib
    tidylib.BASE_OPTIONS = {}

    default_options = { 
                        "force-output" : 1,
                        "output-xhtml" : 1,
                        "doctype" : "strict",
                        "drop-empty-paras": 1,
                        "output-encoding" : "utf8",
                        "clean": 1,
                        "bare": 1
                       }
    if options:
        default_options.extend(options)

    # first fix up footnotes so that HTMLTidy won't ditch them
    soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html")
    footnoteFixer(soup) #html)
    stripEmptyParagraphs(soup)
    
    html, errors = tidylib.tidy_document(soup.prettify(encoding=None), options=default_options)
    
    soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html")
    addMetaTag(soup, [('http-equiv', 'Content-type'), ('content', 'text/html; charset=utf-8')])
    
    return soup.prettify(encoding=None), errors

Example #9

0

Show file

def _tidy2xhtml5(html):
    """Tidy up a html4/5 soup to a parsable valid XHTML5.
    Requires tidy-html5 from https://github.com/w3c/tidy-html5 Installation: http://goo.gl/FG27n
    """
    html = _io2string(html)
    html = _pre_tidy(html) # Pre-process
    xhtml5, errors = tidy_document(html,
        options={
            'merge-divs': 0,       # do not merge nested div elements - preserve semantic block structrues
            'output-xml': 1,       # create xml output
            'indent': 0,           # Don't use indent, add's extra linespace or linefeeds which are big problems
            'tidy-mark': 0,        # No tidy meta tag in output
            'wrap': 0,             # No wrapping
            'alt-text': '',        # Help ensure validation
            'doctype': 'strict',   # Little sense in transitional for tool-generated markup...
            'force-output': 1,     # May not get what you expect but you will get something
            'numeric-entities': 1, # remove HTML entities like e.g. nbsp
            'clean': 1,            # remove
            'bare': 1,
            'word-2000': 1,
            'drop-proprietary-attributes': 1,
            'enclose-text': 1,     # enclose text in body always with <p>...</p>
            'logical-emphasis': 1, # transforms <i> and <b> text to <em> and <strong> text
            # do not tidy all MathML elements! List of MathML 3.0 elements from http://www.w3.org/TR/MathML3/appendixi.html#index.elem
            'new-inline-tags': 'abs, and, annotation, annotation-xml, apply, approx, arccos, arccosh, arccot, arccoth, arccsc, arccsch, arcsec, arcsech, arcsin, arcsinh, arctan, arctanh, arg, bind, bvar, card, cartesianproduct, cbytes, ceiling, cerror, ci, cn, codomain, complexes, compose, condition, conjugate, cos, cosh, cot, coth, cs, csc, csch, csymbol, curl, declare, degree, determinant, diff, divergence, divide, domain, domainofapplication, el, emptyset, eq, equivalent, eulergamma, exists, exp, exponentiale, factorial, factorof, false, floor, fn, forall, gcd, geq, grad, gt, ident, image, imaginary, imaginaryi, implies, in, infinity, int, integers, intersect, interval, inverse, lambda, laplacian, lcm, leq, limit, list, ln, log, logbase, lowlimit, lt, maction, malign, maligngroup, malignmark, malignscope, math, matrix, matrixrow, max, mean, median, menclose, merror, mfenced, mfrac, mfraction, mglyph, mi, min, minus, mlabeledtr, mlongdiv, mmultiscripts, mn, mo, mode, moment, momentabout, mover, mpadded, mphantom, mprescripts, mroot, mrow, ms, mscarries, mscarry, msgroup, msline, mspace, msqrt, msrow, mstack, mstyle, msub, msubsup, msup, mtable, mtd, mtext, mtr, munder, munderover, naturalnumbers, neq, none, not, notanumber, note, notin, notprsubset, notsubset, or, otherwise, outerproduct, partialdiff, pi, piece, piecewise, plus, power, primes, product, prsubset, quotient, rationals, real, reals, reln, rem, root, scalarproduct, sdev, sec, sech, selector, semantics, sep, set, setdiff, share, sin, sinh, subset, sum, tan, tanh, tendsto, times, transpose, true, union, uplimit, variance, vector, vectorproduct, xor',
            'doctype': 'html5',
            })

    #return xhtml5
    # return the tree itself, there is another modification below to avoid
    # another parse
    return _post_tidy(xhtml5)

Example #10

0

Show file

File: middleware.py Project: pombredanne/vmware-scrapper

    def process_response(self, request, response, spider):
        if 'index-mo' in response.url:
            return response
        if 'index-do' in response.url:
            return response
        if 'index-e_types' in response.url:
            return response

        body = response.body
        
        # VERY UGLY...  need to get this done, so it's ugly for now...
        index = body.find("<table")
        while index != -1:
            inner = body.find("<table", index+6)
            endtable = body.find("</table", index+6)
            if inner != -1 and inner < endtable:
                # we have an inner table...
                if body.find("<tr", inner, endtable) != -1:
                    break  # if it's truely a table
                    
                else:
                    start = inner-1
                    end = body.find(">", inner)+1
                    body = body[:start] + body[end:]
                    
                    endtable = body.find("</table", index+6)
                    
                    start = endtable-1
                    end = body.find(">", endtable)+1
                    body = body[:start] + body[end:]
                    
            index = body.find("<table", index+6)
            
        # remove any <br> before we tidy it up
        body = self.br_re.sub('', body)
        body = self.empty_anchor_re.sub('', body)
        
        tidylib.BASE_OPTIONS = {
            "output-xhtml": 0,     # XHTML instead of HTML4
            "indent": 1,           # Pretty; not too much of a performance hit
            "tidy-mark": 0,        # No tidy meta tag in output
            "wrap": 0,             # No wrapping
            "alt-text": "",        # Help ensure validation
            "doctype": 'omit',     # Little sense in transitional for tool-generated markup...
            "force-output": 1,     # May not get what you expect but you will get something
        }
        body, _ = tidy_document(body, options={'drop-empty-paras':1,
                    'drop-font-tags':1,'enclose-text':1,'merge-divs':1,'fix-bad-comments':1})
            
        body = self.link_re.sub('\g<1>', body)
        body = self.vmodl_re.sub('\g<1>', body)
        body = self.strong_re.sub('\g<1>', body)
        body = self.script_re.sub('', body)
        body = self.ul_re.sub('', body)
        body = self.li_end_re.sub('', body)
        body = self.li_re.sub('* ', body)
        body = self.ul_end_re.sub('', body)
        
        response = response.replace(body=body)
        return response

Example #11

0

Show file

File: test_docs.py Project: GertBurger/pytidylib

 def test_xmlns_large_document_xml_corner_case(self):
     # Test for a super weird edge case in Tidy that can cause it to return
     # the wrong required buffer size.
     body = '<span><span>A</span></span>' + 'A' * 7937
     html = '<html xmlns="http://www.w3.org/1999/xhtml">' + body
     doc, err = tidy_document(html, {'output-xml': 1})
     self.assertEqual(doc.strip()[-7:], "</html>")

Example #12

0

Show file

File: DocsTest.py Project: 18600597055/hue

 def test_doc_with_unclosed_tag(self):
     h = "<p>hello"
     expected = DOC % '''<p>
   hello
 </p>'''
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)

Example #13

0

Show file

File: request_setup.py Project: mtmail/test-nominatim

def call():
    if world.results:
        return

    data = urllib.urlencode(world.params)
    req = urllib2.Request(url="%s/%s?%s" % (world.base_url, world.requesttype, data),
                          headers=world.header)
    fd = urllib2.urlopen(req)
    page = fd.read()

    fmt = world.params.get('format')
    if fmt not in ('html', 'xml', 'json', 'jsonv2'):
        fmt = 'xml' if world.requesttype == 'reverse' else 'html'
    pageinfo = fd.info()
    assert_equal('utf-8', pageinfo.getparam('charset').lower())
    pagetype = pageinfo.gettype()
    if fmt == 'html':
        assert_equals('text/html', pagetype)
        document, errors = tidy_document(page, 
                             options={'char-encoding' : 'utf8'})
        assert(len(errors) == 0), "Errors found in HTML document:\n%s" % errors
        world.results = document
    elif fmt == 'xml':
        assert_equals('text/xml', pagetype)
        world.results = parseString(page).documentElement
    else:
        if 'json_callback' in world.params:
            func = world.params['json_callback']
            assert page.startswith(func + '(')
            assert page.endswith(')')
            page = page[(len(func)+1):-1]
            assert_equals('application/javascript', pagetype)
        else:
            assert_equals('application/json', pagetype)
        world.results = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(page)

Example #14

0

Show file

File: inventory_notifier.py Project: ralphbean/func-inventory-notifier

    def run(self):
        """ Main entry point
        
        Run FuncInventory and if there is a change, prepare and send an email.
        """
        # Run the inventory
        inventory = func_inventory.FuncInventory()
        inventory.run(
            [
                'func-inventory',
                '--tree=%s' % self.config['git_repo'],
                '--modules=%s' % ','.join(self.config['modules'])
            ])

        diff = self.git_diff()
        if not diff:
            self.log('No changes detected.  Sleeping.')
        else:
            self.log('CHANGE DETECTED in func-inventory.')

            kw = dict(dark_bg=self.config['dark_bg'],
                      font_size=self.config['font_size'])
            html = ansi2html.Ansi2HTMLConverter(**kw).convert(diff)

            html, errors = tidylib.tidy_document(html)

            html = pypremailer.Premailer(html).premail()

            self.mail(html)

            self.log('Done mailing changes.')

Example #15

0

Show file

File: pipelines.py Project: michelderu/ml-scrapy-pipeline

   def marklogic_put_xml(self, item, spider_name):
       # Set the uri and collection
       if (self.ml_transform == ''):
           params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name}
       else:
           params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name, 'transform': self.ml_transform}
       # Set up the XML payload
       payload = dicttoxml(dict(item), attr_type=False, custom_root='webcontent')
       # Decode the <> characters back again
       payload = payload.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"')
       # Run tidy in order to get wel-formed XML
       payload, errors = tidy_document(payload, options={'input-xml': 1})
 
       # Set up the header
       headers = {'Content-Type': 'application/xml'}
 
       ml_uri = ('ml_uri' in item and item['ml_uri']) or self.ml_uri
       logging.info("PUTting XML in " + ml_uri + " as " + item['uri'])
 
       # Call the MarkLogic REST endpoint
       ml_user = ('ml_user' in item and item['ml_user']) or self.ml_user
       ml_pwd = ('ml_pwd' in item and item['ml_pwd']) or self.ml_pwd
       r = requests.put(ml_uri,
           params = params,
           auth = HTTPDigestAuth(ml_user, ml_pwd),
           data = payload,
           headers = headers)
 
       logging.info("PUT response: " + str(r.status_code) + ", " + r.text)

Example #16

0

Show file

File: test_unittest.py Project: woodenshoe/infoshopkeeper

 def dynamic_test_method(self):
     """this function name doesn't matter much, it can start with `test`,
     but we're going to rename it dynamically below"""
     reportURLstring = '/report?reportname=' + reportItem.metadata['action']
     response=self._my_app.get(reportURLstring)
     code, error=tidylib.tidy_document(response.body, options={'show-errors':1, 'show-warnings':0})
     self.assertFalse(error, '%s did not return valid html page' % reportURLstring)

Example #17

0

Show file

File: enml.py Project: shurain/archiver

def html2enml(html):
    # doc, err = tidy_fragment(

    doc, err = tidy_document(
        html,
        options={
            "output-xhtml": 1,
            "drop-proprietary-attributes": 1,
            "merge-divs": 1,
            "clean": 1
        }
    )

    root = fromstring(doc)

    # XXX dirty hack to circumvent a bug in lxml parser
    root = fromstring(etree.tostring(root))

    logging.debug(etree.tostring(root))

    # tidy_document returns a valid html document which means it usually contains html tag and proper body element
    root = root.find('body')
    if root is None:
        logging.warn("No body on this document")
        logging.warn(html)
        return "<div></div>"
    root.tag = 'div'

    root = remove_prohibited_elements(root)
    root = remove_prohibited_attributes(root)
    #FIXME Skipping dtd validation because of slow DTD creation speed
    # validate_dtd(html, f):

    return etree.tostring(root)

Example #18

0

Show file

    def process_response(self, request, response):
        if not _has_tidylib or not self._is_html(request, response):
            return response

        html, errors = tidylib.tidy_document(response.content, self._options, keep_doc=True)
        if not errors:
            return response

        # Filter out what we care about
        err_list = errors.rstrip().split("\n")
        err_list = self._filter_warnings(err_list)
        if not err_list:
            return response

        try:
            fn = urlresolvers.resolve(request.path)[0]
            fn_name = "%s.%s" % (fn.__module__, fn.__name__)
        except:
            fn_name = "<unresolved_url>"

        # Write the two versions of html out for offline debugging
        filename = os.path.join(self._outdir, fn_name)

        result = (
            "HTML tidy result: %s [%s]:"
            "\n\t%s"
            "\nPlease see %s.orig %s.tidy\n-------" % (request.path, fn_name, "\n\t".join(err_list), filename, filename)
        )

        file(filename + ".orig", "w").write(i18n.smart_str(response.content))
        file(filename + ".tidy", "w").write(i18n.smart_str(html))
        file(filename + ".info", "w").write(i18n.smart_str(result))

        self._logger.error(result)
        return response

Example #19

0

Show file

File: bvb.py Project: orithena/sportswarnbot

def fetch_data():
    def bvbreplace(s):
        return "BVB" if "Dortmund" in s else s

    doc = None
    try:
        doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
        soup = Soup(doc)
    except Exception as e:
        raise Exception(u"Error fetching/parsing website: %s" % e)

    out = ''
    matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
    timestr = ''
    try:
        home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
        guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
        league = ''
        try:
            league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
        except:
            league = select(soup, "div.next-match p span")[2].contents[0].strip()            
        matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
        timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
        dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
        location = u"Heim" if u"BVB" == home else u"Auswaerts"
        out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
    except IndexError:
        # This means: No next game on the webpage.
        sys.exit(1)
    except Exception as e:
        #print(traceback.format_exc())
        raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
    return out, matchtime

Example #20

0

Show file

File: realtestmodel.py Project: BelloHe/Malicious_Website_Detection

def nofoutofplacefeatures(url):
	try:
	

	#	pdb.set_trace()

		if url[:4]=="http":
			r = requests.get(url)
		else:
			url="http://"+url
			r  = requests.get(url)

		#r = requests.get(url)
		data = r.text
		data2=r.content

		document, errors = tidy_document(data,
		  options={'numeric-entities':1})

		#print document
		#print errors
		#print "Number of Elements Out of Place : " + str(len(errors))
		return len(errors)
	except:
		pass

Example #21

0

Show file

File: sciebo_shares.py Project: BenediktAllendorf/snippets

def get_employees(lastname, firstname):
    payload = { 'find' : lastname }
    res = requests.get('https://www.campus.rwth-aachen.de/rwth/all/lecturerlist.asp', params=payload)
    if res.status_code == 200:
        persons = [ ]
        
        document, errors = tidy_document(res.content, options={'numeric-entities': 1, 'output_xhtml': 1})
        tree = ET.fromstring(strip_ns(document))
        
        try:
            filename = posixpath.basename(urlparse.urlsplit(res.url).path)
            if filename == 'lecturer.asp':
                fullname = tree.find('body/table[1]/tr[3]//tr[2]/td[2]').text.strip()
                unit = tree.find("body/table[2]//td[@class='h3']/a").text.strip()
            
                persons.append(fullname)

            elif filename == 'lecturerlist.asp':
                links = [ ]
                for cell in tree.findall('body/table[2]//td[3]/table[2]//td[1]/a'):
                    if cell is not None:
                        fullname = cell.text.strip()
                        persons.append(fullname)
            else:
                raise Exception
        except:
            print "===> WARNING: failed to get employee list for: %s, %s" % (firstname, lastname)
        
        return persons

Example #22

0

Show file

File: notilitus.py Project: arychj/Notilitus

def sanitize(note):
	debug('Sanitizing note content...', 2)

	if get_setting('evernote/sanitize/@applytemplate') == 'True':
		with open(get_setting('evernote/sanitize/template/text()'), 'r') as file:
			template = file.read()
			template = template.replace('{content}', note['content'])
			
		note['content'] = transform(template)
		
		preservedElements = []
		preservePattern = get_setting('evernote/sanitize/preserve/pattern/text()')
		preserves = get_setting('evernote/sanitize/preserve/elements/text()').split(',')
		for preserve in preserves:
			matches = re.findall(preservePattern.format(preserve), note['content'])
			for match in matches:
				placeholder = '{%s}' % uuid.uuid4().hex
				preservedElements.append({'placeholder': placeholder, 'element': match})
				note['content'] = note['content'].replace(match, placeholder, 1)
	
		note['content'] = re.sub(get_setting('evernote/sanitize/attributes/empty/text()'), '', note['content'])
		note['content'] = re.sub(get_setting('evernote/sanitize/attributes/prohibited/text()'), '', note['content'])
		note['content'] = re.sub(get_setting('evernote/sanitize/elements/text()'), '', note['content'])
		note['content'] = note['content'].encode('utf-8', errors='ignore')
		(note['content'], errors) = tidy_document(note['content'])

		for element in preservedElements:
			note['content'] = note['content'].replace(element['placeholder'], element['element'])
	
	if note['title'] != None:
		note['title'] = note['title'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
	else:
		note['title'] = get_setting('evernote/sanitize/defaulttitle/text()')

Example #23

0

Show file

File: main.py Project: bradmontgomery/word2html

def convert_to_html(filename):
    # Do the conversion with pandoc
    output = pypandoc.convert(filename, 'html')

    # Clean up with tidy...
    output, errors = tidy_document(output,  options={
        'numeric-entities': 1,
        'wrap': 80,
    })
    print(errors)

    # replace smart quotes.
    output = output.replace(u"\u2018", '&lsquo;').replace(u"\u2019", '&rsquo;')
    output = output.replace(u"\u201c", "&ldquo;").replace(u"\u201d", "&rdquo;")

    # write the output
    filename, ext = os.path.splitext(filename)
    filename = "{0}.html".format(filename)
    with open(filename, 'w') as f:
        # Python 2 "fix". If this isn't a string, encode it.
        if type(output) is not str:
            output = output.encode('utf-8')
        f.write(output)

    print("Done! Output written to: {}\n".format(filename))

Example #24

0

Show file

File: homepage.py Project: adam139/xsgs.theme

    def _tidysrc(self,data,srccode):
        """tidy scribe the html src"""

        try:
            from tidylib import tidy_document
            BASE_OPTIONS = {
    "output-xhtml": 1,     # XHTML instead of HTML4
    "indent": 1,           # Pretty; not too much of a performance hit
    "tidy-mark": 0,        # No tidy meta tag in output
    "wrap": 0,             # No wrapping
    "alt-text": "",        # Help ensure validation
    "doctype": 'strict',   # Little sense in transitional for tool-generated markup...
    "force-output": 1,     # May not get what you expect but you will get something
    "char-encoding":'utf-8',
    "input-encoding":srccode,
    "output-encoding":'utf-8',
    }
            if not isinstance(data, unicode):                
                try:
                    data = data.decode(srccode)
                except:
                    pass
            doc, errors = tidy_document(data,options={'numeric-entities':1})
            return doc
        except:
            return data

Example #25

0

Show file

File: DocsTest.py Project: 18600597055/hue

 def test_doc_with_unicode_subclass(self):
     class MyUnicode(unicode):
         pass
     
     h = MyUnicode(u"unicode string ß")
     expected = unicode(DOC, 'utf-8') % h
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)

Example #26

0

Show file

File: templates.py Project: edmw/scripts

 def pretty(self, html):
     soup = BeautifulSoup(html, "html5lib")
     document, errors = tidy_document(soup.encode(formatter="html"), options={
         'char-encoding': 'utf8',
         'output-encoding': 'utf8',
         'doctype': 'html5'
     })
     return document

Example #27

0

Show file

File: validator.py Project: hyperweek/django-debug-toolbar

 def init_stats(self):
     self.source = self.response.content.decode('utf-8')
     if tidylib:
         document, errors = tidylib.tidy_document(self.source, options={'numeric-entities':1})
     else:
         errors = ""
     if errors:
         self.errors = [e.groupdict() for e in error_re.finditer(errors)]

Example #28

0

Show file

File: DocsTest.py Project: waylan/pytidylib

 def test_doc_with_unicode_subclass(self):
     class MyUnicode(utype):
         pass
     
     h = MyUnicode("unicode string ß")
     expected = DOC % h
     doc, err = tidy_document(h, {'output_xhtml':1})
     self.assertEqual(doc, expected)

Example #29

0

Show file

File: transform.py Project: koslab/platocdp.devoops

 def tidy(self, data):
     document, errors = tidy_document(data, {
         'input-xml': True, 'output-xml': True,
         'preserve-entities': True, 'numeric-entities': True
     })
     if errors:
         print errors
     return document

Example #30

0

Show file

File: panels.py Project: flenter/django-dtpanel-htmltidy

    def process_response(self, request, response):
        document, errors = tidy_document(response.content, options={"numeric-entities": 1})
        self.log_data = (document, errors)
        self.src_content = response.content
        errors_list = errors.split("\n")
        self.errors_count = len([err for err in errors_list if "error:" in err.lower()])
        self.warns_count = len([err for err in errors_list if "warning:" in err.lower()])

        return response

Example #31

0

Show file

def validate_html(endpoint, document):
    """
    This function can be used to make sure HTML returned is valid
    It raises an exception describing what's wrong then non-valid HTML was entered
    :param endpoint: name of the function which returned the html content
    :param document: the html content
    :return: None
    """
    tidied, errors = tidy_document(document)
    if errors:
        raise SystemError(
            "Errors were found in the following HTML returned by function {}:\n{}\n\nErrors:\n{}"
            .format(endpoint, document, errors))

Example #32

0

Show file

 def pretty(self, tidy_warnings=False) -> str:
     '''Like render() but format through tidylib'''
     txt, errors = tidy_document(
         self.render(), {
             'indent': 1,
             'output-xhtml': True,
             'force-output': 1,
             'doctype': 'strict',
             'show-warnings': tidy_warnings
         })
     if errors:
         print('HTML tidy: ' + str(errors), file=sys.stderr)
     return txt

Example #33

0

Show file

 def test_post_note(self):
     code, error = tidylib.tidy_document(
         self._my_class.post_note(**{
             "author": "test",
             "message": "test"
         }),
         options={
             "show-errors": 1,
             "show-warnings": 0
         },
     )
     self.assertFalse(error,
                      "/notes/post_note does not return valid html page")

Example #34

0

Show file

 def test_select_item_search(self):
     random_item = random.sample(list(Book.selectBy(status="STOCK")), 1)[0]
     code, error = tidylib.tidy_document(
         self._my_class.select_item_search(
             title=random_item.title.booktitle),
         options={
             "show-errors": 1,
             "show-warnings": 0
         },
     )
     self.assertFalse(
         error,
         "/register/select_item_search does not return valid html page")

Example #35

0

Show file

 def clean_html(self, htmlfile):
     try:
         reader = open(htmlfile, 'r')
         content = reader.read()
         reader.close()
         document, errors = tidy.tidy_document(content, options=tidy_options)
         if document:
             writer = open(htmlfile, 'w')
             writer.write(document)
             writer.close()
         print "Cleaned", htmlfile
     except Exception, e:
         print e

Example #36

0

Show file

def get_cvk_page(url):
    '''Повертає текст сторінки або нічого
    '''
    res = s.get(url, headers=headers, verify=False)
    # print(res.encoding)
    # print(res.url)
    res.encoding = "utf-8"
    if res.status_code != 200:
        print(f"Error <= {url}")
        return
    tidy, errors = tidy_document(res.text)
    # print(errors)
    return tidy

Example #37

0

Show file

 def test_select_item_for_isbn_search_functional(self):
     random_item = random.sample(list(Title.select()), 1)[0]
     response = self._my_app.get("/admin/select_item_for_isbn_search",
                                 {"isbn": random_item.isbn})
     code, error = tidylib.tidy_document(response.body,
                                         options={
                                             "show-errors": 1,
                                             "show-warnings": 0
                                         })
     self.assertFalse(
         error,
         "/admin/select_item_for_isbn_search did not return proper response"
     )

Example #38

0

Show file

    def parse_html(self):
        content, errors = tidy_document(self.page,
                                        options={'char-encoding': 'utf8'})
        #eq_(len(errors), 0 , "Errors found in HTML document:\n%s" % errors)

        b = content.find('nominatim_results =')
        e = content.find('</script>')
        content = content[b:e]
        b = content.find('[')
        e = content.rfind(']')

        self.result = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(
            content[b:e + 1])

Example #39

0

Show file

File: parserLattes.py Project: Webteg/crawlerlattes

    def __init__(self, idMembro, cvLattesHTML):
        HTMLParser.__init__(self)

        # inicializacao obrigatoria
        self.idMembro = idMembro
        self.sexo = 'Masculino'
        self.nomeCompleto = u'[Nome-nao-identificado]'

        self.item = ''
        self.issn = ''
        self.listaIDLattesColaboradores = []
        self.listaFormacaoAcademica = []
        self.listaAreaDeAtuacao = []
        self.listaIdioma = []

        self.listaArtigoEmPeriodico = []
        self.listaLivroPublicado = []
        self.listaCapituloDeLivroPublicado = []
        self.listaTextoEmJornalDeNoticia = []
        self.listaTrabalhoCompletoEmCongresso = []
        self.listaResumoExpandidoEmCongresso = []
        self.listaResumoEmCongresso = []
        self.listaArtigoAceito = []
        self.listaApresentacaoDeTrabalho = []
        self.listaOutroTipoDeProducaoBibliografica = []


        self.listaParticipacaoEmEvento = []
        self.listaOrganizacaoDeEvento = []


        # inicializacao para evitar a busca exaustiva de algumas palavras-chave
        self.salvarAtualizacaoCV = 1
        self.salvarFoto = 1
        self.procurarCabecalho = 0
        self.achouGrupo = 0
        self.doi = ''
        self.relevante = 0
        self.idOrientando = ''
        self.complemento = ''

        # contornamos alguns erros do HTML da Plataforma Lattes
        cvLattesHTML = cvLattesHTML.replace("<![CDATA[","")
        cvLattesHTML = cvLattesHTML.replace("]]>","")
        cvLattesHTML = cvLattesHTML.replace("<x<","&lt;x&lt;")
        cvLattesHTML = cvLattesHTML.replace("<X<","&lt;X&lt;")

        # feed it!
        cvLattesHTML, errors = tidy_document(cvLattesHTML, options={'numeric-entities':1})

        self.feed(cvLattesHTML)

Example #40

0

Show file

    def _validateHTML(self, moFile):
        """
        This function validates the file ``moFile`` for correct html syntax.

        :param moFile: The name of a Modelica source file.
        :return: (str, str) The tidied markup [0] and warning/error
                 messages[1]. Warnings and errors are returned
                 just as tidylib returns them.

        """
        from tidylib import tidy_document

        entries = self._getInfoRevisionsHTML(moFile)

        # Document header
        header = "<?xml version='1.0' encoding='utf-8'?> \n \
        <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \n \
    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"> \n \
<html xmlns=\"http://www.w3.org/1999/xhtml\"> \n \
<head> \n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /> \n \
<title>xxx</title> \n \
</head> \n \
<body> \n \
<!-- +++++++++++++++++++++++++++++++++++++ -->\n"

        body = ""
        for line in entries:
            body += line + '\n'
        # Replace \" with "
        body = body.replace('\\"', '"')

        # Document footer
        footer = "<!-- +++++++++++++++++++++++++++++++++++++ -->\n \
</body>\n \
</html>"

        # Validate the string
        document, errors = tidy_document(r"%s%s%s" % (header, body, footer),
                                         options={
                                             'numeric-entities': 1,
                                             'output-html': 1,
                                             'alt-text': '',
                                             'wrap': 72
                                         })
        # Write html file.
        if self._writeHTML:
            htmlName = "%s%s" % (moFile[0:-2], "html")
            with open(htmlName, mode="w", encoding="utf-8") as f:
                f.write(document)
        return (document, errors)

Example #41

0

Show file

File: fetch.py Project: Ramblurr/philosophy-of-tramping

def wrap_html(curr_chapter):
    with open(curr_chapter['local'], 'r+', encoding='utf-8') as f:
        content = f.read()
        header = """<?xml version='1.0' encoding='utf-8'?>
            <html xmlns="http://www.w3.org/1999/xhtml">
            <head><title>%s</title></head>
            <body><h1 class="chapter">%s</h1>""" % (curr_chapter['title'],
                                    curr_chapter['title'])
        footer = "</body></html>"
        doc = "%s%s%s" % (header, content, footer)
        clean = tidy_document(doc, options=TIDY_OPTS)
        f.seek(0)
        f.write(clean[0])
        f.truncate()

Example #42

0

Show file

File: crawler_test.py Project: ion-plugged/test-NAV

def test_page_should_be_valid_html(page):
    if page.response != 200:
        pytest.skip("not validating non-reachable page")
    if not page.content_type or 'html' not in page.content_type.lower():
        pytest.skip("not attempting to validate non-html page")
    if not should_validate(page.url):
        pytest.skip("skip validation of blacklisted page")
    if not page.content:
        pytest.skip("page has no content")

    document, errors = tidy_document(page.content, TIDY_OPTIONS)
    errors = filter_errors(errors)

    assert not errors, "Found following validation errors:\n" + errors

Example #43

0

Show file

File: peticiones.py Project: JaviBF92/TransporBot

def get_stations():
    try:
        web = requests.get(
            'http://horarios.renfe.com/cer/hjcer300.jsp?NUCLEO=30&CP=NO&I=s#',
            timeout=4).text
    except Timeout:
        return None
    else:
        document, errors = tidy_document(web)
        bs = BeautifulSoup(document, 'html.parser')
        estaciones = bs.find('select', {"name": "o"}).findAll('option')
        estaciones_ids = [(option.text.strip().replace(" ", "").lower(),
                           option['value']) for option in estaciones][1:]
        return {key: value for (key, value) in estaciones_ids}

Example #44

0

Show file

File: process.py Project: Keson96/ConEx

 def clean(cls, html, tidy=True, body_only=False):
     """
     clean html document
     """
     if body_only:
         cleaner = cls.cleaner_body
     else:
         cleaner = cls.cleaner
     if tidy:
         document, errors = tidy_document(html)
         cleaned = cleaner.clean_html(document)
     else:
         cleaned = cleaner.clean_html(html)
     return cleaned

Example #45

0

Show file

File: importa_leis.py Project: brunogfranca/qualcandidato

def carrega_dados_lei(lei):
    response = urllib2.urlopen(lei.get('url'))
    html, errors = tidy_document(response.read(), tidy_options)
    if errors:
        sys.exit(errors)
    parsed_html = BeautifulSoup(html)

    titulo = parsed_html.find('h4').text.strip()
    descricao = parsed_html.find('td', attrs={'id': 'prim_col'}).text.strip()
    dados = {'titulo_completo': titulo, 'descricao': descricao}
    if lei.has_key('voto'):
        del (lei['voto'])
    lei.update(dados)
    return lei

Example #46

0

Show file

    def htmlCorrection(
        self,
        htmlStr: str,
        substitutions_dict: dict = {
            '"': '\\"',
            '<br>': '<br />',
            '<br/>': '<br />'
        }
    ) -> (str, str):
        """Returns cleaned html code and found errors

        Calls tidylib which will produce a clean version of the html code
        and also the errors that it has found.

        Parameters
        ----------
        htmlStr : str
                The html code as a single string.
        substitutions_dict : dict
                A dictionary with key:value pairs for old and new text.
                The html code must be escaped in Modelica. Generate properly
                escaped code we need to add the escape characters. All the
                while we can replace html errors that Dymola introduces.
                i.e. '<br>' -> '<br />'

        Returns
        -------
        str
                The tidy html code with escape characters as one string.
        str
                The error messages from tidylib.
        """
        from tidylib import tidy_document

        # Validate the string
        htmlCorrect, errors = tidy_document(f"{htmlStr}",
                                            options={
                                                'doctype': 'html5',
                                                'show-body-only': 1,
                                                'numeric-entities': 1,
                                                'output-html': 1,
                                                'wrap': 72,
                                                'alt-text': '',
                                            })

        document_corr = HTML_Tidy.make_string_replacements(
            self, theString=htmlCorrect, substitutions_dict=substitutions_dict)

        return document_corr, errors

Example #47

0

Show file

 def __parse(self,
             url: str = None,
             html: str = None,
             cookies: str = None) -> (dict, str):
     """
     Make an HTML/URL parsing by processing ALL found tags
     :param url: The url to parse (or None)
     :param html: The html page to parse as string (or None)
     :param cookies: The cookies to use on parsing
     :return: dictionary of tags, cookies
     """
     self.url = None
     self.base_url = None
     is_image = False
     if url is not None:
         self.url = url
         url_parsed = urlparse(url)
         self.url_scheme = str(url_parsed.scheme)
         self.base_url = self.url_scheme + '://' + str(url_parsed.netloc)
         r = HttpRequest.request(url, cookies=cookies)
         if r is None:
             return None
         if r.status_code >= 400 or r.headers.get(
                 'Content-Type') in HtmlParser._unacceptable_content_types:
             return None
         try:
             html = r.json()
             Log.warning('Trying to parse a json with HTML parser!')
         except ValueError:
             html = r.text
         if r.headers is not None:
             for k, v in r.headers.items():
                 if k.lower() == 'set-cookie':
                     cookies = v
         if HttpRequest.is_image(r):
             is_image = True
             xmp_start = html.find('<x:xmpmeta')
             xmp_end = html.find('</x:xmpmeta')
             xmp_str = html[xmp_start:xmp_end + 12]
             html = xmp_str
     if is_image:
         sorted_html = html
     else:
         sorted_html, errors = tidy_document(
             html)  # Sort html (and fix errors)
     self.feed(sorted_html)
     if cookies is None:
         cookies = ''
     return self.tags, cookies

Example #48

0

Show file

File: template_validations.py Project: Ali-Armstrong/Inboxer

 def validate_html_code(template, mongodb, redis_db):
     _id_key = redis_db.incr('html_log')
     document, errors = tidy_document(template,
                                      options={
                                          'numeric-entities': 1,
                                          'char-encoding': 'utf8'
                                      })
     errors = errors.split('\n')
     mongodb.html_log.insert({
         '_id': _id_key,
         'time': get_time.datetime_to_secs(),
         'tmpl': template,
         'log': errors
     })
     return document

Example #49

0

Show file

File: tidyHTML.py Project: modelica-3rdparty/ExternData

def _tidyHTML(doc):
    body = ''.join(doc)
    body = body.replace('\\"', '"')

    _, errors = tidy_document(r'{0}'.format(body), options=TIDY_OPTIONS)

    error_list = errors.rstrip().split('\n')
    errors = []
    for error in error_list:
        error = error.rstrip().split('\n')
        for err in error:
            if bool(err) and not any(id in err for id in IGNORE_IDS):
                errors.append(err)

    return errors

Example #50

0

Show file

    def validate(self):
        document, errors = tidylib.tidy_document(self.content,
                                                 options=self.TIDY_OPTIONS)

        for line in errors.splitlines():
            for marker in self.markers:
                pattern = r"line \d+ column \d+ - {}:".format(marker)
                if re.match(pattern, line, flags=re.IGNORECASE):
                    self.log.warning("Tidy report:\n%s", errors)
                    raise ValidateError("invalid HTML content")

        if self.permissive and errors:
            self.log.debug("Tidy report:\n%s", errors)
        self.log.info("Clean HTML document")
        return document  # return its XHTML version

Example #51

0

Show file

File: get_quizzes.py Project: njoroge33/py_learn

def get_quiz_content(page):
    try:
        soup = BeautifulSoup(page, 'html.parser')
        q_desc_html = soup.find(class_='story')
        x_c = [tidy_document(str(x)) for x in q_desc_html.find_all('p')]

        x_s = BeautifulSoup(x_c[0][0], 'html.parser')
        q_desc = x_s.find('p').text.strip()
        return [x for x in q_desc.split('.') if x]
    except Exception as err:
        q_c = [x for x in q_desc_html.descendants][:2]
        if isinstance(q_c[1], bs4.element.Tag):
            q_c[1] = q_c[1].text
        else:
            q_c[1] = ''
        return q_c

Example #52

0

Show file

File: scrapy_openkansas.py Project: flosokaks/scrapy_ks

    def process_item(self, item, spider):
        data = pprint.pformat(item['content'])
        self.write(item, data, "html")

        document, errors = tidy_document(data, options={'numeric-entities': 1})

        self.write(item, document, "html.tidy")
        self.write(item, errors, "html.errors")

        #        my_reporter.read(html=data)
        #        data= my_reporter.report_news()

        self.write(item, "\n".join(item['links']), "links.txt")

        #
        return item

Example #53

0

Show file

File: validate.py Project: svopper/SDAB-Validator

def validate(folder, files):
    for fileName in files:
        if fileName.lower().endswith(".html"):
            try:
                file = open(fileName, "r")
                filetext = "".join([s for s in file.readlines()])
                _, errors = tidy_document(filetext,
                                          options={"numeric-entities": 1})
                if len(errors) == 0:
                    return "HTML was successfully validated with no errors\n"
                return "HTML was validated, with following errors:\n  - " + errors.replace(
                    "\n", "\n  - ").strip("  - ")
            except Exception as _:
                return "An error occured while validating html file\n"
            finally:
                file.close()

Example #54

0

Show file

    def corrigirHTML(self, cvLattesHTML):
        extended_chars = u''.join(
            unichr(c)
            for c in xrange(127, 65536, 1))  # srange(r"[\0x80-\0x7FF]")
        special_chars = ' -' ''
        cvLattesHTML = cvLattesHTML.decode(
            'iso-8859-1', 'replace')  #+extended_chars+special_chars
        #cvLattesHTML  = cvLattesHTML.decode('ascii','replace')+extended_char+special_chars # Wed Jul 25 16:47:39 BRT 2012

        # contornamos alguns erros do HTML da Plataforma Lattes
        cvLattesHTML = cvLattesHTML.replace("<![CDATA[", "")
        cvLattesHTML = cvLattesHTML.replace("]]>", "")
        arquivoHTML, errors = tidy_document(cvLattesHTML,
                                            options={'numeric-entities': 1})
        #print errors
        return arquivoHTML

Example #55

0

Show file

def checkurl_html(url: str,
                  status_code: Optional[int] = 200,
                  mimetype: Optional[str] = 'text/html',
                  has: List[str] = [],
                  hasnot: List[str] = []) -> str:
    __tracebackhide__ = True
    document = checkurl_text(url, status_code, mimetype, has, hasnot)
    if html_validation:
        for line in tidy_document(document,
                                  options=TIDY_OPTIONS)[1].split('\n'):
            if not line:
                continue
            if 'trimming empty <span>' in line:
                continue
            pytest.fail(f'tidy error: {line}')
    return document

Example #56

0

Show file

File: importa_excelencias.py Project: brunogfranca/qualcandidato

def carrega_dados_politico(idx):
    response = urllib2.urlopen('http://www.excelencias.org.br/@parl.php?id=%s'%idx)
    html, errors = tidy_document(response.read(), tidy_options)
    if errors:
        sys.exit(errors)
    parsed_html = BeautifulSoup(html)

    conteudo = parsed_html.body.find('div', attrs={'id':'conteudo'}).find_all('div', attrs={'id':'contem_parl'})
    if len(conteudo) <= 4:
        return

    bloco_principal = conteudo[0]
    bloco_votacoes = None
    for bloco in conteudo:
        if bloco.find('div', attrs={'id':'contem_titulo_parl'}).text.strip() == u'Como votou matérias no Plenário':
            bloco_votacoes = bloco
    if not bloco_votacoes:
        return

    nome = bloco_principal.find('div', attrs={'id':'contem_titulo_parl'}).text.strip()
    if not nome:
        return
    tabela_votacoes = bloco_votacoes.find('table', attrs={'class':'livre'})

    lista_votos = []
    if tabela_votacoes:
        for linha in tabela_votacoes.find_all('tr'):
            titulo_lei = linha.find('td', attrs={'id':'prim_col'}).text.strip()
            voto = linha.find('td', attrs={'class':'esq'}).text.strip()
            link_lei = linha.find('td', attrs={'id':'prim_col'}).find('a')
            url_lei = ''
            if link_lei:
                cod, num, ano, casa = link_lei.get('href').replace('javascript:parent.traz_pl(', '').split(',')
                url_lei = 'http://www.excelencias.org.br/modulos/parl_projetolei.php?cod=%s&num=%s&ano=%s&casa=%s'
                url_lei = url_lei %(cod.replace("'", ''), num, ano, casa.replace(')', ''))

            lista_votos.append({
                'titulo': titulo_lei,
                'voto': voto,
                'url': url_lei
            })
    dados_politico = {
        'idx': int(idx),
        'nome': nome,
        'votos': lista_votos
    }
    return dados_politico

Example #57

0

Show file

File: utils.py Project: Connexions/osc-stylesheet-framework

def tidy_html(html):
    if hasattr(html, 'read'):
        html = html.read()
    html5, errors = tidy_document(
        html,
        options={
            'merge-divs':
            0,  # do not merge nested div elements - preserve semantic block structures
            'output-xml': 0,
            'indent': 1,
            'tidy-mark': 0,
            'wrap': 0,
            'alt-text': '',
            'doctype': 'html5',
            'markup': 1
        })
    return html5

Example #58

0

Show file

File: validate_html.py Project: reesporte/bookwyrm

def validate_html(html):
    """run tidy on html"""
    _, errors = tidy_document(
        html.content,
        options={
            "drop-empty-elements": False,
            "warn-proprietary-attributes": False,
        },
    )
    # idk how else to filter out these unescape amp errs
    errors = "\n".join(
        e for e in errors.split("\n")
        if "&book" not in e and "&type" not in e and "id and name attribute"
        not in e and "illegal characters found in URI" not in e
        and "escaping malformed URI reference" not in e)
    if errors:
        raise Exception(errors)

Example #59

0

Show file

File: modules.py Project: cash2one/sitecheck

    def process(self, request, response, report):
        global _tidy_available
        #TODO: Hash errors and don't log duplicate error sets (just a reference)
        if response.is_html and _tidy_available:
            try:
                doc, err = tidy_document(response.content,
                                         options=self.options)
            except:
                report.add_error('Unable to parse response')
            else:
                l = err.splitlines()
                if len(l) > 0:
                    for e in l:
                        report.add_message('{0}'.format(
                            re.sub('^line\\b', 'Line', e)))

                    report.add_message('Total: {0}'.format(len(l)))

Example #60

0

Show file

File: sanitize.py Project: zwd1990/django-th

def sanitize(html):
    # with from __future__ import unicode_litterals
    # tidy_document does not want other options at all
    # such as div merge char-encoding and so on
    document, errors = tidy_document(
        html, options={"output-xhtml": 1, "force-output": 1})

    try:
        parsed_dom = parseString(document)
        document_element = parsed_dom.documentElement
        remove_prohibited_elements(document_element)
        remove_prohibited_attributes(document_element)
        body = document_element.getElementsByTagName("body")[0]
        body.tagName = "en-note"
        return body.toxml()
    except ExpatError:
        return ''