Example #1
0
 def test_doc_with_entity(self):
     h = "é"
     expected = DOC % "é"
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)
     
     expected = DOC % "é"
     doc, err = tidy_document(h, {'numeric-entities':1})
     self.assertEqual(doc, expected)
Example #2
0
def _massage_diff_content(content):
    tidy_options = {
        'output-xhtml': 0,
        'force-output': 1,
    }
    try:
        content = tidy_document(content, options=tidy_options)
    except UnicodeDecodeError:
        # In case something happens in pytidylib we'll try again with
        # a proper encoding
        content = tidy_document(content.encode('utf-8'), options=tidy_options)
        tidied, errors = content
        content = tidied.decode('utf-8'), errors
    return content
Example #3
0
def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            "output-html": 1,
            #'indent':1,
            "clean": 1,
            "drop-font-tags": 1,
        },
    )
    if errs:
        # raise Exception, errs
        print errs

    doc = html5lib.parse(doc, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery("td#content")
    assert len(td) == 1

    for img in td("img"):
        # print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr("src")
        # alt = img.attr('alt')

        # if src.startswith ('/image'):
        rslt = getimage(src, slug.split("/")[0])
        img.attr("src", rslt)
        if trace:
            print rslt

    # td =
    # no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    # content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  # .html()  # [:60]

    if dbteeth:
        # q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                # defaults = dict (sortorder = sortorder),
            ),
        )
Example #4
0
    def html_clean(self, html):
        # First we pass it through tidy
        (html, errors) = tidylib.tidy_document(html,
                                               options={
                                                   'drop-proprietary-attributes': 1,
                                                   'alt-text': '',
                                                   'hide-comments': 1,
                                                   'output-xhtml': 1,
                                                   'show-body-only': 1,
                                                   'clean': 1,
                                                   'char-encoding': 'utf8',
                                                   'show-warnings': 0,
                                                   'show-info': 0,
                                               })
        if errors:
            print(("HTML tidy failed for %s!" % self.msgid))
            print(errors)
            return None

        try:
            cleaner = HTMLCleaner()
            cleaner.feed(html)
            return cleaner.get_text()
        except Exception as e:
            # Failed to parse the html, thus failed to clean it. so we must
            # give up...
            return None
Example #5
0
 def __trading_years(self, instrument):
     re = urllib2.urlopen('http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/%s.phtml' % (instrument))
     document, errors = tidy_document(re.read())
     soup = BeautifulSoup(document)
     node = soup.find('select', attrs={'name':'year'})
     for option in node.findAll('option'):
         yield option.getText()
    def process_response(self, request, response):
        if 'text/html' in response['Content-Type'] and response.content:
            document, errors = tidy_document(response.content)
            if errors:
                raise HTMLValidationError(errors)

        return response
Example #7
0
def getMenu():
    storeFile = open("list.txt","r")
    txt = storeFile.read()
    storeFile.close()
    
    list=txt.split('\n\n\n')
    

 #   print list
    
    for store in list:    
#        print store
        rest = store.split('\n')
        if len(rest)!=3:
            break
        try:
            url=baseUrl+rest[2] +'menu'
            print url
            res=urlopen(url)
            html=res.read()    
         
            options = {'output-encoding':'utf8', 'output-xhtml':1 }
            document,errors = tidy_document(html,options)   
            
            filepath = dataDir+ (rest[2].split('/'))[2] + ".html"
            saveFile = open(filepath,"w")
            saveFile.write(document)
            saveFile.close()
            print filepath
        except :
            print "skip:"+url
Example #8
0
def cleanUpHTML(html, options=None):
    import tidylib
    tidylib.BASE_OPTIONS = {}

    default_options = { 
                        "force-output" : 1,
                        "output-xhtml" : 1,
                        "doctype" : "strict",
                        "drop-empty-paras": 1,
                        "output-encoding" : "utf8",
                        "clean": 1,
                        "bare": 1
                       }
    if options:
        default_options.extend(options)

    # first fix up footnotes so that HTMLTidy won't ditch them
    soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html")
    footnoteFixer(soup) #html)
    stripEmptyParagraphs(soup)
    
    html, errors = tidylib.tidy_document(soup.prettify(encoding=None), options=default_options)
    
    soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html")
    addMetaTag(soup, [('http-equiv', 'Content-type'), ('content', 'text/html; charset=utf-8')])
    
    return soup.prettify(encoding=None), errors
Example #9
0
def _tidy2xhtml5(html):
    """Tidy up a html4/5 soup to a parsable valid XHTML5.
    Requires tidy-html5 from https://github.com/w3c/tidy-html5 Installation: http://goo.gl/FG27n
    """
    html = _io2string(html)
    html = _pre_tidy(html) # Pre-process
    xhtml5, errors = tidy_document(html,
        options={
            'merge-divs': 0,       # do not merge nested div elements - preserve semantic block structrues
            'output-xml': 1,       # create xml output
            'indent': 0,           # Don't use indent, add's extra linespace or linefeeds which are big problems
            'tidy-mark': 0,        # No tidy meta tag in output
            'wrap': 0,             # No wrapping
            'alt-text': '',        # Help ensure validation
            'doctype': 'strict',   # Little sense in transitional for tool-generated markup...
            'force-output': 1,     # May not get what you expect but you will get something
            'numeric-entities': 1, # remove HTML entities like e.g. nbsp
            'clean': 1,            # remove
            'bare': 1,
            'word-2000': 1,
            'drop-proprietary-attributes': 1,
            'enclose-text': 1,     # enclose text in body always with <p>...</p>
            'logical-emphasis': 1, # transforms <i> and <b> text to <em> and <strong> text
            # do not tidy all MathML elements! List of MathML 3.0 elements from http://www.w3.org/TR/MathML3/appendixi.html#index.elem
            'new-inline-tags': 'abs, and, annotation, annotation-xml, apply, approx, arccos, arccosh, arccot, arccoth, arccsc, arccsch, arcsec, arcsech, arcsin, arcsinh, arctan, arctanh, arg, bind, bvar, card, cartesianproduct, cbytes, ceiling, cerror, ci, cn, codomain, complexes, compose, condition, conjugate, cos, cosh, cot, coth, cs, csc, csch, csymbol, curl, declare, degree, determinant, diff, divergence, divide, domain, domainofapplication, el, emptyset, eq, equivalent, eulergamma, exists, exp, exponentiale, factorial, factorof, false, floor, fn, forall, gcd, geq, grad, gt, ident, image, imaginary, imaginaryi, implies, in, infinity, int, integers, intersect, interval, inverse, lambda, laplacian, lcm, leq, limit, list, ln, log, logbase, lowlimit, lt, maction, malign, maligngroup, malignmark, malignscope, math, matrix, matrixrow, max, mean, median, menclose, merror, mfenced, mfrac, mfraction, mglyph, mi, min, minus, mlabeledtr, mlongdiv, mmultiscripts, mn, mo, mode, moment, momentabout, mover, mpadded, mphantom, mprescripts, mroot, mrow, ms, mscarries, mscarry, msgroup, msline, mspace, msqrt, msrow, mstack, mstyle, msub, msubsup, msup, mtable, mtd, mtext, mtr, munder, munderover, naturalnumbers, neq, none, not, notanumber, note, notin, notprsubset, notsubset, or, otherwise, outerproduct, partialdiff, pi, piece, piecewise, plus, power, primes, product, prsubset, quotient, rationals, real, reals, reln, rem, root, scalarproduct, sdev, sec, sech, selector, semantics, sep, set, setdiff, share, sin, sinh, subset, sum, tan, tanh, tendsto, times, transpose, true, union, uplimit, variance, vector, vectorproduct, xor',
            'doctype': 'html5',
            })

    #return xhtml5
    # return the tree itself, there is another modification below to avoid
    # another parse
    return _post_tidy(xhtml5)
Example #10
0
    def process_response(self, request, response, spider):
        if 'index-mo' in response.url:
            return response
        if 'index-do' in response.url:
            return response
        if 'index-e_types' in response.url:
            return response

        body = response.body
        
        # VERY UGLY...  need to get this done, so it's ugly for now...
        index = body.find("<table")
        while index != -1:
            inner = body.find("<table", index+6)
            endtable = body.find("</table", index+6)
            if inner != -1 and inner < endtable:
                # we have an inner table...
                if body.find("<tr", inner, endtable) != -1:
                    break  # if it's truely a table
                    
                else:
                    start = inner-1
                    end = body.find(">", inner)+1
                    body = body[:start] + body[end:]
                    
                    endtable = body.find("</table", index+6)
                    
                    start = endtable-1
                    end = body.find(">", endtable)+1
                    body = body[:start] + body[end:]
                    
            index = body.find("<table", index+6)
            
        # remove any <br> before we tidy it up
        body = self.br_re.sub('', body)
        body = self.empty_anchor_re.sub('', body)
        
        tidylib.BASE_OPTIONS = {
            "output-xhtml": 0,     # XHTML instead of HTML4
            "indent": 1,           # Pretty; not too much of a performance hit
            "tidy-mark": 0,        # No tidy meta tag in output
            "wrap": 0,             # No wrapping
            "alt-text": "",        # Help ensure validation
            "doctype": 'omit',     # Little sense in transitional for tool-generated markup...
            "force-output": 1,     # May not get what you expect but you will get something
        }
        body, _ = tidy_document(body, options={'drop-empty-paras':1,
                    'drop-font-tags':1,'enclose-text':1,'merge-divs':1,'fix-bad-comments':1})
            
        body = self.link_re.sub('\g<1>', body)
        body = self.vmodl_re.sub('\g<1>', body)
        body = self.strong_re.sub('\g<1>', body)
        body = self.script_re.sub('', body)
        body = self.ul_re.sub('', body)
        body = self.li_end_re.sub('', body)
        body = self.li_re.sub('* ', body)
        body = self.ul_end_re.sub('', body)
        
        response = response.replace(body=body)
        return response
Example #11
0
 def test_xmlns_large_document_xml_corner_case(self):
     # Test for a super weird edge case in Tidy that can cause it to return
     # the wrong required buffer size.
     body = '<span><span>A</span></span>' + 'A' * 7937
     html = '<html xmlns="http://www.w3.org/1999/xhtml">' + body
     doc, err = tidy_document(html, {'output-xml': 1})
     self.assertEqual(doc.strip()[-7:], "</html>")
Example #12
0
 def test_doc_with_unclosed_tag(self):
     h = "<p>hello"
     expected = DOC % '''<p>
   hello
 </p>'''
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)
Example #13
0
def call():
    if world.results:
        return

    data = urllib.urlencode(world.params)
    req = urllib2.Request(url="%s/%s?%s" % (world.base_url, world.requesttype, data),
                          headers=world.header)
    fd = urllib2.urlopen(req)
    page = fd.read()

    fmt = world.params.get('format')
    if fmt not in ('html', 'xml', 'json', 'jsonv2'):
        fmt = 'xml' if world.requesttype == 'reverse' else 'html'
    pageinfo = fd.info()
    assert_equal('utf-8', pageinfo.getparam('charset').lower())
    pagetype = pageinfo.gettype()
    if fmt == 'html':
        assert_equals('text/html', pagetype)
        document, errors = tidy_document(page, 
                             options={'char-encoding' : 'utf8'})
        assert(len(errors) == 0), "Errors found in HTML document:\n%s" % errors
        world.results = document
    elif fmt == 'xml':
        assert_equals('text/xml', pagetype)
        world.results = parseString(page).documentElement
    else:
        if 'json_callback' in world.params:
            func = world.params['json_callback']
            assert page.startswith(func + '(')
            assert page.endswith(')')
            page = page[(len(func)+1):-1]
            assert_equals('application/javascript', pagetype)
        else:
            assert_equals('application/json', pagetype)
        world.results = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(page)
    def run(self):
        """ Main entry point
        
        Run FuncInventory and if there is a change, prepare and send an email.
        """
        # Run the inventory
        inventory = func_inventory.FuncInventory()
        inventory.run(
            [
                'func-inventory',
                '--tree=%s' % self.config['git_repo'],
                '--modules=%s' % ','.join(self.config['modules'])
            ])

        diff = self.git_diff()
        if not diff:
            self.log('No changes detected.  Sleeping.')
        else:
            self.log('CHANGE DETECTED in func-inventory.')

            kw = dict(dark_bg=self.config['dark_bg'],
                      font_size=self.config['font_size'])
            html = ansi2html.Ansi2HTMLConverter(**kw).convert(diff)

            html, errors = tidylib.tidy_document(html)

            html = pypremailer.Premailer(html).premail()

            self.mail(html)

            self.log('Done mailing changes.')
   def marklogic_put_xml(self, item, spider_name):
       # Set the uri and collection
       if (self.ml_transform == ''):
           params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name}
       else:
           params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name, 'transform': self.ml_transform}
       # Set up the XML payload
       payload = dicttoxml(dict(item), attr_type=False, custom_root='webcontent')
       # Decode the <> characters back again
       payload = payload.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"')
       # Run tidy in order to get wel-formed XML
       payload, errors = tidy_document(payload, options={'input-xml': 1})
 
       # Set up the header
       headers = {'Content-Type': 'application/xml'}
 
       ml_uri = ('ml_uri' in item and item['ml_uri']) or self.ml_uri
       logging.info("PUTting XML in " + ml_uri + " as " + item['uri'])
 
       # Call the MarkLogic REST endpoint
       ml_user = ('ml_user' in item and item['ml_user']) or self.ml_user
       ml_pwd = ('ml_pwd' in item and item['ml_pwd']) or self.ml_pwd
       r = requests.put(ml_uri,
           params = params,
           auth = HTTPDigestAuth(ml_user, ml_pwd),
           data = payload,
           headers = headers)
 
       logging.info("PUT response: " + str(r.status_code) + ", " + r.text)
 def dynamic_test_method(self):
     """this function name doesn't matter much, it can start with `test`,
     but we're going to rename it dynamically below"""
     reportURLstring = '/report?reportname=' + reportItem.metadata['action']
     response=self._my_app.get(reportURLstring)
     code, error=tidylib.tidy_document(response.body, options={'show-errors':1, 'show-warnings':0})
     self.assertFalse(error, '%s did not return valid html page' % reportURLstring)
Example #17
0
def html2enml(html):
    # doc, err = tidy_fragment(

    doc, err = tidy_document(
        html,
        options={
            "output-xhtml": 1,
            "drop-proprietary-attributes": 1,
            "merge-divs": 1,
            "clean": 1
        }
    )

    root = fromstring(doc)

    # XXX dirty hack to circumvent a bug in lxml parser
    root = fromstring(etree.tostring(root))

    logging.debug(etree.tostring(root))

    # tidy_document returns a valid html document which means it usually contains html tag and proper body element
    root = root.find('body')
    if root is None:
        logging.warn("No body on this document")
        logging.warn(html)
        return "<div></div>"
    root.tag = 'div'

    root = remove_prohibited_elements(root)
    root = remove_prohibited_attributes(root)
    #FIXME Skipping dtd validation because of slow DTD creation speed
    # validate_dtd(html, f):

    return etree.tostring(root)
Example #18
0
    def process_response(self, request, response):
        if not _has_tidylib or not self._is_html(request, response):
            return response

        html, errors = tidylib.tidy_document(response.content, self._options, keep_doc=True)
        if not errors:
            return response

        # Filter out what we care about
        err_list = errors.rstrip().split("\n")
        err_list = self._filter_warnings(err_list)
        if not err_list:
            return response

        try:
            fn = urlresolvers.resolve(request.path)[0]
            fn_name = "%s.%s" % (fn.__module__, fn.__name__)
        except:
            fn_name = "<unresolved_url>"

        # Write the two versions of html out for offline debugging
        filename = os.path.join(self._outdir, fn_name)

        result = (
            "HTML tidy result: %s [%s]:"
            "\n\t%s"
            "\nPlease see %s.orig %s.tidy\n-------" % (request.path, fn_name, "\n\t".join(err_list), filename, filename)
        )

        file(filename + ".orig", "w").write(i18n.smart_str(response.content))
        file(filename + ".tidy", "w").write(i18n.smart_str(html))
        file(filename + ".info", "w").write(i18n.smart_str(result))

        self._logger.error(result)
        return response
Example #19
0
def fetch_data():
    def bvbreplace(s):
        return "BVB" if "Dortmund" in s else s

    doc = None
    try:
        doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
        soup = Soup(doc)
    except Exception as e:
        raise Exception(u"Error fetching/parsing website: %s" % e)

    out = ''
    matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
    timestr = ''
    try:
        home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
        guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
        league = ''
        try:
            league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
        except:
            league = select(soup, "div.next-match p span")[2].contents[0].strip()            
        matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
        timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
        dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
        location = u"Heim" if u"BVB" == home else u"Auswaerts"
        out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
    except IndexError:
        # This means: No next game on the webpage.
        sys.exit(1)
    except Exception as e:
        #print(traceback.format_exc())
        raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
    return out, matchtime
def nofoutofplacefeatures(url):
	try:
	

	#	pdb.set_trace()

		if url[:4]=="http":
			r = requests.get(url)
		else:
			url="http://"+url
			r  = requests.get(url)

		#r = requests.get(url)
		data = r.text
		data2=r.content

		document, errors = tidy_document(data,
		  options={'numeric-entities':1})

		#print document
		#print errors
		#print "Number of Elements Out of Place : " + str(len(errors))
		return len(errors)
	except:
		pass
def get_employees(lastname, firstname):
    payload = { 'find' : lastname }
    res = requests.get('https://www.campus.rwth-aachen.de/rwth/all/lecturerlist.asp', params=payload)
    if res.status_code == 200:
        persons = [ ]
        
        document, errors = tidy_document(res.content, options={'numeric-entities': 1, 'output_xhtml': 1})
        tree = ET.fromstring(strip_ns(document))
        
        try:
            filename = posixpath.basename(urlparse.urlsplit(res.url).path)
            if filename == 'lecturer.asp':
                fullname = tree.find('body/table[1]/tr[3]//tr[2]/td[2]').text.strip()
                unit = tree.find("body/table[2]//td[@class='h3']/a").text.strip()
            
                persons.append(fullname)

            elif filename == 'lecturerlist.asp':
                links = [ ]
                for cell in tree.findall('body/table[2]//td[3]/table[2]//td[1]/a'):
                    if cell is not None:
                        fullname = cell.text.strip()
                        persons.append(fullname)
            else:
                raise Exception
        except:
            print "===> WARNING: failed to get employee list for: %s, %s" % (firstname, lastname)
        
        return persons
Example #22
0
def sanitize(note):
	debug('Sanitizing note content...', 2)

	if get_setting('evernote/sanitize/@applytemplate') == 'True':
		with open(get_setting('evernote/sanitize/template/text()'), 'r') as file:
			template = file.read()
			template = template.replace('{content}', note['content'])
			
		note['content'] = transform(template)
		
		preservedElements = []
		preservePattern = get_setting('evernote/sanitize/preserve/pattern/text()')
		preserves = get_setting('evernote/sanitize/preserve/elements/text()').split(',')
		for preserve in preserves:
			matches = re.findall(preservePattern.format(preserve), note['content'])
			for match in matches:
				placeholder = '{%s}' % uuid.uuid4().hex
				preservedElements.append({'placeholder': placeholder, 'element': match})
				note['content'] = note['content'].replace(match, placeholder, 1)
	
		note['content'] = re.sub(get_setting('evernote/sanitize/attributes/empty/text()'), '', note['content'])
		note['content'] = re.sub(get_setting('evernote/sanitize/attributes/prohibited/text()'), '', note['content'])
		note['content'] = re.sub(get_setting('evernote/sanitize/elements/text()'), '', note['content'])
		note['content'] = note['content'].encode('utf-8', errors='ignore')
		(note['content'], errors) = tidy_document(note['content'])

		for element in preservedElements:
			note['content'] = note['content'].replace(element['placeholder'], element['element'])
	
	if note['title'] != None:
		note['title'] = note['title'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
	else:
		note['title'] = get_setting('evernote/sanitize/defaulttitle/text()')
Example #23
0
def convert_to_html(filename):
    # Do the conversion with pandoc
    output = pypandoc.convert(filename, 'html')

    # Clean up with tidy...
    output, errors = tidy_document(output,  options={
        'numeric-entities': 1,
        'wrap': 80,
    })
    print(errors)

    # replace smart quotes.
    output = output.replace(u"\u2018", '&lsquo;').replace(u"\u2019", '&rsquo;')
    output = output.replace(u"\u201c", "&ldquo;").replace(u"\u201d", "&rdquo;")

    # write the output
    filename, ext = os.path.splitext(filename)
    filename = "{0}.html".format(filename)
    with open(filename, 'w') as f:
        # Python 2 "fix". If this isn't a string, encode it.
        if type(output) is not str:
            output = output.encode('utf-8')
        f.write(output)

    print("Done! Output written to: {}\n".format(filename))
Example #24
0
    def _tidysrc(self,data,srccode):
        """tidy scribe the html src"""

        try:
            from tidylib import tidy_document
            BASE_OPTIONS = {
    "output-xhtml": 1,     # XHTML instead of HTML4
    "indent": 1,           # Pretty; not too much of a performance hit
    "tidy-mark": 0,        # No tidy meta tag in output
    "wrap": 0,             # No wrapping
    "alt-text": "",        # Help ensure validation
    "doctype": 'strict',   # Little sense in transitional for tool-generated markup...
    "force-output": 1,     # May not get what you expect but you will get something
    "char-encoding":'utf-8',
    "input-encoding":srccode,
    "output-encoding":'utf-8',
    }
            if not isinstance(data, unicode):                
                try:
                    data = data.decode(srccode)
                except:
                    pass
            doc, errors = tidy_document(data,options={'numeric-entities':1})
            return doc
        except:
            return data
Example #25
0
 def test_doc_with_unicode_subclass(self):
     class MyUnicode(unicode):
         pass
     
     h = MyUnicode(u"unicode string ß")
     expected = unicode(DOC, 'utf-8') % h
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)
Example #26
0
 def pretty(self, html):
     soup = BeautifulSoup(html, "html5lib")
     document, errors = tidy_document(soup.encode(formatter="html"), options={
         'char-encoding': 'utf8',
         'output-encoding': 'utf8',
         'doctype': 'html5'
     })
     return document
 def init_stats(self):
     self.source = self.response.content.decode('utf-8')
     if tidylib:
         document, errors = tidylib.tidy_document(self.source, options={'numeric-entities':1})
     else:
         errors = ""
     if errors:
         self.errors = [e.groupdict() for e in error_re.finditer(errors)]
Example #28
0
 def test_doc_with_unicode_subclass(self):
     class MyUnicode(utype):
         pass
     
     h = MyUnicode("unicode string ß")
     expected = DOC % h
     doc, err = tidy_document(h, {'output_xhtml':1})
     self.assertEqual(doc, expected)
Example #29
0
 def tidy(self, data):
     document, errors = tidy_document(data, {
         'input-xml': True, 'output-xml': True,
         'preserve-entities': True, 'numeric-entities': True
     })
     if errors:
         print errors
     return document
Example #30
0
    def process_response(self, request, response):
        document, errors = tidy_document(response.content, options={"numeric-entities": 1})
        self.log_data = (document, errors)
        self.src_content = response.content
        errors_list = errors.split("\n")
        self.errors_count = len([err for err in errors_list if "error:" in err.lower()])
        self.warns_count = len([err for err in errors_list if "warning:" in err.lower()])

        return response
Example #31
0
def validate_html(endpoint, document):
    """
    This function can be used to make sure HTML returned is valid
    It raises an exception describing what's wrong then non-valid HTML was entered
    :param endpoint: name of the function which returned the html content
    :param document: the html content
    :return: None
    """
    tidied, errors = tidy_document(document)
    if errors:
        raise SystemError(
            "Errors were found in the following HTML returned by function {}:\n{}\n\nErrors:\n{}"
            .format(endpoint, document, errors))
Example #32
0
 def pretty(self, tidy_warnings=False) -> str:
     '''Like render() but format through tidylib'''
     txt, errors = tidy_document(
         self.render(), {
             'indent': 1,
             'output-xhtml': True,
             'force-output': 1,
             'doctype': 'strict',
             'show-warnings': tidy_warnings
         })
     if errors:
         print('HTML tidy: ' + str(errors), file=sys.stderr)
     return txt
Example #33
0
 def test_post_note(self):
     code, error = tidylib.tidy_document(
         self._my_class.post_note(**{
             "author": "test",
             "message": "test"
         }),
         options={
             "show-errors": 1,
             "show-warnings": 0
         },
     )
     self.assertFalse(error,
                      "/notes/post_note does not return valid html page")
Example #34
0
 def test_select_item_search(self):
     random_item = random.sample(list(Book.selectBy(status="STOCK")), 1)[0]
     code, error = tidylib.tidy_document(
         self._my_class.select_item_search(
             title=random_item.title.booktitle),
         options={
             "show-errors": 1,
             "show-warnings": 0
         },
     )
     self.assertFalse(
         error,
         "/register/select_item_search does not return valid html page")
Example #35
0
 def clean_html(self, htmlfile):
     try:
         reader = open(htmlfile, 'r')
         content = reader.read()
         reader.close()
         document, errors = tidy.tidy_document(content, options=tidy_options)
         if document:
             writer = open(htmlfile, 'w')
             writer.write(document)
             writer.close()
         print "Cleaned", htmlfile
     except Exception, e:
         print e
Example #36
0
def get_cvk_page(url):
    '''Повертає текст сторінки або нічого
    '''
    res = s.get(url, headers=headers, verify=False)
    # print(res.encoding)
    # print(res.url)
    res.encoding = "utf-8"
    if res.status_code != 200:
        print(f"Error <= {url}")
        return
    tidy, errors = tidy_document(res.text)
    # print(errors)
    return tidy
Example #37
0
 def test_select_item_for_isbn_search_functional(self):
     random_item = random.sample(list(Title.select()), 1)[0]
     response = self._my_app.get("/admin/select_item_for_isbn_search",
                                 {"isbn": random_item.isbn})
     code, error = tidylib.tidy_document(response.body,
                                         options={
                                             "show-errors": 1,
                                             "show-warnings": 0
                                         })
     self.assertFalse(
         error,
         "/admin/select_item_for_isbn_search did not return proper response"
     )
Example #38
0
    def parse_html(self):
        content, errors = tidy_document(self.page,
                                        options={'char-encoding': 'utf8'})
        #eq_(len(errors), 0 , "Errors found in HTML document:\n%s" % errors)

        b = content.find('nominatim_results =')
        e = content.find('</script>')
        content = content[b:e]
        b = content.find('[')
        e = content.rfind(']')

        self.result = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(
            content[b:e + 1])
Example #39
0
    def __init__(self, idMembro, cvLattesHTML):
        HTMLParser.__init__(self)

        # inicializacao obrigatoria
        self.idMembro = idMembro
        self.sexo = 'Masculino'
        self.nomeCompleto = u'[Nome-nao-identificado]'

        self.item = ''
        self.issn = ''
        self.listaIDLattesColaboradores = []
        self.listaFormacaoAcademica = []
        self.listaAreaDeAtuacao = []
        self.listaIdioma = []

        self.listaArtigoEmPeriodico = []
        self.listaLivroPublicado = []
        self.listaCapituloDeLivroPublicado = []
        self.listaTextoEmJornalDeNoticia = []
        self.listaTrabalhoCompletoEmCongresso = []
        self.listaResumoExpandidoEmCongresso = []
        self.listaResumoEmCongresso = []
        self.listaArtigoAceito = []
        self.listaApresentacaoDeTrabalho = []
        self.listaOutroTipoDeProducaoBibliografica = []


        self.listaParticipacaoEmEvento = []
        self.listaOrganizacaoDeEvento = []


        # inicializacao para evitar a busca exaustiva de algumas palavras-chave
        self.salvarAtualizacaoCV = 1
        self.salvarFoto = 1
        self.procurarCabecalho = 0
        self.achouGrupo = 0
        self.doi = ''
        self.relevante = 0
        self.idOrientando = ''
        self.complemento = ''

        # contornamos alguns erros do HTML da Plataforma Lattes
        cvLattesHTML = cvLattesHTML.replace("<![CDATA[","")
        cvLattesHTML = cvLattesHTML.replace("]]>","")
        cvLattesHTML = cvLattesHTML.replace("<x<","&lt;x&lt;")
        cvLattesHTML = cvLattesHTML.replace("<X<","&lt;X&lt;")

        # feed it!
        cvLattesHTML, errors = tidy_document(cvLattesHTML, options={'numeric-entities':1})

        self.feed(cvLattesHTML)
Example #40
0
    def _validateHTML(self, moFile):
        """
        This function validates the file ``moFile`` for correct html syntax.

        :param moFile: The name of a Modelica source file.
        :return: (str, str) The tidied markup [0] and warning/error
                 messages[1]. Warnings and errors are returned
                 just as tidylib returns them.

        """
        from tidylib import tidy_document

        entries = self._getInfoRevisionsHTML(moFile)

        # Document header
        header = "<?xml version='1.0' encoding='utf-8'?> \n \
        <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \n \
    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"> \n \
<html xmlns=\"http://www.w3.org/1999/xhtml\"> \n \
<head> \n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /> \n \
<title>xxx</title> \n \
</head> \n \
<body> \n \
<!-- +++++++++++++++++++++++++++++++++++++ -->\n"

        body = ""
        for line in entries:
            body += line + '\n'
        # Replace \" with "
        body = body.replace('\\"', '"')

        # Document footer
        footer = "<!-- +++++++++++++++++++++++++++++++++++++ -->\n \
</body>\n \
</html>"

        # Validate the string
        document, errors = tidy_document(r"%s%s%s" % (header, body, footer),
                                         options={
                                             'numeric-entities': 1,
                                             'output-html': 1,
                                             'alt-text': '',
                                             'wrap': 72
                                         })
        # Write html file.
        if self._writeHTML:
            htmlName = "%s%s" % (moFile[0:-2], "html")
            with open(htmlName, mode="w", encoding="utf-8") as f:
                f.write(document)
        return (document, errors)
Example #41
0
def wrap_html(curr_chapter):
    with open(curr_chapter['local'], 'r+', encoding='utf-8') as f:
        content = f.read()
        header = """<?xml version='1.0' encoding='utf-8'?>
            <html xmlns="http://www.w3.org/1999/xhtml">
            <head><title>%s</title></head>
            <body><h1 class="chapter">%s</h1>""" % (curr_chapter['title'],
                                    curr_chapter['title'])
        footer = "</body></html>"
        doc = "%s%s%s" % (header, content, footer)
        clean = tidy_document(doc, options=TIDY_OPTS)
        f.seek(0)
        f.write(clean[0])
        f.truncate()
Example #42
0
def test_page_should_be_valid_html(page):
    if page.response != 200:
        pytest.skip("not validating non-reachable page")
    if not page.content_type or 'html' not in page.content_type.lower():
        pytest.skip("not attempting to validate non-html page")
    if not should_validate(page.url):
        pytest.skip("skip validation of blacklisted page")
    if not page.content:
        pytest.skip("page has no content")

    document, errors = tidy_document(page.content, TIDY_OPTIONS)
    errors = filter_errors(errors)

    assert not errors, "Found following validation errors:\n" + errors
Example #43
0
def get_stations():
    try:
        web = requests.get(
            'http://horarios.renfe.com/cer/hjcer300.jsp?NUCLEO=30&CP=NO&I=s#',
            timeout=4).text
    except Timeout:
        return None
    else:
        document, errors = tidy_document(web)
        bs = BeautifulSoup(document, 'html.parser')
        estaciones = bs.find('select', {"name": "o"}).findAll('option')
        estaciones_ids = [(option.text.strip().replace(" ", "").lower(),
                           option['value']) for option in estaciones][1:]
        return {key: value for (key, value) in estaciones_ids}
Example #44
0
 def clean(cls, html, tidy=True, body_only=False):
     """
     clean html document
     """
     if body_only:
         cleaner = cls.cleaner_body
     else:
         cleaner = cls.cleaner
     if tidy:
         document, errors = tidy_document(html)
         cleaned = cleaner.clean_html(document)
     else:
         cleaned = cleaner.clean_html(html)
     return cleaned
def carrega_dados_lei(lei):
    response = urllib2.urlopen(lei.get('url'))
    html, errors = tidy_document(response.read(), tidy_options)
    if errors:
        sys.exit(errors)
    parsed_html = BeautifulSoup(html)

    titulo = parsed_html.find('h4').text.strip()
    descricao = parsed_html.find('td', attrs={'id': 'prim_col'}).text.strip()
    dados = {'titulo_completo': titulo, 'descricao': descricao}
    if lei.has_key('voto'):
        del (lei['voto'])
    lei.update(dados)
    return lei
Example #46
0
    def htmlCorrection(
        self,
        htmlStr: str,
        substitutions_dict: dict = {
            '"': '\\"',
            '<br>': '<br />',
            '<br/>': '<br />'
        }
    ) -> (str, str):
        """Returns cleaned html code and found errors

        Calls tidylib which will produce a clean version of the html code
        and also the errors that it has found.

        Parameters
        ----------
        htmlStr : str
                The html code as a single string.
        substitutions_dict : dict
                A dictionary with key:value pairs for old and new text.
                The html code must be escaped in Modelica. Generate properly
                escaped code we need to add the escape characters. All the
                while we can replace html errors that Dymola introduces.
                i.e. '<br>' -> '<br />'

        Returns
        -------
        str
                The tidy html code with escape characters as one string.
        str
                The error messages from tidylib.
        """
        from tidylib import tidy_document

        # Validate the string
        htmlCorrect, errors = tidy_document(f"{htmlStr}",
                                            options={
                                                'doctype': 'html5',
                                                'show-body-only': 1,
                                                'numeric-entities': 1,
                                                'output-html': 1,
                                                'wrap': 72,
                                                'alt-text': '',
                                            })

        document_corr = HTML_Tidy.make_string_replacements(
            self, theString=htmlCorrect, substitutions_dict=substitutions_dict)

        return document_corr, errors
Example #47
0
 def __parse(self,
             url: str = None,
             html: str = None,
             cookies: str = None) -> (dict, str):
     """
     Make an HTML/URL parsing by processing ALL found tags
     :param url: The url to parse (or None)
     :param html: The html page to parse as string (or None)
     :param cookies: The cookies to use on parsing
     :return: dictionary of tags, cookies
     """
     self.url = None
     self.base_url = None
     is_image = False
     if url is not None:
         self.url = url
         url_parsed = urlparse(url)
         self.url_scheme = str(url_parsed.scheme)
         self.base_url = self.url_scheme + '://' + str(url_parsed.netloc)
         r = HttpRequest.request(url, cookies=cookies)
         if r is None:
             return None
         if r.status_code >= 400 or r.headers.get(
                 'Content-Type') in HtmlParser._unacceptable_content_types:
             return None
         try:
             html = r.json()
             Log.warning('Trying to parse a json with HTML parser!')
         except ValueError:
             html = r.text
         if r.headers is not None:
             for k, v in r.headers.items():
                 if k.lower() == 'set-cookie':
                     cookies = v
         if HttpRequest.is_image(r):
             is_image = True
             xmp_start = html.find('<x:xmpmeta')
             xmp_end = html.find('</x:xmpmeta')
             xmp_str = html[xmp_start:xmp_end + 12]
             html = xmp_str
     if is_image:
         sorted_html = html
     else:
         sorted_html, errors = tidy_document(
             html)  # Sort html (and fix errors)
     self.feed(sorted_html)
     if cookies is None:
         cookies = ''
     return self.tags, cookies
 def validate_html_code(template, mongodb, redis_db):
     _id_key = redis_db.incr('html_log')
     document, errors = tidy_document(template,
                                      options={
                                          'numeric-entities': 1,
                                          'char-encoding': 'utf8'
                                      })
     errors = errors.split('\n')
     mongodb.html_log.insert({
         '_id': _id_key,
         'time': get_time.datetime_to_secs(),
         'tmpl': template,
         'log': errors
     })
     return document
Example #49
0
def _tidyHTML(doc):
    body = ''.join(doc)
    body = body.replace('\\"', '"')

    _, errors = tidy_document(r'{0}'.format(body), options=TIDY_OPTIONS)

    error_list = errors.rstrip().split('\n')
    errors = []
    for error in error_list:
        error = error.rstrip().split('\n')
        for err in error:
            if bool(err) and not any(id in err for id in IGNORE_IDS):
                errors.append(err)

    return errors
Example #50
0
    def validate(self):
        document, errors = tidylib.tidy_document(self.content,
                                                 options=self.TIDY_OPTIONS)

        for line in errors.splitlines():
            for marker in self.markers:
                pattern = r"line \d+ column \d+ - {}:".format(marker)
                if re.match(pattern, line, flags=re.IGNORECASE):
                    self.log.warning("Tidy report:\n%s", errors)
                    raise ValidateError("invalid HTML content")

        if self.permissive and errors:
            self.log.debug("Tidy report:\n%s", errors)
        self.log.info("Clean HTML document")
        return document  # return its XHTML version
Example #51
0
def get_quiz_content(page):
    try:
        soup = BeautifulSoup(page, 'html.parser')
        q_desc_html = soup.find(class_='story')
        x_c = [tidy_document(str(x)) for x in q_desc_html.find_all('p')]

        x_s = BeautifulSoup(x_c[0][0], 'html.parser')
        q_desc = x_s.find('p').text.strip()
        return [x for x in q_desc.split('.') if x]
    except Exception as err:
        q_c = [x for x in q_desc_html.descendants][:2]
        if isinstance(q_c[1], bs4.element.Tag):
            q_c[1] = q_c[1].text
        else:
            q_c[1] = ''
        return q_c
Example #52
0
    def process_item(self, item, spider):
        data = pprint.pformat(item['content'])
        self.write(item, data, "html")

        document, errors = tidy_document(data, options={'numeric-entities': 1})

        self.write(item, document, "html.tidy")
        self.write(item, errors, "html.errors")

        #        my_reporter.read(html=data)
        #        data= my_reporter.report_news()

        self.write(item, "\n".join(item['links']), "links.txt")

        #
        return item
Example #53
0
def validate(folder, files):
    for fileName in files:
        if fileName.lower().endswith(".html"):
            try:
                file = open(fileName, "r")
                filetext = "".join([s for s in file.readlines()])
                _, errors = tidy_document(filetext,
                                          options={"numeric-entities": 1})
                if len(errors) == 0:
                    return "HTML was successfully validated with no errors\n"
                return "HTML was validated, with following errors:\n  - " + errors.replace(
                    "\n", "\n  - ").strip("  - ")
            except Exception as _:
                return "An error occured while validating html file\n"
            finally:
                file.close()
Example #54
0
    def corrigirHTML(self, cvLattesHTML):
        extended_chars = u''.join(
            unichr(c)
            for c in xrange(127, 65536, 1))  # srange(r"[\0x80-\0x7FF]")
        special_chars = ' -' ''
        cvLattesHTML = cvLattesHTML.decode(
            'iso-8859-1', 'replace')  #+extended_chars+special_chars
        #cvLattesHTML  = cvLattesHTML.decode('ascii','replace')+extended_char+special_chars # Wed Jul 25 16:47:39 BRT 2012

        # contornamos alguns erros do HTML da Plataforma Lattes
        cvLattesHTML = cvLattesHTML.replace("<![CDATA[", "")
        cvLattesHTML = cvLattesHTML.replace("]]>", "")
        arquivoHTML, errors = tidy_document(cvLattesHTML,
                                            options={'numeric-entities': 1})
        #print errors
        return arquivoHTML
Example #55
0
def checkurl_html(url: str,
                  status_code: Optional[int] = 200,
                  mimetype: Optional[str] = 'text/html',
                  has: List[str] = [],
                  hasnot: List[str] = []) -> str:
    __tracebackhide__ = True
    document = checkurl_text(url, status_code, mimetype, has, hasnot)
    if html_validation:
        for line in tidy_document(document,
                                  options=TIDY_OPTIONS)[1].split('\n'):
            if not line:
                continue
            if 'trimming empty <span>' in line:
                continue
            pytest.fail(f'tidy error: {line}')
    return document
def carrega_dados_politico(idx):
    response = urllib2.urlopen('http://www.excelencias.org.br/@parl.php?id=%s'%idx)
    html, errors = tidy_document(response.read(), tidy_options)
    if errors:
        sys.exit(errors)
    parsed_html = BeautifulSoup(html)

    conteudo = parsed_html.body.find('div', attrs={'id':'conteudo'}).find_all('div', attrs={'id':'contem_parl'})
    if len(conteudo) <= 4:
        return

    bloco_principal = conteudo[0]
    bloco_votacoes = None
    for bloco in conteudo:
        if bloco.find('div', attrs={'id':'contem_titulo_parl'}).text.strip() == u'Como votou matérias no Plenário':
            bloco_votacoes = bloco
    if not bloco_votacoes:
        return

    nome = bloco_principal.find('div', attrs={'id':'contem_titulo_parl'}).text.strip()
    if not nome:
        return
    tabela_votacoes = bloco_votacoes.find('table', attrs={'class':'livre'})

    lista_votos = []
    if tabela_votacoes:
        for linha in tabela_votacoes.find_all('tr'):
            titulo_lei = linha.find('td', attrs={'id':'prim_col'}).text.strip()
            voto = linha.find('td', attrs={'class':'esq'}).text.strip()
            link_lei = linha.find('td', attrs={'id':'prim_col'}).find('a')
            url_lei = ''
            if link_lei:
                cod, num, ano, casa = link_lei.get('href').replace('javascript:parent.traz_pl(', '').split(',')
                url_lei = 'http://www.excelencias.org.br/modulos/parl_projetolei.php?cod=%s&num=%s&ano=%s&casa=%s'
                url_lei = url_lei %(cod.replace("'", ''), num, ano, casa.replace(')', ''))

            lista_votos.append({
                'titulo': titulo_lei,
                'voto': voto,
                'url': url_lei
            })
    dados_politico = {
        'idx': int(idx),
        'nome': nome,
        'votos': lista_votos
    }
    return dados_politico
def tidy_html(html):
    if hasattr(html, 'read'):
        html = html.read()
    html5, errors = tidy_document(
        html,
        options={
            'merge-divs':
            0,  # do not merge nested div elements - preserve semantic block structures
            'output-xml': 0,
            'indent': 1,
            'tidy-mark': 0,
            'wrap': 0,
            'alt-text': '',
            'doctype': 'html5',
            'markup': 1
        })
    return html5
Example #58
0
def validate_html(html):
    """run tidy on html"""
    _, errors = tidy_document(
        html.content,
        options={
            "drop-empty-elements": False,
            "warn-proprietary-attributes": False,
        },
    )
    # idk how else to filter out these unescape amp errs
    errors = "\n".join(
        e for e in errors.split("\n")
        if "&book" not in e and "&type" not in e and "id and name attribute"
        not in e and "illegal characters found in URI" not in e
        and "escaping malformed URI reference" not in e)
    if errors:
        raise Exception(errors)
Example #59
0
    def process(self, request, response, report):
        global _tidy_available
        #TODO: Hash errors and don't log duplicate error sets (just a reference)
        if response.is_html and _tidy_available:
            try:
                doc, err = tidy_document(response.content,
                                         options=self.options)
            except:
                report.add_error('Unable to parse response')
            else:
                l = err.splitlines()
                if len(l) > 0:
                    for e in l:
                        report.add_message('{0}'.format(
                            re.sub('^line\\b', 'Line', e)))

                    report.add_message('Total: {0}'.format(len(l)))
Example #60
0
def sanitize(html):
    # with from __future__ import unicode_litterals
    # tidy_document does not want other options at all
    # such as div merge char-encoding and so on
    document, errors = tidy_document(
        html, options={"output-xhtml": 1, "force-output": 1})

    try:
        parsed_dom = parseString(document)
        document_element = parsed_dom.documentElement
        remove_prohibited_elements(document_element)
        remove_prohibited_attributes(document_element)
        body = document_element.getElementsByTagName("body")[0]
        body.tagName = "en-note"
        return body.toxml()
    except ExpatError:
        return ''