def test_doc_with_entity(self): h = "é" expected = DOC % "é" doc, err = tidy_document(h) self.assertEqual(doc, expected) expected = DOC % "é" doc, err = tidy_document(h, {'numeric-entities':1}) self.assertEqual(doc, expected)
def _massage_diff_content(content): tidy_options = { 'output-xhtml': 0, 'force-output': 1, } try: content = tidy_document(content, options=tidy_options) except UnicodeDecodeError: # In case something happens in pytidylib we'll try again with # a proper encoding content = tidy_document(content.encode('utf-8'), options=tidy_options) tidied, errors = content content = tidied.decode('utf-8'), errors return content
def scrape(slug, url, name, title=None): f = urlopen(url) doc = f.read() doc, errs = tidy_document( doc, options={ "output-html": 1, #'indent':1, "clean": 1, "drop-font-tags": 1, }, ) if errs: # raise Exception, errs print errs doc = html5lib.parse(doc, treebuilder="lxml") # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) td = jQuery("td#content") assert len(td) == 1 for img in td("img"): # print 'img:', PyQuery (img) img = PyQuery(img) src = img.attr("src") # alt = img.attr('alt') # if src.startswith ('/image'): rslt = getimage(src, slug.split("/")[0]) img.attr("src", rslt) if trace: print rslt # td = # no_fonts (td) # need to fix links here content = PyQuery(td[0]) # content = content.html() content = no_namespaces(content.html()) print slug, content[:60] # .html() # [:60] if dbteeth: # q, created = QuickPage.objects.get_or_create ( qp, created = create_or_update( QuickPage, keys=dict(slug=slug), fields=dict( name=name, title=title if title else name, content=content, # defaults = dict (sortorder = sortorder), ), )
def html_clean(self, html): # First we pass it through tidy (html, errors) = tidylib.tidy_document(html, options={ 'drop-proprietary-attributes': 1, 'alt-text': '', 'hide-comments': 1, 'output-xhtml': 1, 'show-body-only': 1, 'clean': 1, 'char-encoding': 'utf8', 'show-warnings': 0, 'show-info': 0, }) if errors: print(("HTML tidy failed for %s!" % self.msgid)) print(errors) return None try: cleaner = HTMLCleaner() cleaner.feed(html) return cleaner.get_text() except Exception as e: # Failed to parse the html, thus failed to clean it. so we must # give up... return None
def __trading_years(self, instrument): re = urllib2.urlopen('http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/%s.phtml' % (instrument)) document, errors = tidy_document(re.read()) soup = BeautifulSoup(document) node = soup.find('select', attrs={'name':'year'}) for option in node.findAll('option'): yield option.getText()
def process_response(self, request, response): if 'text/html' in response['Content-Type'] and response.content: document, errors = tidy_document(response.content) if errors: raise HTMLValidationError(errors) return response
def getMenu(): storeFile = open("list.txt","r") txt = storeFile.read() storeFile.close() list=txt.split('\n\n\n') # print list for store in list: # print store rest = store.split('\n') if len(rest)!=3: break try: url=baseUrl+rest[2] +'menu' print url res=urlopen(url) html=res.read() options = {'output-encoding':'utf8', 'output-xhtml':1 } document,errors = tidy_document(html,options) filepath = dataDir+ (rest[2].split('/'))[2] + ".html" saveFile = open(filepath,"w") saveFile.write(document) saveFile.close() print filepath except : print "skip:"+url
def cleanUpHTML(html, options=None): import tidylib tidylib.BASE_OPTIONS = {} default_options = { "force-output" : 1, "output-xhtml" : 1, "doctype" : "strict", "drop-empty-paras": 1, "output-encoding" : "utf8", "clean": 1, "bare": 1 } if options: default_options.extend(options) # first fix up footnotes so that HTMLTidy won't ditch them soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html") footnoteFixer(soup) #html) stripEmptyParagraphs(soup) html, errors = tidylib.tidy_document(soup.prettify(encoding=None), options=default_options) soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html") addMetaTag(soup, [('http-equiv', 'Content-type'), ('content', 'text/html; charset=utf-8')]) return soup.prettify(encoding=None), errors
def _tidy2xhtml5(html): """Tidy up a html4/5 soup to a parsable valid XHTML5. Requires tidy-html5 from https://github.com/w3c/tidy-html5 Installation: http://goo.gl/FG27n """ html = _io2string(html) html = _pre_tidy(html) # Pre-process xhtml5, errors = tidy_document(html, options={ 'merge-divs': 0, # do not merge nested div elements - preserve semantic block structrues 'output-xml': 1, # create xml output 'indent': 0, # Don't use indent, add's extra linespace or linefeeds which are big problems 'tidy-mark': 0, # No tidy meta tag in output 'wrap': 0, # No wrapping 'alt-text': '', # Help ensure validation 'doctype': 'strict', # Little sense in transitional for tool-generated markup... 'force-output': 1, # May not get what you expect but you will get something 'numeric-entities': 1, # remove HTML entities like e.g. nbsp 'clean': 1, # remove 'bare': 1, 'word-2000': 1, 'drop-proprietary-attributes': 1, 'enclose-text': 1, # enclose text in body always with <p>...</p> 'logical-emphasis': 1, # transforms <i> and <b> text to <em> and <strong> text # do not tidy all MathML elements! List of MathML 3.0 elements from http://www.w3.org/TR/MathML3/appendixi.html#index.elem 'new-inline-tags': 'abs, and, annotation, annotation-xml, apply, approx, arccos, arccosh, arccot, arccoth, arccsc, arccsch, arcsec, arcsech, arcsin, arcsinh, arctan, arctanh, arg, bind, bvar, card, cartesianproduct, cbytes, ceiling, cerror, ci, cn, codomain, complexes, compose, condition, conjugate, cos, cosh, cot, coth, cs, csc, csch, csymbol, curl, declare, degree, determinant, diff, divergence, divide, domain, domainofapplication, el, emptyset, eq, equivalent, eulergamma, exists, exp, exponentiale, factorial, factorof, false, floor, fn, forall, gcd, geq, grad, gt, ident, image, imaginary, imaginaryi, implies, in, infinity, int, integers, intersect, interval, inverse, lambda, laplacian, lcm, leq, limit, list, ln, log, logbase, lowlimit, lt, maction, malign, maligngroup, malignmark, malignscope, math, matrix, matrixrow, max, mean, median, menclose, merror, mfenced, mfrac, mfraction, mglyph, mi, min, minus, mlabeledtr, mlongdiv, mmultiscripts, mn, mo, mode, moment, momentabout, mover, mpadded, mphantom, mprescripts, mroot, mrow, ms, mscarries, mscarry, msgroup, msline, mspace, msqrt, msrow, mstack, mstyle, msub, msubsup, msup, mtable, mtd, mtext, mtr, munder, munderover, naturalnumbers, neq, none, not, notanumber, note, notin, notprsubset, notsubset, or, otherwise, outerproduct, partialdiff, pi, piece, piecewise, plus, power, primes, product, prsubset, quotient, rationals, real, reals, reln, rem, root, scalarproduct, sdev, sec, sech, selector, semantics, sep, set, setdiff, share, sin, sinh, subset, sum, tan, tanh, tendsto, times, transpose, true, union, uplimit, variance, vector, vectorproduct, xor', 'doctype': 'html5', }) #return xhtml5 # return the tree itself, there is another modification below to avoid # another parse return _post_tidy(xhtml5)
def process_response(self, request, response, spider): if 'index-mo' in response.url: return response if 'index-do' in response.url: return response if 'index-e_types' in response.url: return response body = response.body # VERY UGLY... need to get this done, so it's ugly for now... index = body.find("<table") while index != -1: inner = body.find("<table", index+6) endtable = body.find("</table", index+6) if inner != -1 and inner < endtable: # we have an inner table... if body.find("<tr", inner, endtable) != -1: break # if it's truely a table else: start = inner-1 end = body.find(">", inner)+1 body = body[:start] + body[end:] endtable = body.find("</table", index+6) start = endtable-1 end = body.find(">", endtable)+1 body = body[:start] + body[end:] index = body.find("<table", index+6) # remove any <br> before we tidy it up body = self.br_re.sub('', body) body = self.empty_anchor_re.sub('', body) tidylib.BASE_OPTIONS = { "output-xhtml": 0, # XHTML instead of HTML4 "indent": 1, # Pretty; not too much of a performance hit "tidy-mark": 0, # No tidy meta tag in output "wrap": 0, # No wrapping "alt-text": "", # Help ensure validation "doctype": 'omit', # Little sense in transitional for tool-generated markup... "force-output": 1, # May not get what you expect but you will get something } body, _ = tidy_document(body, options={'drop-empty-paras':1, 'drop-font-tags':1,'enclose-text':1,'merge-divs':1,'fix-bad-comments':1}) body = self.link_re.sub('\g<1>', body) body = self.vmodl_re.sub('\g<1>', body) body = self.strong_re.sub('\g<1>', body) body = self.script_re.sub('', body) body = self.ul_re.sub('', body) body = self.li_end_re.sub('', body) body = self.li_re.sub('* ', body) body = self.ul_end_re.sub('', body) response = response.replace(body=body) return response
def test_xmlns_large_document_xml_corner_case(self): # Test for a super weird edge case in Tidy that can cause it to return # the wrong required buffer size. body = '<span><span>A</span></span>' + 'A' * 7937 html = '<html xmlns="http://www.w3.org/1999/xhtml">' + body doc, err = tidy_document(html, {'output-xml': 1}) self.assertEqual(doc.strip()[-7:], "</html>")
def test_doc_with_unclosed_tag(self): h = "<p>hello" expected = DOC % '''<p> hello </p>''' doc, err = tidy_document(h) self.assertEqual(doc, expected)
def call(): if world.results: return data = urllib.urlencode(world.params) req = urllib2.Request(url="%s/%s?%s" % (world.base_url, world.requesttype, data), headers=world.header) fd = urllib2.urlopen(req) page = fd.read() fmt = world.params.get('format') if fmt not in ('html', 'xml', 'json', 'jsonv2'): fmt = 'xml' if world.requesttype == 'reverse' else 'html' pageinfo = fd.info() assert_equal('utf-8', pageinfo.getparam('charset').lower()) pagetype = pageinfo.gettype() if fmt == 'html': assert_equals('text/html', pagetype) document, errors = tidy_document(page, options={'char-encoding' : 'utf8'}) assert(len(errors) == 0), "Errors found in HTML document:\n%s" % errors world.results = document elif fmt == 'xml': assert_equals('text/xml', pagetype) world.results = parseString(page).documentElement else: if 'json_callback' in world.params: func = world.params['json_callback'] assert page.startswith(func + '(') assert page.endswith(')') page = page[(len(func)+1):-1] assert_equals('application/javascript', pagetype) else: assert_equals('application/json', pagetype) world.results = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(page)
def run(self): """ Main entry point Run FuncInventory and if there is a change, prepare and send an email. """ # Run the inventory inventory = func_inventory.FuncInventory() inventory.run( [ 'func-inventory', '--tree=%s' % self.config['git_repo'], '--modules=%s' % ','.join(self.config['modules']) ]) diff = self.git_diff() if not diff: self.log('No changes detected. Sleeping.') else: self.log('CHANGE DETECTED in func-inventory.') kw = dict(dark_bg=self.config['dark_bg'], font_size=self.config['font_size']) html = ansi2html.Ansi2HTMLConverter(**kw).convert(diff) html, errors = tidylib.tidy_document(html) html = pypremailer.Premailer(html).premail() self.mail(html) self.log('Done mailing changes.')
def marklogic_put_xml(self, item, spider_name): # Set the uri and collection if (self.ml_transform == ''): params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name} else: params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name, 'transform': self.ml_transform} # Set up the XML payload payload = dicttoxml(dict(item), attr_type=False, custom_root='webcontent') # Decode the <> characters back again payload = payload.replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"') # Run tidy in order to get wel-formed XML payload, errors = tidy_document(payload, options={'input-xml': 1}) # Set up the header headers = {'Content-Type': 'application/xml'} ml_uri = ('ml_uri' in item and item['ml_uri']) or self.ml_uri logging.info("PUTting XML in " + ml_uri + " as " + item['uri']) # Call the MarkLogic REST endpoint ml_user = ('ml_user' in item and item['ml_user']) or self.ml_user ml_pwd = ('ml_pwd' in item and item['ml_pwd']) or self.ml_pwd r = requests.put(ml_uri, params = params, auth = HTTPDigestAuth(ml_user, ml_pwd), data = payload, headers = headers) logging.info("PUT response: " + str(r.status_code) + ", " + r.text)
def dynamic_test_method(self): """this function name doesn't matter much, it can start with `test`, but we're going to rename it dynamically below""" reportURLstring = '/report?reportname=' + reportItem.metadata['action'] response=self._my_app.get(reportURLstring) code, error=tidylib.tidy_document(response.body, options={'show-errors':1, 'show-warnings':0}) self.assertFalse(error, '%s did not return valid html page' % reportURLstring)
def html2enml(html): # doc, err = tidy_fragment( doc, err = tidy_document( html, options={ "output-xhtml": 1, "drop-proprietary-attributes": 1, "merge-divs": 1, "clean": 1 } ) root = fromstring(doc) # XXX dirty hack to circumvent a bug in lxml parser root = fromstring(etree.tostring(root)) logging.debug(etree.tostring(root)) # tidy_document returns a valid html document which means it usually contains html tag and proper body element root = root.find('body') if root is None: logging.warn("No body on this document") logging.warn(html) return "<div></div>" root.tag = 'div' root = remove_prohibited_elements(root) root = remove_prohibited_attributes(root) #FIXME Skipping dtd validation because of slow DTD creation speed # validate_dtd(html, f): return etree.tostring(root)
def process_response(self, request, response): if not _has_tidylib or not self._is_html(request, response): return response html, errors = tidylib.tidy_document(response.content, self._options, keep_doc=True) if not errors: return response # Filter out what we care about err_list = errors.rstrip().split("\n") err_list = self._filter_warnings(err_list) if not err_list: return response try: fn = urlresolvers.resolve(request.path)[0] fn_name = "%s.%s" % (fn.__module__, fn.__name__) except: fn_name = "<unresolved_url>" # Write the two versions of html out for offline debugging filename = os.path.join(self._outdir, fn_name) result = ( "HTML tidy result: %s [%s]:" "\n\t%s" "\nPlease see %s.orig %s.tidy\n-------" % (request.path, fn_name, "\n\t".join(err_list), filename, filename) ) file(filename + ".orig", "w").write(i18n.smart_str(response.content)) file(filename + ".tidy", "w").write(i18n.smart_str(html)) file(filename + ".info", "w").write(i18n.smart_str(result)) self._logger.error(result) return response
def fetch_data(): def bvbreplace(s): return "BVB" if "Dortmund" in s else s doc = None try: doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions) soup = Soup(doc) except Exception as e: raise Exception(u"Error fetching/parsing website: %s" % e) out = '' matchtime = datetime.datetime.now() + datetime.timedelta(hours=25) timestr = '' try: home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip()) guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip()) league = '' try: league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip() except: league = select(soup, "div.next-match p span")[2].contents[0].strip() matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M") timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M") dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund" location = u"Heim" if u"BVB" == home else u"Auswaerts" out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo) except IndexError: # This means: No next game on the webpage. sys.exit(1) except Exception as e: #print(traceback.format_exc()) raise Exception(u"ERRBVB while parsing bvb.de: %s" % e) return out, matchtime
def nofoutofplacefeatures(url): try: # pdb.set_trace() if url[:4]=="http": r = requests.get(url) else: url="http://"+url r = requests.get(url) #r = requests.get(url) data = r.text data2=r.content document, errors = tidy_document(data, options={'numeric-entities':1}) #print document #print errors #print "Number of Elements Out of Place : " + str(len(errors)) return len(errors) except: pass
def get_employees(lastname, firstname): payload = { 'find' : lastname } res = requests.get('https://www.campus.rwth-aachen.de/rwth/all/lecturerlist.asp', params=payload) if res.status_code == 200: persons = [ ] document, errors = tidy_document(res.content, options={'numeric-entities': 1, 'output_xhtml': 1}) tree = ET.fromstring(strip_ns(document)) try: filename = posixpath.basename(urlparse.urlsplit(res.url).path) if filename == 'lecturer.asp': fullname = tree.find('body/table[1]/tr[3]//tr[2]/td[2]').text.strip() unit = tree.find("body/table[2]//td[@class='h3']/a").text.strip() persons.append(fullname) elif filename == 'lecturerlist.asp': links = [ ] for cell in tree.findall('body/table[2]//td[3]/table[2]//td[1]/a'): if cell is not None: fullname = cell.text.strip() persons.append(fullname) else: raise Exception except: print "===> WARNING: failed to get employee list for: %s, %s" % (firstname, lastname) return persons
def sanitize(note): debug('Sanitizing note content...', 2) if get_setting('evernote/sanitize/@applytemplate') == 'True': with open(get_setting('evernote/sanitize/template/text()'), 'r') as file: template = file.read() template = template.replace('{content}', note['content']) note['content'] = transform(template) preservedElements = [] preservePattern = get_setting('evernote/sanitize/preserve/pattern/text()') preserves = get_setting('evernote/sanitize/preserve/elements/text()').split(',') for preserve in preserves: matches = re.findall(preservePattern.format(preserve), note['content']) for match in matches: placeholder = '{%s}' % uuid.uuid4().hex preservedElements.append({'placeholder': placeholder, 'element': match}) note['content'] = note['content'].replace(match, placeholder, 1) note['content'] = re.sub(get_setting('evernote/sanitize/attributes/empty/text()'), '', note['content']) note['content'] = re.sub(get_setting('evernote/sanitize/attributes/prohibited/text()'), '', note['content']) note['content'] = re.sub(get_setting('evernote/sanitize/elements/text()'), '', note['content']) note['content'] = note['content'].encode('utf-8', errors='ignore') (note['content'], errors) = tidy_document(note['content']) for element in preservedElements: note['content'] = note['content'].replace(element['placeholder'], element['element']) if note['title'] != None: note['title'] = note['title'].replace('\n', ' ').replace('\r', '').replace(' ', ' ') else: note['title'] = get_setting('evernote/sanitize/defaulttitle/text()')
def convert_to_html(filename): # Do the conversion with pandoc output = pypandoc.convert(filename, 'html') # Clean up with tidy... output, errors = tidy_document(output, options={ 'numeric-entities': 1, 'wrap': 80, }) print(errors) # replace smart quotes. output = output.replace(u"\u2018", '‘').replace(u"\u2019", '’') output = output.replace(u"\u201c", "“").replace(u"\u201d", "”") # write the output filename, ext = os.path.splitext(filename) filename = "{0}.html".format(filename) with open(filename, 'w') as f: # Python 2 "fix". If this isn't a string, encode it. if type(output) is not str: output = output.encode('utf-8') f.write(output) print("Done! Output written to: {}\n".format(filename))
def _tidysrc(self,data,srccode): """tidy scribe the html src""" try: from tidylib import tidy_document BASE_OPTIONS = { "output-xhtml": 1, # XHTML instead of HTML4 "indent": 1, # Pretty; not too much of a performance hit "tidy-mark": 0, # No tidy meta tag in output "wrap": 0, # No wrapping "alt-text": "", # Help ensure validation "doctype": 'strict', # Little sense in transitional for tool-generated markup... "force-output": 1, # May not get what you expect but you will get something "char-encoding":'utf-8', "input-encoding":srccode, "output-encoding":'utf-8', } if not isinstance(data, unicode): try: data = data.decode(srccode) except: pass doc, errors = tidy_document(data,options={'numeric-entities':1}) return doc except: return data
def test_doc_with_unicode_subclass(self): class MyUnicode(unicode): pass h = MyUnicode(u"unicode string ß") expected = unicode(DOC, 'utf-8') % h doc, err = tidy_document(h) self.assertEqual(doc, expected)
def pretty(self, html): soup = BeautifulSoup(html, "html5lib") document, errors = tidy_document(soup.encode(formatter="html"), options={ 'char-encoding': 'utf8', 'output-encoding': 'utf8', 'doctype': 'html5' }) return document
def init_stats(self): self.source = self.response.content.decode('utf-8') if tidylib: document, errors = tidylib.tidy_document(self.source, options={'numeric-entities':1}) else: errors = "" if errors: self.errors = [e.groupdict() for e in error_re.finditer(errors)]
def test_doc_with_unicode_subclass(self): class MyUnicode(utype): pass h = MyUnicode("unicode string ß") expected = DOC % h doc, err = tidy_document(h, {'output_xhtml':1}) self.assertEqual(doc, expected)
def tidy(self, data): document, errors = tidy_document(data, { 'input-xml': True, 'output-xml': True, 'preserve-entities': True, 'numeric-entities': True }) if errors: print errors return document
def process_response(self, request, response): document, errors = tidy_document(response.content, options={"numeric-entities": 1}) self.log_data = (document, errors) self.src_content = response.content errors_list = errors.split("\n") self.errors_count = len([err for err in errors_list if "error:" in err.lower()]) self.warns_count = len([err for err in errors_list if "warning:" in err.lower()]) return response
def validate_html(endpoint, document): """ This function can be used to make sure HTML returned is valid It raises an exception describing what's wrong then non-valid HTML was entered :param endpoint: name of the function which returned the html content :param document: the html content :return: None """ tidied, errors = tidy_document(document) if errors: raise SystemError( "Errors were found in the following HTML returned by function {}:\n{}\n\nErrors:\n{}" .format(endpoint, document, errors))
def pretty(self, tidy_warnings=False) -> str: '''Like render() but format through tidylib''' txt, errors = tidy_document( self.render(), { 'indent': 1, 'output-xhtml': True, 'force-output': 1, 'doctype': 'strict', 'show-warnings': tidy_warnings }) if errors: print('HTML tidy: ' + str(errors), file=sys.stderr) return txt
def test_post_note(self): code, error = tidylib.tidy_document( self._my_class.post_note(**{ "author": "test", "message": "test" }), options={ "show-errors": 1, "show-warnings": 0 }, ) self.assertFalse(error, "/notes/post_note does not return valid html page")
def test_select_item_search(self): random_item = random.sample(list(Book.selectBy(status="STOCK")), 1)[0] code, error = tidylib.tidy_document( self._my_class.select_item_search( title=random_item.title.booktitle), options={ "show-errors": 1, "show-warnings": 0 }, ) self.assertFalse( error, "/register/select_item_search does not return valid html page")
def clean_html(self, htmlfile): try: reader = open(htmlfile, 'r') content = reader.read() reader.close() document, errors = tidy.tidy_document(content, options=tidy_options) if document: writer = open(htmlfile, 'w') writer.write(document) writer.close() print "Cleaned", htmlfile except Exception, e: print e
def get_cvk_page(url): '''Повертає текст сторінки або нічого ''' res = s.get(url, headers=headers, verify=False) # print(res.encoding) # print(res.url) res.encoding = "utf-8" if res.status_code != 200: print(f"Error <= {url}") return tidy, errors = tidy_document(res.text) # print(errors) return tidy
def test_select_item_for_isbn_search_functional(self): random_item = random.sample(list(Title.select()), 1)[0] response = self._my_app.get("/admin/select_item_for_isbn_search", {"isbn": random_item.isbn}) code, error = tidylib.tidy_document(response.body, options={ "show-errors": 1, "show-warnings": 0 }) self.assertFalse( error, "/admin/select_item_for_isbn_search did not return proper response" )
def parse_html(self): content, errors = tidy_document(self.page, options={'char-encoding': 'utf8'}) #eq_(len(errors), 0 , "Errors found in HTML document:\n%s" % errors) b = content.find('nominatim_results =') e = content.find('</script>') content = content[b:e] b = content.find('[') e = content.rfind(']') self.result = json.JSONDecoder(object_pairs_hook=OrderedDict).decode( content[b:e + 1])
def __init__(self, idMembro, cvLattesHTML): HTMLParser.__init__(self) # inicializacao obrigatoria self.idMembro = idMembro self.sexo = 'Masculino' self.nomeCompleto = u'[Nome-nao-identificado]' self.item = '' self.issn = '' self.listaIDLattesColaboradores = [] self.listaFormacaoAcademica = [] self.listaAreaDeAtuacao = [] self.listaIdioma = [] self.listaArtigoEmPeriodico = [] self.listaLivroPublicado = [] self.listaCapituloDeLivroPublicado = [] self.listaTextoEmJornalDeNoticia = [] self.listaTrabalhoCompletoEmCongresso = [] self.listaResumoExpandidoEmCongresso = [] self.listaResumoEmCongresso = [] self.listaArtigoAceito = [] self.listaApresentacaoDeTrabalho = [] self.listaOutroTipoDeProducaoBibliografica = [] self.listaParticipacaoEmEvento = [] self.listaOrganizacaoDeEvento = [] # inicializacao para evitar a busca exaustiva de algumas palavras-chave self.salvarAtualizacaoCV = 1 self.salvarFoto = 1 self.procurarCabecalho = 0 self.achouGrupo = 0 self.doi = '' self.relevante = 0 self.idOrientando = '' self.complemento = '' # contornamos alguns erros do HTML da Plataforma Lattes cvLattesHTML = cvLattesHTML.replace("<![CDATA[","") cvLattesHTML = cvLattesHTML.replace("]]>","") cvLattesHTML = cvLattesHTML.replace("<x<","<x<") cvLattesHTML = cvLattesHTML.replace("<X<","<X<") # feed it! cvLattesHTML, errors = tidy_document(cvLattesHTML, options={'numeric-entities':1}) self.feed(cvLattesHTML)
def _validateHTML(self, moFile): """ This function validates the file ``moFile`` for correct html syntax. :param moFile: The name of a Modelica source file. :return: (str, str) The tidied markup [0] and warning/error messages[1]. Warnings and errors are returned just as tidylib returns them. """ from tidylib import tidy_document entries = self._getInfoRevisionsHTML(moFile) # Document header header = "<?xml version='1.0' encoding='utf-8'?> \n \ <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \n \ \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"> \n \ <html xmlns=\"http://www.w3.org/1999/xhtml\"> \n \ <head> \n \ <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /> \n \ <title>xxx</title> \n \ </head> \n \ <body> \n \ <!-- +++++++++++++++++++++++++++++++++++++ -->\n" body = "" for line in entries: body += line + '\n' # Replace \" with " body = body.replace('\\"', '"') # Document footer footer = "<!-- +++++++++++++++++++++++++++++++++++++ -->\n \ </body>\n \ </html>" # Validate the string document, errors = tidy_document(r"%s%s%s" % (header, body, footer), options={ 'numeric-entities': 1, 'output-html': 1, 'alt-text': '', 'wrap': 72 }) # Write html file. if self._writeHTML: htmlName = "%s%s" % (moFile[0:-2], "html") with open(htmlName, mode="w", encoding="utf-8") as f: f.write(document) return (document, errors)
def wrap_html(curr_chapter): with open(curr_chapter['local'], 'r+', encoding='utf-8') as f: content = f.read() header = """<?xml version='1.0' encoding='utf-8'?> <html xmlns="http://www.w3.org/1999/xhtml"> <head><title>%s</title></head> <body><h1 class="chapter">%s</h1>""" % (curr_chapter['title'], curr_chapter['title']) footer = "</body></html>" doc = "%s%s%s" % (header, content, footer) clean = tidy_document(doc, options=TIDY_OPTS) f.seek(0) f.write(clean[0]) f.truncate()
def test_page_should_be_valid_html(page): if page.response != 200: pytest.skip("not validating non-reachable page") if not page.content_type or 'html' not in page.content_type.lower(): pytest.skip("not attempting to validate non-html page") if not should_validate(page.url): pytest.skip("skip validation of blacklisted page") if not page.content: pytest.skip("page has no content") document, errors = tidy_document(page.content, TIDY_OPTIONS) errors = filter_errors(errors) assert not errors, "Found following validation errors:\n" + errors
def get_stations(): try: web = requests.get( 'http://horarios.renfe.com/cer/hjcer300.jsp?NUCLEO=30&CP=NO&I=s#', timeout=4).text except Timeout: return None else: document, errors = tidy_document(web) bs = BeautifulSoup(document, 'html.parser') estaciones = bs.find('select', {"name": "o"}).findAll('option') estaciones_ids = [(option.text.strip().replace(" ", "").lower(), option['value']) for option in estaciones][1:] return {key: value for (key, value) in estaciones_ids}
def clean(cls, html, tidy=True, body_only=False): """ clean html document """ if body_only: cleaner = cls.cleaner_body else: cleaner = cls.cleaner if tidy: document, errors = tidy_document(html) cleaned = cleaner.clean_html(document) else: cleaned = cleaner.clean_html(html) return cleaned
def carrega_dados_lei(lei): response = urllib2.urlopen(lei.get('url')) html, errors = tidy_document(response.read(), tidy_options) if errors: sys.exit(errors) parsed_html = BeautifulSoup(html) titulo = parsed_html.find('h4').text.strip() descricao = parsed_html.find('td', attrs={'id': 'prim_col'}).text.strip() dados = {'titulo_completo': titulo, 'descricao': descricao} if lei.has_key('voto'): del (lei['voto']) lei.update(dados) return lei
def htmlCorrection( self, htmlStr: str, substitutions_dict: dict = { '"': '\\"', '<br>': '<br />', '<br/>': '<br />' } ) -> (str, str): """Returns cleaned html code and found errors Calls tidylib which will produce a clean version of the html code and also the errors that it has found. Parameters ---------- htmlStr : str The html code as a single string. substitutions_dict : dict A dictionary with key:value pairs for old and new text. The html code must be escaped in Modelica. Generate properly escaped code we need to add the escape characters. All the while we can replace html errors that Dymola introduces. i.e. '<br>' -> '<br />' Returns ------- str The tidy html code with escape characters as one string. str The error messages from tidylib. """ from tidylib import tidy_document # Validate the string htmlCorrect, errors = tidy_document(f"{htmlStr}", options={ 'doctype': 'html5', 'show-body-only': 1, 'numeric-entities': 1, 'output-html': 1, 'wrap': 72, 'alt-text': '', }) document_corr = HTML_Tidy.make_string_replacements( self, theString=htmlCorrect, substitutions_dict=substitutions_dict) return document_corr, errors
def __parse(self, url: str = None, html: str = None, cookies: str = None) -> (dict, str): """ Make an HTML/URL parsing by processing ALL found tags :param url: The url to parse (or None) :param html: The html page to parse as string (or None) :param cookies: The cookies to use on parsing :return: dictionary of tags, cookies """ self.url = None self.base_url = None is_image = False if url is not None: self.url = url url_parsed = urlparse(url) self.url_scheme = str(url_parsed.scheme) self.base_url = self.url_scheme + '://' + str(url_parsed.netloc) r = HttpRequest.request(url, cookies=cookies) if r is None: return None if r.status_code >= 400 or r.headers.get( 'Content-Type') in HtmlParser._unacceptable_content_types: return None try: html = r.json() Log.warning('Trying to parse a json with HTML parser!') except ValueError: html = r.text if r.headers is not None: for k, v in r.headers.items(): if k.lower() == 'set-cookie': cookies = v if HttpRequest.is_image(r): is_image = True xmp_start = html.find('<x:xmpmeta') xmp_end = html.find('</x:xmpmeta') xmp_str = html[xmp_start:xmp_end + 12] html = xmp_str if is_image: sorted_html = html else: sorted_html, errors = tidy_document( html) # Sort html (and fix errors) self.feed(sorted_html) if cookies is None: cookies = '' return self.tags, cookies
def validate_html_code(template, mongodb, redis_db): _id_key = redis_db.incr('html_log') document, errors = tidy_document(template, options={ 'numeric-entities': 1, 'char-encoding': 'utf8' }) errors = errors.split('\n') mongodb.html_log.insert({ '_id': _id_key, 'time': get_time.datetime_to_secs(), 'tmpl': template, 'log': errors }) return document
def _tidyHTML(doc): body = ''.join(doc) body = body.replace('\\"', '"') _, errors = tidy_document(r'{0}'.format(body), options=TIDY_OPTIONS) error_list = errors.rstrip().split('\n') errors = [] for error in error_list: error = error.rstrip().split('\n') for err in error: if bool(err) and not any(id in err for id in IGNORE_IDS): errors.append(err) return errors
def validate(self): document, errors = tidylib.tidy_document(self.content, options=self.TIDY_OPTIONS) for line in errors.splitlines(): for marker in self.markers: pattern = r"line \d+ column \d+ - {}:".format(marker) if re.match(pattern, line, flags=re.IGNORECASE): self.log.warning("Tidy report:\n%s", errors) raise ValidateError("invalid HTML content") if self.permissive and errors: self.log.debug("Tidy report:\n%s", errors) self.log.info("Clean HTML document") return document # return its XHTML version
def get_quiz_content(page): try: soup = BeautifulSoup(page, 'html.parser') q_desc_html = soup.find(class_='story') x_c = [tidy_document(str(x)) for x in q_desc_html.find_all('p')] x_s = BeautifulSoup(x_c[0][0], 'html.parser') q_desc = x_s.find('p').text.strip() return [x for x in q_desc.split('.') if x] except Exception as err: q_c = [x for x in q_desc_html.descendants][:2] if isinstance(q_c[1], bs4.element.Tag): q_c[1] = q_c[1].text else: q_c[1] = '' return q_c
def process_item(self, item, spider): data = pprint.pformat(item['content']) self.write(item, data, "html") document, errors = tidy_document(data, options={'numeric-entities': 1}) self.write(item, document, "html.tidy") self.write(item, errors, "html.errors") # my_reporter.read(html=data) # data= my_reporter.report_news() self.write(item, "\n".join(item['links']), "links.txt") # return item
def validate(folder, files): for fileName in files: if fileName.lower().endswith(".html"): try: file = open(fileName, "r") filetext = "".join([s for s in file.readlines()]) _, errors = tidy_document(filetext, options={"numeric-entities": 1}) if len(errors) == 0: return "HTML was successfully validated with no errors\n" return "HTML was validated, with following errors:\n - " + errors.replace( "\n", "\n - ").strip(" - ") except Exception as _: return "An error occured while validating html file\n" finally: file.close()
def corrigirHTML(self, cvLattesHTML): extended_chars = u''.join( unichr(c) for c in xrange(127, 65536, 1)) # srange(r"[\0x80-\0x7FF]") special_chars = ' -' '' cvLattesHTML = cvLattesHTML.decode( 'iso-8859-1', 'replace') #+extended_chars+special_chars #cvLattesHTML = cvLattesHTML.decode('ascii','replace')+extended_char+special_chars # Wed Jul 25 16:47:39 BRT 2012 # contornamos alguns erros do HTML da Plataforma Lattes cvLattesHTML = cvLattesHTML.replace("<![CDATA[", "") cvLattesHTML = cvLattesHTML.replace("]]>", "") arquivoHTML, errors = tidy_document(cvLattesHTML, options={'numeric-entities': 1}) #print errors return arquivoHTML
def checkurl_html(url: str, status_code: Optional[int] = 200, mimetype: Optional[str] = 'text/html', has: List[str] = [], hasnot: List[str] = []) -> str: __tracebackhide__ = True document = checkurl_text(url, status_code, mimetype, has, hasnot) if html_validation: for line in tidy_document(document, options=TIDY_OPTIONS)[1].split('\n'): if not line: continue if 'trimming empty <span>' in line: continue pytest.fail(f'tidy error: {line}') return document
def carrega_dados_politico(idx): response = urllib2.urlopen('http://www.excelencias.org.br/@parl.php?id=%s'%idx) html, errors = tidy_document(response.read(), tidy_options) if errors: sys.exit(errors) parsed_html = BeautifulSoup(html) conteudo = parsed_html.body.find('div', attrs={'id':'conteudo'}).find_all('div', attrs={'id':'contem_parl'}) if len(conteudo) <= 4: return bloco_principal = conteudo[0] bloco_votacoes = None for bloco in conteudo: if bloco.find('div', attrs={'id':'contem_titulo_parl'}).text.strip() == u'Como votou matérias no Plenário': bloco_votacoes = bloco if not bloco_votacoes: return nome = bloco_principal.find('div', attrs={'id':'contem_titulo_parl'}).text.strip() if not nome: return tabela_votacoes = bloco_votacoes.find('table', attrs={'class':'livre'}) lista_votos = [] if tabela_votacoes: for linha in tabela_votacoes.find_all('tr'): titulo_lei = linha.find('td', attrs={'id':'prim_col'}).text.strip() voto = linha.find('td', attrs={'class':'esq'}).text.strip() link_lei = linha.find('td', attrs={'id':'prim_col'}).find('a') url_lei = '' if link_lei: cod, num, ano, casa = link_lei.get('href').replace('javascript:parent.traz_pl(', '').split(',') url_lei = 'http://www.excelencias.org.br/modulos/parl_projetolei.php?cod=%s&num=%s&ano=%s&casa=%s' url_lei = url_lei %(cod.replace("'", ''), num, ano, casa.replace(')', '')) lista_votos.append({ 'titulo': titulo_lei, 'voto': voto, 'url': url_lei }) dados_politico = { 'idx': int(idx), 'nome': nome, 'votos': lista_votos } return dados_politico
def tidy_html(html): if hasattr(html, 'read'): html = html.read() html5, errors = tidy_document( html, options={ 'merge-divs': 0, # do not merge nested div elements - preserve semantic block structures 'output-xml': 0, 'indent': 1, 'tidy-mark': 0, 'wrap': 0, 'alt-text': '', 'doctype': 'html5', 'markup': 1 }) return html5
def validate_html(html): """run tidy on html""" _, errors = tidy_document( html.content, options={ "drop-empty-elements": False, "warn-proprietary-attributes": False, }, ) # idk how else to filter out these unescape amp errs errors = "\n".join( e for e in errors.split("\n") if "&book" not in e and "&type" not in e and "id and name attribute" not in e and "illegal characters found in URI" not in e and "escaping malformed URI reference" not in e) if errors: raise Exception(errors)
def process(self, request, response, report): global _tidy_available #TODO: Hash errors and don't log duplicate error sets (just a reference) if response.is_html and _tidy_available: try: doc, err = tidy_document(response.content, options=self.options) except: report.add_error('Unable to parse response') else: l = err.splitlines() if len(l) > 0: for e in l: report.add_message('{0}'.format( re.sub('^line\\b', 'Line', e))) report.add_message('Total: {0}'.format(len(l)))
def sanitize(html): # with from __future__ import unicode_litterals # tidy_document does not want other options at all # such as div merge char-encoding and so on document, errors = tidy_document( html, options={"output-xhtml": 1, "force-output": 1}) try: parsed_dom = parseString(document) document_element = parsed_dom.documentElement remove_prohibited_elements(document_element) remove_prohibited_attributes(document_element) body = document_element.getElementsByTagName("body")[0] body.tagName = "en-note" return body.toxml() except ExpatError: return ''