def test_frag_with_entity(self): h = "é" expected = "é" doc, err = tidy_fragment(h) self.assertEqual(doc, expected) expected = "é" doc, err = tidy_fragment(h, {'numeric-entities':1}) self.assertEqual(doc, expected)
def test_frag_with_entity(self): h = "é" expected = "é" doc, err = tidy_fragment(h) self.assertEqual(doc, expected) expected = "é" doc, err = tidy_fragment(h, {'numeric-entities': 1}) self.assertEqual(doc, expected)
def test_frag_with_unclosed_tag(self): h = "<p>hello" expected = '''<p> hello </p>''' doc, err = tidy_fragment(h) self.assertEqual(doc, expected)
def handle(self, *args, **kwargs): month_format = '%b' day_format = '%d' for url in args: parts = url.split('/') if len(parts) < 4: return "URL doesn't parse into at least year/month/day/slug" if parts[-1] == "": empty = parts.pop() slug = parts.pop() day = parts.pop() month = parts.pop() year = parts.pop() try: tt = time.strptime( '%s-%s-%s' % (year, month, day), '%s-%s-%s' % ('%Y', month_format, day_format)) date = datetime.date(*tt[:3]) except ValueError: raise Http404 story = Story.objects.get(publish_date=date, slug=slug) try: BeautifulSoup(story.body) # error that happens in paginator print "Story HTML is valid." except HTMLParseError, e: story.body = tidy_fragment(story.body)[0] # tidy the frag print "Story HTML is invalid, fixing and saving story." story.save() try: BeautifulSoup( story.body) # error that happens in paginator print "Story HTML is valid." except HTMLParseError, e: print "Story HTML was not able to be fixed. Object pk: %s" % story.id
def clean_html_fragment(self, body): content, errors = tidy_fragment(body, options={ "output-xhtml": 1, "doctype": 'strict' }) return content
def parse_book_file(href, book): block = {} book_tree = lxml.html.parse(join(books_dir, href), parser) if not 'page_count' in book: td = book_tree.xpath( "//td[descendant::*[contains(text(), '{}')]]".format( book['title']) ) if len(td): td = td[0] page_info = td.xpath("descendant::*[contains(text(), 'страниц')]") if len(page_info): book['page_count'] = patterns[0][1].search( tostring(page_info[0], encoding='unicode')).groups()[0] block['annotation'] = book_tree.xpath( r"//table[descendant::*[contains(text(), 'Аннотация')]]") block['contents'] = book_tree.xpath( r"//table[descendant::*[contains(text(), 'Содержание')]]") for key in block: if len(block[key]): mark = block[key][-1] book[key] = "" for element in mark.itersiblings(): if element.tag == "table": break drop_a(element) remove_attr(element) book[key] += tostring(element, encoding='unicode') book[key] = tidy_fragment(clean(book[key]))[0] return book
def clean( html ): if not html: return html clean = bleach.clean( html, tags = local_config.TAG_WHITELIST, attributes = local_config.ATTRIBUTE_WHITELIST ) # catches some additional problems tidy, warnings = tidylib.tidy_fragment( clean ) return tidy
def html(cls, string, show_everything=False, translation=gettext.NullTranslations()): # pylint: disable=unused-argument """Parses HTML""" out, _ = tidylib.tidy_fragment(string) return out
def link_title_uid_txt(i): if 'alternate' in i: link = i['alternate'][0]['href'] else: link = '' if 'title' in i: title = i['title'] title = unescape(title) else: title = '无题' rss_uid = i.get('id') or 1 snippet = i.get('summary') or i.get('content') or None if not snippet: return if snippet: htm = snippet['content'] if not htm: return htm = txttidy(htm) htm = txt_map('<pre', '</pre>', htm, pre_br) htm = tidy_fragment(htm, {'indent': 0})[0] htm = htm.replace('<br />', '\n') txt = htm2txt(htm) if not txt: return return link, title, rss_uid, txt
def get_article_text(self, body): """ Gets the article main text :param body: :return: """ raw_article_body = body.find("div", {"class": "article-body"}) article_body_no_html = raw_article_body if article_body_no_html is not None: article_body_no_html = article_body_no_html.get_text() article_body_no_html = self.gremlin_zapper.zap_string(article_body_no_html) if raw_article_body is not None: self.zap_tag_contents(raw_article_body) article_body = '' for item in raw_article_body.contents: article_body += str(item) else: article_body = '' article_body, errors = tidy_fragment(article_body, options={'numeric-entities': 1}) return article_body, article_body_no_html
def sanitize_html(value): from BeautifulSoup import BeautifulSoup, Comment, Tag # FIXME: 'None' should never be saved as text if value is None: return "" # allowed tags for a Vodafone Live <CONTAINER type="data" /> # this doubles up as a translation table. CKEditor does new-ish # HTML than Vodafone Live will accept. We have to translate 'em' back # to 'i', and 'strong' back to 'b'. # # NOTE: Order is important since <strong>'s can be inside <p>'s. tags = ( ("em", "i"), # when creating them in the editor they're EMs ("strong", "b"), ("i", "i"), # when loading them as I's the editor leaves them ("b", "b"), # we keep them here to prevent them from being removed ("u", "u"), ("br", "br"), ("p", "p"), ) valid_tags = [tag for tag, replacement_tag in tags] soup = BeautifulSoup(value) # remove all comments from the HTML for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() # hide all tags that aren't in the allowed list, but keep # their contents for tag in soup.findAll(True): # Vodafone Live allows for no tag attributes tag.attrs = [] if tag.name not in valid_tags: tag.hidden = True # replace tags with Vlive equivelants for element, replacement_element in tags: if element is not replacement_element: for tag in soup.findAll(element): replacement_tag = Tag(soup, replacement_element) replacement_tag.insert(0, tag.text) tag.replaceWith(replacement_tag) xml = soup.renderContents().decode("utf8") fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"}) return ( fragment.replace(" ", " ") .replace("’", "'") .replace("‘", "'") .replace(""", '"') .replace("“", '"') .replace("”", '"') .replace("•", "- ") .replace("é", "e") .replace("É", "E") .replace("–", "-") )
def parse_book_file(href, book): block = {} book_tree = lxml.html.parse(join(books_dir, href), parser) if not 'page_count' in book: td = book_tree.xpath( "//td[descendant::*[contains(text(), '{}')]]".format( book['title'])) if len(td): td = td[0] page_info = td.xpath("descendant::*[contains(text(), 'страниц')]") if len(page_info): book['page_count'] = patterns[0][1].search( tostring(page_info[0], encoding='unicode')).groups()[0] block['annotation'] = book_tree.xpath( r"//table[descendant::*[contains(text(), 'Аннотация')]]") block['contents'] = book_tree.xpath( r"//table[descendant::*[contains(text(), 'Содержание')]]") for key in block: if len(block[key]): mark = block[key][-1] book[key] = "" for element in mark.itersiblings(): if element.tag == "table": break drop_a(element) remove_attr(element) book[key] += tostring(element, encoding='unicode') book[key] = tidy_fragment(clean(book[key]))[0] return book
def remove_tags(text): from tidylib import tidy_fragment import re, html text = html.unescape(text) text, errors = tidy_fragment(text) tag_re = re.compile(r'<[^>]+>') return tag_re.sub('', text)
def test_frag_with_unicode_subclass(self): class MyUnicode(unicode): pass h = MyUnicode(u"unicode string ß") expected = h doc, err = tidy_fragment(h) self.assertEqual(doc, expected)
def test_frag_with_unicode_subclass(self): class MyUnicode(utype): pass h = MyUnicode("unicode string ß") expected = h doc, err = tidy_fragment(h) self.assertEqual(doc, expected)
def natural_selection(self, generation, gene, eval_place, individual_i): """I don't have to be the be the fittest and fastest to survive -- I just have to be fitter and faster than YOU! :param generation: :param gene: :param eval_place: :param individual_i: :return: """ if not generation or generation is None: raise SequencerValidationException("[!] generation is required.") if not gene or gene is None: raise SequencerValidationException("[!] gene is required.") if not eval_place or eval_place is None: raise SequencerValidationException("[!] eval_place is required.") if not individual_i or individual_i is None: raise SequencerValidationException("[!] individual_i is required.") sv = SeleniumValidator() indiv = gene_to_str(gene, generation.genomes) html = self.template.render({eval_place: indiv}) eval_html_path = os.path.realpath( os.path.join(self.html_dir, self.html_file.replace("*", str(individual_i)))) with open(eval_html_path, "w", encoding="utf-8") as _html: _html.write(html) payload, errors = tidy_fragment(html) warnings = len(re.findall(r"(Warning)\W", errors)) errors = len(re.findall(r"(Error)\W", errors)) if warnings > 0: warnings = float(warnings) * -0.2 # -0.1 if errors > 0: errors = float(errors) * -1.1 # -1.0 else: return None, 1 int_score = warnings + errors # result = test_payload_with_selenium(self.web_driver, str("file://" + eval_html_path)) result = sv.validate_payload( (self.web_driver, str("file://" + eval_html_path))) selenium_score = result["score"] if result["error"]: return None, 1 if selenium_score > 0: print("[*] Found running script: \"{}\" in {}.".format( indiv, eval_place)) int_score += self.bingo_score self.result_list.append([eval_place, generation.genomes, indiv]) return int_score, 0
def addSection(link, title): if not 'http' in link: page = urllib2.urlopen('http://www.paulgraham.com/' + link).read() soup = BeautifulSoup(page, "lxml") soup.prettify() else: page = urllib2.urlopen(link).read() section = ez_epub.Section() try: section.title = title print section.title if not 'http' in link: if len(soup.findAll('table', {'width': '435'})) != 0: font = str( soup.findAll('table', {'width': '435'})[0].findAll('font') [0]).strip("<font face=\"verdana\" size=\"2\">") elif len(soup.findAll('table', {'width': '374'})) != 0: font = str( soup.findAll('table', {'width': '374'})[0].findAll('font') [0]).strip("<font face=\"verdana\" size=\"2\">") if not 'Get funded by' in font and not 'Watch how this essay was' in font and not 'Like to build things?' in font and not len( font) < 100: content = font else: content = '' for par in soup.findAll('p'): content += str(par) for p in content.decode('utf-8').split("<br/><br/>"): p, error = tidy_fragment(p) if p == '</': continue if p.__contains__("<xa"): p = p.replace("<xa", "<a") section.text.append(genshi.core.Markup(p)) else: for p in str(page).replace("\n", "<br/>").split("<br/><br/>"): p, error = tidy_fragment(p) if p.__contains__("<xa"): p = p.replace("<xa", "<a") section.text.append(genshi.core.Markup(p)) except Exception, e: print str(e) pass
def tidy_html(html): """ Process an input string containing HTML and return a tuple (xhtml, errors, warnings) containing the output of tidylib and lists of validation errors and warnings. Input must be unicode. Output will be valid XHTML. """ if not isinstance(html, unicode): raise ValueError("tidyhtml must be called with a Unicode string!") warnings = list() # First, deal with embedded control codes: html, sub_count = CONTROL_CHAR_RE.subn(" ", html) if sub_count: warnings.append("Stripped %d control characters from body: %s" % ( sub_count, set(ord(i) for i in CONTROL_CHAR_RE.findall(html)) )) html, messages = tidylib.tidy_fragment( html.strip(), { "char-encoding": "utf8", "clean": False, "drop-empty-paras": False, "drop-font-tags": True, "drop-proprietary-attributes": False, "fix-backslash": True, "indent": True, "output-xhtml": True, } ) messages = filter(None, (l.strip() for l in messages.split("\n") if l)) # postprocess warnings to avoid HTML fragments being reported as lacking # doctype and title: errors = list() warnings = list() for msg in messages: if "Warning: missing <!DOCTYPE> declaration" in msg: continue if "Warning: inserting missing 'title' element" in msg: continue if "Warning: inserting implicit <body>" in msg: continue if "Error:" in msg: errors.append(msg) else: warnings.append(msg) return html, errors, warnings
def object_for_typepad_object(tp_obj): try: obj = Object.objects.get(service='typepad.com', foreign_id=tp_obj.url_id) except Object.DoesNotExist: pass else: log.debug("Reusing typepad object %r for asset %s", obj, tp_obj.url_id) return False, obj log.debug("Making new object for TypePad post %s by %s", tp_obj.url_id, tp_obj.author.display_name) author = account_for_typepad_user(tp_obj.author) body = tp_obj.rendered_content if not body and tp_obj.content: if tp_obj.text_format == 'html_convert_linebreaks': body = '\n\n'.join(u'<p>%s</p>' % t for t in tp_obj.content.split('\n\n')) else: body = tp_obj.content if body: body, errors = tidy_fragment(body) else: body = '' obj = Object( service='typepad.com', foreign_id=tp_obj.url_id, render_mode='mixed', title=tp_obj.title, body=body, time=tp_obj.published, permalink_url=tp_obj.permalink_url, author=author, ) if getattr(tp_obj, 'in_reply_to', None) is not None: # This post is in reply, so we don't care if our referent was # really a share. Be transitively in reply to the shared obj. really_a_share, obj.in_reply_to = object_for_typepad_object(tp_obj.in_reply_to) elif getattr(tp_obj, 'reblog_of', None) is not None: # Assets are public so it's okay if we use an anonymous typd here. t = typd.TypePad(endpoint='http://api.typepad.com/') reblog_of = t.assets.get(tp_obj.reblog_of.url_id) really_a_share, obj.in_reply_to = object_for_typepad_object(reblog_of) remove_reblog_boilerplate_from_obj(obj) if not obj.body: return True, obj.in_reply_to elif getattr(tp_obj, 'reblog_of_url', None) is not None: reblog_url = tp_obj.reblog_of_url try: in_reply_to = leapfrog.poll.embedlam.object_for_url(reblog_url) except leapfrog.poll.embedlam.RequestError, exc: in_reply_to = None except ValueError, exc: in_reply_to = None log.error("Error making object from referent %s of %s's post %s", reblog_url, author.display_name, tp_obj.url_id) log.exception(exc)
def html(self, string): """Parses HTML""" if "allow_html" not in INGIniousConfiguration or INGIniousConfiguration["allow_html"] == False: raise Exception("HTML is not allowed") elif INGIniousConfiguration["allow_html"] == "tidy": import tidylib out, dummy = tidylib.tidy_fragment(string) return out else: return string
def cleanupText(text): """This method cleans up the text of the report using libtidy""" # tidylib options options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, char_encoding="utf8", quote_nbsp=0) # remove html entities from the text ubody_text = unescape(text) # clean up xhtml using tidy aftertidy, errors = tidy_fragment(ubody_text.encode("utf8"), options, keep_doc=False) # tidylib returns a <tidy.lib._Document object> return str(aftertidy)
def html2xhtml(html,**options): options.update(doctype='omit') options.update(show_warnings=0) options.update(indent=0) options.update(output_xml=1) document, errors = tidy_fragment(html,options=options) if errors: #~ raise Exception(repr(errors)) raise Exception("Errors while processing %s\n==========\n%s" % (html,errors)) return document
def fix_open_tags(source): """ Fixes missing tags in html fragments. """ if not source: return source fixedhtml, errors = tidy_fragment(source) if settings.DEBUG and errors: errors = filter_tidylib_errors(errors) if errors: log.debug('Tidylib errors:\n{}'.format(errors)) return fixedhtml
def normalize(text): """ Normalize whitespace for a string of html using tidylib. """ output, errors = tidylib.tidy_fragment(text, options={ 'drop_empty_paras':0, 'fix_backslash':0, 'fix_bad_comments':0, 'fix_uri':0, 'join_styles':0, 'lower_literals':0, 'merge_divs':0, 'output_xhtml':1, 'quote_ampersand':0, 'newline':'LF'}) return output
def POST(self): """ POST request """ web.header('Content-Type', 'application/json') post_input = web.data() try: decoded_input = json.loads(post_input) except: return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: cannot decode POST</p>"}) if "xqueue_body" not in decoded_input: return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: no xqueue_body in POST</p>"}) try: edx_input = json.loads(decoded_input["xqueue_body"]) taskid = json.loads(edx_input["grader_payload"])["tid"] except: return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: cannot decode JSON</p>"}) try: task = course.get_task(taskid) except: return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: unknown task {}</p>".format(taskid)}) if not task.input_is_consistent(edx_input): return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: input not consistent with task</p>"}) try: job_return = job_manager_sync.new_job(task, edx_input, "Plugin - EDX") except: return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: error while grading submission</p>"}) try: text = "" if "text" in job_return: text = job_return["text"] if "problems" in job_return: for prob in job_return["problems"]: text += "<br/><h4>" + job_return["task"].get_problems()[prob].get_name() + "</h4>" + job_return["problems"][prob] score = (1 if job_return["result"] == "success" else 0) if "score" in job_return: score = job_return["score"] import tidylib out, dummy = tidylib.tidy_fragment(text, options={'output-xhtml': 1, 'enclose-block-text': 1, 'enclose-text': 1}) return json.dumps({"correct": (True if (job_return["result"] == "success") else None), "score": score, "msg": out}) except: return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: error converting submission result</p>"})
def cleanupText(text): """This method cleans up the text of the report using libtidy""" #tidylib options options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, char_encoding="utf8", quote_nbsp=0) #remove html entities from the text ubody_text = unescape(text) #clean up xhtml using tidy aftertidy, errors = tidy_fragment(ubody_text.encode("utf8"), options, keep_doc=False) #tidylib returns a <tidy.lib._Document object> return str(aftertidy)
def html2xhtml(html, **options): options.update(doctype='omit') options.update(show_warnings=0) options.update(indent=0) # options.update(output_xml=1) options.update(output_xhtml=1) document, errors = tidy_fragment(html, options=options) if errors: #~ raise Exception(repr(errors)) raise Exception("Errors while processing %s\n==========\n%s" % (html, errors)) # if document.startswith(WRAP_BEFORE): # document = document[len(WRAP_BEFORE):] # document = document[:-15] return document.strip()
def mytidy(content): BASE_OPTIONS = { "output-xhtml": 0, # XHTML instead of HTML4 "indent": 1, # Pretty; not too much of a performance hit "indent-spaces":4, "tab-size":4, "tidy-mark": 0, # No tidy meta tag in output "wrap": 0, # No wrapping "alt-text": "", # Help ensure validation "doctype": 'strict', # Little sense in transitional for tool-generated markup... "force-output": 1, # May not get what you expect but you will get something "char-encoding":'utf8', "input-encoding":'utf8', "output-encoding":'utf8', } content = tidy_fragment(content, BASE_OPTIONS) return content[0]
def tidy(soup): #put it through HTMLTidy to get nice output tidy_output = tidy_fragment(str(soup), options={ 'indent': 'auto', 'logical-emphasis': 'yes', 'vertical-space': 'yes', 'fix-uri': 'no' }) #hacks to get spacing the way I want it, since Tidy puts in too much html = tidy_output[0].replace('\n\n\n', '\n\n') html = re.sub(r'\n\n(\s*)<ul>', r'\n\1<ul>', html) #print Tidy error messages print(tidy_output[1]) return html
def __init__(self, op_html): """ Intializes this option with HTML. The HTML is validated before initializing the option. The input HTML should be a snippet and not contain the `html`, `head`, `title`, nor `body` tags. Throws an HTMLValidationException if the validation produces errors. :param op_html: The string representation of the option HTML. :return: """ document, errors = tidy_fragment("<!DOCTYPE html><html><head><title></title><body>%s</body></html>" % op_html) # python is stupid if len(errors) > 1: print errors raise HTMLValidationException() else: Option.__init__(self, op_html)
def normalize(text): """ Normalize whitespace for a string of html using tidylib. """ output, errors = tidylib.tidy_fragment( text, options={ "drop_empty_paras": 0, "fix_backslash": 0, "fix_bad_comments": 0, "fix_uri": 0, "join_styles": 0, "lower_literals": 0, "merge_divs": 0, "output_xhtml": 1, "quote_ampersand": 0, "newline": "LF", }, ) return output
def on_message(message): try: channel_id = message.channel_id.pop() except IndexError: # No channel_id means it's a private message, skip for now return channel = mumble_client.channels[channel_id] user = mumble_client.users[message.actor] message_text = message.message html_linted_message_text, _ = tidy_fragment(message_text) message_record = Message(user_name=user['name'], channel_name=channel['name'], message=html_linted_message_text, timestamp=datetime.datetime.now()) session.add(message_record) session.commit()
def process_single_node(node, context, typ, src): if typ == "text": node.text = value_repr(get_element_value(context, src)) elif typ == "html": raw_value = get_element_value(context, src, "") if raw_value: html_element = html.fragments_fromstring( "<div>%s</div>" % tidy_fragment(raw_value)[0])[0] for (key, value) in node.attrib.iteritems(): html_element.attrib[key] = value node.addnext(html_element) node.insert(0, html_element) # !+ANCHOR(mr, sep-2014) this should be type "anchor" not "link" !! # There is another HTML element "link" that is something else altogether... # e.g. <link rel="stylesheet" type="text/css" href="/browserref.css"> elif typ == "link": src_url = get_attr(node, "url") if src_url: link_url = get_element_value(context, src_url) else: link_url = url.absoluteURL(context, request) node.attrib["href"] = link_url if src: node.text = get_element_value(context, src) # For outputting elements that have an @src attribute to an external # resource, such as <img>, <script>, ... resolves @src to the # designated resource base url, as per configuration. Any additional # attrs needed to be output are specified verbatim in the template. elif typ == "src": src_url = get_attr(node, "src") assert src_url is not None, \ "Node %s attribute %r is invalid. Check report template." % ( node, typ) parsed_url = urlparse.urlparse(src_url) if not parsed_url.path.startswith("/"): node.attrib["src"] = urlparse.urljoin( "/@@/reporting-static/", src_url) else: # absolute or external, pass on as is node.attrib["src"] = src_url clean_element(node)
def process_single_node(node, context, typ, src): if typ == "text": node.text = value_repr(get_element_value(context, src)) elif typ == "html": raw_value = get_element_value(context, src, "") if raw_value: html_element = html.fragments_fromstring("<div>%s</div>" % tidy_fragment(raw_value)[0] )[0] for (key, value) in node.attrib.iteritems(): html_element.attrib[key] = value node.addnext(html_element) node.insert(0, html_element) # !+ANCHOR(mr, sep-2014) this should be type "anchor" not "link" !! # There is another HTML element "link" that is something else altogether... # e.g. <link rel="stylesheet" type="text/css" href="/browserref.css"> elif typ == "link": src_url = get_attr(node, "url") if src_url: link_url = get_element_value(context, src_url) else: link_url = url.absoluteURL(context, request) node.attrib["href"] = link_url if src: node.text = get_element_value(context, src) # For outputting elements that have an @src attribute to an external # resource, such as <img>, <script>, ... resolves @src to the # designated resource base url, as per configuration. Any additional # attrs needed to be output are specified verbatim in the template. elif typ == "src": src_url = get_attr(node, "src") assert src_url is not None, \ "Node %s attribute %r is invalid. Check report template." % ( node, typ) parsed_url = urlparse.urlparse(src_url) if not parsed_url.path.startswith("/"): node.attrib["src"] = urlparse.urljoin("/@@/reporting-static/", src_url) else: # absolute or external, pass on as is node.attrib["src"] = src_url clean_element(node)
def process_single_node(node, context, typ, src): clean_element(node) if typ == "text": node.text = value_repr(get_element_value(context, src)) elif typ == "html": raw_value = get_element_value(context, src, "") if raw_value: html_element = html.fragments_fromstring("<div>%s</div>" % tidy_fragment(raw_value)[0])[0] for (key, value) in node.attrib.iteritems(): html_element.attrib[key] = value node.addnext(html_element) node.insert(0, html_element) elif type == "link": url_src = get_attr(node, "url") if url_src: link_url = get_element_value(context, url_src) else: link_url = url.absoluteURL(context, request) node.attrib["href"] = link_url if src: node.text = get_element_value(context, src)
def clean(self): cleaned_data = super().clean() # clean HTML in some fields for field in ['shortdescr', 'yandexdescr', 'descr', 'spec', 'manuals', 'state', 'complect', 'stitches', 'dealertxt', 'sm_display', 'sm_software']: value = cleaned_data.get(field) if not value: continue fragment, errors = tidy_fragment(value, options={'indent': 0}) if not fragment: self.add_error(field, forms.ValidationError("Ошибка очистки HTML")) continue cleaned_data[field] = fragment code = cleaned_data.get('code') reg = re.compile(r'[-\.\w]+') # test for code presence is required for mass edit if code and not reg.fullmatch(code): self.add_error('code', forms.ValidationError("Код товара содержит недопустимые символы")) # detect import lock - do not allow save during import if cache.get("celery-single-instance-import1c") is not None: self.add_error(None, forms.ValidationError("Сохранение невозможно во время импорта склада, попробуйте позже.")) return cleaned_data
def process_single_node(node, context, typ, src): clean_element(node) if typ == "text": node.text = get_element_value(context, src) elif typ == "html": raw_value = get_element_value(context, src, "") if raw_value: html_element = etree.fromstring( "<div>%s</div>" % tidy_fragment(raw_value)[0]) for (key, value) in node.attrib.iteritems(): html_element.attrib[key] = value node.addnext(html_element) node.insert(0, html_element) elif type == "link": url_src = get_attr(node, "url") if url_src: link_url = get_element_value(context, url_src) else: link_url = url.absoluteURL(context, request) node.attrib["href"] = link_url if src: node.text = get_element_value(context, src)
def cmd_tidy(root, **kwargs): default_options = { 'clean': 0, 'drop-empty-elements': 0, 'drop-empty-paras': 0, 'drop-proprietary-attributes': 1, 'logical-emphasis': 0, 'merge-divs': 0, 'merge-spans': 0, 'anchor-as-name': 1, 'coerce-endtags': 1, 'custom-tags': 'blocklevel', 'enclose-block-text': 0, 'enclose-text': 0, 'escape-scripts': 1, 'fix-backslash': 1, 'fix-style-tags': 1, 'fix-uri': 1, 'literal-attributes': 0, 'uppercase-attributes': 0, 'uppercase-tags': 0, 'hide-comments': 1, 'join-classes': 1, 'join-styles': 1, 'merge-emphasis': 0, 'replace-color': 0, 'break-before-br': 0, 'indent': 0, 'indent-attributes': 0, 'keep-tabs': 0, 'omit-optional-tags': 0, 'tidy-mark': 0, 'vertical-space': 0 } options = {**default_options, **kwargs} return fromstring(tidy_fragment(tostring(root), options=options)[0])
def handle(self, *args, **options): num = 0 for product in Product.objects.all(): changed = False for field in [ 'shortdescr', 'yandexdescr', 'descr', 'spec', 'state', 'complect', 'stitches', 'sm_display', 'sm_software' ]: value = product.__dict__[field] if not value: continue fragment, errors = tidy_fragment(value, options={'indent': 0}) if not fragment: self.stdout.write('{}: {}'.format(str(product), field)) continue if md5(value.encode('utf-8')).hexdigest() != md5( fragment.encode('utf-8')).hexdigest(): product.__dict__[field] = fragment changed = True if changed: product.save() num = num + 1 self.stdout.write('Successfully updated %d products' % num)
def tidy(fragment): html, errors = tidy_fragment(fragment) return html
def render_get_caught_up(): ''' Render the prose for the get-caught-up info box The Google Sheet that powers this will be regularly re-downloaded ''' copy = copytext.Copy(app_config.CALENDAR_PATH) sheet = copy['get_caught_up'] serialized_data = json.loads(sheet.json()) is_valid = True markup_fields = [ 'intro_1', 'intro_2', 'bullet_1', 'bullet_2', 'bullet_3', 'bullet_4', 'bullet_5' ] markup_errors_found = None # Note that despite its name, tidy_fragment() requires a valid html document or else # it will throw markup validation errors. The documentation at http://countergram.github.io/pytidylib/ # did not address this seeming discrepancy. for field in markup_fields: document, errors = tidy_fragment( '<!DOCTYPE html><html><head><title>test</title></head><body>%s</body></html>' % serialized_data[field]) if errors: is_valid = False markup_errors_found = errors break # Don't publish if that option is off, or if a syntax error is found if serialized_data.get('published', '').lower() == 'yes' and is_valid: meta = { 'is_valid_markup': is_valid, 'published': serialized_data['published'], 'last_updated': datetime.utcnow() } content = { k: v.strip() for k, v in serialized_data.items() if k in markup_fields } _write_json_file({ 'meta': meta, 'content': content }, 'get-caught-up.json') # Publish a debug version to help editors gauge length of content # If there are no markup errors and `published` is `True`, the contents # of this file will be identical to that of the main GCU file meta = { 'is_valid_markup': is_valid, 'published': serialized_data['published'], 'last_updated': datetime.utcnow() } content = { k: v.strip() for k, v in serialized_data.items() if k in markup_fields } if is_valid else "The HTML markup is invalid. Errors:\n{}".format( markup_errors_found) _write_json_file({ 'meta': meta, 'content': content }, 'get-caught-up-debug.json')
subject = subject + " @" + notebook msg_body = "" msg_body = msg_body + '<note><title>'+subject+'</title><content><![CDATA[<?xml version="1.0" encoding="'+char_encoding+'" standalone="no"?> <!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"> <en-note>' msg_url = "" if 'canonical' in s.keys(): d = s["canonical"][0] msg_url = d["href"].encode(char_encoding, 'replace') if 'alternate' in s.keys(): d = s["alternate"][0] msg_url = d["href"].encode(char_encoding, 'replace') if 'summary' in s.keys(): d = s["summary"] dirtyHtml = d["content"] cleanHtml, errors = tidy_fragment(dirtyHtml) msg_body = msg_body + cleanHtml.encode(char_encoding, 'replace') if 'content' in s.keys(): d = s["content"] dirtyHtml = d["content"] cleanHtml, errors = tidy_fragment(dirtyHtml) msg_body = msg_body + cleanHtml.encode(char_encoding, 'replace') msg_body = msg_body + "</en-note>]]>\r\n</content>\r\n" if published_datetime: msg_body = msg_body + "<created>" + published_datetime + "</created>" if updated_datetime: msg_body = msg_body + "<updated>" + updated_datetime + "</updated>" msg_body = msg_body + "<note-attributes><source>web.clip</source><source-url>" + escape(msg_url) + "</source-url></note-attributes>" msg_body = msg_body + "</note>\r\n" print(msg_body)
def object_for_typepad_object(tp_obj): try: obj = Object.objects.get(service='typepad.com', foreign_id=tp_obj.url_id) except Object.DoesNotExist: pass else: log.debug("Reusing typepad object %r for asset %s", obj, tp_obj.url_id) return False, obj log.debug("Making new object for TypePad post %s by %s", tp_obj.url_id, tp_obj.author.display_name) author = account_for_typepad_user(tp_obj.author) body = tp_obj.rendered_content if not body and tp_obj.content: if tp_obj.text_format == 'html_convert_linebreaks': body = '\n\n'.join(u'<p>%s</p>' % t for t in tp_obj.content.split('\n\n')) else: body = tp_obj.content if body: body, errors = tidy_fragment(body) else: body = '' obj = Object( service='typepad.com', foreign_id=tp_obj.url_id, render_mode='mixed', title=tp_obj.title, body=body, time=tp_obj.published, permalink_url=tp_obj.permalink_url, author=author, ) if getattr(tp_obj, 'in_reply_to', None) is not None: # This post is in reply, so we don't care if our referent was # really a share. Be transitively in reply to the shared obj. really_a_share, obj.in_reply_to = object_for_typepad_object( tp_obj.in_reply_to) elif getattr(tp_obj, 'reblog_of', None) is not None: # Assets are public so it's okay if we use an anonymous typd here. t = typd.TypePad(endpoint='http://api.typepad.com/') reblog_of = t.assets.get(tp_obj.reblog_of.url_id) really_a_share, obj.in_reply_to = object_for_typepad_object(reblog_of) remove_reblog_boilerplate_from_obj(obj) if not obj.body: return True, obj.in_reply_to elif getattr(tp_obj, 'reblog_of_url', None) is not None: reblog_url = tp_obj.reblog_of_url try: in_reply_to = leapfrog.poll.embedlam.object_for_url(reblog_url) except leapfrog.poll.embedlam.RequestError, exc: in_reply_to = None except ValueError, exc: in_reply_to = None log.error("Error making object from referent %s of %s's post %s", reblog_url, author.display_name, tp_obj.url_id) log.exception(exc)
def check_html(text): document, err = tidy_fragment(text,options={'numeric-entities':1}) for l in err.split("\n"): if (re.search("missing </",l)): return 0 return 1
def html(cls, string, show_everything=False): """Parses HTML""" out, _ = tidylib.tidy_fragment(string) return out
def test_frag_with_incomplete_img_tag(self): h = "<img src='foo'>" expected = '''<img src='foo' alt="" />''' doc, err = tidy_fragment(h) self.assertEqual(doc, expected)
def tidy_html(html): doc, _ = tidylib.tidy_fragment(html, options={'indent': 0}) return doc
def test_frag_with_unicode(self): h = "unicode string ß" expected = h doc, err = tidy_fragment(h) self.assertEqual(doc, expected)
def test_tidy_fragment(self): h = "<p>hello" for i in xrange(100): doc, err = tidy_fragment(h) self.assertEqual(sink.sinks, {})
def html(cls, string): """Parses HTML""" out, _ = tidylib.tidy_fragment(string) return out
def test_tidy_fragment(self): h = "<p>hello" for i in range(100): doc, err = tidy_fragment(h) self.assertEqual(sink.sinks, {})
import glob import sys from tidylib import tidy_document, tidy_fragment options = { "indent": "auto", "indent-spaces": 4, "markup": True, "output-xml": False, "input-xml": False, "show-warnings": True, "numeric-entities": True, "quote-marks": True, "quote-nbsp": True, "quote-ampersand": False, "break-before-br": False, "uppercase-tags": False, "uppercase-attributes": False, } try: file_type = sys.argv[1] except: file_type = "html" for f in glob.glob("*." + file_type): with open(f) as htmlFragment: htmlFragment, errors = tidy_fragment(htmlFragment.read(), options) f = open(f, "w") f.write(htmlFragment)
def test_frag_with_unicode(self): h = u"unicode string ß" expected = h doc, err = tidy_fragment(h) self.assertEqual(doc, expected)
def clean_html_fragment(self, body): content, errors = tidy_fragment(body, options={"output-xhtml": 1, "doctype": 'strict'}) return content