def _process_html_tree(elt): node_list = safe_dom.NodeList() tail = elt.tail if elt.tag in tag_bindings: elt = tag_bindings[elt.tag]().render(elt, handler) try: if elt.tag.lower() == 'script': out_elt = safe_dom.ScriptElement() else: out_elt = safe_dom.Element(elt.tag) out_elt.add_attribute(**elt.attrib) if elt.text: out_elt.add_text(elt.text) for child in elt: out_elt.add_children(_process_html_tree(child)) except Exception as e: # pylint: disable-msg=broad-except logging.error('Invalid HTML tag: %s. %s', elt, e) out_elt = safe_dom.Element('span') out_elt.add_attribute(className='gcb-error-tag') out_elt.add_text(INVALID_HTML_TAG_MESSAGE) node_list.append(out_elt) if tail: node_list.append(safe_dom.Text(tail)) return node_list
def _process_html_tree(elt): """Recursively parses an HTML tree into a safe_dom.NodeList().""" # Return immediately with an error message if a duplicate instanceid is # detected. if 'instanceid' in elt.attrib: if elt.attrib['instanceid'] in used_instance_ids: return _generate_error_message_node_list( elt, DUPLICATE_INSTANCE_ID_MESSAGE) used_instance_ids.add(elt.attrib['instanceid']) # Otherwise, attempt to parse this tag and all its child tags. original_elt = elt try: if render_custom_tags and elt.tag in tag_bindings: tag = tag_bindings[elt.tag]() if isinstance(tag, ContextAwareTag): # Get or initialize a environment dict for this type of tag. # Each tag type gets a separate environment shared by all # instances of that tag. context = tag_contexts.get(elt.tag) if context is None: context = ContextAwareTag.Context(handler, {}) tag_contexts[elt.tag] = context # Render the tag elt = tag.render(elt, context) else: # Render the tag elt = tag.render(elt, handler) if elt.tag == cElementTree.Comment: out_elt = safe_dom.Comment() elif elt.tag.lower() == 'script': out_elt = safe_dom.ScriptElement() else: out_elt = safe_dom.Element(_remove_namespace(elt.tag)) out_elt.add_attribute(**elt.attrib) if elt.text: out_elt.add_text(elt.text) for child in elt: out_elt.add_children( _process_html_tree(child)) node_list = safe_dom.NodeList() node_list.append(out_elt) if original_elt.tail: node_list.append(safe_dom.Text(original_elt.tail)) return node_list except Exception as e: # pylint: disable=broad-except logging.exception('Error handling tag: %s', elt.tag) return _generate_error_message_node_list( original_elt, '%s: %s' % (INVALID_HTML_TAG_MESSAGE, e))
def _generate_error_message_node_list(elt, error_message): """Generates a node_list representing an error message.""" logging.error('[%s, %s]: %s.', elt.tag, dict(**elt.attrib), error_message) node_list = safe_dom.NodeList() node_list.append( safe_dom.Element( 'span', className='gcb-error-tag').add_text(error_message)) if elt.tail: node_list.append(safe_dom.Text(elt.tail)) return node_list
def _process_html_tree(elt): node_list = safe_dom.NodeList() tail = elt.tail if elt.tag in tag_bindings: elt = tag_bindings[elt.tag]().render(elt) out_elt = safe_dom.Element(elt.tag) out_elt.add_attribute(**elt.attrib) if elt.text: out_elt.add_text(elt.text) for child in elt: out_elt.add_children(_process_html_tree(child)) node_list.append(out_elt) if tail: node_list.append(safe_dom.Text(tail)) return node_list
def html_to_safe_dom(html_string): """Render HTML text as a tree of safe_dom elements.""" tag_bindings = get_tag_bindings() node_list = safe_dom.NodeList() if not html_string: return node_list def _process_html_tree(elt): node_list = safe_dom.NodeList() tail = elt.tail if elt.tag in tag_bindings: elt = tag_bindings[elt.tag]().render(elt) if elt.tag.lower() == 'script': out_elt = safe_dom.ScriptElement() else: out_elt = safe_dom.Element(elt.tag) out_elt.add_attribute(**elt.attrib) if elt.text: out_elt.add_text(elt.text) for child in elt: out_elt.add_children(_process_html_tree(child)) node_list.append(out_elt) if tail: node_list.append(safe_dom.Text(tail)) return node_list parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder( 'etree', cElementTree), namespaceHTMLElements=False) root = parser.parseFragment('<div>%s</div>' % html_string)[0] if root.text: node_list.append(safe_dom.Text(root.text)) for elt in root: node_list.append(_process_html_tree(elt)) return node_list
def _process_html_tree(elt, used_instance_ids): # Return immediately with an error message if a duplicate instanceid is # detected. if 'instanceid' in elt.attrib: if elt.attrib['instanceid'] in used_instance_ids: return _generate_error_message_node_list( elt, DUPLICATE_INSTANCE_ID_MESSAGE) used_instance_ids.add(elt.attrib['instanceid']) # Otherwise, attempt to parse this tag and all its child tags. original_elt = elt try: if elt.tag in tag_bindings: elt = tag_bindings[elt.tag]().render(elt, handler) if elt.tag.lower() == 'script': out_elt = safe_dom.ScriptElement() else: out_elt = safe_dom.Element(elt.tag) out_elt.add_attribute(**elt.attrib) if elt.text: out_elt.add_text(elt.text) for child in elt: out_elt.add_children( _process_html_tree(child, used_instance_ids)) node_list = safe_dom.NodeList() node_list.append(out_elt) if original_elt.tail: node_list.append(safe_dom.Text(original_elt.tail)) return node_list except Exception as e: # pylint: disable-msg=broad-except return _generate_error_message_node_list( original_elt, '%s: %s' % (INVALID_HTML_TAG_MESSAGE, e))
def html_to_safe_dom(html_string, handler, render_custom_tags=True): """Render HTML text as a tree of safe_dom elements.""" tag_bindings = get_tag_bindings() node_list = safe_dom.NodeList() if not html_string: return node_list # Set of all instance id's used in this dom tree, used to detect duplication used_instance_ids = set([]) # A dictionary of environments, one for each tag type which appears in the # page tag_contexts = {} def _generate_error_message_node_list(elt, error_message): """Generates a node_list representing an error message.""" logging.error('[%s, %s]: %s.', elt.tag, dict(**elt.attrib), error_message) node_list = safe_dom.NodeList() node_list.append( safe_dom.Element( 'span', className='gcb-error-tag').add_text(error_message)) if elt.tail: node_list.append(safe_dom.Text(elt.tail)) return node_list def _remove_namespace(tag_name): # Remove any namespacing which html5lib may have introduced. Html5lib # namespacing is of the form, e.g., # {http://www.w3.org/2000/svg}svg return re.sub(r'^\{[^\}]+\}', '', tag_name, count=1) def _process_html_tree(elt): """Recursively parses an HTML tree into a safe_dom.NodeList().""" # Return immediately with an error message if a duplicate instanceid is # detected. if 'instanceid' in elt.attrib: if elt.attrib['instanceid'] in used_instance_ids: return _generate_error_message_node_list( elt, DUPLICATE_INSTANCE_ID_MESSAGE) used_instance_ids.add(elt.attrib['instanceid']) # Otherwise, attempt to parse this tag and all its child tags. original_elt = elt try: if render_custom_tags and elt.tag in tag_bindings: tag = tag_bindings[elt.tag]() if isinstance(tag, ContextAwareTag): # Get or initialize a environment dict for this type of tag. # Each tag type gets a separate environment shared by all # instances of that tag. context = tag_contexts.get(elt.tag) if context is None: context = ContextAwareTag.Context(handler, {}) tag_contexts[elt.tag] = context # Render the tag elt = tag.render(elt, context) else: # Render the tag elt = tag.render(elt, handler) if elt.tag == cElementTree.Comment: out_elt = safe_dom.Comment() elif elt.tag.lower() == 'script': out_elt = safe_dom.ScriptElement() else: out_elt = safe_dom.Element(_remove_namespace(elt.tag)) out_elt.add_attribute(**elt.attrib) if elt.text: out_elt.add_text(elt.text) for child in elt: out_elt.add_children(_process_html_tree(child)) node_list = safe_dom.NodeList() node_list.append(out_elt) if original_elt.tail: node_list.append(safe_dom.Text(original_elt.tail)) return node_list except Exception as e: # pylint: disable=broad-except logging.exception('Error handling tag: %s', elt.tag) return _generate_error_message_node_list( original_elt, '%s: %s' % (INVALID_HTML_TAG_MESSAGE, e)) root = html_string_to_element_tree(html_string) if root.text: node_list.append(safe_dom.Text(root.text)) for child_elt in root: node_list.append(_process_html_tree(child_elt)) # After the page is processed, rollup any global header/footer data which # the environment-aware tags have accumulated in their env's for tag_name, context in tag_contexts.items(): header, footer = tag_bindings[tag_name]().rollup_header_footer(context) node_list.insert(0, _process_html_tree(header)) node_list.append(_process_html_tree(footer)) return node_list
import re from xml.etree import cElementTree import html5lib import safe_dom import webapp2 import appengine_config from common import schema_fields from models import config CAN_USE_DYNAMIC_TAGS = config.ConfigProperty( 'gcb_can_use_dynamic_tags', bool, safe_dom.Text( 'Whether lesson content can make use of custom HTML tags such as ' '<gcb-youtube videoid="...">. If this is enabled some legacy content ' 'may be rendered differently. '), default_value=True) DUPLICATE_INSTANCE_ID_MESSAGE = ( 'Error processing custom HTML tag: duplicate tag id') INVALID_HTML_TAG_MESSAGE = 'Invalid HTML tag' class BaseTag(object): """Base class for the custom HTML tags.""" @classmethod def name(cls): return cls.__name__ @classmethod
from common import caching from models import config from models import models from models.counters import PerfCounter import gae_mini_profiler.profiler import gae_mini_profiler.templatetags # max size for in-process jinja template cache MAX_GLOBAL_CACHE_SIZE_BYTES = 8 * 1024 * 1024 # this cache used to be memcache based; now it's in-process CAN_USE_JINJA2_TEMPLATE_CACHE = config.ConfigProperty( 'gcb_can_use_jinja2_template_cache', bool, safe_dom.Text( 'Whether jinja2 can cache bytecode of compiled templates in-process.'), default_value=True) def finalize(x): """A finalize method which will correctly handle safe_dom elements.""" if isinstance(x, safe_dom.Node) or isinstance(x, safe_dom.NodeList): return jinja2.utils.Markup(x.sanitized) return x def js_string_raw(data): """Escape a string so that it can be put in a JS quote.""" if not isinstance(data, basestring): return data data = data.replace('\\', '\\\\')
__author__ = 'John Orr ([email protected])' import jinja2 from models import config from models import models from webapp2_extras import i18n from models.models import MemcacheManager import safe_dom import tags from jinja2.bccache import BytecodeCache CAN_USE_JINJA2_TEMPLATE_CACHE = config.ConfigProperty( 'gcb_can_use_jinja2_template_cache', bool, safe_dom.Text( 'Whether jinja2 can cache bytecode of compiled templates in memcache.' ), default_value=True) def finalize(x): """A finalize method which will correctly handle safe_dom elements.""" if isinstance(x, safe_dom.Node) or isinstance(x, safe_dom.NodeList): return jinja2.utils.Markup(x.sanitized) return x def js_string_raw(data): """Escape a string so that it can be put in a JS quote.""" if not isinstance(data, basestring): return data
from webapp2_extras import i18n import appengine_config from common import caching from models import config from models import models from models.counters import PerfCounter # max size for in-process jinja template cache MAX_GLOBAL_CACHE_SIZE_BYTES = 8 * 1024 * 1024 # this cache used to be memcache based; now it's in-process CAN_USE_JINJA2_TEMPLATE_CACHE = config.ConfigProperty( 'gcb_can_use_jinja2_template_cache', bool, safe_dom.Text( 'Whether jinja2 can cache bytecode of compiled templates in-process.'), default_value=True) def finalize(x): """A finalize method which will correctly handle safe_dom elements.""" if isinstance(x, safe_dom.Node) or isinstance(x, safe_dom.NodeList): return jinja2.utils.Markup(x.sanitized) return x def js_string_raw(data): """Escape a string so that it can be put in a JS quote.""" if not isinstance(data, basestring): return data data = data.replace('\\', '\\\\')
def html_to_safe_dom(html_string, handler): """Render HTML text as a tree of safe_dom elements.""" tag_bindings = get_tag_bindings() node_list = safe_dom.NodeList() if not html_string: return node_list def _generate_error_message_node_list(elt, error_message): """Generates a node_list representing an error message.""" logging.error('[%s, %s]: %s.', elt.tag, dict(**elt.attrib), error_message) node_list = safe_dom.NodeList() node_list.append( safe_dom.Element( 'span', className='gcb-error-tag').add_text(error_message)) if elt.tail: node_list.append(safe_dom.Text(elt.tail)) return node_list def _process_html_tree(elt, used_instance_ids): # Return immediately with an error message if a duplicate instanceid is # detected. if 'instanceid' in elt.attrib: if elt.attrib['instanceid'] in used_instance_ids: return _generate_error_message_node_list( elt, DUPLICATE_INSTANCE_ID_MESSAGE) used_instance_ids.add(elt.attrib['instanceid']) # Otherwise, attempt to parse this tag and all its child tags. original_elt = elt try: if elt.tag in tag_bindings: elt = tag_bindings[elt.tag]().render(elt, handler) if elt.tag.lower() == 'script': out_elt = safe_dom.ScriptElement() else: out_elt = safe_dom.Element(elt.tag) out_elt.add_attribute(**elt.attrib) if elt.text: out_elt.add_text(elt.text) for child in elt: out_elt.add_children( _process_html_tree(child, used_instance_ids)) node_list = safe_dom.NodeList() node_list.append(out_elt) if original_elt.tail: node_list.append(safe_dom.Text(original_elt.tail)) return node_list except Exception as e: # pylint: disable-msg=broad-except return _generate_error_message_node_list( original_elt, '%s: %s' % (INVALID_HTML_TAG_MESSAGE, e)) root = html_string_to_element_tree(html_string) if root.text: node_list.append(safe_dom.Text(root.text)) used_instance_ids = set([]) for elt in root: node_list.append(_process_html_tree(elt, used_instance_ids)) return node_list