def __init__(self, *args, **kw): HTMLParser.__init__(self, *args, **kw) self.sami = u'' self.line = u'' self.styles = {} self.queue = deque() self.langs = {} self.last_element = u'' self.name2codepoint = name2codepoint.copy() self.name2codepoint[u'apos'] = 0x0027
def __init__(self, *args, **kw): HTMLParser.__init__(self, *args, **kw) self.sami = u"" self.line = u"" self.styles = {} self.queue = deque() self.langs = set() self.last_element = u"" self.name2codepoint = name2codepoint.copy() self.name2codepoint[u"apos"] = 0x0027 self.convert_charrefs = False
def unescaper(match): """Custom un-escape function. """ name2codepoint_work = name2codepoint.copy() name2codepoint_work['apos']=ord("'") code = match.group(1) if code: return unichr(int(code, 10)) else: code = match.group(2) if code: return unichr(int(code, 16)) else: code = match.group(3) if code in name2codepoint_work: return unichr(name2codepoint_work[code]) return match.group(0)
from exe.webui.blockfactory import g_blockFactory from exe.engine.error import Error from exe.engine.path import Path, TempDirPath from exe.engine.version import release from exe.export.pages import Page, uniquifyNames from exe import globals as G from BeautifulSoup import BeautifulSoup from htmlentitydefs import name2codepoint from helper import exportMinFileJS from helper import exportMinFileCSS from exe.webui.common import getFilesCSSToMinify from exe.webui.common import getFilesJSToMinify log = logging.getLogger(__name__) entitymap = name2codepoint.copy() entitymap.pop("amp") entitymap.pop("lt") entitymap.pop("gt") entitymap.pop("quot") def htmlentitydecode(s): return re.sub("&(%s);" % "|".join(entitymap), lambda m: unichr(entitymap[m.group(1)]), s) # =========================================================================== class PublicationEpub3(object): """ EPUB Publications 3.0, defines publication-level semantics and conformance requirements for EPUB 3, including the format of the Package Document and rules for how this document and other
from collections import deque from copy import deepcopy from itertools import imap _word_split_re = re.compile(r'(\s+)') _punctuation_re = re.compile( '^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % ( '|'.join(imap(re.escape, ('(', '<', '<'))), '|'.join(imap(re.escape, ('.', ',', ')', '>', '\n', '>'))) ) ) _simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') _striptags_re = re.compile(r'(<!--.*?-->|<[^>]*>)') _entity_re = re.compile(r'&([^;]+);') _entities = name2codepoint.copy() _entities['apos'] = 39 # special singleton representing missing values for the runtime missing = type('MissingType', (), {'__repr__': lambda x: 'missing'})() # concatenate a list of strings and convert them to unicode. # unfortunately there is a bug in python 2.4 and lower that causes # unicode.join trash the traceback. _concat = u''.join try: def _test_gen_bug(): raise TypeError(_test_gen_bug) yield None _concat(_test_gen_bug())
import uuid from cgi import escape from zipfile import ZipFile, ZIP_DEFLATED, ZIP_STORED from exe.webui import common from exe.webui.blockfactory import g_blockFactory from exe.engine.error import Error from exe.engine.path import Path, TempDirPath from exe.engine.version import release from exe.export.pages import Page, uniquifyNames from exe import globals as G from BeautifulSoup import BeautifulSoup from htmlentitydefs import name2codepoint log = logging.getLogger(__name__) entitymap = name2codepoint.copy() entitymap.pop('amp') entitymap.pop('lt') entitymap.pop('gt') entitymap.pop('quot') def htmlentitydecode(s): return re.sub('&(%s);' % '|'.join(entitymap), lambda m: unichr(entitymap[m.group(1)]), s) # =========================================================================== class PublicationEpub3(object): """ EPUB Publications 3.0, defines publication-level semantics and conformance requirements for EPUB 3,
class HTMLBuilder(object): """Helper object for HTML generation. Per default there are two instances of that class. The `html` one, and the `xhtml` one for those two dialects. The class uses keyword parameters and positional parameters to generate small snippets of HTML. Keyword parameters are converted to XML/SGML attributes, positional arguments are used as children. Because Python accepts positional arguments before keyword arguments it's a good idea to use a list with the star-syntax for some children: >>> html.p(class_='foo', *[html.a('foo', href='foo.html'), ' ', ... html.a('bar', href='bar.html')]) u'<p class="foo"><a href="foo.html">foo</a> <a href="bar.html">bar</a></p>' This class works around some browser limitations and can not be used for arbitrary SGML/XML generation. For that purpose lxml and similar libraries exist. Calling the builder escapes the string passed: >>> html.p(html("<foo>")) u'<p><foo></p>' """ from htmlentitydefs import name2codepoint _entity_re = re.compile(r'&([^;]+);') _entities = name2codepoint.copy() _entities['apos'] = 39 _empty_elements = set([ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'keygen', 'isindex', 'link', 'meta', 'param', 'source', 'wbr' ]) _boolean_attributes = set([ 'selected', 'checked', 'compact', 'declare', 'defer', 'disabled', 'ismap', 'multiple', 'nohref', 'noresize', 'noshade', 'nowrap' ]) _plaintext_elements = set(['textarea']) _c_like_cdata = set(['script', 'style']) del name2codepoint def __init__(self, dialect): self._dialect = dialect def __call__(self, s): return escape(s) def __getattr__(self, tag): if tag[:2] == '__': raise AttributeError(tag) def proxy(*children, **arguments): buffer = '<' + tag for key, value in arguments.iteritems(): if value is None: continue if key[-1] == '_': key = key[:-1] if key in self._boolean_attributes: if not value: continue if self._dialect == 'xhtml': value = '="' + key + '"' else: value = '' else: value = '="' + escape(value, True) + '"' buffer += ' ' + key + value if not children and tag in self._empty_elements: if self._dialect == 'xhtml': buffer += ' />' else: buffer += '>' return buffer buffer += '>' children_as_string = ''.join( [unicode(x) for x in children if x is not None]) if children_as_string: if tag in self._plaintext_elements: children_as_string = escape(children_as_string) elif tag in self._c_like_cdata and self._dialect == 'xhtml': children_as_string = '/*<![CDATA[*/' + \ children_as_string + '/*]]>*/' buffer += children_as_string + '</' + tag + '>' return buffer return proxy def __repr__(self): return '<%s for %r>' % (self.__class__.__name__, self._dialect)
from Products.CMFCore.utils import getToolByName import re from urlparse import urlsplit, urljoin, urlunsplit from urllib import unquote_plus from Acquisition import aq_base from htmlentitydefs import name2codepoint from zope.component import getAdapters from zope.interface import implements from zope.app.component.hooks import getSite from plone.outputfilters import apply_filters from plone.outputfilters.interfaces import IFilter from plone.outputfilters.browser.resolveuid import uuidToObject, uuidFor from plone.outputfilters.filters.resolveuid_and_caption import IImageCaptioningEnabler from plone.outputfilters.filters.resolveuid_and_caption import IResolveUidsEnabler name2codepoint = name2codepoint.copy() name2codepoint['apos']=ord("'") __revision__ = '$Id$' # IMAGE_PATTERN matches an image tag on its own, or an image tag # enclosed in a simple <p> or <div>. In the latter case we strip out # the enclosing tag since we are going to insert our own. PATIMG = '\\<img[^>]+class\s*=[^=>]*captioned[^>]+\\>' PATA = '(?:(?P<atag0>\\<a[^>]*\\>)'+PATIMG+'\\</a\\>)' + '|' + PATIMG PAT0 = '(?P<pat0>'+PATA+')' PAT1 = '<(?:p|div)[^>]*>'+PAT0 + '</(?:p|div)>' + '|' + PAT0.replace('0>','1>') IMAGE_PATTERN = re.compile(PAT1, re.IGNORECASE) # Regex to match stupid IE attributes. In IE generated HTML an # attribute may not be enclosed by quotes if it doesn't contain
except ImportError: from dummy_thread import allocate_lock from htmlentitydefs import name2codepoint from collections import deque from copy import deepcopy from itertools import imap _word_split_re = re.compile(r'(\s+)') _punctuation_re = re.compile( '^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % ('|'.join(imap(re.escape, ('(', '<', '<'))), '|'.join( imap(re.escape, ('.', ',', ')', '>', '\n', '>'))))) _simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') _striptags_re = re.compile(r'(<!--.*?-->|<[^>]*>)') _entity_re = re.compile(r'&([^;]+);') _entities = name2codepoint.copy() _entities['apos'] = 39 # special singleton representing missing values for the runtime missing = type('MissingType', (), {'__repr__': lambda x: 'missing'})() # concatenate a list of strings and convert them to unicode. # unfortunately there is a bug in python 2.4 and lower that causes # unicode.join trash the traceback. _concat = u''.join try: def _test_gen_bug(): raise TypeError(_test_gen_bug) yield None
class HTMLBuilder(object): from htmlentitydefs import name2codepoint _entity_re = re.compile('&([^;]+);') _entities = name2codepoint.copy() _entities['apos'] = 39 _empty_elements = set([ 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param' ]) _boolean_attributes = set([ 'selected', 'checked', 'compact', 'declare', 'defer', 'disabled', 'ismap', 'multiple', 'nohref', 'noresize', 'noshade', 'nowrap' ]) _plaintext_elements = set(['textarea']) _c_like_cdata = set(['script', 'style']) del name2codepoint def __init__(self, dialect): self._dialect = dialect def __call__(self, s): return escape(s) def __getattr__(self, tag): if tag[:2] == '__': raise AttributeError(tag) def proxy(*children, **arguments): buffer = ['<' + tag] write = buffer.append for key, value in arguments.iteritems(): if value is None: continue if key.endswith('_'): key = key[:-1] if key in self._boolean_attributes: if not value: continue value = self._dialect == 'xhtml' and '="%s"' % key or '' else: value = '="%s"' % escape(value, True) write(' ' + key + value) if not children and tag in self._empty_elements: write(self._dialect == 'xhtml' and ' />' or '>') return ''.join(buffer) write('>') children_as_string = ''.join( (unicode(x) for x in children if x is not None)) if children_as_string: if tag in self._plaintext_elements: children_as_string = escape(children_as_string) elif tag in self._c_like_cdata and self._dialect == 'xhtml': children_as_string = '/*<![CDATA[*/%s/*]]>*/' % children_as_string buffer.extend((children_as_string, '</%s>' % tag)) return ''.join(buffer) return proxy def __repr__(self): return '<%s for %r>' % (self.__class__.__name__, self._dialect)
if code: return unichr(int(code, 10)) else: code = match.group(2) if code: return unichr(int(code, 16)) else: code = match.group(3) if code in name2codepoint: return unichr(name2codepoint[code]) return match.group(0) s = format_entity.entity_re.sub(unescape, s) s = format_entity.tag_re.sub('', s) return s format_entity.name2codepoint = name2codepoint.copy() format_entity.name2codepoint['apos'] = ord("'") format_entity.entity_re = re.compile('&(?:#(\d+)|(?:#x([\da-fA-F]+))|([a-zA-Z]+));') format_entity.tag_re = re.compile('\</?span[^\>]*>') def extract_correct_title(fragment, title, contentdb): # fragment: text starting with an instance of the 記事名の制約 template # extract 'title' parameter by intercepting template expansion te = Expander(fragment, pagename=title, wikidb=contentdb) found = False node_list = [te.parsed] while True: node = node_list.pop(0) if isinstance(node, Template): found = True
""" Transform """ import re from htmlentitydefs import name2codepoint from Products.PortalTransforms.interfaces import itransform from Products.PortalTransforms.interfaces import ITransform from DocumentTemplate.DT_Util import html_quote from DocumentTemplate.DT_Var import newline_to_br from zope.interface import implements name2codepoint = name2codepoint.copy() name2codepoint['apos'] = ord("'") # pylint: disable=W1401 # IMAGE_PATTERN matches an image tag on its own, or an image tag # enclosed in a simple <p> or <div>. In the latter case we strip out # the enclosing tag since we are going to insert our own. PATIMG = '\\<img[^>]+class\s*=[^=>]*captioned[^>]+\\>' PATA = '(?:(?P<atag0>\\<a[^>]*\\>)'+PATIMG+'\\</a\\>)' + '|' + PATIMG PAT0 = '(?P<pat0>'+PATA+')' PAT1 = '<(?:p|div)[^>]*>'+PAT0 + '</(?:p|div)>' + '|' + PAT0.replace('0>', '1>') IMAGE_PATTERN = re.compile(PAT1, re.IGNORECASE) # Copied from Products.kupu.plone.config UID_PATTERN = re.compile('(?P<tag><(?:a|area|img|object|param)' '\\s[^>]*(?:src|href|data|value)\s*=\s' '*")(?P<url>[^"]*resolveuid/(?P<uid>[^/"#? ]*))', re.DOTALL | re.IGNORECASE) # Regex to match stupid IE attributes. In IE generated HTML an # attribute may not be enclosed by quotes if it doesn't contain # certain punctuation.