コード例 #1
0
ファイル: transforms.py プロジェクト: vedantc98/Plone-test
    def __call__(self):
        text = self.orig
        if text is None:
            text = ''
        text = safe_decode(text, errors='replace')

        # Do & separately, else, it may replace an already-inserted & from
        # an entity with &amp;, so < becomes &lt; becomes &amp;lt;
        text = text.replace('&', '&amp;')
        # Make funny characters into html entity defs
        for entity, codepoint in name2codepoint.items():
            if entity != 'amp':
                text = text.replace(unichr(codepoint), '&' + entity + ';')

        text = self.urlRegexp.subn(self.replaceURL, text)[0]
        text = self.emailRegexp.subn(self.replaceEmail, text)[0]
        text = self.indentRegexp.subn(self.indentWhitespace, text)[0]

        # convert windows line endings
        text = text.replace('\r\n', '\n')
        # Finally, make \n's into br's
        text = text.replace('\n', '<br />')

        text = text.encode('utf-8')

        return text
コード例 #2
0
ファイル: transforms.py プロジェクト: CGTIC/Plone_SP
    def __call__(self):
        text = self.orig
        if text is None:
            text = ''
        text = safe_decode(text, errors='replace')

        # Do &amp; separately, else, it may replace an already-inserted & from
        # an entity with &amp;, so < becomes &lt; becomes &amp;lt;
        text = text.replace('&', '&amp;')
        # Make funny characters into html entity defs
        for entity, codepoint in name2codepoint.items():
            if entity != 'amp':
                text = text.replace(unichr(codepoint), '&' + entity + ';')

        text = self.urlRegexp.subn(self.replaceURL, text)[0]
        text = self.emailRegexp.subn(self.replaceEmail, text)[0]
        text = self.indentRegexp.subn(self.indentWhitespace, text)[0]

        # convert windows line endings
        text = text.replace('\r\n', '\n')
        # Finally, make \n's into br's
        text = text.replace('\n', '<br />')

        text = text.encode('utf-8')

        return text
コード例 #3
0
def get_xml_tree(html_string):
    try:
        entities_defs = []
        for x, i in name2codepoint.items():
            entities_defs.append('  <!ENTITY {x} "&#{i};">'.format(**locals()))
        docstring = "<!DOCTYPE html [\n{}\n]>".format('\n'.join(entities_defs))
        html_string = html_string.replace("<!DOCTYPE html>", docstring, 1)
        tree = ET.fromstring(html_string)
    except:  # noqa FIXME: figure out what we expect this to throw.
        dump_html(html_string)
        raise
    return tree
コード例 #4
0
ファイル: paraparser.py プロジェクト: anantram/mycommunity
    'uuml': u'\xfc',
    'weierp': u'\u2118',
    'Xi': u'\u039e',
    'xi': u'\u03be',
    'Yacute': u'\xdd',
    'yacute': u'\xfd',
    'yen': u'\xa5',
    'yuml': u'\xff',
    'Yuml': u'\u0178',
    'Zeta': u'\u0396',
    'zeta': u'\u03b6',
    'zwj': u'\u200d',
    'zwnj': u'\u200c',
}

known_entities = dict([(k, uniChr(v)) for k, v in name2codepoint.items()])
for k in greeks:
    if k not in known_entities:
        known_entities[k] = greeks[k]
f = isPy3 and asBytes or asUnicode
K = list(known_entities.keys())
for k in K:
    known_entities[f(k)] = known_entities[k]
del k, f, K


#------------------------------------------------------------------------
class ParaFrag(ABag):
    """class ParaFrag contains the intermediate representation of string
    segments as they are being parsed by the ParaParser.
    fontname, fontSize, rise, textColor, cbDefn
コード例 #5
0
        return "{0}(style={1})".format(self.__class__.__name__, self.style)

    @property
    def width(self):
        raise NotImplementedError

    def height(self, document):
        raise NotImplementedError

    def render(self):
        raise NotImplementedError


NAME2CHAR = {
    name: chr(codepoint)
    for name, codepoint in name2codepoint.items()
}


class StyledText(InlineStyled, AcceptNoneAttributeType):
    """Base class for text that has a :class:`TextStyle` associated with it."""
    def __add__(self, other):
        """Return the concatenation of this styled text and `other`. If `other`
        is `None`, this styled text itself is returned."""
        return MixedStyledText([self, other]) if other is not None else self

    def __radd__(self, other):
        """Return the concatenation of `other` and this styled text. If `other`
        is `None`, this styled text itself is returned."""
        return MixedStyledText([other, self]) if other is not None else self
コード例 #6
0
ファイル: html.py プロジェクト: kelvindecosta/picharsso
"""This module defines a formatter for [HTML](../../formats/html.md)."""

from html.entities import name2codepoint

import numpy as np

from .base import BaseFormatter

HTML_ENTITY_MAP: dict = {
    chr(value): key
    for (key, value) in name2codepoint.items()
}
"""A dictionary mapping unicode characters to their equivalent HTML entities."""

HTML_ENTITY_MAP[" "] = "nbsp;"


class HtmlFormatter(BaseFormatter):
    """
    A formatter for [HTML](../../formats/html.md).

    Inherits [`BaseFormatter`][picharsso.format.base.BaseFormatter].
    """
    @staticmethod
    def color(text, color):
        return f'<span style="color : rgb{tuple(color)};">{text}</span>'

    @staticmethod
    def translate(text_matrix):
        unique_chars = np.unique(text_matrix)
コード例 #7
0
#
# tweetokenize: Regular expression based tokenizer for Twitter
# Copyright: (c) 2013, Jared Suttles. All rights reserved.
# License: BSD, see LICENSE for details.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
import re
from os import path
#from itertools import imap
try:
    from itertools import imap
except ImportError:
    # Python 3...
    imap=map
from html.entities import name2codepoint

html_entities = {k: chr(v) for k, v in name2codepoint.items()}
html_entities_re = re.compile(r"&#?\w+;")
emoji_ranges = ((u'\U0001f300', u'\U0001f5ff'), (u'\U0001f600', u'\U0001f64f'), (u'\U0001f680', u'\U0001f6c5'),
                (u'\u2600', u'\u26ff'), (u'\U0001f170', u'\U0001f19a'))
emoji_flags =  {u'\U0001f1ef\U0001f1f5', u'\U0001f1f0\U0001f1f7', u'\U0001f1e9\U0001f1ea',
                u'\U0001f1e8\U0001f1f3', u'\U0001f1fa\U0001f1f8', u'\U0001f1eb\U0001f1f7',
                u'\U0001f1ea\U0001f1f8', u'\U0001f1ee\U0001f1f9', u'\U0001f1f7\U0001f1fa',
                u'\U0001f1ec\U0001f1e7'}


def _converthtmlentities(msg):
    def replace_entities(s):
        s = s.group(0)[1:-1] # remove & and ;
        if s[0] == '#':
            try:
                return chr(int(s[2:],16) if s[1] in 'xX' else int(s[1:]))
コード例 #8
0
    'uuml': u'\xfc',
    'weierp': u'\u2118',
    'Xi': u'\u039e',
    'xi': u'\u03be',
    'Yacute': u'\xdd',
    'yacute': u'\xfd',
    'yen': u'\xa5',
    'yuml': u'\xff',
    'Yuml': u'\u0178',
    'Zeta': u'\u0396',
    'zeta': u'\u03b6',
    'zwj': u'\u200d',
    'zwnj': u'\u200c',
    }

known_entities = dict([(k,uniChr(v)) for k,v in name2codepoint.items()])
for k in greeks:
    if k not in known_entities:
        known_entities[k] = greeks[k]
f = isPy3 and asBytes or asUnicode
K = list(known_entities.keys())
for k in K:
    known_entities[f(k)] = known_entities[k]
del k, f, K

#------------------------------------------------------------------------
class ParaFrag(ABag):
    """class ParaFrag contains the intermediate representation of string
    segments as they are being parsed by the ParaParser.
    fontname, fontSize, rise, textColor, cbDefn
    """
コード例 #9
0
ファイル: transforms.py プロジェクト: vedantc98/Plone-test
def convertHtmlToWebIntelligentPlainText(orig):
    """Converts text/html to text/x-web-intelligent.
    """
    preRegex = re.compile(r'<\s*pre[^>]*>(.*?)<\s*/pre\s*>', re.I | re.S)

    tagWhitespaceRegex = re.compile(r'\s+((<[^>]+>)\s+)+')
    whitespaceRegex = re.compile(r'\s+')

    tdRegex = re.compile(r'<\s*(td)([^>])*>', re.I)
    breakRegex = re.compile(r'<\s*(br)\s*/?>', re.I)
    startBlockRegex = re.compile(r'<\s*(dt)[^>]*>', re.I)
    endBlockRegex = re.compile(r'<\s*/\s*(p|div|tr|ul|ol|dl)[^>]*>', re.I)
    indentBlockRegex = re.compile(r'<\s*(blockquote|dd)[^>]*>', re.I)
    listBlockRegex = re.compile(r'<\s*(li)[^>]*>', re.I)

    tagRegex = re.compile(r'<[^>]+>', re.I | re.M)

    # Save all <pre> sections and restore after other transforms
    preSections = {}

    def savePres(match):
        marker = '__pre_marker__%d__' % len(preSections)
        preSections[marker] = match.group(1)
        return marker
    if orig is None:
        orig = ''
    text = preRegex.sub(savePres, orig)

    def fixTagWhitespace(match):
        """Make whitespace-tag-whitespace into whitespace-tag.
        Repeat this in case there are directly nested tags.
        """
        # Remove any superfluous whitespace, but preserve one leading space
        return ' ' + whitespaceRegex.sub('', match.group(0))
    text = tagWhitespaceRegex.sub(fixTagWhitespace, text)

    # Make all whitespace into a single space
    text = whitespaceRegex.sub(' ', text)

    # Fix entities
    text = text.replace('&nbsp;', ' ')
    for entity, codepoint in name2codepoint.items():
        # Do &lt; and &gt; later, else we may be creating what looks like
        # tags
        if entity != 'lt' and entity != 'gt' and entity != 'amp':
            text = text.replace(
                '&' + entity + ';',
                '&#' + str(codepoint) + ';'
            )

    # XXX: Remove <head>, <script>, <style> ?

    # Make tabs out of td's
    text = tdRegex.sub('\t', text)

    # Make br's and li's into newlines
    text = breakRegex.sub('\n', text)

    # Make the start of list blocks into paragraphs
    text = startBlockRegex.sub('\n\n', text)

    # Make the close of p's, div's and tr's into paragraphs
    text = endBlockRegex.sub('\n\n', text)

    # Make blockquotes and dd blocks indented
    text = indentBlockRegex.sub('\n\n  ', text)

    # Make list items indented and prefixed with -
    text = listBlockRegex.sub('\n\n  - ', text)

    # Remove other tags
    text = tagRegex.sub('', text)

    # Fix < and > entities
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    text = text.replace('&amp;', '&')

    # Restore pres
    for marker, section in preSections.items():
        text = text.replace(marker, '\n\n' + section + '\n\n')

    return text
コード例 #10
0
ファイル: tokenizer.py プロジェクト: vamaq/tweetokenize
#!/usr/bin/env python
#
# tweetokenize: Regular expression based tokenizer for Twitter
# Copyright: (c) 2013, Jared Suttles. All rights reserved.
# License: BSD, see LICENSE for details.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
import re
from os import path
from html.entities import name2codepoint

html_entities = {k: chr(v) for k, v in name2codepoint.items()}
html_entities_re = re.compile(r"&#?\w+;")
emoji_ranges = (('\U0001f300', '\U0001f5ff'), ('\U0001f600', '\U0001f64f'), ('\U0001f680', '\U0001f6c5'),
                ('\u2600', '\u26ff'), ('\U0001f170', '\U0001f19a'))
emoji_flags = {'\U0001f1ef\U0001f1f5', '\U0001f1f0\U0001f1f7', '\U0001f1e9\U0001f1ea',
               '\U0001f1e8\U0001f1f3', '\U0001f1fa\U0001f1f8', '\U0001f1eb\U0001f1f7',
               '\U0001f1ea\U0001f1f8', '\U0001f1ee\U0001f1f9', '\U0001f1f7\U0001f1fa',
               '\U0001f1ec\U0001f1e7'}


def _converthtmlentities(msg):
    def replace_entities(s):
        s = s.group(0)[1:-1]  # remove & and ;
        if s[0] == '#':
            try:
                return chr(int(s[2:],16) if s[1] in 'xX' else int(s[1:]))
            except ValueError:
                return '&#' + s + ';'
        else:
            try:
                return html_entities[s]
コード例 #11
0
ファイル: text.py プロジェクト: brechtm/rinohtype
    def __repr__(self):
        return "{0}(style={1})".format(self.__class__.__name__, self.style)

    @property
    def width(self):
        raise NotImplementedError

    def height(self, document):
        raise NotImplementedError

    def render(self):
        raise NotImplementedError


NAME2CHAR = {name: chr(codepoint)
             for name, codepoint in name2codepoint.items()}


class StyledText(Styled, AcceptNoneAttributeType):
    """Base class for text that has a :class:`TextStyle` associated with it."""

    style_class = TextStyle

    def __add__(self, other):
        """Return the concatenation of this styled text and `other`. If `other`
        is `None`, this styled text itself is returned."""
        return MixedStyledText([self, other]) if other is not None else self

    def __radd__(self, other):
        """Return the concatenation of `other` and this styled text. If `other`
        is `None`, this styled text itself is returned."""
コード例 #12
0
ファイル: transforms.py プロジェクト: CGTIC/Plone_SP
def convertHtmlToWebIntelligentPlainText(orig):
    """Converts text/html to text/x-web-intelligent.
    """
    preRegex = re.compile(r'<\s*pre[^>]*>(.*?)<\s*/pre\s*>', re.I | re.S)

    tagWhitespaceRegex = re.compile(r'\s+((<[^>]+>)\s+)+')
    whitespaceRegex = re.compile(r'\s+')

    tdRegex = re.compile(r'<\s*(td)([^>])*>', re.I)
    breakRegex = re.compile(r'<\s*(br)\s*/?>', re.I)
    startBlockRegex = re.compile(r'<\s*(dt)[^>]*>', re.I)
    endBlockRegex = re.compile(r'<\s*/\s*(p|div|tr|ul|ol|dl)[^>]*>', re.I)
    indentBlockRegex = re.compile(r'<\s*(blockquote|dd)[^>]*>', re.I)
    listBlockRegex = re.compile(r'<\s*(li)[^>]*>', re.I)

    tagRegex = re.compile(r'<[^>]+>', re.I | re.M)

    # Save all <pre> sections and restore after other transforms
    preSections = {}

    def savePres(match):
        marker = '__pre_marker__%d__' % len(preSections)
        preSections[marker] = match.group(1)
        return marker
    if orig is None:
        orig = ''
    text = preRegex.sub(savePres, orig)

    def fixTagWhitespace(match):
        """Make whitespace-tag-whitespace into whitespace-tag.
        Repeat this in case there are directly nested tags.
        """
        # Remove any superfluous whitespace, but preserve one leading space
        return ' ' + whitespaceRegex.sub('', match.group(0))
    text = tagWhitespaceRegex.sub(fixTagWhitespace, text)

    # Make all whitespace into a single space
    text = whitespaceRegex.sub(' ', text)

    # Fix entities
    text = text.replace('&nbsp;', ' ')
    for entity, codepoint in name2codepoint.items():
        # Do &lt; and &gt; later, else we may be creating what looks like
        # tags
        if entity != 'lt' and entity != 'gt' and entity != 'amp':
            text = text.replace(
                '&' + entity + ';',
                '&#' + str(codepoint) + ';'
            )

    # XXX: Remove <head>, <script>, <style> ?

    # Make tabs out of td's
    text = tdRegex.sub('\t', text)

    # Make br's and li's into newlines
    text = breakRegex.sub('\n', text)

    # Make the start of list blocks into paragraphs
    text = startBlockRegex.sub('\n\n', text)

    # Make the close of p's, div's and tr's into paragraphs
    text = endBlockRegex.sub('\n\n', text)

    # Make blockquotes and dd blocks indented
    text = indentBlockRegex.sub('\n\n  ', text)

    # Make list items indented and prefixed with -
    text = listBlockRegex.sub('\n\n  - ', text)

    # Remove other tags
    text = tagRegex.sub('', text)

    # Fix < and > entities
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    text = text.replace('&amp;', '&')

    # Restore pres
    for marker, section in preSections.items():
        text = text.replace(marker, '\n\n' + section + '\n\n')

    return text