Beispiel #1
0
def convertSpecialCharToHTML(text):
    from html.entities import codepoint2name
    resultStr = text
    for key, value in codepoint2name.items():
        if chr(key) in text:
            resultStr = text.replace(chr(key), "&" + value + ";")
    return resultStr
Beispiel #2
0
    def _populate_class_variables():
        lookup = {}
        reverse_lookup = {}
        characters_for_re = []

        # &apos is an XHTML entity and an HTML 5, but not an HTML 4
        # entity. We don't want to use it, but we want to recognize it on the way in.
        #
        # TODO: Ideally we would be able to recognize all HTML 5 named
        # entities, but that's a little tricky.
        extra = [(39, 'apos')]
        for codepoint, name in list(codepoint2name.items()) + extra:
            character = chr(codepoint)
            if codepoint not in (34, 39):
                # There's no point in turning the quotation mark into
                # " or the single quote into ', unless it
                # happens within an attribute value, which is handled
                # elsewhere.
                characters_for_re.append(character)
                lookup[character] = name
            # But we do want to recognize those entities on the way in and
            # convert them to Unicode characters.
            reverse_lookup[name] = character
        re_definition = "[%s]" % "".join(characters_for_re)
        return lookup, reverse_lookup, re.compile(re_definition)
Beispiel #3
0
def convertSpecialCharToHTML(text):
    from html.entities import codepoint2name
    resultStr = text
    for key, value in codepoint2name.items():
        if chr(key) in text:
            resultStr = text.replace(chr(key), "&" + value + ";")
    return resultStr
Beispiel #4
0
 def _populate_class_variables():
     lookup = {}
     reverse_lookup = {}
     characters_for_re = []
     for codepoint, name in list(codepoint2name.items()):
         character = chr(codepoint)
         if codepoint != 34:
             # There's no point in turning the quotation mark into
             # ", unless it happens within an attribute value, which
             # is handled elsewhere.
             characters_for_re.append(character)
             lookup[character] = name
         # But we do want to turn " into the quotation mark.
         reverse_lookup[name] = character
     re_definition = "[%s]" % "".join(characters_for_re)
     return lookup, reverse_lookup, re.compile(re_definition)
Beispiel #5
0
 def _populate_class_variables():
     lookup = {}
     reverse_lookup = {}
     characters_for_re = []
     for codepoint, name in list(codepoint2name.items()):
         character = chr(codepoint)
         if codepoint != 34:
             # There's no point in turning the quotation mark into
             # ", unless it happens within an attribute value, which
             # is handled elsewhere.
             characters_for_re.append(character)
             lookup[character] = name
         # But we do want to turn " into the quotation mark.
         reverse_lookup[name] = character
     re_definition = "[%s]" % "".join(characters_for_re)
     return lookup, reverse_lookup, re.compile(re_definition)
Beispiel #6
0
# -*- coding: utf-8 -*-
import re
from html import _escape_map_full
from html.entities import codepoint2name

html_entities = {
    _ord: '&{0};'.format(value)
    for _ord, value in codepoint2name.items()
}
html_entities.update(_escape_map_full)
entities_html = {value: _ord for _ord, value in html_entities.items()}


def encode(string):
    """Encodes html entities.

    This is a little more full featured than html.escape, as it will
    replace all charactes from codepoint2name.

    Returns:
        string with replaced html entities.
    """
    return string.translate(html_entities)


def decode(string):
    """Decodes html entities.

    Returns:
        string with html entities decoded.
    """
Beispiel #7
0
 def __init__(self, codepoint2name, name2codepoint):
     self.codepoint2entity = {
         c: str("&%s;" % n)
         for c, n in codepoint2name.items()
     }
     self.name2codepoint = name2codepoint
Beispiel #8
0
# -*- coding: utf-8 -*-
import re
from html import _escape_map_full
from html.entities import codepoint2name

html_entities = {_ord: '&{0};'.format(value)
                 for _ord, value in codepoint2name.items()}
html_entities.update(_escape_map_full)
entities_html = {value: _ord for _ord, value in html_entities.items()}


def encode(string):
    """Encodes html entities.

    This is a little more full featured than html.escape, as it will
    replace all charactes from codepoint2name.

    Returns:
        string with replaced html entities.
    """
    return string.translate(html_entities)


def decode(string):
    """Decodes html entities.

    Returns:
        string with html entities decoded.
    """
    return (
        re.sub(
Beispiel #9
0
def entity_parse(entity_pattern):
	chars = [chr(cp) for cp, name in codepoint2name.items() if entity_pattern.search(name)]
	print_char_table(chars)
Beispiel #10
0
from . import Graph

import re
from collections import defaultdict
from html.entities import codepoint2name

NTRIPLES_PATTERN = re.compile(r'\s*'.join([
    r'^', r'<(?P<subj>[^>]*)>', r'<(?P<pred>[^>]*)>',
    r'(<(?P<obj>[^>]*)>|"(?P<lit>[^"]*)"(@[a-z]+)?)', r'.', r'$'
]))
ENTITIES = str.maketrans({
    codepoint: '&{};'.format(name)
    for codepoint, name in codepoint2name.items()
})


def read_edge_list_graph(infile):
    "parse an edge-list formatted graph from `infile`"
    num_vertices = int(infile.readline())
    graph = Graph(range(num_vertices))

    for line in infile:
        source, target = line.split(' ')
        graph.add_edge(int(source), int(target))

    return graph


def read_metis_graph(infile):
    "parse a METIS formatted graph from `infile`"
    vertices_edges = infile.readline().split(' ')
Beispiel #11
0
# -*- coding: utf-8 -*-
import re
from html import _escape_map_full
from html.entities import codepoint2name, name2codepoint

html_entities = {_ord: '&{0};'.format(value) for _ord, value in codepoint2name.items()}
html_entities.update(_escape_map_full)
entities_html = {value: _ord for _ord, value in html_entities.items()}


def encode(string):
    """Encodes html entities.

    This is a little more full featured than html.escape, as it will
    replace all charactes from codepoint2name.

    Returns:
        string with replaced html entities.
    """
    return string.translate(html_entities)


def decode(string):
    """Decodes html entities.

    Returns:
        string with html entities decoded.
    """
    return re.sub('&(?:[#a-z][a-z0-9]+);', lambda m: chr(entities_html[m.group()]), string)
    return tweets_df_column.map( hastag_converter )


# unit_test
str_ = '#tousAvecLesBleus #AllezLesBleus #Mettre3-0 #viveLePSG à 17h30 JKL Algériens U victor ; ' + \
    'MNOP AB étienne #Titi127Tata3titi Didier BBCNews. AhAh!'
assert hastag_converter(str_) == \
    'tous Avec Les Bleus Allez Les Bleus Mettre 3-0 vive Le PSG à 17h30 JKL Algériens U victor ; MNOP AB étienne Titi 127 Tata 3 titi Didier BBC News. Ah Ah !' \
    , 'hastag_converter flawed'


#/////////////////////////////////////////////////////////////////////////////////////


from html.entities import codepoint2name
html_escape_table = {k: '&{};'.format(v) for k, v in codepoint2name.items()}

def html_escape(string: str) -> str :
    return string.translate(html_escape_table)


#/////////////////////////////////////////////////////////////////////////////////////


def my_log_formatter_fun(y, pos):
    """
    inspired from 'https://stackoverflow.com/questions/21920233#33213196'
    (matplotlib-log-scale-tick-label-number-formatting)
    
    to be used as a "matplotlib.ticker.FuncFormatter"
    for axis label formatting in log scale.
Beispiel #13
0
    def _populate_class_variables():
        """Initialize variables used by this class to manage the plethora of
        HTML5 named entities.

        This function returns a 3-tuple containing two dictionaries
        and a regular expression:

        unicode_to_name - A mapping of Unicode strings like "⦨" to
        entity names like "angmsdaa". When a single Unicode string has
        multiple entity names, we try to choose the most commonly-used
        name.

        name_to_unicode: A mapping of entity names like "angmsdaa" to 
        Unicode strings like "⦨".

        named_entity_re: A regular expression matching (almost) any
        Unicode string that corresponds to an HTML5 named entity.
        """
        unicode_to_name = {}
        name_to_unicode = {}

        short_entities = set()
        long_entities_by_first_character = defaultdict(set)
        
        for name_with_semicolon, character in sorted(html5.items()):
            # "It is intentional, for legacy compatibility, that many
            # code points have multiple character reference names. For
            # example, some appear both with and without the trailing
            # semicolon, or with different capitalizations."
            # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
            #
            # The parsers are in charge of handling (or not) character
            # references with no trailing semicolon, so we remove the
            # semicolon whenever it appears.
            if name_with_semicolon.endswith(';'):
                name = name_with_semicolon[:-1]
            else:
                name = name_with_semicolon

            # When parsing HTML, we want to recognize any known named
            # entity and convert it to a sequence of Unicode
            # characters.
            if name not in name_to_unicode:
                name_to_unicode[name] = character

            # When _generating_ HTML, we want to recognize special
            # character sequences that _could_ be converted to named
            # entities.
            unicode_to_name[character] = name

            # We also need to build a regular expression that lets us
            # _find_ those characters in output strings so we can
            # replace them.
            #
            # This is tricky, for two reasons.

            if (len(character) == 1 and ord(character) < 128
                and character not in '<>&'):
                # First, it would be annoying to turn single ASCII
                # characters like | into named entities like
                # &verbar;. The exceptions are <>&, which we _must_
                # turn into named entities to produce valid HTML.
                continue

            if len(character) > 1 and all(ord(x) < 128 for x in character):
                # We also do not want to turn _combinations_ of ASCII
                # characters like 'fj' into named entities like '&fjlig;',
                # though that's more debateable.
                continue

            # Second, some named entities have a Unicode value that's
            # a subset of the Unicode value for some _other_ named
            # entity.  As an example, \u2267' is &GreaterFullEqual;,
            # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular
            # expression needs to match the first two characters of
            # "\u2267\u0338foo", but only the first character of
            # "\u2267foo".
            #
            # In this step, we build two sets of characters that
            # _eventually_ need to go into the regular expression. But
            # we won't know exactly what the regular expression needs
            # to look like until we've gone through the entire list of
            # named entities.
            if len(character) == 1:
                short_entities.add(character)
            else:
                long_entities_by_first_character[character[0]].add(character)

        # Now that we've been through the entire list of entities, we
        # can create a regular expression that matches any of them.
        particles = set()
        for short in short_entities:
            long_versions = long_entities_by_first_character[short]
            if not long_versions:
                particles.add(short)
            else:
                ignore = "".join([x[1] for x in long_versions])
                # This finds, e.g. \u2267 but only if it is _not_
                # followed by \u0338.
                particles.add("%s(?![%s])" % (short, ignore))
        
        for long_entities in list(long_entities_by_first_character.values()):
            for long_entity in long_entities:
                particles.add(long_entity)

        re_definition = "(%s)" % "|".join(particles)
                
        # If an entity shows up in both html5 and codepoint2name, it's
        # likely that HTML5 gives it several different names, such as
        # 'rsquo' and 'rsquor'. When converting Unicode characters to
        # named entities, the codepoint2name name should take
        # precedence where possible, since that's the more easily
        # recognizable one.
        for codepoint, name in list(codepoint2name.items()):
            character = chr(codepoint)
            unicode_to_name[character] = name

        return unicode_to_name, name_to_unicode, re.compile(re_definition)