def convertSpecialCharToHTML(text): from html.entities import codepoint2name resultStr = text for key, value in codepoint2name.items(): if chr(key) in text: resultStr = text.replace(chr(key), "&" + value + ";") return resultStr
def _populate_class_variables(): lookup = {} reverse_lookup = {} characters_for_re = [] # &apos is an XHTML entity and an HTML 5, but not an HTML 4 # entity. We don't want to use it, but we want to recognize it on the way in. # # TODO: Ideally we would be able to recognize all HTML 5 named # entities, but that's a little tricky. extra = [(39, 'apos')] for codepoint, name in list(codepoint2name.items()) + extra: character = chr(codepoint) if codepoint not in (34, 39): # There's no point in turning the quotation mark into # " or the single quote into ', unless it # happens within an attribute value, which is handled # elsewhere. characters_for_re.append(character) lookup[character] = name # But we do want to recognize those entities on the way in and # convert them to Unicode characters. reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition)
def convertSpecialCharToHTML(text): from html.entities import codepoint2name resultStr = text for key, value in codepoint2name.items(): if chr(key) in text: resultStr = text.replace(chr(key), "&" + value + ";") return resultStr
def _populate_class_variables(): lookup = {} reverse_lookup = {} characters_for_re = [] for codepoint, name in list(codepoint2name.items()): character = chr(codepoint) if codepoint != 34: # There's no point in turning the quotation mark into # ", unless it happens within an attribute value, which # is handled elsewhere. characters_for_re.append(character) lookup[character] = name # But we do want to turn " into the quotation mark. reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition)
def _populate_class_variables(): lookup = {} reverse_lookup = {} characters_for_re = [] for codepoint, name in list(codepoint2name.items()): character = chr(codepoint) if codepoint != 34: # There's no point in turning the quotation mark into # ", unless it happens within an attribute value, which # is handled elsewhere. characters_for_re.append(character) lookup[character] = name # But we do want to turn " into the quotation mark. reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition)
# -*- coding: utf-8 -*- import re from html import _escape_map_full from html.entities import codepoint2name html_entities = { _ord: '&{0};'.format(value) for _ord, value in codepoint2name.items() } html_entities.update(_escape_map_full) entities_html = {value: _ord for _ord, value in html_entities.items()} def encode(string): """Encodes html entities. This is a little more full featured than html.escape, as it will replace all charactes from codepoint2name. Returns: string with replaced html entities. """ return string.translate(html_entities) def decode(string): """Decodes html entities. Returns: string with html entities decoded. """
def __init__(self, codepoint2name, name2codepoint): self.codepoint2entity = { c: str("&%s;" % n) for c, n in codepoint2name.items() } self.name2codepoint = name2codepoint
# -*- coding: utf-8 -*- import re from html import _escape_map_full from html.entities import codepoint2name html_entities = {_ord: '&{0};'.format(value) for _ord, value in codepoint2name.items()} html_entities.update(_escape_map_full) entities_html = {value: _ord for _ord, value in html_entities.items()} def encode(string): """Encodes html entities. This is a little more full featured than html.escape, as it will replace all charactes from codepoint2name. Returns: string with replaced html entities. """ return string.translate(html_entities) def decode(string): """Decodes html entities. Returns: string with html entities decoded. """ return ( re.sub(
def entity_parse(entity_pattern): chars = [chr(cp) for cp, name in codepoint2name.items() if entity_pattern.search(name)] print_char_table(chars)
from . import Graph import re from collections import defaultdict from html.entities import codepoint2name NTRIPLES_PATTERN = re.compile(r'\s*'.join([ r'^', r'<(?P<subj>[^>]*)>', r'<(?P<pred>[^>]*)>', r'(<(?P<obj>[^>]*)>|"(?P<lit>[^"]*)"(@[a-z]+)?)', r'.', r'$' ])) ENTITIES = str.maketrans({ codepoint: '&{};'.format(name) for codepoint, name in codepoint2name.items() }) def read_edge_list_graph(infile): "parse an edge-list formatted graph from `infile`" num_vertices = int(infile.readline()) graph = Graph(range(num_vertices)) for line in infile: source, target = line.split(' ') graph.add_edge(int(source), int(target)) return graph def read_metis_graph(infile): "parse a METIS formatted graph from `infile`" vertices_edges = infile.readline().split(' ')
# -*- coding: utf-8 -*- import re from html import _escape_map_full from html.entities import codepoint2name, name2codepoint html_entities = {_ord: '&{0};'.format(value) for _ord, value in codepoint2name.items()} html_entities.update(_escape_map_full) entities_html = {value: _ord for _ord, value in html_entities.items()} def encode(string): """Encodes html entities. This is a little more full featured than html.escape, as it will replace all charactes from codepoint2name. Returns: string with replaced html entities. """ return string.translate(html_entities) def decode(string): """Decodes html entities. Returns: string with html entities decoded. """ return re.sub('&(?:[#a-z][a-z0-9]+);', lambda m: chr(entities_html[m.group()]), string)
return tweets_df_column.map( hastag_converter ) # unit_test str_ = '#tousAvecLesBleus #AllezLesBleus #Mettre3-0 #viveLePSG à 17h30 JKL Algériens U victor ; ' + \ 'MNOP AB étienne #Titi127Tata3titi Didier BBCNews. AhAh!' assert hastag_converter(str_) == \ 'tous Avec Les Bleus Allez Les Bleus Mettre 3-0 vive Le PSG à 17h30 JKL Algériens U victor ; MNOP AB étienne Titi 127 Tata 3 titi Didier BBC News. Ah Ah !' \ , 'hastag_converter flawed' #///////////////////////////////////////////////////////////////////////////////////// from html.entities import codepoint2name html_escape_table = {k: '&{};'.format(v) for k, v in codepoint2name.items()} def html_escape(string: str) -> str : return string.translate(html_escape_table) #///////////////////////////////////////////////////////////////////////////////////// def my_log_formatter_fun(y, pos): """ inspired from 'https://stackoverflow.com/questions/21920233#33213196' (matplotlib-log-scale-tick-label-number-formatting) to be used as a "matplotlib.ticker.FuncFormatter" for axis label formatting in log scale.
def _populate_class_variables(): """Initialize variables used by this class to manage the plethora of HTML5 named entities. This function returns a 3-tuple containing two dictionaries and a regular expression: unicode_to_name - A mapping of Unicode strings like "⦨" to entity names like "angmsdaa". When a single Unicode string has multiple entity names, we try to choose the most commonly-used name. name_to_unicode: A mapping of entity names like "angmsdaa" to Unicode strings like "⦨". named_entity_re: A regular expression matching (almost) any Unicode string that corresponds to an HTML5 named entity. """ unicode_to_name = {} name_to_unicode = {} short_entities = set() long_entities_by_first_character = defaultdict(set) for name_with_semicolon, character in sorted(html5.items()): # "It is intentional, for legacy compatibility, that many # code points have multiple character reference names. For # example, some appear both with and without the trailing # semicolon, or with different capitalizations." # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references # # The parsers are in charge of handling (or not) character # references with no trailing semicolon, so we remove the # semicolon whenever it appears. if name_with_semicolon.endswith(';'): name = name_with_semicolon[:-1] else: name = name_with_semicolon # When parsing HTML, we want to recognize any known named # entity and convert it to a sequence of Unicode # characters. if name not in name_to_unicode: name_to_unicode[name] = character # When _generating_ HTML, we want to recognize special # character sequences that _could_ be converted to named # entities. unicode_to_name[character] = name # We also need to build a regular expression that lets us # _find_ those characters in output strings so we can # replace them. # # This is tricky, for two reasons. if (len(character) == 1 and ord(character) < 128 and character not in '<>&'): # First, it would be annoying to turn single ASCII # characters like | into named entities like # |. The exceptions are <>&, which we _must_ # turn into named entities to produce valid HTML. continue if len(character) > 1 and all(ord(x) < 128 for x in character): # We also do not want to turn _combinations_ of ASCII # characters like 'fj' into named entities like 'fj', # though that's more debateable. continue # Second, some named entities have a Unicode value that's # a subset of the Unicode value for some _other_ named # entity. As an example, \u2267' is ≧, # but '\u2267\u0338' is ≧̸. Our regular # expression needs to match the first two characters of # "\u2267\u0338foo", but only the first character of # "\u2267foo". # # In this step, we build two sets of characters that # _eventually_ need to go into the regular expression. But # we won't know exactly what the regular expression needs # to look like until we've gone through the entire list of # named entities. if len(character) == 1: short_entities.add(character) else: long_entities_by_first_character[character[0]].add(character) # Now that we've been through the entire list of entities, we # can create a regular expression that matches any of them. particles = set() for short in short_entities: long_versions = long_entities_by_first_character[short] if not long_versions: particles.add(short) else: ignore = "".join([x[1] for x in long_versions]) # This finds, e.g. \u2267 but only if it is _not_ # followed by \u0338. particles.add("%s(?![%s])" % (short, ignore)) for long_entities in list(long_entities_by_first_character.values()): for long_entity in long_entities: particles.add(long_entity) re_definition = "(%s)" % "|".join(particles) # If an entity shows up in both html5 and codepoint2name, it's # likely that HTML5 gives it several different names, such as # 'rsquo' and 'rsquor'. When converting Unicode characters to # named entities, the codepoint2name name should take # precedence where possible, since that's the more easily # recognizable one. for codepoint, name in list(codepoint2name.items()): character = chr(codepoint) unicode_to_name[character] = name return unicode_to_name, name_to_unicode, re.compile(re_definition)