def __init__(self): symbols = '' symbol_set = set({}) for k in name2codepoint.keys(): symbol_set.add(k) for k in html5.keys(): symbol_set.add(k.strip(';')) for s in symbol_set: symbols += '|' + s symbols = symbols.strip('|') self.single = re.compile('&[ ]?(' + symbols + ')[ ]?;', re.IGNORECASE) self.double = re.compile('&[ ]?amp[ ]?;[ ]?(' + symbols + ')[ ]?;', re.IGNORECASE) self.singleNum = re.compile('&[ ]?#[ ]?([0-9]+)[ ]?;', re.IGNORECASE) self.doubleNum = re.compile('&[ ]?amp[ ]?;[ ]?#[ ]?([0-9]+)[ ]?;', re.IGNORECASE) self.singleXNum = re.compile('&[ ]?#[ ]?x[ ]?([a-f0-9]+)[ ]?;', re.IGNORECASE) self.doubleXNum = re.compile( '&[ ]?amp[ ]?;[ ]?#[ ]?x[ ]?([a-f0-9]+)[ ]?;', re.IGNORECASE) self.nbsp = re.compile( '(&[ ]?x?[ ]?n[]?b[ ]?([a-z][ ]?){0,6}[ ]?;)|(&[ ]?o[ ]?s[ ]?p[ ]?;)', re.IGNORECASE) self.shy = re.compile('[ ]?&[ ]?s[ ]?h[ ]?y[ ]?;[ ]?', re.IGNORECASE) self.bpe = None
def deentity(data,mode=0): """Remove HTML entities from a string. Modes: 0: Fast / common entities only (default) 1: Comprehensive (slow) 2: Syntax-critical escapes only 3: Whole-file mode (skip syntax-critical escapes) """ # The level of overhead which results from inefficiencies in this function is phenomenal. # TODO: convert this entire function to do it properly or at least using a compiled regex. from html.entities import name2codepoint,codepoint2name # # 0: Fast, 1: Comprehensive, 2: Syntax-critical only, 3: Whole-file mode (skip syntax-critical) if mode==0: foci = ('lt', 'gt', 'quot', 'nbsp', 'lsquo', 'rsquo', 'ldquo', 'rdquo', 'ndash', 'hellip', 'eacute') elif mode in (1,3): foci = list(name2codepoint.keys()) elif mode == 2: foci = ('lt', 'gt') for name in foci: if name != "amp": if (mode != 3) or (name not in ('lt', 'gt')): data = data.replace("&" + name + ";", chr(name2codepoint[name])) if mode in (0, 2): data = data.replace("'", chr(39)) elif mode in (1, ):#3): for number in range(0x100): name = "#"+str(number) data = data.replace("&" + name + ";", chr(number)) if mode != 3: data = data.replace("&", "&") return data
def html2unicode(s): #replace html characters with unicode codepoints keys = list(name2codepoint.keys()) keys = [k for k in keys if k not in ['amp', 'gt', 'lt']] for k in keys: s = s.replace('&%s;' % k, chr(name2codepoint[k])) return s
def charent_occurrences(filename): """Returns a dictionary of occurrences of char entities in the file. Arguments: filename: name of the file """ occurrences = collections.defaultdict(int) pattern = re.compile('&(' + '|'.join(name2codepoint.keys()) + ');') with open(filename, 'r') as f: for line in f: match_iter = pattern.finditer(line) for match in match_iter: match_str = match.group() char_entity = match_str[1:-1] occurrences[char_entity] += 1 return occurrences
def __init__(self, bpe_code_file): super(Preprocessor, self).__init__() symbols = '' symbol_set = set({}) for k in name2codepoint.keys(): symbol_set.add(k) for k in html5.keys(): symbol_set.add(k.strip(';')) for s in symbol_set: symbols += '|' + s symbols = symbols.strip('|') self.single = re.compile('&[ ]?(' + symbols + ')[ ]?;', re.IGNORECASE) self.double = re.compile('&[ ]?amp[ ]?;[ ]?(' + symbols + ')[ ]?;', re.IGNORECASE) self.singleNum = re.compile('&[ ]?#[ ]?([0-9]+)[ ]?;', re.IGNORECASE) self.doubleNum = re.compile('&[ ]?amp[ ]?;[ ]?#[ ]?([0-9]+)[ ]?;', re.IGNORECASE) self.singleXNum = re.compile('&[ ]?#[ ]?x[ ]?([a-f0-9]+)[ ]?;', re.IGNORECASE) self.doubleXNum = re.compile( '&[ ]?amp[ ]?;[ ]?#[ ]?x[ ]?([a-f0-9]+)[ ]?;', re.IGNORECASE) self.nbsp = re.compile( '(&[ ]?x?[ ]?n[]?b[ ]?([a-z][ ]?){0,6}[ ]?;)|(&[ ]?o[ ]?s[ ]?p[ ]?;)', re.IGNORECASE) self.shy = re.compile('[ ]?&[ ]?s[ ]?h[ ]?y[ ]?;[ ]?', re.IGNORECASE) self.bpe = None if bpe_code_file: with open(bpe_code_file, mode='r', encoding='utf-8') as f: self.bpe = BPE(f) else: logging.error('No BPE code file specified')
def substitute_char_entities(text): """Substitute html character entities in text for their corresponding unicode code points. Arguments: text - a text iterable """ pattern = re.compile('&(' + '|'.join(name2codepoint.keys()) + ');') char_ents_in_text = set() text_str = ''.join(text) match_iter = pattern.finditer(text_str) for match in match_iter: match_str = match.group() char_entity = match_str[1:-1] char_ents_in_text.add(char_entity) ## oldf = open(old_file, 'r') ## for line in oldf: ## match_iter = pattern.finditer(line) ## for match in match_iter: ## match_str = match.group() ## char_entity = match_str[1:-1] ## char_ents_in_text.add(char_entity) entdefs = {element: name2codepoint[element] for element in char_ents_in_text} entdefs['nbsp'] = ord(' ') # trata nbsp como espaço csub = CharEntitySubstituter(entdefs) ## chunk_size = 2**20 ## file_size = oldf.tell() ## num_reads = file_size // chunk_size ## oldf.seek(0, 0) return pattern.sub(csub, text_str)
def html2unicode(s): #replace html characters with unicode codepoints for k in list(name2codepoint.keys()): s = s.replace('&%s;' % k,chr( name2codepoint[k])) return s