def __init__(self):
        symbols = ''
        symbol_set = set({})

        for k in name2codepoint.keys():
            symbol_set.add(k)

        for k in html5.keys():
            symbol_set.add(k.strip(';'))

        for s in symbol_set:
            symbols += '|' + s

        symbols = symbols.strip('|')

        self.single = re.compile('&[ ]?(' + symbols + ')[ ]?;', re.IGNORECASE)
        self.double = re.compile('&[ ]?amp[ ]?;[ ]?(' + symbols + ')[ ]?;',
                                 re.IGNORECASE)

        self.singleNum = re.compile('&[ ]?#[ ]?([0-9]+)[ ]?;', re.IGNORECASE)
        self.doubleNum = re.compile('&[ ]?amp[ ]?;[ ]?#[ ]?([0-9]+)[ ]?;',
                                    re.IGNORECASE)

        self.singleXNum = re.compile('&[ ]?#[ ]?x[ ]?([a-f0-9]+)[ ]?;',
                                     re.IGNORECASE)
        self.doubleXNum = re.compile(
            '&[ ]?amp[ ]?;[ ]?#[ ]?x[ ]?([a-f0-9]+)[ ]?;', re.IGNORECASE)

        self.nbsp = re.compile(
            '(&[ ]?x?[ ]?n[]?b[ ]?([a-z][ ]?){0,6}[ ]?;)|(&[ ]?o[ ]?s[ ]?p[ ]?;)',
            re.IGNORECASE)

        self.shy = re.compile('[ ]?&[ ]?s[ ]?h[ ]?y[ ]?;[ ]?', re.IGNORECASE)

        self.bpe = None
Beispiel #2
0
def deentity(data,mode=0):
    """Remove HTML entities from a string.
    
    Modes:
    
    0: Fast / common entities only (default)
    1: Comprehensive (slow)
    2: Syntax-critical escapes only
    3: Whole-file mode (skip syntax-critical escapes)
    """
    # The level of overhead which results from inefficiencies in this function is phenomenal.
    # TODO: convert this entire function to do it properly or at least using a compiled regex.
    from html.entities import name2codepoint,codepoint2name
    #
    # 0: Fast, 1: Comprehensive, 2: Syntax-critical only, 3: Whole-file mode (skip syntax-critical)
    if mode==0:
        foci = ('lt', 'gt', 'quot', 'nbsp', 'lsquo', 'rsquo', 'ldquo', 'rdquo', 'ndash', 'hellip', 'eacute')
    elif mode in (1,3):
        foci = list(name2codepoint.keys())
    elif mode == 2:
        foci = ('lt', 'gt')
    for name in foci:
        if name != "amp":
            if (mode != 3) or (name not in ('lt', 'gt')):
                data = data.replace("&" + name + ";", chr(name2codepoint[name]))
    if mode in (0, 2):
        data = data.replace("'", chr(39))
    elif mode in (1, ):#3):
        for number in range(0x100):
            name = "#"+str(number)
            data = data.replace("&" + name + ";", chr(number))
    if mode != 3:
        data = data.replace("&", "&")
    return data
Beispiel #3
0
def html2unicode(s):
    #replace html characters with unicode codepoints
    keys = list(name2codepoint.keys())
    keys = [k for k in keys if k not in ['amp', 'gt', 'lt']]
    for k in keys:

        s = s.replace('&%s;' % k, chr(name2codepoint[k]))
    return s
Beispiel #4
0
def charent_occurrences(filename):
    """Returns a dictionary of occurrences of char entities in the file.

    Arguments:
    filename: name of the file
    """
    occurrences = collections.defaultdict(int)
    pattern = re.compile('&(' + '|'.join(name2codepoint.keys()) + ');')

    with open(filename, 'r') as f:
        for line in f:
            match_iter = pattern.finditer(line)
            for match in match_iter:
                match_str = match.group()
                char_entity = match_str[1:-1]
                occurrences[char_entity] += 1
    return occurrences
Beispiel #5
0
    def __init__(self, bpe_code_file):
        super(Preprocessor, self).__init__()

        symbols = ''
        symbol_set = set({})

        for k in name2codepoint.keys():
            symbol_set.add(k)

        for k in html5.keys():
            symbol_set.add(k.strip(';'))

        for s in symbol_set:
            symbols += '|' + s

        symbols = symbols.strip('|')

        self.single = re.compile('&[ ]?(' + symbols + ')[ ]?;', re.IGNORECASE)
        self.double = re.compile('&[ ]?amp[ ]?;[ ]?(' + symbols + ')[ ]?;',
                                 re.IGNORECASE)

        self.singleNum = re.compile('&[ ]?#[ ]?([0-9]+)[ ]?;', re.IGNORECASE)
        self.doubleNum = re.compile('&[ ]?amp[ ]?;[ ]?#[ ]?([0-9]+)[ ]?;',
                                    re.IGNORECASE)

        self.singleXNum = re.compile('&[ ]?#[ ]?x[ ]?([a-f0-9]+)[ ]?;',
                                     re.IGNORECASE)
        self.doubleXNum = re.compile(
            '&[ ]?amp[ ]?;[ ]?#[ ]?x[ ]?([a-f0-9]+)[ ]?;', re.IGNORECASE)

        self.nbsp = re.compile(
            '(&[ ]?x?[ ]?n[]?b[ ]?([a-z][ ]?){0,6}[ ]?;)|(&[ ]?o[ ]?s[ ]?p[ ]?;)',
            re.IGNORECASE)

        self.shy = re.compile('[ ]?&[ ]?s[ ]?h[ ]?y[ ]?;[ ]?', re.IGNORECASE)

        self.bpe = None
        if bpe_code_file:
            with open(bpe_code_file, mode='r', encoding='utf-8') as f:
                self.bpe = BPE(f)
        else:
            logging.error('No BPE code file specified')
Beispiel #6
0
def substitute_char_entities(text):
    """Substitute html character entities in text for their corresponding unicode code points.

    Arguments:
    text - a text iterable

    """
    pattern = re.compile('&(' + '|'.join(name2codepoint.keys()) + ');')
    char_ents_in_text = set()

    text_str = ''.join(text)
    match_iter = pattern.finditer(text_str)
    for match in match_iter:
        match_str = match.group()
        char_entity = match_str[1:-1]
        char_ents_in_text.add(char_entity)

    ## oldf = open(old_file, 'r')
    ## for line in oldf:
    ##     match_iter = pattern.finditer(line)
    ##     for match in match_iter:
    ##         match_str = match.group()
    ##         char_entity = match_str[1:-1]
    ##         char_ents_in_text.add(char_entity)


    entdefs = {element: name2codepoint[element] for element in char_ents_in_text}
    entdefs['nbsp'] = ord(' ') # trata nbsp como espaço
    csub = CharEntitySubstituter(entdefs)

    ## chunk_size = 2**20
    ## file_size = oldf.tell()
    ## num_reads = file_size // chunk_size
    ## oldf.seek(0, 0)

    return pattern.sub(csub, text_str)
Beispiel #7
0
def html2unicode(s):
    #replace html characters with unicode codepoints
    for k in list(name2codepoint.keys()):
        s = s.replace('&%s;' % k,chr( name2codepoint[k]))
    return s