def sanitize_characters(self, token): """Handles Characters tokens Our overridden tokenizer doesn't do anything with entities. However, that means that the serializer will convert all ``&`` in Characters tokens to ``&``. Since we don't want that, we extract entities here and convert them to Entity tokens so the serializer will let them be. :arg token: the Characters token to work on :returns: a list of tokens """ data = token.get('data', '') if not data: return token data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) token['data'] = data # If there isn't a & in the data, we can return now if '&' not in data: return token new_tokens = [] # For each possible entity that starts with a "&", we try to extract an # actual entity and re-tokenize accordingly for part in html5lib_shim.next_possible_entity(data): if not part: continue if part.startswith('&'): entity = html5lib_shim.match_entity(part) if entity is not None: if entity == 'amp': # LinkifyFilter can't match urls across token boundaries # which is problematic with & since that shows up in # querystrings all the time. This special-cases & # and converts it to a & and sticks it in as a # Characters token. It'll get merged with surrounding # tokens in the BleachSanitizerfilter.__iter__ and # escaped in the serializer. new_tokens.append({'type': 'Characters', 'data': '&'}) else: new_tokens.append({'type': 'Entity', 'name': entity}) # Length of the entity plus 2--one for & at the beginning # and one for ; at the end remainder = part[len(entity) + 2:] if remainder: new_tokens.append({'type': 'Characters', 'data': remainder}) continue new_tokens.append({'type': 'Characters', 'data': part}) return new_tokens
def sanitize_characters(self, token): """Handles Characters tokens Our overridden tokenizer doesn't do anything with entities. However, that means that the serializer will convert all ``&`` in Characters tokens to ``&``. Since we don't want that, we extract entities here and convert them to Entity tokens so the serializer will let them be. :arg token: the Characters token to work on :returns: a list of tokens """ data = token.get('data', '') if not data: return token data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) token['data'] = data # If there isn't a & in the data, we can return now if '&' not in data: return token new_tokens = [] # For each possible entity that starts with a "&", we try to extract an # actual entity and re-tokenize accordingly for part in html5lib_shim.next_possible_entity(data): if not part: continue if part.startswith('&'): entity = html5lib_shim.match_entity(part) if entity is not None: if entity == 'amp': # LinkifyFilter can't match urls across token boundaries # which is problematic with & since that shows up in # querystrings all the time. This special-cases & # and converts it to a & and sticks it in as a # Characters token. It'll get merged with surrounding # tokens in the BleachSanitizerfilter.__iter__ and # escaped in the serializer. new_tokens.append({'type': 'Characters', 'data': '&'}) else: new_tokens.append({'type': 'Entity', 'name': entity}) # Length of the entity plus 2--one for & at the beginning # and and one for ; at the end remainder = part[len(entity) + 2:] if remainder: new_tokens.append({'type': 'Characters', 'data': remainder}) continue new_tokens.append({'type': 'Characters', 'data': part}) return new_tokens
def sanitize_characters(self, token): """Handles Characters tokens Our overridden tokenizer doesn't do anything with entities. However, that means that the serializer will convert all ``&`` in Characters tokens to ``&``. Since we don't want that, we extract entities here and convert them to Entity tokens so the serializer will let them be. :arg token: the Characters token to work on :returns: a list of tokens """ data = token.get('data', '') if not data: return token data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) token['data'] = data # If there isn't a & in the data, we can return now if '&' not in data: return token new_tokens = [] # For each possible entity that starts with a "&", we try to extract an # actual entity and re-tokenize accordingly for part in html5lib_shim.next_possible_entity(data): if not part: continue if part.startswith('&'): entity = html5lib_shim.match_entity(part) if entity is not None: new_tokens.append({'type': 'Entity', 'name': entity}) # Length of the entity plus 2--one for & at the beginning # and and one for ; at the end remainder = part[len(entity) + 2:] if remainder: new_tokens.append({ 'type': 'Characters', 'data': remainder }) continue new_tokens.append({'type': 'Characters', 'data': part}) return new_tokens
def sanitize_characters(self, token): """Handles Characters tokens Our overridden tokenizer doesn't do anything with entities. However, that means that the serializer will convert all ``&`` in Characters tokens to ``&``. Since we don't want that, we extract entities here and convert them to Entity tokens so the serializer will let them be. :arg token: the Characters token to work on :returns: a list of tokens """ data = token.get('data', '') if not data: return token data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) token['data'] = data # If there isn't a & in the data, we can return now if '&' not in data: return token new_tokens = [] # For each possible entity that starts with a "&", we try to extract an # actual entity and re-tokenize accordingly for part in html5lib_shim.next_possible_entity(data): if not part: continue if part.startswith('&'): entity = html5lib_shim.match_entity(part) if entity is not None: new_tokens.append({'type': 'Entity', 'name': entity}) # Length of the entity plus 2--one for & at the beginning # and and one for ; at the end remainder = part[len(entity) + 2:] if remainder: new_tokens.append({'type': 'Characters', 'data': remainder}) continue new_tokens.append({'type': 'Characters', 'data': part}) return new_tokens