Esempio n. 1
0
def check_phrase(text):
    eq_(fix_text_encoding(text), text)
    eq_(fix_text_encoding(text.encode('utf-8').decode('latin-1')), text)
    # make sure that the opening punctuation is not the only thing that makes
    # it work
    eq_(fix_text_encoding(text[1:]), text[1:])
    eq_(fix_text_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])
Esempio n. 2
0
def test_all_bmp_characters():
    for index in range(0xa0, 0xfffd):
        char = unichr(index)
        # Exclude code points that are not assigned
        if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'):
            garble = char.encode('utf-8').decode('latin-1')
            garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1')
            eq_(char_names(fix_text_encoding(garble)), char_names(char))
            eq_(char_names(fix_text_encoding(garble2)), char_names(char))
Esempio n. 3
0
def fix_bad_encoding(text):
    """
    Kept for compatibility with previous versions of ftfy.
    """
    warnings.warn('fix_bad_encoding is now known as fix_text_encoding',
                  DeprecationWarning)
    return fix_text_encoding(text)
Esempio n. 4
0
 def check_ftfy(self, text):
     check_text = remove_unsafe_private_use(text).lower()
     if not possible_encoding(text, 'ascii') and 'unfollow' not in check_text:
         fixed = fix_text_encoding(text)
         if text != fixed:
             if not check_text.startswith('http://t.co/'):
                 print(u'Text:\t{text}\nFixed:\t{fixed}\n'.format(text=text, fixed=fixed))
             self.num_fixed += 1
Esempio n. 5
0
def fix_bad_encoding(text):
    """
    Kept for compatibility with previous versions of ftfy.
    """
    warnings.warn(
        'fix_bad_encoding is now known as fix_text_encoding',
        DeprecationWarning
    )
    return fix_text_encoding(text)
Esempio n. 6
0
def fix_text_segment(text,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     fix_latin_ligatures=True,
                     fix_character_width=True,
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True,
                     normalization='NFC'):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Esempio n. 7
0
def fix_text_segment(text,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     fix_latin_ligatures=True,
                     fix_character_width=True,
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True,
                     normalization='NFC'):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Esempio n. 8
0
def fix_text_segment(text,
                     remove_unsafe_private_use=True,
                     fix_entities=True,
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    unsafe_entities = ('<' in text and '>' in text)
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities and not unsafe_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
Esempio n. 9
0
    def check_ftfy(self, text):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        if not possible_encoding(text, 'ascii'):
            fixed = fix_text_encoding(text)
            if text != fixed:
                # possibly filter common bots before printing
                print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format(
                    text=text, fixed=fixed))
                self.num_fixed += 1

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))
Esempio n. 10
0
def fix_text_segment(text,
                     remove_unsafe_private_use=True,
                     fix_entities=True,
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    unsafe_entities = ('<' in text and '>' in text)
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities and not unsafe_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
Esempio n. 11
0
    def check_ftfy(self, text):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        if not possible_encoding(text, 'ascii'):
            fixed = fix_text_encoding(text)
            if text != fixed:
                # possibly filter common bots before printing
                print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format(
                    text=text, fixed=fixed
                ))
                self.num_fixed += 1

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))
Esempio n. 12
0
def check_phrase(text):
    eq_(fix_text_encoding(text), text)
    eq_(fix_text_encoding(text.encode('utf-8').decode('latin-1')), text)
Esempio n. 13
0
def test_fix_with_backslash():
    eq_(fix_text_encoding(u"<40\\% vs \xe2\x89\xa540\\%"), u"<40\\% vs ≥40\\%")
Esempio n. 14
0
def test_fix_with_backslash():
    eq_(fix_text_encoding(u"<40\\% vs \xe2\x89\xa540\\%"), u"<40\\% vs ≥40\\%")