Python remove_bomの例、ftfy.fixes.remove_bom Pythonの例

コード例 #1

0

ファイルを表示

def fix_text_segment(
    text,
    *,
    fix_entities='auto',
    remove_terminal_escapes=True,
    fix_encoding=True,
    fix_latin_ligatures=True,
    fix_character_width=True,
    uncurl_quotes=True,
    fix_line_breaks=True,
    fix_surrogates=True,
    remove_control_chars=True,
    remove_bom=True,
    normalization='NFC'
):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text

コード例 #2

0

ファイルを表示

ファイル: __init__.py プロジェクト: rlugojr/python-ftfy

def fix_text_segment(text,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     fix_latin_ligatures=True,
                     fix_character_width=True,
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True,
                     normalization='NFC'):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text

コード例 #3

0

ファイルを表示

def TestInput(data):
    if len(data) < 1:
        return

    fdp = atheris.FuzzedDataProvider(data)

    ftfy.character_width(chr(fdp.ConsumeIntInRange(1,1114110)))
    ftfy.monospaced_width(fdp.ConsumeString(1000))
    ftfy.monospaced_width(fdp.ConsumeUnicode(1000))

    choice = fdp.ConsumeIntInRange(1,3)
    if choice == 1:
        ftfy.display_ljust(fdp.ConsumeString(1000),fdp.ConsumeIntInRange(1,2000))
        ftfy.display_ljust(fdp.ConsumeUnicode(1000),fdp.ConsumeIntInRange(1,2000))
    if choice == 2:
        ftfy.display_rjust(fdp.ConsumeString(1000),fdp.ConsumeIntInRange(1,2000))
        ftfy.display_rjust(fdp.ConsumeUnicode(1000),fdp.ConsumeIntInRange(1,2000))
    if choice == 3:
        ftfy.display_center(fdp.ConsumeString(1000),fdp.ConsumeIntInRange(1,2000))
        ftfy.display_center(fdp.ConsumeUnicode(1000),fdp.ConsumeIntInRange(1,2000))

    fixes.remove_bom(fdp.ConsumeString(1000))
    fixes.remove_bom(fdp.ConsumeUnicode(1000))

コード例 #4

0

ファイルを表示

ファイル: __init__.py プロジェクト: d-mittal/python-ftfy

def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text

コード例 #5

0

ファイルを表示

def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text

コード例 #6

0

ファイルを表示

ファイル: general.py プロジェクト: danielapoliveira/iswc-annotation-challenge

def clean_string(s):
    s = str(s)
    if isnull(s):
        return None
    elif re.search('[a-zA-Z]', s) is None:
        return None
    else:
        s = remove_bom(s)
        s = remove_control_chars(s)
        s = fix_encoding(s)
        s = fix_text(s)
        s = fix_partial_utf8_punct_in_1252(s)
        s = decode_escapes(s)
        s = fix_latin_ligatures(s)
        s = uncurl_quotes(s)
        s = s.replace("Äu0087", "ć")
        s = s.replace("Äu0090", "Đ")
        s = s.replace("Ãu0096", "Ö")
        s = s.replace("Åu008D", "ō")

        s = s.replace("\\", " ")
        s = s.replace("/", " ")
        s = s.replace("Ã¶", "ö")

        p = re.compile("^\w+[A-Z]{1}\w*$")
        if p.search(s):
            # From: https://stackoverflow.com/a/37697078
            s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s)

        new_string = ""
        p = False
        for letter in s:
            if letter in "([":
                p = True
            elif letter in ")]":
                p = False
                continue
            if not p:
                new_string += letter
        return new_string.strip()