Esempio n. 1
0
def preprocess_twitter(infile, outfile):
    """
    Read Twitter text from the format we collected it in, and produce language-tagged
    lines.

    In this format, each line might come with some metadata, such as the tweet ID,
    which appears before the text, separated from the text by a tab character. Or it
    might not contain any such data. We weren't very consistent about it over the years.

    This function reads just the text (the part after the tab, if there is a tab). It
    removes URLs and Twitter handles from the text. It then language-detects the
    text, and if it is confident about the language, it outputs a new tab-separated
    file containing the language code and the processed text.

    This format could be read again by the same function, because the language code
    is now the metadata, but we have no reason to actually do this.
    """
    for line in infile:
        if "\t" in line:
            line = line.split("\t", 1)[1]
        text = line.rstrip()
        text = TWITTER_HANDLE_RE.sub("", text)
        text = TCO_RE.sub("", text)
        text = fix_surrogates(unescape_html(text)).replace("\n", " ")
        lang, _confidence = detect_language_checked(text)
        if lang != 'und':
            print(f"{lang}\t{text}", file=outfile)
Esempio n. 2
0
def tokenize_file(infile,
                  outfile,
                  language,
                  check_language=False,
                  punctuation=False,
                  ftfy=False):
    """
    Take in a file of plain text, tokenize it as the given language, and write
    the result as lines of space-separated tokens.
    """
    for line in infile:
        if ftfy:
            # Run all ftfy fixes, but don't let it introduce line breaks
            line = fix_text(line.rstrip()).replace('\n', ' ')
        else:
            # Run only specific quick fixes from ftfy
            line = fix_surrogates(unescape_html(line.rstrip()))
        tokens = tokenize(line,
                          language,
                          include_punctuation=punctuation,
                          external_wordlist=True)
        checked_lang = None
        if check_language:
            checked_lang, _confidence = detect_language_checked(line.rstrip())
        if (not check_language):
            print(' '.join(tokens), file=outfile)
        else:
            if langcodes.tag_distance(checked_lang, language) < 10:
                print(' '.join(tokens), file=outfile)
Esempio n. 3
0
def preprocess_reddit(infile, outfile):
    """
    Read Reddit text from a JSON-lines file, parse the Markdown, and tag
    what language each post is in.

    Filter the posts to enforce _some_ standard of quality:

    - Posts in English should have score >= 2 (they should have net upvotes)
    - Other posts should have score >= 1 (no net downvotes)
    - Posts from subreddits that are banned in 2018 are skipped
    """
    for line in infile:
        data = json.loads(line)
        if ('score' in data and 'body' in data and data["score"] is not None
                and data["score"] >= 1 and data["body"] != "[deleted]"):
            subreddit = data["subreddit"]
            subreddit_hash = mmh3.hash(subreddit)
            if subreddit_hash not in BANNED_SUBREDDITS:
                md = fix_surrogates(
                    unescape_html(fix_line_breaks(data["body"])))
                text = strip_markdown(md)
                text = text.replace("\n", " ").replace("\u200b", "")
                text = URL_RE.sub("", text)
                if text:
                    lang, confident = detect_language(text)
                    if confident:
                        # There are more English posts than we need, so filter them
                        # for score >= 2
                        if lang != "en" or data["score"] > 1:
                            print(f"{lang}\t{text}", file=outfile)
Esempio n. 4
0
def fix_text_segment(
    text,
    *,
    fix_entities='auto',
    remove_terminal_escapes=True,
    fix_encoding=True,
    fix_latin_ligatures=True,
    fix_character_width=True,
    uncurl_quotes=True,
    fix_line_breaks=True,
    fix_surrogates=True,
    remove_control_chars=True,
    remove_bom=True,
    normalization='NFC'
):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Esempio n. 5
0
def fix_text_segment(text,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     fix_latin_ligatures=True,
                     fix_character_width=True,
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True,
                     normalization='NFC'):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Esempio n. 6
0
def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
Esempio n. 7
0
def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
Esempio n. 8
0
def preprocess_reddit_lines(input_lines):
    for line in input_lines:
        data = json.loads(line)
        if ('score' in data and 'body' in data and data["score"] is not None
                and data["score"] >= 2 and data["body"] != "[deleted]"
                and data["body"] != "[removed]"):
            subreddit = data["subreddit"].casefold()
            subreddit_hash = mmh3.hash(subreddit)
            if subreddit_hash not in BANNED_SUBREDDITS:
                md = fix_surrogates(
                    unescape_html(fix_line_breaks(data["body"])))
                text = strip_markdown(md)
                text = text.replace("\n", " ").replace("\u200b", "")
                text = URL_RE.sub("", text)
                if text:
                    lang, _confidence = detect_language_checked(text)
                    if lang != 'und':
                        # There are more English posts than we need, so filter them
                        # for score >= 3
                        if lang != "en" or data["score"] > 2:
                            yield (lang, text)
Esempio n. 9
0
def test_surrogates():
    eq_(fix_surrogates('\udbff\udfff'), '\U0010ffff')
    eq_(fix_surrogates('\ud800\udc00'), '\U00010000')
def test_surrogates():
    eq_(fix_surrogates('\udbff\udfff'), '\U0010ffff')
    eq_(fix_surrogates('\ud800\udc00'), '\U00010000')
Esempio n. 11
0
def test_surrogates():
    assert fix_surrogates('\udbff\udfff') == '\U0010ffff'
    assert fix_surrogates('\ud800\udc00') == '\U00010000'
def test_surrogates():
    assert fix_surrogates('\udbff\udfff') == '\U0010ffff'
    assert fix_surrogates('\ud800\udc00') == '\U00010000'