Ejemplo n.º 1
0
def count_files_to_freqs(input_filenames, output_filename):
    """
    Take in multiple files of word counts by their filename, and produce a
    frequency list in the named output file. The counts should be in the format
    we produce that has a __total__ at the top. We merge them into a single
    frequency list using the 'figure skating average' defined above.
    """
    freq_dicts = []
    for input_filename in input_filenames:
        freq_dict = defaultdict(float)
        with open(input_filename, encoding='utf-8') as infile:
            total = None
            for line in infile:
                word, strcount = line.rstrip().split('\t', 1)
                # Correct for earlier steps that might not have handled curly
                # apostrophes consistently
                word = uncurl_quotes(word).strip("' ")
                if word:
                    count = int(strcount)
                    if word == '__total__':
                        total = count
                    else:
                        freq = count / total
                        if freq < 1e-9:
                            break
                        freq_dict[word] += freq
        freq_dicts.append(freq_dict)

    merged_dict = merge_freqs(freq_dicts)
    with open(output_filename, 'w', encoding='utf-8') as outfile:
        _write_frequency_file(merged_dict, outfile)
Ejemplo n.º 2
0
def count_tokenized(infile, outfile):
    """
    Take in a file that's been tokenized (such as with 'xc tokenize'), count
    its tokens, and write the ones with a count of at least 2.
    """
    counts = Counter()
    total = 0
    for line in infile:
        line = uncurl_quotes(line.rstrip())
        if line:
            toks = [
                t.strip("'") for t in line.split(' ')
                if not t.startswith('__') and t not in BAD_TOKENS
            ]
            counts.update(toks)
            total += len(toks)

    # adjusted_counts drops the items that only occurred once
    one_each = Counter(counts.keys())
    adjusted_counts = counts - one_each

    # Write the counted tokens to outfile
    print('__total__\t{}'.format(total), file=outfile)
    for token, adjcount in adjusted_counts.most_common():
        if TOKEN_RE.match(token):
            print('{}\t{}'.format(token, adjcount + 1), file=outfile)
Ejemplo n.º 3
0
    def _encode(self, texts, labels=None):
        """
        Convert a batch of raw text to a batch of byte-pair encoded token indices.
        """
        self._lazy_init()
        batch_tokens = []
        batch_token_idxs = []
        batch_label_idxs = []
        batch_character_locs = []
        label = None
        for i, text in enumerate(texts):
            if labels is not None:
                label = labels[i]
            raw_text = text.lower()
            # Only fine to apply this fix because it preserves character locations
            ftfy_text = uncurl_quotes(raw_text)
            tokens = NLP(_text_standardize(text))
            subtokens = []
            subtoken_idxs = []
            tok_pos = []
            token_start = 0

            for j, token in enumerate(tokens):
                bpe_toks = self.bpe(token.text).split(' ')

                try:
                    if token.text.strip():
                        token_start = ftfy_text.index((token.text.strip()), token_start)
                except ValueError:
                    warnings.warn("Failed to find token `{}` in text.".format(token.text))
                    continue

                subtokens.extend(bpe_toks)
                subtoken_idxs.extend([
                    self.encoder.get(SUBS.get(t, t), self.UNK_IDX)
                    for t in bpe_toks
                ])

                assert len("".join(bpe_toks).replace("</w>", "")) == len(token.text.replace(' ', ''))
                subtoken_positions = np.cumsum([len(tok.replace("</w>", '')) for tok in bpe_toks]) + token_start

                token_start += len(token.text.strip())

                tok_pos.extend(subtoken_positions)

            batch_tokens.append(subtokens)
            batch_token_idxs.append(subtoken_idxs)
            batch_character_locs.append(tok_pos)
            if labels is not None:
                batch_label_idxs.append([label] * len(subtoken_idxs))

        return EncodedOutput(
            token_ids=batch_token_idxs,
            tokens=batch_tokens,
            labels=batch_label_idxs,
            char_locs=batch_character_locs,
        )
Ejemplo n.º 4
0
def fix_text_segment(
    text,
    *,
    fix_entities='auto',
    remove_terminal_escapes=True,
    fix_encoding=True,
    fix_latin_ligatures=True,
    fix_character_width=True,
    uncurl_quotes=True,
    fix_line_breaks=True,
    fix_surrogates=True,
    remove_control_chars=True,
    remove_bom=True,
    normalization='NFC'
):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Ejemplo n.º 5
0
def fix_text_segment(text,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     fix_latin_ligatures=True,
                     fix_character_width=True,
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True,
                     normalization='NFC'):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Ejemplo n.º 6
0
def _text_standardize(text):
    """
    Fixes some issues the spacy tokenizer had on books corpus
    Also handles whitespace standardization
    """
    text = re.sub(
        """(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""",
        r" \1 ",
        text,
    )
    text = re.sub("\s*\n\s*", " \n ", text)
    text = re.sub("[^\S\n]+", " ", text)
    return uncurl_quotes(text.strip().lower())
Ejemplo n.º 7
0
def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
Ejemplo n.º 8
0
def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
def clean_string(s):
    s = str(s)
    if isnull(s):
        return None
    elif re.search('[a-zA-Z]', s) is None:
        return None
    else:
        s = remove_bom(s)
        s = remove_control_chars(s)
        s = fix_encoding(s)
        s = fix_text(s)
        s = fix_partial_utf8_punct_in_1252(s)
        s = decode_escapes(s)
        s = fix_latin_ligatures(s)
        s = uncurl_quotes(s)
        s = s.replace("Äu0087", "ć")
        s = s.replace("Äu0090", "Đ")
        s = s.replace("Ãu0096", "Ö")
        s = s.replace("Åu008D", "ō")

        s = s.replace("\\", " ")
        s = s.replace("/", " ")
        s = s.replace("ö", "ö")

        p = re.compile("^\w+[A-Z]{1}\w*$")
        if p.search(s):
            # From: https://stackoverflow.com/a/37697078
            s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s)

        new_string = ""
        p = False
        for letter in s:
            if letter in "([":
                p = True
            elif letter in ")]":
                p = False
                continue
            if not p:
                new_string += letter
        return new_string.strip()
Ejemplo n.º 10
0
    def _encode(self, texts, labels=None):
        """
        Convert a batch of raw text to a batch of byte-pair encoded token indices.
        """

        self._lazy_init()
        batch_tokens = []
        batch_token_idxs = []
        batch_label_idxs = []
        batch_char_ends = (
            []
        )  # to account for the fact that some BPEs have different lengths than their original tokens (e.g. special characters such as bullets)
        batch_char_starts = []
        label = None
        skipped = 0
        for i, text in enumerate(texts):
            if labels is not None:
                label = labels[i]

            raw_text = text.lower()
            
            # Only fine to apply this fix because it preserves character locations
            ftfy_text = uncurl_quotes(raw_text)
            tokens = NLP(_text_standardize(text))
            if not tokens:
                skipped += 1
                continue
            i -= skipped
            subtokens = []
            subtoken_idxs = []
            char_starts = []
            char_ends = []
            token_start = 0

            for j, token in enumerate(tokens):
                bpe_toks = self.bpe(token.text).split(" ")

                try:
                    if token.text.strip():
                        token_start = ftfy_text.index((token.text.strip()), token_start)
                except ValueError:
                    warnings.warn(
                        "Failed to find token `{}` in text.".format(token.text)
                    )
                    continue

                subtokens.extend(bpe_toks)
                subtoken_idxs.extend(
                    [self.encoder.get(SUBS.get(t, t), self.UNK_IDX) for t in bpe_toks]
                )

                assert len("".join(bpe_toks).replace("</w>", "")) == len(
                    token.text.replace(" ", "")
                )

                if np.sum([len(tok.replace("</w>", "")) for tok in bpe_toks]) > len(
                    token
                ):  # the BPEs comprising a token are longer than the token itself
                    token_char_ends = (
                        np.asarray([len(token.text.strip()) for tok in bpe_toks])
                        + token_start
                    )
                else:
                    token_char_ends = (
                        np.cumsum([len(tok.replace("</w>", "")) for tok in bpe_toks])
                        + token_start
                    )
                
                token_char_starts = [token_start] + token_char_ends[:-1].tolist()
                token_start += len(token.text.strip())
                char_ends.extend(token_char_ends)
                char_starts.extend(token_char_starts)

            batch_tokens.append(subtokens)
            batch_token_idxs.append(subtoken_idxs)
            batch_char_ends.append(char_ends)
            batch_char_starts.append(char_starts)
            if labels is not None:
                batch_label_idxs.append([label] * len(subtoken_idxs))

        return EncodedOutput(
            token_ids=batch_token_idxs,
            tokens=batch_tokens,
            labels=batch_label_idxs,
            char_locs=batch_char_ends,
            char_starts=batch_char_starts,
        )
Ejemplo n.º 11
0
print(fix_text('ünicode'))

print(fix_text('&lt;3'))

print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))

len(fix_text(''))

explain_unicode('ノ( º _ ºノ) 테스트')

from ftfy.fixes import fix_encoding, unescape_html, uncurl_quotes, fix_line_breaks, decode_escapes

print(fix_encoding('â\x81”.'))

print(unescape_html('&lt;hr&gt;'))

print(uncurl_quotes('\u201ctest\u201d'))

print(fix_line_breaks("1. hello\u2028" "2. world"))

factoid = '\\u20a2'
print(decode_escapes(factoid))

from ftfy.formatting import character_width, display_center

print(character_width('A'))
print(character_width('가'))

lines = ['Display center', 'center']
for line in lines:
    print(display_center(line, 20, '▒'))