def count_files_to_freqs(input_filenames, output_filename): """ Take in multiple files of word counts by their filename, and produce a frequency list in the named output file. The counts should be in the format we produce that has a __total__ at the top. We merge them into a single frequency list using the 'figure skating average' defined above. """ freq_dicts = [] for input_filename in input_filenames: freq_dict = defaultdict(float) with open(input_filename, encoding='utf-8') as infile: total = None for line in infile: word, strcount = line.rstrip().split('\t', 1) # Correct for earlier steps that might not have handled curly # apostrophes consistently word = uncurl_quotes(word).strip("' ") if word: count = int(strcount) if word == '__total__': total = count else: freq = count / total if freq < 1e-9: break freq_dict[word] += freq freq_dicts.append(freq_dict) merged_dict = merge_freqs(freq_dicts) with open(output_filename, 'w', encoding='utf-8') as outfile: _write_frequency_file(merged_dict, outfile)
def count_tokenized(infile, outfile): """ Take in a file that's been tokenized (such as with 'xc tokenize'), count its tokens, and write the ones with a count of at least 2. """ counts = Counter() total = 0 for line in infile: line = uncurl_quotes(line.rstrip()) if line: toks = [ t.strip("'") for t in line.split(' ') if not t.startswith('__') and t not in BAD_TOKENS ] counts.update(toks) total += len(toks) # adjusted_counts drops the items that only occurred once one_each = Counter(counts.keys()) adjusted_counts = counts - one_each # Write the counted tokens to outfile print('__total__\t{}'.format(total), file=outfile) for token, adjcount in adjusted_counts.most_common(): if TOKEN_RE.match(token): print('{}\t{}'.format(token, adjcount + 1), file=outfile)
def _encode(self, texts, labels=None): """ Convert a batch of raw text to a batch of byte-pair encoded token indices. """ self._lazy_init() batch_tokens = [] batch_token_idxs = [] batch_label_idxs = [] batch_character_locs = [] label = None for i, text in enumerate(texts): if labels is not None: label = labels[i] raw_text = text.lower() # Only fine to apply this fix because it preserves character locations ftfy_text = uncurl_quotes(raw_text) tokens = NLP(_text_standardize(text)) subtokens = [] subtoken_idxs = [] tok_pos = [] token_start = 0 for j, token in enumerate(tokens): bpe_toks = self.bpe(token.text).split(' ') try: if token.text.strip(): token_start = ftfy_text.index((token.text.strip()), token_start) except ValueError: warnings.warn("Failed to find token `{}` in text.".format(token.text)) continue subtokens.extend(bpe_toks) subtoken_idxs.extend([ self.encoder.get(SUBS.get(t, t), self.UNK_IDX) for t in bpe_toks ]) assert len("".join(bpe_toks).replace("</w>", "")) == len(token.text.replace(' ', '')) subtoken_positions = np.cumsum([len(tok.replace("</w>", '')) for tok in bpe_toks]) + token_start token_start += len(token.text.strip()) tok_pos.extend(subtoken_positions) batch_tokens.append(subtokens) batch_token_idxs.append(subtoken_idxs) batch_character_locs.append(tok_pos) if labels is not None: batch_label_idxs.append([label] * len(subtoken_idxs)) return EncodedOutput( token_ids=batch_token_idxs, tokens=batch_tokens, labels=batch_label_idxs, char_locs=batch_character_locs, )
def fix_text_segment( text, *, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC' ): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def fix_text_segment(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC'): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def _text_standardize(text): """ Fixes some issues the spacy tokenizer had on books corpus Also handles whitespace standardization """ text = re.sub( """(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text, ) text = re.sub("\s*\n\s*", " \n ", text) text = re.sub("[^\S\n]+", " ", text) return uncurl_quotes(text.strip().lower())
def fix_text_segment(text, remove_unsafe_private_use=False, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, normalization='NFKC', uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is all in the same encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_unsafe_private_use: text = fixes.remove_unsafe_private_use(text) if fix_entities: text = fixes.unescape_html(text) if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_text_encoding(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom: text = fixes.remove_bom(text) if text == origtext: return text
def clean_string(s): s = str(s) if isnull(s): return None elif re.search('[a-zA-Z]', s) is None: return None else: s = remove_bom(s) s = remove_control_chars(s) s = fix_encoding(s) s = fix_text(s) s = fix_partial_utf8_punct_in_1252(s) s = decode_escapes(s) s = fix_latin_ligatures(s) s = uncurl_quotes(s) s = s.replace("Äu0087", "ć") s = s.replace("Äu0090", "Đ") s = s.replace("Ãu0096", "Ö") s = s.replace("Åu008D", "ō") s = s.replace("\\", " ") s = s.replace("/", " ") s = s.replace("ö", "ö") p = re.compile("^\w+[A-Z]{1}\w*$") if p.search(s): # From: https://stackoverflow.com/a/37697078 s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s) new_string = "" p = False for letter in s: if letter in "([": p = True elif letter in ")]": p = False continue if not p: new_string += letter return new_string.strip()
def _encode(self, texts, labels=None): """ Convert a batch of raw text to a batch of byte-pair encoded token indices. """ self._lazy_init() batch_tokens = [] batch_token_idxs = [] batch_label_idxs = [] batch_char_ends = ( [] ) # to account for the fact that some BPEs have different lengths than their original tokens (e.g. special characters such as bullets) batch_char_starts = [] label = None skipped = 0 for i, text in enumerate(texts): if labels is not None: label = labels[i] raw_text = text.lower() # Only fine to apply this fix because it preserves character locations ftfy_text = uncurl_quotes(raw_text) tokens = NLP(_text_standardize(text)) if not tokens: skipped += 1 continue i -= skipped subtokens = [] subtoken_idxs = [] char_starts = [] char_ends = [] token_start = 0 for j, token in enumerate(tokens): bpe_toks = self.bpe(token.text).split(" ") try: if token.text.strip(): token_start = ftfy_text.index((token.text.strip()), token_start) except ValueError: warnings.warn( "Failed to find token `{}` in text.".format(token.text) ) continue subtokens.extend(bpe_toks) subtoken_idxs.extend( [self.encoder.get(SUBS.get(t, t), self.UNK_IDX) for t in bpe_toks] ) assert len("".join(bpe_toks).replace("</w>", "")) == len( token.text.replace(" ", "") ) if np.sum([len(tok.replace("</w>", "")) for tok in bpe_toks]) > len( token ): # the BPEs comprising a token are longer than the token itself token_char_ends = ( np.asarray([len(token.text.strip()) for tok in bpe_toks]) + token_start ) else: token_char_ends = ( np.cumsum([len(tok.replace("</w>", "")) for tok in bpe_toks]) + token_start ) token_char_starts = [token_start] + token_char_ends[:-1].tolist() token_start += len(token.text.strip()) char_ends.extend(token_char_ends) char_starts.extend(token_char_starts) batch_tokens.append(subtokens) batch_token_idxs.append(subtoken_idxs) batch_char_ends.append(char_ends) batch_char_starts.append(char_starts) if labels is not None: batch_label_idxs.append([label] * len(subtoken_idxs)) return EncodedOutput( token_ids=batch_token_idxs, tokens=batch_tokens, labels=batch_label_idxs, char_locs=batch_char_ends, char_starts=batch_char_starts, )
print(fix_text('ünicode')) print(fix_text('<3')) print(fix_text("¯\\_(ã\x83\x84)_/¯")) len(fix_text('')) explain_unicode('ノ( º _ ºノ) 테스트') from ftfy.fixes import fix_encoding, unescape_html, uncurl_quotes, fix_line_breaks, decode_escapes print(fix_encoding('â\x81”.')) print(unescape_html('<hr>')) print(uncurl_quotes('\u201ctest\u201d')) print(fix_line_breaks("1. hello\u2028" "2. world")) factoid = '\\u20a2' print(decode_escapes(factoid)) from ftfy.formatting import character_width, display_center print(character_width('A')) print(character_width('가')) lines = ['Display center', 'center'] for line in lines: print(display_center(line, 20, '▒'))