def get_char_counts(corpus_reader: CorpusReader) -> Dict[str, int]: """ Get a frequency distribution of characters in a corpus. :param corpus_reader: :return: """ char_counter = Counter() # type: Dict[str, int] files = corpus_reader.fileids() for file in tqdm(files, total=len(files), unit="files"): for word in corpus_reader.words(file): if word.isalpha(): for car in word: char_counter.update({car: 1}) return char_counter
def get_word_lengths(corpus_reader: CorpusReader, max_word_length: int = 100) -> Dict[int, int]: """ Get the word length/frequency distribution :param corpus_reader: :param max_word_length: :return: """ word_lengths = Counter() # type: Dict[int, int] files = corpus_reader.fileids() for file in tqdm(files, total=len(files), unit='files'): for word in corpus_reader.words(file): word_length = len(word) if word.isalpha() and word_length <= max_word_length: word_lengths.update({word_length: 1}) return word_lengths
def get_samples_for_lengths(corpus_reader: CorpusReader, num_samples: int = 5) -> Dict[int, List[str]]: """ Get a number of sample words for each word length; good for sanity checking. :param corpus_reader: :param num_samples: :return: """ samples_lengths = defaultdict(list) # type: Dict[int, List[str]] files = corpus_reader.fileids() for file in tqdm(files, total=len(files), unit="files"): for word in corpus_reader.words(file): if word.isalpha(): word_length = len(word) samples_lengths[word_length].append(word) samples_lengths[word_length] = samples_lengths[ word_length][:num_samples] # trim to num_samples size return samples_lengths
def get_split_words(corpus_reader: CorpusReader, word_trie: WordTrie, max_word_length: int = 15) -> Dict[str, List[str]]: """ Search a corpus for improperly joined words, defined by a discrete trie model. return a dictionary, keys are files, and values are lists of tuples of the split words. :param corpus_reader: :param word_trie: :param max_word_length: :return: """ split_words = defaultdict(list) # type: Dict[str, List[str]] files = corpus_reader.fileids() for file in tqdm(files, total=len(files), unit="files"): for word in corpus_reader.words(file): if len(word) > max_word_length and not word_trie.has_word(word): word_list = word_trie.extract_word_pair(word) if len(word_list) == 2: split_words[file] += word_list return split_words