def get_char_counts(corpus_reader: CorpusReader) -> Dict[str, int]: """ Get a frequency distribution of characters in a corpus. :param corpus_reader: :return: """ char_counter = Counter() # type: Dict[str, int] files = corpus_reader.fileids() for file in tqdm(files, total=len(files), unit="files"): for word in corpus_reader.words(file): if word.isalpha(): for car in word: char_counter.update({car: 1}) return char_counter
def get_word_lengths(corpus_reader: CorpusReader, max_word_length: int = 100) -> Dict[int, int]: """ Get the word length/frequency distribution :param corpus_reader: :param max_word_length: :return: """ word_lengths = Counter() # type: Dict[int, int] files = corpus_reader.fileids() for file in tqdm(files, total=len(files), unit='files'): for word in corpus_reader.words(file): word_length = len(word) if word.isalpha() and word_length <= max_word_length: word_lengths.update({word_length: 1}) return word_lengths
def assemble_corpus( corpus_reader: CorpusReader, types_requested: List[str], type_dirs: Dict[str, List[str]] = None, type_files: Dict[str, List[str]] = None ) -> Tuple[CorpusReader, List[str], Set[str]]: """ Create a filtered corpus. :param corpus_reader: This get mutated :param types_requested: a list of string types, which are to be found in the type_dirs and type_files mappings :param type_dirs: a dict of corpus types to directories :param type_files: a dict of corpus types to files :return: a Tuple(CorpusReader object containing only the mappings desired, fileid_names: A list of file ids of the matching corpus files, and categories_found: a set of word categories used to build the reader """ fileid_names = [] # type: List[str] categories_found = set() # type: Set[str] try: ALL_FILE_IDS = list(corpus_reader.fileids()) CLEAN_IDS_TYPES = [] # type: List[Tuple[str, str]] if type_files: for key, valuelist in type_files.items(): if key in types_requested: for value in valuelist: if value in ALL_FILE_IDS: if key: CLEAN_IDS_TYPES.append((value, key)) if type_dirs: for key, valuelist in type_dirs.items(): if key in types_requested: for value in valuelist: corrected_dir = value.replace('./', '') corrected_dir = '{}/'.format(corrected_dir) for name in ALL_FILE_IDS: if name and name.startswith(corrected_dir): CLEAN_IDS_TYPES.append((name, key)) CLEAN_IDS_TYPES.sort(key=lambda x: x[0]) fileid_names, categories = zip(*CLEAN_IDS_TYPES) # type: ignore categories_found = set(categories) # type: Set[str] corpus_reader._fileids = fileid_names except Exception: LOG.exception('failure in corpus building') return (corpus_reader, fileid_names, categories_found)
def get_samples_for_lengths(corpus_reader: CorpusReader, num_samples: int = 5) -> Dict[int, List[str]]: """ Get a number of sample words for each word length; good for sanity checking. :param corpus_reader: :param num_samples: :return: """ samples_lengths = defaultdict(list) # type: Dict[int, List[str]] files = corpus_reader.fileids() for file in tqdm(files, total=len(files), unit="files"): for word in corpus_reader.words(file): if word.isalpha(): word_length = len(word) samples_lengths[word_length].append(word) samples_lengths[word_length] = samples_lengths[ word_length][:num_samples] # trim to num_samples size return samples_lengths
def fileids(self, channels=None, domains=None, categories=None): if channels is not None and domains is not None and categories is not None: raise ValueError('You can specify only one of channels, domains ' 'and categories parameter at once') if channels is None and domains is None and categories is None: return CorpusReader.fileids(self) if isinstance(channels, string_types): channels = [channels] if isinstance(domains, string_types): domains = [domains] if isinstance(categories, string_types): categories = [categories] if channels: return self._list_morph_files_by('channel', channels) elif domains: return self._list_morph_files_by('domain', domains) else: return self._list_morph_files_by('keyTerm', categories, map=self._map_category)
def fileids(self, channels=None, domains=None, categories=None): if channels is not None and domains is not None and categories is not None: raise ValueError("You can specify only one of channels, domains " "and categories parameter at once") if channels is None and domains is None and categories is None: return CorpusReader.fileids(self) if isinstance(channels, str): channels = [channels] if isinstance(domains, str): domains = [domains] if isinstance(categories, str): categories = [categories] if channels: return self._list_morph_files_by("channel", channels) elif domains: return self._list_morph_files_by("domain", domains) else: return self._list_morph_files_by("keyTerm", categories, map=self._map_category)
def assemble_corpus( corpus_reader: CorpusReader, types_requested: List[str], type_dirs: Dict[str, List[str]] = None, type_files: Dict[str, List[str]] = None, ) -> CorpusReader: """ Create a filtered corpus. :param corpus_reader: This get mutated :param types_requested: a list of string types, which are to be found in the type_dirs and type_files mappings :param type_dirs: a dict of corpus types to directories :param type_files: a dict of corpus types to files :return: a CorpusReader object containing only the mappings desired """ fileid_names = [] # type: List[str] try: all_file_ids = list(corpus_reader.fileids()) clean_ids_types = [] # type: List[Tuple[str, str]] if type_files: for key, valuelist in type_files.items(): if key in types_requested: for value in valuelist: if value in all_file_ids: if key: clean_ids_types.append((value, key)) if type_dirs: for key, valuelist in type_dirs.items(): if key in types_requested: for value in valuelist: corrected_dir = value.replace("./", "") corrected_dir = "{}/".format(corrected_dir) for name in all_file_ids: if name and name.startswith(corrected_dir): clean_ids_types.append((name, key)) clean_ids_types.sort(key=lambda x: x[0]) fileid_names, categories = zip(*clean_ids_types) # type: ignore corpus_reader._fileids = fileid_names return corpus_reader except Exception: LOG.exception("failure in corpus building")
def get_split_words(corpus_reader: CorpusReader, word_trie: WordTrie, max_word_length: int = 15) -> Dict[str, List[str]]: """ Search a corpus for improperly joined words, defined by a discrete trie model. return a dictionary, keys are files, and values are lists of tuples of the split words. :param corpus_reader: :param word_trie: :param max_word_length: :return: """ split_words = defaultdict(list) # type: Dict[str, List[str]] files = corpus_reader.fileids() for file in tqdm(files, total=len(files), unit="files"): for word in corpus_reader.words(file): if len(word) > max_word_length and not word_trie.has_word(word): word_list = word_trie.extract_word_pair(word) if len(word_list) == 2: split_words[file] += word_list return split_words
def fileids(self, channels=None, domains=None, categories=None): if channels is not None and domains is not None and \ categories is not None: raise ValueError('You can specify only one of channels, domains ' 'and categories parameter at once') if channels is None and domains is None and \ categories is None: return CorpusReader.fileids(self) if isinstance(channels, basestring): channels = [channels] if isinstance(domains, basestring): domains = [domains] if isinstance(categories, basestring): categories = [categories] if channels: return self._list_morph_files_by('channel', channels) elif domains: return self._list_morph_files_by('domain', domains) else: return self._list_morph_files_by('keyTerm', categories, map=self._map_category)
def assemble_corpus(corpus_reader: CorpusReader, types_requested: List[str], type_dirs: Dict[str, List[str]] = None, type_files: Dict[str, List[str]] = None) -> CorpusReader: """ Create a filtered corpus. :param corpus_reader: This get mutated :param types_requested: a list of string types, which are to be found in the type_dirs and type_files mappings :param type_dirs: a dict of corpus types to directories :param type_files: a dict of corpus types to files :return: a CorpusReader object containing only the mappings desired """ fileid_names = [] # type: List[str] try: all_file_ids = list(corpus_reader.fileids()) clean_ids_types = [] # type: List[Tuple[str, str]] if type_files: for key, valuelist in type_files.items(): if key in types_requested: for value in valuelist: if value in all_file_ids: if key: clean_ids_types.append((value, key)) if type_dirs: for key, valuelist in type_dirs.items(): if key in types_requested: for value in valuelist: corrected_dir = value.replace('./', '') corrected_dir = '{}/'.format(corrected_dir) for name in all_file_ids: if name and name.startswith(corrected_dir): clean_ids_types.append((name, key)) clean_ids_types.sort(key=lambda x: x[0]) fileid_names, categories = zip(*clean_ids_types) # type: ignore corpus_reader._fileids = fileid_names return corpus_reader except Exception: LOG.exception('failure in corpus building')