def assemble_corpus( corpus_reader: CorpusReader, types_requested: List[str], type_dirs: Dict[str, List[str]] = None, type_files: Dict[str, List[str]] = None ) -> Tuple[CorpusReader, List[str], Set[str]]: """ Create a filtered corpus. :param corpus_reader: This get mutated :param types_requested: a list of string types, which are to be found in the type_dirs and type_files mappings :param type_dirs: a dict of corpus types to directories :param type_files: a dict of corpus types to files :return: a Tuple(CorpusReader object containing only the mappings desired, fileid_names: A list of file ids of the matching corpus files, and categories_found: a set of word categories used to build the reader """ fileid_names = [] # type: List[str] categories_found = set() # type: Set[str] try: ALL_FILE_IDS = list(corpus_reader.fileids()) CLEAN_IDS_TYPES = [] # type: List[Tuple[str, str]] if type_files: for key, valuelist in type_files.items(): if key in types_requested: for value in valuelist: if value in ALL_FILE_IDS: if key: CLEAN_IDS_TYPES.append((value, key)) if type_dirs: for key, valuelist in type_dirs.items(): if key in types_requested: for value in valuelist: corrected_dir = value.replace('./', '') corrected_dir = '{}/'.format(corrected_dir) for name in ALL_FILE_IDS: if name and name.startswith(corrected_dir): CLEAN_IDS_TYPES.append((name, key)) CLEAN_IDS_TYPES.sort(key=lambda x: x[0]) fileid_names, categories = zip(*CLEAN_IDS_TYPES) # type: ignore categories_found = set(categories) # type: Set[str] corpus_reader._fileids = fileid_names except Exception: LOG.exception('failure in corpus building') return (corpus_reader, fileid_names, categories_found)
def assemble_corpus( corpus_reader: CorpusReader, types_requested: List[str], type_dirs: Dict[str, List[str]] = None, type_files: Dict[str, List[str]] = None, ) -> CorpusReader: """ Create a filtered corpus. :param corpus_reader: This get mutated :param types_requested: a list of string types, which are to be found in the type_dirs and type_files mappings :param type_dirs: a dict of corpus types to directories :param type_files: a dict of corpus types to files :return: a CorpusReader object containing only the mappings desired """ fileid_names = [] # type: List[str] try: all_file_ids = list(corpus_reader.fileids()) clean_ids_types = [] # type: List[Tuple[str, str]] if type_files: for key, valuelist in type_files.items(): if key in types_requested: for value in valuelist: if value in all_file_ids: if key: clean_ids_types.append((value, key)) if type_dirs: for key, valuelist in type_dirs.items(): if key in types_requested: for value in valuelist: corrected_dir = value.replace("./", "") corrected_dir = "{}/".format(corrected_dir) for name in all_file_ids: if name and name.startswith(corrected_dir): clean_ids_types.append((name, key)) clean_ids_types.sort(key=lambda x: x[0]) fileid_names, categories = zip(*clean_ids_types) # type: ignore corpus_reader._fileids = fileid_names return corpus_reader except Exception: LOG.exception("failure in corpus building")
def assemble_corpus(corpus_reader: CorpusReader, types_requested: List[str], type_dirs: Dict[str, List[str]] = None, type_files: Dict[str, List[str]] = None) -> CorpusReader: """ Create a filtered corpus. :param corpus_reader: This get mutated :param types_requested: a list of string types, which are to be found in the type_dirs and type_files mappings :param type_dirs: a dict of corpus types to directories :param type_files: a dict of corpus types to files :return: a CorpusReader object containing only the mappings desired """ fileid_names = [] # type: List[str] try: all_file_ids = list(corpus_reader.fileids()) clean_ids_types = [] # type: List[Tuple[str, str]] if type_files: for key, valuelist in type_files.items(): if key in types_requested: for value in valuelist: if value in all_file_ids: if key: clean_ids_types.append((value, key)) if type_dirs: for key, valuelist in type_dirs.items(): if key in types_requested: for value in valuelist: corrected_dir = value.replace('./', '') corrected_dir = '{}/'.format(corrected_dir) for name in all_file_ids: if name and name.startswith(corrected_dir): clean_ids_types.append((name, key)) clean_ids_types.sort(key=lambda x: x[0]) fileid_names, categories = zip(*clean_ids_types) # type: ignore corpus_reader._fileids = fileid_names return corpus_reader except Exception: LOG.exception('failure in corpus building')