def _check_distributed_corpora_file(self): """Check '~/cltk_data/distributed_corpora.yaml' for any custom, distributed corpora that the user wants to load locally. TODO: write check or try if `cltk_data` dir is not present """ if self.testing: distributed_corpora_fp = os.path.expanduser('~/cltk_data/test_distributed_corpora.yaml') else: distributed_corpora_fp = os.path.expanduser('~/cltk_data/distributed_corpora.yaml') try: with open(distributed_corpora_fp) as file_open: corpora_dict = yaml.safe_load(file_open) except FileNotFoundError: logger.info('Distributed_corpora.yaml file not found.') return [] except yaml.parser.ParserError as parse_err: logger.debug('Yaml parsing error: %s' % parse_err) return [] user_defined_corpora = [] for corpus_name in corpora_dict: about = corpora_dict[corpus_name] if about['language'].lower() == self.language: user_defined_corpus = dict() # user_defined_corpus['git_remote'] = about['git_remote'] user_defined_corpus['origin'] = about['origin'] user_defined_corpus['type'] = about['type'] user_defined_corpus['name'] = corpus_name user_defined_corpora.append(user_defined_corpus) return user_defined_corpora
def _check_distributed_corpora_file(self): """Check '~/cltk_data/distributed_corpora.yaml' for any custom, distributed corpora that the user wants to load locally. TODO: write check or try if `cltk_data` dir is not present """ if self.testing: distributed_corpora_fp = os.path.expanduser('~/cltk_data/test_distributed_corpora.yaml') else: distributed_corpora_fp = os.path.expanduser('~/cltk_data/distributed_corpora.yaml') try: with open(distributed_corpora_fp) as file_open: corpora_dict = yaml.safe_load(file_open) except FileNotFoundError: logger.info('Distributed_corpora.yaml file not found.') return [] except yaml.parser.ParserError as parse_err: logger.debug('Yaml parsing error: %s' % parse_err) return [] user_defined_corpora = [] for corpus_name in corpora_dict: about = corpora_dict[corpus_name] if about['language'].lower() == self.language: user_defined_corpus = dict() user_defined_corpus['git_remote'] = about['git_remote'] user_defined_corpus['name'] = corpus_name user_defined_corpus['type'] = about['type'] user_defined_corpora.append(user_defined_corpus) return user_defined_corpora
def __init__(self, language, testing=False): """Setup corpus importing. `testing` is a hack to check a tmp .yaml file to look at or local corpus. This keeps from overwriting local. A better idea is probably to refuse to overwrite the .yaml. """ self.language = language.lower() assert isinstance(testing, bool), '`testing` parameter must be boolean type' self.testing = testing self.user_defined_corpora = self._setup_language_variables() # if user_defined_corpora, then we need to add these to the corpus.py objects if self.user_defined_corpora: logger.info('User-defined corpus found for "{}" language'.format(self.language)) try: logger.debug('Core corpora also found for "{}" language'.format(self.language)) logger.debug('Combining the user-defined and the core corpora') self.official_corpora = LANGUAGE_CORPORA[self.language] self.all_corpora = self.official_corpora for corpus in self.user_defined_corpora: self.all_corpora.append(corpus) except KeyError: logger.debug('Nothing in the official repos ' 'for "{}" language. Make the all_corpora solely ' 'from the .yaml'.format(self.language)) self.all_corpora = [] for corpus in self.user_defined_corpora: self.all_corpora.append(corpus) else: logger.info('No user-defined corpora found for "{}" language'.format(self.language)) # self.official_corpora = LANGUAGE_CORPORA[self.language] self.all_corpora = LANGUAGE_CORPORA[self.language]
def __init__(self, language, testing=False): """Setup corpus importing. `testing` is a hack to check a tmp .yaml file to look at or local corpus. This keeps from overwriting local. A better idea is probably to refuse to overwrite the .yaml. """ self.language = language.lower() assert isinstance(testing, bool), '`testing` parameter must be boolean type' self.testing = testing self.user_defined_corpora = self._setup_language_variables() # if user_defined_corpora, then we need to add these to the corpus.py objects if self.user_defined_corpora: logger.info('User-defined corpus found for "{}" language'.format(self.language)) try: logger.debug('Core corpora also found for "{}" language'.format(self.language)) logger.debug('Combining the user-defined and the core corpora') self.official_corpora = LANGUAGE_CORPORA[self.language] self.all_corpora = self.official_corpora for corpus in self.user_defined_corpora: self.all_corpora.append(corpus) except KeyError: logger.debug('Nothing in the official repos ' 'for "{}" language. Make the all_corpora solely ' 'from the .yaml'.format(self.language)) self.all_corpora = [] for corpus in self.user_defined_corpora: self.all_corpora.append(corpus) else: logger.info('No user-defined corpora found for "{}" language'.format(self.language)) # self.official_corpora = LANGUAGE_CORPORA[self.language] self.all_corpora = LANGUAGE_CORPORA[self.language]