Example #1
0
    def _check_distributed_corpora_file(self):
        """Check '~/cltk_data/distributed_corpora.yaml' for any custom,
        distributed corpora that the user wants to load locally.

        TODO: write check or try if `cltk_data` dir is not present
        """
        if self.testing:
            distributed_corpora_fp = os.path.expanduser('~/cltk_data/test_distributed_corpora.yaml')
        else:
            distributed_corpora_fp = os.path.expanduser('~/cltk_data/distributed_corpora.yaml')

        try:
            with open(distributed_corpora_fp) as file_open:
                corpora_dict = yaml.safe_load(file_open)
        except FileNotFoundError:
            logger.info('Distributed_corpora.yaml file not found.')
            return []
        except yaml.parser.ParserError as parse_err:
            logger.debug('Yaml parsing error: %s' % parse_err)
            return []

        user_defined_corpora = []
        for corpus_name in corpora_dict:
            about = corpora_dict[corpus_name]

            if about['language'].lower() == self.language:
                user_defined_corpus = dict()
                # user_defined_corpus['git_remote'] = about['git_remote']
                user_defined_corpus['origin'] = about['origin']
                user_defined_corpus['type'] = about['type']
                user_defined_corpus['name'] = corpus_name
                user_defined_corpora.append(user_defined_corpus)

        return user_defined_corpora
Example #2
0
    def _check_distributed_corpora_file(self):
        """Check '~/cltk_data/distributed_corpora.yaml' for any custom,
        distributed corpora that the user wants to load locally.

        TODO: write check or try if `cltk_data` dir is not present
        """
        if self.testing:
            distributed_corpora_fp = os.path.expanduser('~/cltk_data/test_distributed_corpora.yaml')
        else:
            distributed_corpora_fp = os.path.expanduser('~/cltk_data/distributed_corpora.yaml')

        try:
            with open(distributed_corpora_fp) as file_open:
                corpora_dict = yaml.safe_load(file_open)
        except FileNotFoundError:
            logger.info('Distributed_corpora.yaml file not found.')
            return []
        except yaml.parser.ParserError as parse_err:
            logger.debug('Yaml parsing error: %s' % parse_err)
            return []

        user_defined_corpora = []
        for corpus_name in corpora_dict:
            about = corpora_dict[corpus_name]

            if about['language'].lower() == self.language:
                user_defined_corpus = dict()
                user_defined_corpus['git_remote'] = about['git_remote']
                user_defined_corpus['name'] = corpus_name
                user_defined_corpus['type'] = about['type']
                user_defined_corpora.append(user_defined_corpus)

        return user_defined_corpora
Example #3
0
    def __init__(self, language, testing=False):
        """Setup corpus importing.

        `testing` is a hack to check a tmp .yaml file to look at or local corpus. This keeps from overwriting
        local. A better idea is probably to refuse to overwrite the .yaml.
        """
        self.language = language.lower()

        assert isinstance(testing, bool), '`testing` parameter must be boolean type'
        self.testing = testing

        self.user_defined_corpora = self._setup_language_variables()

        # if user_defined_corpora, then we need to add these to the corpus.py objects
        if self.user_defined_corpora:
            logger.info('User-defined corpus found for "{}" language'.format(self.language))
            try:
                logger.debug('Core corpora also found for "{}" language'.format(self.language))
                logger.debug('Combining the user-defined and the core corpora')
                self.official_corpora = LANGUAGE_CORPORA[self.language]
                self.all_corpora = self.official_corpora
                for corpus in self.user_defined_corpora:
                    self.all_corpora.append(corpus)
            except KeyError:
                logger.debug('Nothing in the official repos '
                            'for "{}" language. Make the all_corpora solely '
                            'from the .yaml'.format(self.language))
                self.all_corpora = []
                for corpus in self.user_defined_corpora:
                    self.all_corpora.append(corpus)
        else:
            logger.info('No user-defined corpora found for "{}" language'.format(self.language))
            # self.official_corpora = LANGUAGE_CORPORA[self.language]
            self.all_corpora = LANGUAGE_CORPORA[self.language]
Example #4
0
    def __init__(self, language, testing=False):
        """Setup corpus importing.

        `testing` is a hack to check a tmp .yaml file to look at or local corpus. This keeps from overwriting
        local. A better idea is probably to refuse to overwrite the .yaml.
        """
        self.language = language.lower()

        assert isinstance(testing, bool), '`testing` parameter must be boolean type'
        self.testing = testing

        self.user_defined_corpora = self._setup_language_variables()

        # if user_defined_corpora, then we need to add these to the corpus.py objects
        if self.user_defined_corpora:
            logger.info('User-defined corpus found for "{}" language'.format(self.language))
            try:
                logger.debug('Core corpora also found for "{}" language'.format(self.language))
                logger.debug('Combining the user-defined and the core corpora')
                self.official_corpora = LANGUAGE_CORPORA[self.language]
                self.all_corpora = self.official_corpora
                for corpus in self.user_defined_corpora:
                    self.all_corpora.append(corpus)
            except KeyError:
                logger.debug('Nothing in the official repos '
                            'for "{}" language. Make the all_corpora solely '
                            'from the .yaml'.format(self.language))
                self.all_corpora = []
                for corpus in self.user_defined_corpora:
                    self.all_corpora.append(corpus)
        else:
            logger.info('No user-defined corpora found for "{}" language'.format(self.language))
            # self.official_corpora = LANGUAGE_CORPORA[self.language]
            self.all_corpora = LANGUAGE_CORPORA[self.language]