Ejemplo n.º 1
0
    def read_corpus_by_lang(
            cls,
            folder: str = CORPUS_ROOT,
            ignore_cached: bool = False,
            read_cached_only: bool = False) -> List[CorpusFeatures]:
        """
        Almost the same as read_corpus_by_text, but combines all texts by language
        into single corpus
        """
        data = []  # type: List[CorpusFeatures]

        raw_path = os.path.join(folder, 'raw')
        features_path = os.path.join(folder, 'features')
        if not os.path.isdir(features_path):
            os.mkdir(features_path)

        dirs = [f for f in os.listdir(raw_path)]
        for dir_name in dirs:
            sub_path = os.path.join(raw_path, dir_name)
            if not os.path.isdir(sub_path):
                continue

            language = dir_name
            features_name = f'{language}.json'  # '.../raw/fr.json'
            feature_path = os.path.join(features_path, features_name)
            corpus = None

            if not ignore_cached and os.path.isfile(feature_path):
                try:
                    cf = CorpusFeatures.load_from_file(feature_path)
                    if cf.version != CorpusFeatures.ACTUAL_VERSION:
                        print(
                            f'File "{feature_path}" has version "{cf.version}"'
                        )
                    else:
                        corpus = cf
                except Exception as e:
                    print(f'Error loading "{feature_path}": {e}')

            if not corpus and not read_cached_only:
                # build corpus
                alph = alphabet_by_code[language]
                corpus = CorpusFeatures(language, alph, sub_path)
                dict = DetailedDictionary.read_from_folder(sub_path)
                corpus.build(dict)
                # cache corpus
                corpus.save_to_file(feature_path)

            if corpus:
                corpus.multifile = True
                corpus.cache_file_path = feature_path
                data.append(corpus)
        return data
Ejemplo n.º 2
0
 def test_feed(self):
     path_src = os.path.join(RAW_CORPUS_ROOT, 'en')
     dd = DetailedDictionary.read_from_folder(path_src)
     self.assertGreater(len(dd.words), 100)