def test_build_corpus_features_en(self):
        path_src = os.path.join(RAW_CORPUS_ROOT, 'en')
        file_name = [f for f in os.listdir(path_src)][0]
        file_path = os.path.join(path_src, file_name)

        dd = DetailedDictionary.read_from_file(file_path)
        cf = CorpusFeatures('en', EnAlphabet, file_path)
        cf.build(dd)
        self.assertGreater(len(cf.ngrams_collector.suffixes), 5)
Ejemplo n.º 2
0
 def load_from_file(cls, path: str):  # CorpusFeatures
     with codecs.open(path, 'rb') as fr:
         data = json.load(fr)
     alphabet = f'apps.vnlp.training.alphabet.{data["alphabet"]}'
     cf = CorpusFeatures(data['language'], alphabet, data['path'])
     cf.version = data['version']
     cf.dictionary = DetailedDictionary.json_deserialize(data['dictionary'])
     cf.ngrams_collector = MarginNgramsCollector.json_deserialize(data['ngrams_collector'])
     return cf
Ejemplo n.º 3
0
    def read_corpus_by_lang(
            cls,
            folder: str = CORPUS_ROOT,
            ignore_cached: bool = False,
            read_cached_only: bool = False) -> List[CorpusFeatures]:
        """
        Almost the same as read_corpus_by_text, but combines all texts by language
        into single corpus
        """
        data = []  # type: List[CorpusFeatures]

        raw_path = os.path.join(folder, 'raw')
        features_path = os.path.join(folder, 'features')
        if not os.path.isdir(features_path):
            os.mkdir(features_path)

        dirs = [f for f in os.listdir(raw_path)]
        for dir_name in dirs:
            sub_path = os.path.join(raw_path, dir_name)
            if not os.path.isdir(sub_path):
                continue

            language = dir_name
            features_name = f'{language}.json'  # '.../raw/fr.json'
            feature_path = os.path.join(features_path, features_name)
            corpus = None

            if not ignore_cached and os.path.isfile(feature_path):
                try:
                    cf = CorpusFeatures.load_from_file(feature_path)
                    if cf.version != CorpusFeatures.ACTUAL_VERSION:
                        print(
                            f'File "{feature_path}" has version "{cf.version}"'
                        )
                    else:
                        corpus = cf
                except Exception as e:
                    print(f'Error loading "{feature_path}": {e}')

            if not corpus and not read_cached_only:
                # build corpus
                alph = alphabet_by_code[language]
                corpus = CorpusFeatures(language, alph, sub_path)
                dict = DetailedDictionary.read_from_folder(sub_path)
                corpus.build(dict)
                # cache corpus
                corpus.save_to_file(feature_path)

            if corpus:
                corpus.multifile = True
                corpus.cache_file_path = feature_path
                data.append(corpus)
        return data
 def test_find_morphs(self):
     cf = CorpusFeatures('en', EnAlphabet, '')
     cf.dictionary = DetailedDictionary()
     cf.dictionary.words = [
         WordCard('deprived', 10),
         WordCard('prived', 6),
         WordCard('deprive', 5)
     ]
     cf.dictionary.words_total = len(cf.dictionary.words)
     cf.all_words = {d.word for d in cf.dictionary.words}
     cf.ngrams_collector = MarginNgramsCollector(cf.alphabet, cf.dictionary)
     cf.ngrams_collector.prefixes.append(MarginNgram('de', 1, 3, 1))
     cf.ngrams_collector.prefixes.append(MarginNgram('in', 1, 2, 1))
     cf.ngrams_collector.suffixes.append(MarginNgram('ion', -1, 3, 1))
     cf.ngrams_collector.suffixes.append(MarginNgram('d', -1, 4, 1))
     cf.find_dict_morphs()
     wrd = cf.dictionary.words[0]
     self.assertGreater(len(wrd.root), 0)
Ejemplo n.º 5
0
    def test_serialize_deserialize(self):
        dd = DetailedDictionary()
        dd.files_processed = 1
        dd.words_processed = 4
        dd.words.append(WordCard('detail', 12, 'tail'))
        dd.words[0].prefix = 'de'
        dd.words.append(WordCard('corpus', 2, ''))
        dd.words.append(WordCard('plural', 1, ''))
        dd.words.append(WordCard('omnis', 1, ''))
        dd.words_total = len(dd.words)
        dd.word_grams = {(2, 'corpus omins'): 24,
                         (3, 'plural corpus omins'): 21}
        jsn = dd.json_serialize()
        self.assertGreater(len(jsn), 10)

        rd = DetailedDictionary.json_deserialize(jsn)
        self.assertEqual(dd.files_processed, rd.files_processed)
        self.assertEqual(dd.words_processed, rd.words_processed)
        self.assertEqual(dd.words_total, rd.words_total)
        self.assertEqual(len(dd.words), len(rd.words))
        self.assertEqual(len(dd.word_grams), len(rd.word_grams))
Ejemplo n.º 6
0
 def test_feed(self):
     path_src = os.path.join(RAW_CORPUS_ROOT, 'en')
     dd = DetailedDictionary.read_from_folder(path_src)
     self.assertGreater(len(dd.words), 100)
Ejemplo n.º 7
0
    def read_corpus_by_text(cls,
                            folder: str = CORPUS_ROOT,
                            ignore_cached: bool = False,
                            read_cached_only: bool = False,
                            file_name_only: str = '') -> List[CorpusFeatures]:
        """
        "folder" should have the following structure:
         - raw
           - <lang_1>
             - <file_1_1>.txt  # source text - words in lowercase, space-separated
             ..
           - <lang_N>
         - features
           - <file_1_1>.json  # JSON-encoded CorpusFeatures for file_1_1.txt "corpus"
           ..

        """
        data = []  # type: List[CorpusFeatures]

        raw_path = os.path.join(folder, 'raw')
        features_path = os.path.join(folder, 'features')
        if not os.path.isdir(features_path):
            os.mkdir(features_path)

        dirs = [f for f in os.listdir(raw_path)]
        for dir_name in dirs:
            sub_path = os.path.join(raw_path, dir_name)
            if not os.path.isdir(sub_path):
                continue

            language = dir_name  # now we somwhere like '.../raw/fr/'
            files = [f for f in os.listdir(sub_path)]
            for file_name in files:
                if file_name_only and file_name != file_name_only:
                    continue
                full_path = os.path.join(sub_path,
                                         file_name)  # '.../raw/fr/file01.txt'
                if not os.path.isfile(full_path) or not file_name.endswith(
                        '.txt'):
                    continue
                # try "cached" feature file
                features_name = os.path.splitext(file_name)[0] + '.json'
                feature_path = os.path.join(features_path, dir_name,
                                            features_name)
                corpus = None  # type: Optional[CorpusFeatures]
                if not ignore_cached and os.path.isfile(feature_path):
                    try:
                        cf = CorpusFeatures.load_from_file(feature_path)
                        if cf.version != CorpusFeatures.ACTUAL_VERSION:
                            print(
                                f'File "{feature_path}" has version "{cf.version}"'
                            )
                        else:
                            corpus = cf
                    except Exception as e:
                        print(f'Error loading "{feature_path}": {e}')
                if not corpus and not read_cached_only:
                    # build corpus
                    alph = alphabet_by_code[language]
                    corpus = CorpusFeatures(language, alph, full_path)
                    dict = DetailedDictionary.read_from_file(full_path)
                    corpus.build(dict)
                    # cache corpus
                    feature_subfolder = os.path.join(features_path, dir_name)
                    if not os.path.isdir(feature_subfolder):
                        os.mkdir(feature_subfolder)
                    corpus.save_to_file(feature_path)

                if corpus:
                    corpus.cache_file_path = feature_path
                    data.append(corpus)
        return data