def test_build_corpus_features_en(self): path_src = os.path.join(RAW_CORPUS_ROOT, 'en') file_name = [f for f in os.listdir(path_src)][0] file_path = os.path.join(path_src, file_name) dd = DetailedDictionary.read_from_file(file_path) cf = CorpusFeatures('en', EnAlphabet, file_path) cf.build(dd) self.assertGreater(len(cf.ngrams_collector.suffixes), 5)
def load_from_file(cls, path: str): # CorpusFeatures with codecs.open(path, 'rb') as fr: data = json.load(fr) alphabet = f'apps.vnlp.training.alphabet.{data["alphabet"]}' cf = CorpusFeatures(data['language'], alphabet, data['path']) cf.version = data['version'] cf.dictionary = DetailedDictionary.json_deserialize(data['dictionary']) cf.ngrams_collector = MarginNgramsCollector.json_deserialize(data['ngrams_collector']) return cf
def read_corpus_by_lang( cls, folder: str = CORPUS_ROOT, ignore_cached: bool = False, read_cached_only: bool = False) -> List[CorpusFeatures]: """ Almost the same as read_corpus_by_text, but combines all texts by language into single corpus """ data = [] # type: List[CorpusFeatures] raw_path = os.path.join(folder, 'raw') features_path = os.path.join(folder, 'features') if not os.path.isdir(features_path): os.mkdir(features_path) dirs = [f for f in os.listdir(raw_path)] for dir_name in dirs: sub_path = os.path.join(raw_path, dir_name) if not os.path.isdir(sub_path): continue language = dir_name features_name = f'{language}.json' # '.../raw/fr.json' feature_path = os.path.join(features_path, features_name) corpus = None if not ignore_cached and os.path.isfile(feature_path): try: cf = CorpusFeatures.load_from_file(feature_path) if cf.version != CorpusFeatures.ACTUAL_VERSION: print( f'File "{feature_path}" has version "{cf.version}"' ) else: corpus = cf except Exception as e: print(f'Error loading "{feature_path}": {e}') if not corpus and not read_cached_only: # build corpus alph = alphabet_by_code[language] corpus = CorpusFeatures(language, alph, sub_path) dict = DetailedDictionary.read_from_folder(sub_path) corpus.build(dict) # cache corpus corpus.save_to_file(feature_path) if corpus: corpus.multifile = True corpus.cache_file_path = feature_path data.append(corpus) return data
def test_find_morphs(self): cf = CorpusFeatures('en', EnAlphabet, '') cf.dictionary = DetailedDictionary() cf.dictionary.words = [ WordCard('deprived', 10), WordCard('prived', 6), WordCard('deprive', 5) ] cf.dictionary.words_total = len(cf.dictionary.words) cf.all_words = {d.word for d in cf.dictionary.words} cf.ngrams_collector = MarginNgramsCollector(cf.alphabet, cf.dictionary) cf.ngrams_collector.prefixes.append(MarginNgram('de', 1, 3, 1)) cf.ngrams_collector.prefixes.append(MarginNgram('in', 1, 2, 1)) cf.ngrams_collector.suffixes.append(MarginNgram('ion', -1, 3, 1)) cf.ngrams_collector.suffixes.append(MarginNgram('d', -1, 4, 1)) cf.find_dict_morphs() wrd = cf.dictionary.words[0] self.assertGreater(len(wrd.root), 0)
def test_serialize_deserialize(self): dd = DetailedDictionary() dd.files_processed = 1 dd.words_processed = 4 dd.words.append(WordCard('detail', 12, 'tail')) dd.words[0].prefix = 'de' dd.words.append(WordCard('corpus', 2, '')) dd.words.append(WordCard('plural', 1, '')) dd.words.append(WordCard('omnis', 1, '')) dd.words_total = len(dd.words) dd.word_grams = {(2, 'corpus omins'): 24, (3, 'plural corpus omins'): 21} jsn = dd.json_serialize() self.assertGreater(len(jsn), 10) rd = DetailedDictionary.json_deserialize(jsn) self.assertEqual(dd.files_processed, rd.files_processed) self.assertEqual(dd.words_processed, rd.words_processed) self.assertEqual(dd.words_total, rd.words_total) self.assertEqual(len(dd.words), len(rd.words)) self.assertEqual(len(dd.word_grams), len(rd.word_grams))
def test_feed(self): path_src = os.path.join(RAW_CORPUS_ROOT, 'en') dd = DetailedDictionary.read_from_folder(path_src) self.assertGreater(len(dd.words), 100)
def read_corpus_by_text(cls, folder: str = CORPUS_ROOT, ignore_cached: bool = False, read_cached_only: bool = False, file_name_only: str = '') -> List[CorpusFeatures]: """ "folder" should have the following structure: - raw - <lang_1> - <file_1_1>.txt # source text - words in lowercase, space-separated .. - <lang_N> - features - <file_1_1>.json # JSON-encoded CorpusFeatures for file_1_1.txt "corpus" .. """ data = [] # type: List[CorpusFeatures] raw_path = os.path.join(folder, 'raw') features_path = os.path.join(folder, 'features') if not os.path.isdir(features_path): os.mkdir(features_path) dirs = [f for f in os.listdir(raw_path)] for dir_name in dirs: sub_path = os.path.join(raw_path, dir_name) if not os.path.isdir(sub_path): continue language = dir_name # now we somwhere like '.../raw/fr/' files = [f for f in os.listdir(sub_path)] for file_name in files: if file_name_only and file_name != file_name_only: continue full_path = os.path.join(sub_path, file_name) # '.../raw/fr/file01.txt' if not os.path.isfile(full_path) or not file_name.endswith( '.txt'): continue # try "cached" feature file features_name = os.path.splitext(file_name)[0] + '.json' feature_path = os.path.join(features_path, dir_name, features_name) corpus = None # type: Optional[CorpusFeatures] if not ignore_cached and os.path.isfile(feature_path): try: cf = CorpusFeatures.load_from_file(feature_path) if cf.version != CorpusFeatures.ACTUAL_VERSION: print( f'File "{feature_path}" has version "{cf.version}"' ) else: corpus = cf except Exception as e: print(f'Error loading "{feature_path}": {e}') if not corpus and not read_cached_only: # build corpus alph = alphabet_by_code[language] corpus = CorpusFeatures(language, alph, full_path) dict = DetailedDictionary.read_from_file(full_path) corpus.build(dict) # cache corpus feature_subfolder = os.path.join(features_path, dir_name) if not os.path.isdir(feature_subfolder): os.mkdir(feature_subfolder) corpus.save_to_file(feature_path) if corpus: corpus.cache_file_path = feature_path data.append(corpus) return data