def test_char_map(self): id_map = IDMap() id_map.read(self.id_path) char_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD]) char_id_map = char_map.convert_to_characters(id_map) self.assertEqual(len(char_id_map.word_to_count.keys()), 46)
def test_morph_map(self): id_map = IDMap() id_map.read(self.id_path) original_vocab_size = len(id_map.word_to_count.keys()) print(original_vocab_size) morph_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD]) morph_map.train_morph_parser(id_map) morph_id_map = morph_map.convert_to_morphemes(id_map) new_vocab_size = len(morph_id_map.word_to_count.keys()) print(new_vocab_size) self.assertGreater(original_vocab_size, new_vocab_size)
def test_read_write(self): id_map = IDMap() id_map.read(self.id_path) morph_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD]) morph_map.train_morph_parser(id_map) morph_id_map = morph_map.convert_to_morphemes(id_map) old_vocab_size = len(morph_id_map.word_to_count.keys()) morph_map.write(self.write_out+'morph_map.txt') morph_map.write_morph_model(self.write_out+'morph_model.model') morph_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD]) morph_map.read(self.write_out+'morph_map.txt') morph_map.read_morph_model(self.write_out+'morph_model.model') new_vocab_size = len(morph_id_map.word_to_count.keys()) self.assertEqual(old_vocab_size, new_vocab_size)