Beispiel #1
0
    def test_property_types(self):
        sys_dic = SystemDictionary(all_fstdata(), entries(), connections,
                                   chardef.DATA, unknowns.DATA)
        # entry in the system dictionary
        entry = sys_dic.lookup('すもも'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)

        entry_extra = sys_dic.lookup_extra(entry[0])
        self.assertTrue(type(entry_extra[0]) is str)
        self.assertTrue(type(entry_extra[1]) is str)
        self.assertTrue(type(entry_extra[2]) is str)
        self.assertTrue(type(entry_extra[3]) is str)
        self.assertTrue(type(entry_extra[4]) is str)
        self.assertTrue(type(entry_extra[5]) is str)

        # unknown entry
        entry = sys_dic.unknowns.get(u'HIRAGANA')[0]
        self.assertTrue(type(entry[3]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)

        # mmap dict etnry
        mmap_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(),
                                        connections, chardef.DATA,
                                        unknowns.DATA)
        entry = mmap_dic.lookup(u'すもも'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)

        entry_extra = mmap_dic.lookup_extra(entry[0])
        self.assertTrue(type(entry_extra[0]) is str)
        self.assertTrue(type(entry_extra[1]) is str)
        self.assertTrue(type(entry_extra[2]) is str)
        self.assertTrue(type(entry_extra[3]) is str)
        self.assertTrue(type(entry_extra[4]) is str)
        self.assertTrue(type(entry_extra[5]) is str)

        # entry in the user defined dictionary
        user_dic = UserDictionary(user_dict=os.path.join(
            parent_dir, 'tests/user_ipadic.csv'),
                                  enc='utf8',
                                  type='ipadic',
                                  connections=connections)
        entry = user_dic.lookup('東京スカイツリー'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)
Beispiel #2
0
    def test_simplified_user_dictionary_with_progress(self):
        # create simplified user dictionary from csv with progress indicator
        progress_indicator = SimpleProgressIndicator(update_frequency=1.0)
        with self.assertLogs(logger=p_logger) as cm:
            # create user dictionary
            large_user_dic = UserDictionary(
                user_dict=os.path.join(parent_dir, 'tests/user_simpledic.csv'),
                enc='utf8',
                type='simpledic',
                connections=connections,
                progress_handler=progress_indicator)

            entry_count = len(large_user_dic.entries)
            # output for each entry and for complete (entry_count + 1)
            self.assertEqual((entry_count + 1) * 2, len(cm.output))
            # value is reset after complete
            self.assertIsNone(progress_indicator.value)

            for i in range(0, (entry_count + 1) * 2):
                if i < entry_count:
                    # progress for reading csv
                    self.assertIn('Reading user dictionary from CSV',
                                  cm.output[i])
                    self.assertIn(f'{i + 1}/{entry_count}', cm.output[i])
                elif i == entry_count:
                    # on compete loading csv
                    self.assertIn(f'{entry_count}/{entry_count}', cm.output[i])
                elif i < entry_count * 2 + 1:
                    # progress for create_minimum_transducer
                    self.assertIn('Running create_minimum_transducer',
                                  cm.output[i])
                    self.assertIn(f'{i - entry_count}/{entry_count}',
                                  cm.output[i])
                elif i == entry_count * 2 + 1:
                    # on compete loading create_minimum_transducer
                    self.assertIn(f'{entry_count}/{entry_count}', cm.output[i])

        # same result as without progress indicator
        self.assertEqual(1,
                         len(large_user_dic.lookup('東京スカイツリー'.encode('utf8'))))
Beispiel #3
0
    def test_simplified_user_dictionary(self):
        # create user dictionary from csv
        user_dic = UserDictionary(user_dict=os.path.join(
            parent_dir, 'tests/user_simpledic.csv'),
                                  enc='utf8',
                                  type='simpledic',
                                  connections=connections)
        self.assertEqual(1, len(user_dic.lookup('東京スカイツリー'.encode('utf8'))))

        # save compiled dictionary
        dic_dir = os.path.join(parent_dir, 'tests/userdic_simple')
        user_dic.save(to_dir=os.path.join(parent_dir, 'tests/userdic_simple'))
        self.assertTrue(
            os.path.exists(os.path.join(dic_dir, FILE_USER_FST_DATA)))
        self.assertTrue(
            os.path.exists(os.path.join(dic_dir, FILE_USER_ENTRIES_DATA)))

        # load compiled dictionary
        compiled_user_dic = CompiledUserDictionary(dic_dir,
                                                   connections=connections)
        self.assertEqual(
            1, len(compiled_user_dic.lookup('とうきょうスカイツリー駅'.encode('utf8'))))
# -*- coding: utf-8 -*-

from janome.tokenizer import Tokenizer
from janome.dic import UserDictionary
import sysdic

print('Compile user dictionary (MeCab IPADIC format)')
user_dict = UserDictionary("user_ipadic.csv", "utf8", "ipadic", sysdic.connections)
user_dict.save("/tmp/userdic")

t = Tokenizer("/tmp/userdic")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'):
  print(token)


print('')
print('Compile user dictionary (simplified format)')
user_dict = UserDictionary("user_simpledic.csv", "utf8", "simpledic", sysdic.connections)
user_dict.save("/tmp/userdic_simple")

t = Tokenizer("/tmp/userdic_simple")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'):
  print(token)
Beispiel #5
0
from janome.dic import UserDictionary
from janome import sysdic
user_dict = UserDictionary('neologd.csv', 'utf8', 'ipadic', sysdic.connections)
user_dict.save('neologd')
# -*- coding: utf-8 -*-

from janome.tokenizer import Tokenizer
from janome.dic import UserDictionary
from janome import sysdic

print('Compile user dictionary (MeCab IPADIC format)')
user_dict = UserDictionary("user_ipadic.csv", "utf8", "ipadic", sysdic.connections)
user_dict.save("/tmp/userdic")

t = Tokenizer("/tmp/userdic")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'):
  print(token)


print('')
print('Compile user dictionary (simplified format)')
user_dict = UserDictionary("user_simpledic.csv", "utf8", "simpledic", sysdic.connections)
user_dict.save("/tmp/userdic_simple")

t = Tokenizer("/tmp/userdic_simple")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'):
  print(token)