def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_length=1024, wakati=False, mmap=False, dotfile=''): """ Initialize Tokenizer object with optional arguments. :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8' :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic' :param max_unknows_length: (Optional) max unknown word length. default is 1024. :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode. :param mmap: (Optional) if given True use memory-mapped file for dictionary data. .. seealso:: See http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary for details for user dictionary. """ self.wakati = wakati if mmap: self.sys_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(wakati), connections, chardef.DATA, unknowns.DATA) else: self.sys_dic = SystemDictionary(all_fstdata(), entries(wakati), connections, chardef.DATA, unknowns.DATA) if udic: if udic.endswith('.csv'): # build user dictionary from CSV self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections) elif os.path.isdir(udic): # load compiled user dictionary self.user_dic = CompiledUserDictionary(udic, connections) else: self.user_dic = None else: self.user_dic = None self.max_unknown_length = max_unknown_length
def test_property_types(self): sys_dic = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) # entry in the system dictionary entry = sys_dic.lookup('すもも'.encode('utf8'))[0] self.assertTrue(type(entry[1]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int) entry_extra = sys_dic.lookup_extra(entry[0]) self.assertTrue(type(entry_extra[0]) is str) self.assertTrue(type(entry_extra[1]) is str) self.assertTrue(type(entry_extra[2]) is str) self.assertTrue(type(entry_extra[3]) is str) self.assertTrue(type(entry_extra[4]) is str) self.assertTrue(type(entry_extra[5]) is str) # unknown entry entry = sys_dic.unknowns.get(u'HIRAGANA')[0] self.assertTrue(type(entry[3]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[1]) is int) self.assertTrue(type(entry[2]) is int) # mmap dict etnry mmap_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections, chardef.DATA, unknowns.DATA) entry = mmap_dic.lookup(u'すもも'.encode('utf8'))[0] self.assertTrue(type(entry[1]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int) entry_extra = mmap_dic.lookup_extra(entry[0]) self.assertTrue(type(entry_extra[0]) is str) self.assertTrue(type(entry_extra[1]) is str) self.assertTrue(type(entry_extra[2]) is str) self.assertTrue(type(entry_extra[3]) is str) self.assertTrue(type(entry_extra[4]) is str) self.assertTrue(type(entry_extra[5]) is str) # entry in the user defined dictionary user_dic = UserDictionary(user_dict=os.path.join( parent_dir, 'tests/user_ipadic.csv'), enc='utf8', type='ipadic', connections=connections) entry = user_dic.lookup('東京スカイツリー'.encode('utf8'))[0] self.assertTrue(type(entry[1]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int)
def __init__(self, udic: str = '', *, udic_enc: str = 'utf8', udic_type: str = 'ipadic', max_unknown_length: int = 1024, wakati: bool = False, mmap: bool = DEFAULT_MMAP_MODE, dotfile: str = ''): """ Initialize Tokenizer object with optional arguments. :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8' :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic' :param max_unknows_length: (Optional) max unknown word length. default is 1024. :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode. :param mmap: (Optional) if given False, memory-mapped file mode is disabled. Set this option to False on any environments that do not support mmap. Default is True on 64bit architecture; otherwise False. .. seealso:: http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary """ self.sys_dic: Union[SystemDictionary, MMapSystemDictionary] self.user_dic: Optional[Union[UserDictionary, CompiledUserDictionary]] self.wakati = wakati if mmap: self.sys_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(wakati), connections, chardef.DATA, unknowns.DATA) else: self.sys_dic = SystemDictionary(all_fstdata(), entries(wakati), connections, chardef.DATA, unknowns.DATA) if udic: if udic.endswith('.csv'): # build user dictionary from CSV self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections) elif os.path.isdir(udic): # load compiled user dictionary self.user_dic = CompiledUserDictionary(udic, connections) else: self.user_dic = None else: self.user_dic = None self.max_unknown_length = max_unknown_length
# limitations under the License. import os import sys import unittest from janome.sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns from janome.dic import SystemDictionary, MMapSystemDictionary from janome.lattice import Lattice, BOS, EOS, SurfaceNode # TODO: better way to find package... parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, parent_dir) SYS_DIC = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) MMAP_SYS_DIC = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections, chardef.DATA, unknowns.DATA) class TestLattice(unittest.TestCase): def test_initialize_lattice(self): lattice = Lattice(5, SYS_DIC) self.assertEqual(7, len(lattice.snodes)) self.assertTrue(isinstance(lattice.snodes[0][0], BOS)) self.assertEqual(8, len(lattice.enodes)) self.assertTrue(isinstance(lattice.enodes[1][0], BOS)) def test_add_forward_end(self): s = 'すもも' lattice = Lattice(len(s), SYS_DIC) entries = SYS_DIC.lookup(s.encode('utf8'))
# See the License for the specific language governing permissions and # limitations under the License. import os, sys # TODO: better way to find package... parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, parent_dir) from janome.lattice import * from janome.dic import SystemDictionary, MMapSystemDictionary from janome.sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns import unittest SYS_DIC = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) MMAP_SYS_DIC = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections, chardef.DATA, unknowns.DATA) class TestLattice(unittest.TestCase): def test_initialize_lattice(self): lattice = Lattice(5, SYS_DIC) self.assertEqual(7, len(lattice.snodes)) self.assertTrue(isinstance(lattice.snodes[0][0], BOS)) self.assertEqual(8, len(lattice.enodes)) self.assertTrue(isinstance(lattice.enodes[1][0], BOS)) def test_add_forward_end(self): s = u'すもも' lattice = Lattice(len(s), SYS_DIC) entries = SYS_DIC.lookup(s.encode('utf8')) for entry in entries: lattice.add(SurfaceNode(entry))