def test_system_dictionary_ipadic(self): sys_dic = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) self.assertEqual(7, len(sys_dic.lookup(u'形態素'.encode('utf-8')))) self.assertEqual(1, sys_dic.get_trans_cost(0, 1)) self.assertEqual({'HIRAGANA': []}, sys_dic.get_char_categories(u'は')) self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories(u'ハ')) self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories(u'ハ')) self.assertEqual({'KANJI': []}, sys_dic.get_char_categories(u'葉')) self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories(u'C')) self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories(u'C')) self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories(u'#')) self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories(u'#')) self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories(u'5')) self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories(u'5')) self.assertEqual({ 'KANJI': [], 'KANJINUMERIC': ['KANJI'] }, sys_dic.get_char_categories(u'五')) self.assertEqual({'GREEK': []}, sys_dic.get_char_categories(u'Γ')) self.assertEqual({'CYRILLIC': []}, sys_dic.get_char_categories(u'Б')) self.assertEqual({'DEFAULT': []}, sys_dic.get_char_categories(u'𠮷')) self.assertEqual({'DEFAULT': []}, sys_dic.get_char_categories(u'한')) self.assertTrue(sys_dic.unknown_invoked_always('ALPHA')) self.assertFalse(sys_dic.unknown_invoked_always('KANJI')) self.assertTrue(sys_dic.unknown_grouping('NUMERIC')) self.assertFalse(sys_dic.unknown_grouping('KANJI')) self.assertEqual(2, sys_dic.unknown_length('HIRAGANA'))
def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_length=1024, wakati=False, mmap=False): """ Initialize Tokenizer object with optional arguments. :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8' :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic' :param max_unknows_length: (Optional) max unknown word length. default is 1024. :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode. :param mmap: (Optional) if given True use memory-mapped file for dictionary data. .. seealso:: See http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary for details for user dictionary. """ self.wakati = wakati if mmap: self.sys_dic = MMapSystemDictionary(mmap_entries(wakati), connections, chardef.DATA, unknowns.DATA) else: self.sys_dic = SystemDictionary(entries(wakati), connections, chardef.DATA, unknowns.DATA) if udic: if udic.endswith('.csv'): # build user dictionary from CSV self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections) elif os.path.isdir(udic): # load compiled user dictionary self.user_dic = CompiledUserDictionary(udic, connections) else: self.user_dic = None else: self.user_dic = None self.max_unknown_length = max_unknown_length
def test_property_types(self): sys_dic = SystemDictionary(entries(), connections(), chardef.DATA, unknowns.DATA) # entry in the system dictionary entry = sys_dic.lookup(u'すもも')[0] if PY3: self.assertTrue(type(entry[0]) is str) self.assertTrue(type(entry[4]) is str) self.assertTrue(type(entry[5]) is str) self.assertTrue(type(entry[6]) is str) self.assertTrue(type(entry[7]) is str) self.assertTrue(type(entry[8]) is str) self.assertTrue(type(entry[9]) is str) else: self.assertTrue(type(entry[0]) is unicode) self.assertTrue(type(entry[4]) is unicode) self.assertTrue(type(entry[5]) is unicode) self.assertTrue(type(entry[6]) is unicode) self.assertTrue(type(entry[7]) is unicode) self.assertTrue(type(entry[8]) is unicode) self.assertTrue(type(entry[9]) is unicode) self.assertTrue(type(entry[1]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) # unknown entry entry = sys_dic.unknowns.get(u'HIRAGANA')[0] if PY3: self.assertTrue(type(entry[3]) is str) else: self.assertTrue(type(entry[3]) is unicode) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[1]) is int) self.assertTrue(type(entry[2]) is int) # entry in the user defined dictionary user_dic = UserDictionary(user_dict=os.path.join(parent_dir, 'tests/user_ipadic.csv'), enc='utf8', type='ipadic', connections=connections()) entry = user_dic.lookup(u'東京スカイツリー')[0] if PY3: self.assertTrue(type(entry[0]) is str) self.assertTrue(type(entry[4]) is str) self.assertTrue(type(entry[5]) is str) self.assertTrue(type(entry[6]) is str) self.assertTrue(type(entry[7]) is str) self.assertTrue(type(entry[8]) is str) self.assertTrue(type(entry[9]) is str) else: self.assertTrue(type(entry[0]) is unicode) self.assertTrue(type(entry[4]) is unicode) self.assertTrue(type(entry[5]) is unicode) self.assertTrue(type(entry[6]) is unicode) self.assertTrue(type(entry[7]) is unicode) self.assertTrue(type(entry[8]) is unicode) self.assertTrue(type(entry[9]) is unicode) self.assertTrue(type(entry[1]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int)
def test_system_dictionary_cache(self): sys_dic = SystemDictionary(entries(), connections(), chardef.DATA, unknowns.DATA) self.assertEqual(11, len(sys_dic.lookup(u'小書き'))) self.assertEqual(11, len(sys_dic.lookup(u'小書き'))) self.assertEqual(11, len(sys_dic.lookup(u'小書きにしました'))) self.assertEqual(10, len(sys_dic.lookup(u'みんなと'))) self.assertEqual(10, len(sys_dic.lookup(u'みんなと'))) self.assertEqual(2, len(sys_dic.lookup(u'叩く'))) self.assertEqual(2, len(sys_dic.lookup(u'叩く')))
def test_load_all_fst_data_from_package(self): # Py2.7 doesn't have unittest.mock, so manually replaced the method store = janome.dic.load_all_fstdata janome.dic.load_all_fstdata = janome.dic.load_all_fstdata_from_package try: sys_dic = SystemDictionary(entries(), connections, chardef.DATA, unknowns.DATA) self.assertEqual(11, len(sys_dic.lookup(u'小書き'.encode('utf8')))) except Exception: janome.dic.load_all_fstdata = store raise
def test_system_dictionary_cache(self): sys_dic = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) self.assertEqual(11, len(sys_dic.lookup(u'小書き'.encode('utf8')))) self.assertEqual(11, len(sys_dic.lookup(u'小書き'.encode('utf8')))) self.assertEqual(11, len(sys_dic.lookup(u'小書きにしました'.encode('utf8')))) self.assertEqual(10, len(sys_dic.lookup(u'みんなと'.encode('utf8')))) self.assertEqual(10, len(sys_dic.lookup(u'みんなと'.encode('utf8')))) self.assertEqual(2, len(sys_dic.lookup(u'叩く'.encode('utf8')))) self.assertEqual(2, len(sys_dic.lookup(u'叩く'.encode('utf8'))))
def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_length=1024, wakati=False, mmap=False, dotfile=''): """ Initialize Tokenizer object with optional arguments. :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8' :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic' :param max_unknows_length: (Optional) max unknown word length. default is 1024. :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode. :param mmap: (Optional) if given True use memory-mapped file for dictionary data. .. seealso:: See http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary for details for user dictionary. """ self.wakati = wakati if mmap: self.sys_dic = MMapSystemDictionary(mmap_entries(wakati), connections, chardef.DATA, unknowns.DATA) else: self.sys_dic = SystemDictionary(entries(wakati), connections, chardef.DATA, unknowns.DATA) if udic: if udic.endswith('.csv'): # build user dictionary from CSV self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections) elif os.path.isdir(udic): # load compiled user dictionary self.user_dic = CompiledUserDictionary(udic, connections) else: self.user_dic = None else: self.user_dic = None self.max_unknown_length = max_unknown_length
def test_system_dictionary_ipadic(self): sys_dic = SystemDictionary(entries(), connections(), chardef.DATA, unknowns.DATA) self.assertEqual(7, len(sys_dic.lookup(u'形態素'))) self.assertEqual(1, sys_dic.get_trans_cost(0, 1)) self.assertEqual({'HIRAGANA': []}, sys_dic.get_char_categories(u'は')) self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories(u'ハ')) self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories(u'ハ')) self.assertEqual({'KANJI': []}, sys_dic.get_char_categories(u'葉')) self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories(u'C')) self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories(u'C')) self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories(u'#')) self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories(u'#')) self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories(u'5')) self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories(u'5')) self.assertEqual({'KANJI': [], 'KANJINUMERIC': ['KANJI']}, sys_dic.get_char_categories(u'五')) self.assertEqual({'GREEK': []}, sys_dic.get_char_categories(u'Γ')) self.assertEqual({'CYRILLIC': []}, sys_dic.get_char_categories(u'Б')) self.assertTrue(sys_dic.unkown_invoked_always('ALPHA')) self.assertFalse(sys_dic.unkown_invoked_always('KANJI')) self.assertTrue(sys_dic.unknown_grouping('NUMERIC')) self.assertFalse(sys_dic.unknown_grouping('KANJI')) self.assertEqual(2, sys_dic.unknown_length('HIRAGANA'))
def test_property_types(self): sys_dic = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) # entry in the system dictionary entry = sys_dic.lookup(u'すもも'.encode('utf8'))[0] if PY3: self.assertTrue(type(entry[1]) is str) else: self.assertTrue(type(entry[1]) is unicode) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int) entry_extra = sys_dic.lookup_extra(entry[0]) if PY3: self.assertTrue(type(entry_extra[0]) is str) self.assertTrue(type(entry_extra[1]) is str) self.assertTrue(type(entry_extra[2]) is str) self.assertTrue(type(entry_extra[3]) is str) self.assertTrue(type(entry_extra[4]) is str) self.assertTrue(type(entry_extra[5]) is str) else: self.assertTrue(type(entry_extra[0]) is unicode) self.assertTrue(type(entry_extra[1]) is unicode) self.assertTrue(type(entry_extra[2]) is unicode) self.assertTrue(type(entry_extra[3]) is unicode) self.assertTrue(type(entry_extra[4]) is unicode) self.assertTrue(type(entry_extra[5]) is unicode) # unknown entry entry = sys_dic.unknowns.get(u'HIRAGANA')[0] if PY3: self.assertTrue(type(entry[3]) is str) else: self.assertTrue(type(entry[3]) is unicode) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[1]) is int) self.assertTrue(type(entry[2]) is int) # mmap dict etnry mmap_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections, chardef.DATA, unknowns.DATA) entry = mmap_dic.lookup(u'すもも'.encode('utf8'))[0] if PY3: self.assertTrue(type(entry[1]) is str) else: self.assertTrue(type(entry[1]) is unicode) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int) entry_extra = mmap_dic.lookup_extra(entry[0]) if PY3: self.assertTrue(type(entry_extra[0]) is str) self.assertTrue(type(entry_extra[1]) is str) self.assertTrue(type(entry_extra[2]) is str) self.assertTrue(type(entry_extra[3]) is str) self.assertTrue(type(entry_extra[4]) is str) self.assertTrue(type(entry_extra[5]) is str) else: self.assertTrue(type(entry_extra[0]) is unicode) self.assertTrue(type(entry_extra[1]) is unicode) self.assertTrue(type(entry_extra[2]) is unicode) self.assertTrue(type(entry_extra[3]) is unicode) self.assertTrue(type(entry_extra[4]) is unicode) self.assertTrue(type(entry_extra[5]) is unicode) # entry in the user defined dictionary user_dic = UserDictionary(user_dict=os.path.join( parent_dir, 'tests/user_ipadic.csv'), enc='utf8', type='ipadic', connections=connections) entry = user_dic.lookup(u'東京スカイツリー'.encode('utf8'))[0] if PY3: self.assertTrue(type(entry[1]) is str) else: self.assertTrue(type(entry[1]) is unicode) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os, sys # TODO: better way to find package... parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, parent_dir) from janome.lattice import * from janome.dic import SystemDictionary, MMapSystemDictionary from sysdic import entries, mmap_entries, connections, chardef, unknowns import unittest SYS_DIC = SystemDictionary(entries(), connections, chardef.DATA, unknowns.DATA) MMAP_SYS_DIC = MMapSystemDictionary(mmap_entries(), connections, chardef.DATA, unknowns.DATA) class TestLattice(unittest.TestCase): def test_initialize_lattice(self): lattice = Lattice(5, SYS_DIC) self.assertEqual(7, len(lattice.snodes)) self.assertTrue(isinstance(lattice.snodes[0][0], BOS)) self.assertEqual(8, len(lattice.enodes)) self.assertTrue(isinstance(lattice.enodes[1][0], BOS)) def test_add_forward_end(self): s = u'すもも' lattice = Lattice(len(s), SYS_DIC)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os, sys # TODO: better way to find package... parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, parent_dir) from janome.lattice import * from janome.dic import SystemDictionary, MMapSystemDictionary from sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns import unittest SYS_DIC = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) MMAP_SYS_DIC = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections, chardef.DATA, unknowns.DATA) class TestLattice(unittest.TestCase): def test_initialize_lattice(self): lattice = Lattice(5, SYS_DIC) self.assertEqual(7, len(lattice.snodes)) self.assertTrue(isinstance(lattice.snodes[0][0], BOS)) self.assertEqual(8, len(lattice.enodes)) self.assertTrue(isinstance(lattice.enodes[1][0], BOS)) def test_add_forward_end(self): s = u'すもも'
def test_property_types(self): sys_dic = SystemDictionary(entries(), connections(), chardef.DATA, unknowns.DATA) # entry in the system dictionary entry = sys_dic.lookup(u'すもも')[0] if PY3: self.assertTrue(type(entry[0]) is str) self.assertTrue(type(entry[4]) is str) self.assertTrue(type(entry[5]) is str) self.assertTrue(type(entry[6]) is str) self.assertTrue(type(entry[7]) is str) self.assertTrue(type(entry[8]) is str) self.assertTrue(type(entry[9]) is str) else: self.assertTrue(type(entry[0]) is unicode) self.assertTrue(type(entry[4]) is unicode) self.assertTrue(type(entry[5]) is unicode) self.assertTrue(type(entry[6]) is unicode) self.assertTrue(type(entry[7]) is unicode) self.assertTrue(type(entry[8]) is unicode) self.assertTrue(type(entry[9]) is unicode) self.assertTrue(type(entry[1]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) # unknown entry entry = sys_dic.unknowns.get(u'HIRAGANA')[0] if PY3: self.assertTrue(type(entry[3]) is str) else: self.assertTrue(type(entry[3]) is unicode) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[1]) is int) self.assertTrue(type(entry[2]) is int) # entry in the user defined dictionary user_dic = UserDictionary(user_dict=os.path.join( parent_dir, 'tests/user_ipadic.csv'), enc='utf8', type='ipadic', connections=connections()) entry = user_dic.lookup(u'東京スカイツリー')[0] if PY3: self.assertTrue(type(entry[0]) is str) self.assertTrue(type(entry[4]) is str) self.assertTrue(type(entry[5]) is str) self.assertTrue(type(entry[6]) is str) self.assertTrue(type(entry[7]) is str) self.assertTrue(type(entry[8]) is str) self.assertTrue(type(entry[9]) is str) else: self.assertTrue(type(entry[0]) is unicode) self.assertTrue(type(entry[4]) is unicode) self.assertTrue(type(entry[5]) is unicode) self.assertTrue(type(entry[6]) is unicode) self.assertTrue(type(entry[7]) is unicode) self.assertTrue(type(entry[8]) is unicode) self.assertTrue(type(entry[9]) is unicode) self.assertTrue(type(entry[1]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os, sys # TODO: better way to find package... parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, parent_dir) from janome.lattice import * from janome.dic import SystemDictionary, MMapSystemDictionary from sysdic import entries, mmap_entries, connections, chardef, unknowns import unittest SYS_DIC = SystemDictionary(entries(), connections, chardef.DATA, unknowns.DATA) MMAP_SYS_DIC = MMapSystemDictionary(mmap_entries(), connections, chardef.DATA, unknowns.DATA) class TestLattice(unittest.TestCase): def test_initialize_lattice(self): lattice = Lattice(5, SYS_DIC) self.assertEqual(7, len(lattice.snodes)) self.assertTrue(isinstance(lattice.snodes[0][0], BOS)) self.assertEqual(9, len(lattice.enodes)) self.assertTrue(isinstance(lattice.enodes[1][0], BOS)) def test_add_forward_end(self): s = u'すもも' lattice = Lattice(len(s), SYS_DIC) entries = SYS_DIC.lookup(s.encode('utf8')) for entry in entries: