def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_length=1024, wakati=False, mmap=False, dotfile=''): """ Initialize Tokenizer object with optional arguments. :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8' :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic' :param max_unknows_length: (Optional) max unknown word length. default is 1024. :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode. :param mmap: (Optional) if given True use memory-mapped file for dictionary data. .. seealso:: See http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary for details for user dictionary. """ self.wakati = wakati if mmap: self.sys_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(wakati), connections, chardef.DATA, unknowns.DATA) else: self.sys_dic = SystemDictionary(all_fstdata(), entries(wakati), connections, chardef.DATA, unknowns.DATA) if udic: if udic.endswith('.csv'): # build user dictionary from CSV self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections) elif os.path.isdir(udic): # load compiled user dictionary self.user_dic = CompiledUserDictionary(udic, connections) else: self.user_dic = None else: self.user_dic = None self.max_unknown_length = max_unknown_length
def test_property_types(self): sys_dic = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) # entry in the system dictionary entry = sys_dic.lookup('すもも'.encode('utf8'))[0] self.assertTrue(type(entry[1]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int) entry_extra = sys_dic.lookup_extra(entry[0]) self.assertTrue(type(entry_extra[0]) is str) self.assertTrue(type(entry_extra[1]) is str) self.assertTrue(type(entry_extra[2]) is str) self.assertTrue(type(entry_extra[3]) is str) self.assertTrue(type(entry_extra[4]) is str) self.assertTrue(type(entry_extra[5]) is str) # unknown entry entry = sys_dic.unknowns.get(u'HIRAGANA')[0] self.assertTrue(type(entry[3]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[1]) is int) self.assertTrue(type(entry[2]) is int) # mmap dict etnry mmap_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections, chardef.DATA, unknowns.DATA) entry = mmap_dic.lookup(u'すもも'.encode('utf8'))[0] self.assertTrue(type(entry[1]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int) entry_extra = mmap_dic.lookup_extra(entry[0]) self.assertTrue(type(entry_extra[0]) is str) self.assertTrue(type(entry_extra[1]) is str) self.assertTrue(type(entry_extra[2]) is str) self.assertTrue(type(entry_extra[3]) is str) self.assertTrue(type(entry_extra[4]) is str) self.assertTrue(type(entry_extra[5]) is str) # entry in the user defined dictionary user_dic = UserDictionary(user_dict=os.path.join( parent_dir, 'tests/user_ipadic.csv'), enc='utf8', type='ipadic', connections=connections) entry = user_dic.lookup('東京スカイツリー'.encode('utf8'))[0] self.assertTrue(type(entry[1]) is str) self.assertTrue(type(entry[0]) is int) self.assertTrue(type(entry[2]) is int) self.assertTrue(type(entry[3]) is int) self.assertTrue(type(entry[4]) is int)
def __init__(self, udic: str = '', *, udic_enc: str = 'utf8', udic_type: str = 'ipadic', max_unknown_length: int = 1024, wakati: bool = False, mmap: bool = DEFAULT_MMAP_MODE, dotfile: str = ''): """ Initialize Tokenizer object with optional arguments. :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8' :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic' :param max_unknows_length: (Optional) max unknown word length. default is 1024. :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode. :param mmap: (Optional) if given False, memory-mapped file mode is disabled. Set this option to False on any environments that do not support mmap. Default is True on 64bit architecture; otherwise False. .. seealso:: http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary """ self.sys_dic: Union[SystemDictionary, MMapSystemDictionary] self.user_dic: Optional[Union[UserDictionary, CompiledUserDictionary]] self.wakati = wakati if mmap: self.sys_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(wakati), connections, chardef.DATA, unknowns.DATA) else: self.sys_dic = SystemDictionary(all_fstdata(), entries(wakati), connections, chardef.DATA, unknowns.DATA) if udic: if udic.endswith('.csv'): # build user dictionary from CSV self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections) elif os.path.isdir(udic): # load compiled user dictionary self.user_dic = CompiledUserDictionary(udic, connections) else: self.user_dic = None else: self.user_dic = None self.max_unknown_length = max_unknown_length
def test_system_dictionary_ipadic(self): sys_dic = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) self.assertEqual(7, len(sys_dic.lookup('形態素'.encode('utf-8')))) self.assertEqual(1, sys_dic.get_trans_cost(0, 1)) self.assertEqual({'HIRAGANA': []}, sys_dic.get_char_categories('は')) self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories('ハ')) self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories('ハ')) self.assertEqual({'KANJI': []}, sys_dic.get_char_categories('葉')) self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories('C')) self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories('C')) self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories('#')) self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories('#')) self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories('5')) self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories('5')) self.assertEqual({ 'KANJI': [], 'KANJINUMERIC': ['KANJI'] }, sys_dic.get_char_categories('五')) self.assertEqual({'GREEK': []}, sys_dic.get_char_categories('Γ')) self.assertEqual({'CYRILLIC': []}, sys_dic.get_char_categories('Б')) self.assertEqual({'DEFAULT': []}, sys_dic.get_char_categories('𠮷')) self.assertEqual({'DEFAULT': []}, sys_dic.get_char_categories('한')) self.assertTrue(sys_dic.unknown_invoked_always('ALPHA')) self.assertFalse(sys_dic.unknown_invoked_always('KANJI')) self.assertTrue(sys_dic.unknown_grouping('NUMERIC')) self.assertFalse(sys_dic.unknown_grouping('KANJI')) self.assertEqual(2, sys_dic.unknown_length('HIRAGANA'))
def test_system_dictionary_cache(self): sys_dic = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) self.assertEqual(11, len(sys_dic.lookup(u'小書き'.encode('utf8')))) self.assertEqual(11, len(sys_dic.lookup(u'小書き'.encode('utf8')))) self.assertEqual(11, len(sys_dic.lookup(u'小書きにしました'.encode('utf8')))) self.assertEqual(10, len(sys_dic.lookup(u'みんなと'.encode('utf8')))) self.assertEqual(10, len(sys_dic.lookup(u'みんなと'.encode('utf8')))) self.assertEqual(2, len(sys_dic.lookup(u'叩く'.encode('utf8')))) self.assertEqual(2, len(sys_dic.lookup(u'叩く'.encode('utf8'))))
def generate_abc_dic( sysdic: typing.Optional[typing.Iterable[typing.Iterable[typing.Any]]] = None ) -> typing.Iterator[JanomeLexEntry]: """ Generate custom Janome lexical entries for this parser. Parameters ---------- sysdic : internal list of lexical entries in janome.dic.SystemDictionary, optional An iterable of internal representation of Janome lexical entries. Optional. If not given, this function will retrive one from Janome. Giving a reference to the system lexical entries is recommended for performance reasons whenever you have obtained a relevant instance which contains a Janome system dictionary. Returns ------- abc_entries : set of JanomeLexEntry Our custom lexical entries. Notes ----- The authors choose a set, rather than a generator, for the returning result since this subroutine is intended to be externally cached (not implemented yet). Examples -------- >>> import janome.tokenizer as jt ... tokenizer = jt.Tokenizer() ... abc_entries = dic.generate_abc_dic( ... sysdic = tokenizer.sys_dic.entries.values() ... ) ... next(iter(abc_entries)).surface "筈もあれ" """ if sysdic: return _gen_abc_dic(sysdic) else: import janome.dic from janome.sysdic import ( all_fstdata, entries, mmap_entries, connections, chardef, unknowns ) janome_sys_dic = janome.dic.SystemDictionary( all_fstdata(), entries(None), connections, chardef.DATA, unknowns.DATA ) return set(_gen_abc_dic(janome_sys_dic.entries.values()))
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import unittest from janome.sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns from janome.dic import SystemDictionary, MMapSystemDictionary from janome.lattice import Lattice, BOS, EOS, SurfaceNode # TODO: better way to find package... parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, parent_dir) SYS_DIC = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA) MMAP_SYS_DIC = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections, chardef.DATA, unknowns.DATA) class TestLattice(unittest.TestCase): def test_initialize_lattice(self): lattice = Lattice(5, SYS_DIC) self.assertEqual(7, len(lattice.snodes)) self.assertTrue(isinstance(lattice.snodes[0][0], BOS)) self.assertEqual(8, len(lattice.enodes)) self.assertTrue(isinstance(lattice.enodes[1][0], BOS)) def test_add_forward_end(self): s = 'すもも'