Example #1
0
    def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_length=1024, wakati=False, mmap=False, dotfile=''):
        """
        Initialize Tokenizer object with optional arguments.

        :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data
        :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8'
        :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic'
        :param max_unknows_length: (Optional) max unknown word length. default is 1024.
        :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode.
        :param mmap: (Optional) if given True use memory-mapped file for dictionary data.

        .. seealso:: See http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary for details for user dictionary.
        """
        self.wakati = wakati
        if mmap:
            self.sys_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(wakati), connections, chardef.DATA, unknowns.DATA)
        else:
            self.sys_dic = SystemDictionary(all_fstdata(), entries(wakati), connections, chardef.DATA, unknowns.DATA)
        if udic:
            if udic.endswith('.csv'):
                # build user dictionary from CSV
                self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections)
            elif os.path.isdir(udic):
                # load compiled user dictionary
                self.user_dic = CompiledUserDictionary(udic, connections)
            else:
                self.user_dic = None
        else:
            self.user_dic = None
        self.max_unknown_length = max_unknown_length
Example #2
0
    def test_property_types(self):
        sys_dic = SystemDictionary(all_fstdata(), entries(), connections,
                                   chardef.DATA, unknowns.DATA)
        # entry in the system dictionary
        entry = sys_dic.lookup('すもも'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)

        entry_extra = sys_dic.lookup_extra(entry[0])
        self.assertTrue(type(entry_extra[0]) is str)
        self.assertTrue(type(entry_extra[1]) is str)
        self.assertTrue(type(entry_extra[2]) is str)
        self.assertTrue(type(entry_extra[3]) is str)
        self.assertTrue(type(entry_extra[4]) is str)
        self.assertTrue(type(entry_extra[5]) is str)

        # unknown entry
        entry = sys_dic.unknowns.get(u'HIRAGANA')[0]
        self.assertTrue(type(entry[3]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)

        # mmap dict etnry
        mmap_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(),
                                        connections, chardef.DATA,
                                        unknowns.DATA)
        entry = mmap_dic.lookup(u'すもも'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)

        entry_extra = mmap_dic.lookup_extra(entry[0])
        self.assertTrue(type(entry_extra[0]) is str)
        self.assertTrue(type(entry_extra[1]) is str)
        self.assertTrue(type(entry_extra[2]) is str)
        self.assertTrue(type(entry_extra[3]) is str)
        self.assertTrue(type(entry_extra[4]) is str)
        self.assertTrue(type(entry_extra[5]) is str)

        # entry in the user defined dictionary
        user_dic = UserDictionary(user_dict=os.path.join(
            parent_dir, 'tests/user_ipadic.csv'),
                                  enc='utf8',
                                  type='ipadic',
                                  connections=connections)
        entry = user_dic.lookup('東京スカイツリー'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)
    def __init__(self,
                 udic: str = '',
                 *,
                 udic_enc: str = 'utf8',
                 udic_type: str = 'ipadic',
                 max_unknown_length: int = 1024,
                 wakati: bool = False,
                 mmap: bool = DEFAULT_MMAP_MODE,
                 dotfile: str = ''):
        """
        Initialize Tokenizer object with optional arguments.

        :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data
        :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8'
        :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'.
                          default is 'ipadic'
        :param max_unknows_length: (Optional) max unknown word length. default is 1024.
        :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode.
        :param mmap: (Optional) if given False, memory-mapped file mode is disabled.
                     Set this option to False on any environments that do not support mmap.
                     Default is True on 64bit architecture; otherwise False.

        .. seealso:: http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary
        """
        self.sys_dic: Union[SystemDictionary, MMapSystemDictionary]
        self.user_dic: Optional[Union[UserDictionary, CompiledUserDictionary]]
        self.wakati = wakati
        if mmap:
            self.sys_dic = MMapSystemDictionary(all_fstdata(),
                                                mmap_entries(wakati),
                                                connections, chardef.DATA,
                                                unknowns.DATA)
        else:
            self.sys_dic = SystemDictionary(all_fstdata(), entries(wakati),
                                            connections, chardef.DATA,
                                            unknowns.DATA)
        if udic:
            if udic.endswith('.csv'):
                # build user dictionary from CSV
                self.user_dic = UserDictionary(udic, udic_enc, udic_type,
                                               connections)
            elif os.path.isdir(udic):
                # load compiled user dictionary
                self.user_dic = CompiledUserDictionary(udic, connections)
            else:
                self.user_dic = None
        else:
            self.user_dic = None
        self.max_unknown_length = max_unknown_length
Example #4
0
# limitations under the License.

import os
import sys
import unittest
from janome.sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns
from janome.dic import SystemDictionary, MMapSystemDictionary
from janome.lattice import Lattice, BOS, EOS, SurfaceNode

# TODO: better way to find package...
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)

SYS_DIC = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA,
                           unknowns.DATA)
MMAP_SYS_DIC = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections,
                                    chardef.DATA, unknowns.DATA)


class TestLattice(unittest.TestCase):
    def test_initialize_lattice(self):
        lattice = Lattice(5, SYS_DIC)
        self.assertEqual(7, len(lattice.snodes))
        self.assertTrue(isinstance(lattice.snodes[0][0], BOS))
        self.assertEqual(8, len(lattice.enodes))
        self.assertTrue(isinstance(lattice.enodes[1][0], BOS))

    def test_add_forward_end(self):
        s = 'すもも'
        lattice = Lattice(len(s), SYS_DIC)
        entries = SYS_DIC.lookup(s.encode('utf8'))
Example #5
0
# See the License for the specific language governing permissions and
# limitations under the License.

import os, sys

# TODO: better way to find package...
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)

from janome.lattice import *
from janome.dic import SystemDictionary, MMapSystemDictionary
from janome.sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns
import unittest

SYS_DIC = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA, unknowns.DATA)
MMAP_SYS_DIC = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections, chardef.DATA, unknowns.DATA)

class TestLattice(unittest.TestCase):
    def test_initialize_lattice(self):
        lattice = Lattice(5, SYS_DIC)
        self.assertEqual(7, len(lattice.snodes))
        self.assertTrue(isinstance(lattice.snodes[0][0], BOS))
        self.assertEqual(8, len(lattice.enodes))
        self.assertTrue(isinstance(lattice.enodes[1][0], BOS))

    def test_add_forward_end(self):
        s = u'すもも'
        lattice = Lattice(len(s), SYS_DIC)
        entries = SYS_DIC.lookup(s.encode('utf8'))
        for entry in entries:
            lattice.add(SurfaceNode(entry))