Esempio n. 1
0
 def test_system_dictionary_ipadic(self):
     sys_dic = SystemDictionary(all_fstdata(), entries(), connections,
                                chardef.DATA, unknowns.DATA)
     self.assertEqual(7, len(sys_dic.lookup(u'形態素'.encode('utf-8'))))
     self.assertEqual(1, sys_dic.get_trans_cost(0, 1))
     self.assertEqual({'HIRAGANA': []}, sys_dic.get_char_categories(u'は'))
     self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories(u'ハ'))
     self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories(u'ハ'))
     self.assertEqual({'KANJI': []}, sys_dic.get_char_categories(u'葉'))
     self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories(u'C'))
     self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories(u'C'))
     self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories(u'#'))
     self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories(u'#'))
     self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories(u'5'))
     self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories(u'5'))
     self.assertEqual({
         'KANJI': [],
         'KANJINUMERIC': ['KANJI']
     }, sys_dic.get_char_categories(u'五'))
     self.assertEqual({'GREEK': []}, sys_dic.get_char_categories(u'Γ'))
     self.assertEqual({'CYRILLIC': []}, sys_dic.get_char_categories(u'Б'))
     self.assertEqual({'DEFAULT': []}, sys_dic.get_char_categories(u'𠮷'))
     self.assertEqual({'DEFAULT': []}, sys_dic.get_char_categories(u'한'))
     self.assertTrue(sys_dic.unknown_invoked_always('ALPHA'))
     self.assertFalse(sys_dic.unknown_invoked_always('KANJI'))
     self.assertTrue(sys_dic.unknown_grouping('NUMERIC'))
     self.assertFalse(sys_dic.unknown_grouping('KANJI'))
     self.assertEqual(2, sys_dic.unknown_length('HIRAGANA'))
Esempio n. 2
0
    def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_length=1024, wakati=False, mmap=False):
        """
        Initialize Tokenizer object with optional arguments.

        :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data
        :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8'
        :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic'
        :param max_unknows_length: (Optional) max unknown word length. default is 1024.
        :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode.
        :param mmap: (Optional) if given True use memory-mapped file for dictionary data.

        .. seealso:: See http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary for details for user dictionary.
        """
        self.wakati = wakati
        if mmap:
            self.sys_dic = MMapSystemDictionary(mmap_entries(wakati), connections, chardef.DATA, unknowns.DATA)
        else:
            self.sys_dic = SystemDictionary(entries(wakati), connections, chardef.DATA, unknowns.DATA)
        if udic:
            if udic.endswith('.csv'):
                # build user dictionary from CSV
                self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections)
            elif os.path.isdir(udic):
                # load compiled user dictionary
                self.user_dic = CompiledUserDictionary(udic, connections)
            else:
                self.user_dic = None
        else:
            self.user_dic = None
        self.max_unknown_length = max_unknown_length
Esempio n. 3
0
    def test_property_types(self):
        sys_dic = SystemDictionary(entries(), connections(), chardef.DATA, unknowns.DATA)
        # entry in the system dictionary
        entry = sys_dic.lookup(u'すもも')[0]
        if PY3:
            self.assertTrue(type(entry[0]) is str)
            self.assertTrue(type(entry[4]) is str)
            self.assertTrue(type(entry[5]) is str)
            self.assertTrue(type(entry[6]) is str)
            self.assertTrue(type(entry[7]) is str)
            self.assertTrue(type(entry[8]) is str)
            self.assertTrue(type(entry[9]) is str)
        else:
            self.assertTrue(type(entry[0]) is unicode)
            self.assertTrue(type(entry[4]) is unicode)
            self.assertTrue(type(entry[5]) is unicode)
            self.assertTrue(type(entry[6]) is unicode)
            self.assertTrue(type(entry[7]) is unicode)
            self.assertTrue(type(entry[8]) is unicode)
            self.assertTrue(type(entry[9]) is unicode)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)

        # unknown entry
        entry = sys_dic.unknowns.get(u'HIRAGANA')[0]
        if PY3:
            self.assertTrue(type(entry[3]) is str)
        else:
            self.assertTrue(type(entry[3]) is unicode)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)

        # entry in the user defined dictionary
        user_dic = UserDictionary(user_dict=os.path.join(parent_dir, 'tests/user_ipadic.csv'),
                                  enc='utf8', type='ipadic', connections=connections())
        entry = user_dic.lookup(u'東京スカイツリー')[0]
        if PY3:
            self.assertTrue(type(entry[0]) is str)
            self.assertTrue(type(entry[4]) is str)
            self.assertTrue(type(entry[5]) is str)
            self.assertTrue(type(entry[6]) is str)
            self.assertTrue(type(entry[7]) is str)
            self.assertTrue(type(entry[8]) is str)
            self.assertTrue(type(entry[9]) is str)
        else:
            self.assertTrue(type(entry[0]) is unicode)
            self.assertTrue(type(entry[4]) is unicode)
            self.assertTrue(type(entry[5]) is unicode)
            self.assertTrue(type(entry[6]) is unicode)
            self.assertTrue(type(entry[7]) is unicode)
            self.assertTrue(type(entry[8]) is unicode)
            self.assertTrue(type(entry[9]) is unicode)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
Esempio n. 4
0
    def test_system_dictionary_cache(self):
        sys_dic = SystemDictionary(entries(), connections(), chardef.DATA, unknowns.DATA)
        self.assertEqual(11, len(sys_dic.lookup(u'小書き')))
        self.assertEqual(11, len(sys_dic.lookup(u'小書き')))
        self.assertEqual(11, len(sys_dic.lookup(u'小書きにしました')))

        self.assertEqual(10, len(sys_dic.lookup(u'みんなと')))
        self.assertEqual(10, len(sys_dic.lookup(u'みんなと')))

        self.assertEqual(2, len(sys_dic.lookup(u'叩く')))
        self.assertEqual(2, len(sys_dic.lookup(u'叩く')))
Esempio n. 5
0
 def test_load_all_fst_data_from_package(self):
     # Py2.7 doesn't have unittest.mock, so manually replaced the method
     store = janome.dic.load_all_fstdata
     janome.dic.load_all_fstdata = janome.dic.load_all_fstdata_from_package
     try:
         sys_dic = SystemDictionary(entries(), connections, chardef.DATA,
                                    unknowns.DATA)
         self.assertEqual(11, len(sys_dic.lookup(u'小書き'.encode('utf8'))))
     except Exception:
         janome.dic.load_all_fstdata = store
         raise
Esempio n. 6
0
    def test_system_dictionary_cache(self):
        sys_dic = SystemDictionary(all_fstdata(), entries(), connections,
                                   chardef.DATA, unknowns.DATA)
        self.assertEqual(11, len(sys_dic.lookup(u'小書き'.encode('utf8'))))
        self.assertEqual(11, len(sys_dic.lookup(u'小書き'.encode('utf8'))))
        self.assertEqual(11, len(sys_dic.lookup(u'小書きにしました'.encode('utf8'))))

        self.assertEqual(10, len(sys_dic.lookup(u'みんなと'.encode('utf8'))))
        self.assertEqual(10, len(sys_dic.lookup(u'みんなと'.encode('utf8'))))

        self.assertEqual(2, len(sys_dic.lookup(u'叩く'.encode('utf8'))))
        self.assertEqual(2, len(sys_dic.lookup(u'叩く'.encode('utf8'))))
Esempio n. 7
0
    def __init__(self,
                 udic='',
                 udic_enc='utf8',
                 udic_type='ipadic',
                 max_unknown_length=1024,
                 wakati=False,
                 mmap=False,
                 dotfile=''):
        """
        Initialize Tokenizer object with optional arguments.

        :param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data
        :param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8'
        :param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'. default is 'ipadic'
        :param max_unknows_length: (Optional) max unknown word length. default is 1024.
        :param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode.
        :param mmap: (Optional) if given True use memory-mapped file for dictionary data.

        .. seealso:: See http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary for details for user dictionary.
        """
        self.wakati = wakati
        if mmap:
            self.sys_dic = MMapSystemDictionary(mmap_entries(wakati),
                                                connections, chardef.DATA,
                                                unknowns.DATA)
        else:
            self.sys_dic = SystemDictionary(entries(wakati), connections,
                                            chardef.DATA, unknowns.DATA)
        if udic:
            if udic.endswith('.csv'):
                # build user dictionary from CSV
                self.user_dic = UserDictionary(udic, udic_enc, udic_type,
                                               connections)
            elif os.path.isdir(udic):
                # load compiled user dictionary
                self.user_dic = CompiledUserDictionary(udic, connections)
            else:
                self.user_dic = None
        else:
            self.user_dic = None
        self.max_unknown_length = max_unknown_length
Esempio n. 8
0
 def test_system_dictionary_ipadic(self):
     sys_dic = SystemDictionary(entries(), connections(), chardef.DATA, unknowns.DATA)
     self.assertEqual(7, len(sys_dic.lookup(u'形態素')))
     self.assertEqual(1, sys_dic.get_trans_cost(0, 1))
     self.assertEqual({'HIRAGANA': []}, sys_dic.get_char_categories(u'は'))
     self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories(u'ハ'))
     self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories(u'ハ'))
     self.assertEqual({'KANJI': []}, sys_dic.get_char_categories(u'葉'))
     self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories(u'C'))
     self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories(u'C'))
     self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories(u'#'))
     self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories(u'#'))
     self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories(u'5'))
     self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories(u'5'))
     self.assertEqual({'KANJI': [], 'KANJINUMERIC': ['KANJI']}, sys_dic.get_char_categories(u'五'))
     self.assertEqual({'GREEK': []}, sys_dic.get_char_categories(u'Γ'))
     self.assertEqual({'CYRILLIC': []}, sys_dic.get_char_categories(u'Б'))
     self.assertTrue(sys_dic.unkown_invoked_always('ALPHA'))
     self.assertFalse(sys_dic.unkown_invoked_always('KANJI'))
     self.assertTrue(sys_dic.unknown_grouping('NUMERIC'))
     self.assertFalse(sys_dic.unknown_grouping('KANJI'))
     self.assertEqual(2, sys_dic.unknown_length('HIRAGANA'))
Esempio n. 9
0
    def test_property_types(self):
        sys_dic = SystemDictionary(all_fstdata(), entries(), connections,
                                   chardef.DATA, unknowns.DATA)
        # entry in the system dictionary
        entry = sys_dic.lookup(u'すもも'.encode('utf8'))[0]
        if PY3:
            self.assertTrue(type(entry[1]) is str)
        else:
            self.assertTrue(type(entry[1]) is unicode)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)

        entry_extra = sys_dic.lookup_extra(entry[0])
        if PY3:
            self.assertTrue(type(entry_extra[0]) is str)
            self.assertTrue(type(entry_extra[1]) is str)
            self.assertTrue(type(entry_extra[2]) is str)
            self.assertTrue(type(entry_extra[3]) is str)
            self.assertTrue(type(entry_extra[4]) is str)
            self.assertTrue(type(entry_extra[5]) is str)
        else:
            self.assertTrue(type(entry_extra[0]) is unicode)
            self.assertTrue(type(entry_extra[1]) is unicode)
            self.assertTrue(type(entry_extra[2]) is unicode)
            self.assertTrue(type(entry_extra[3]) is unicode)
            self.assertTrue(type(entry_extra[4]) is unicode)
            self.assertTrue(type(entry_extra[5]) is unicode)

        # unknown entry
        entry = sys_dic.unknowns.get(u'HIRAGANA')[0]
        if PY3:
            self.assertTrue(type(entry[3]) is str)
        else:
            self.assertTrue(type(entry[3]) is unicode)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)

        # mmap dict etnry
        mmap_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(),
                                        connections, chardef.DATA,
                                        unknowns.DATA)
        entry = mmap_dic.lookup(u'すもも'.encode('utf8'))[0]
        if PY3:
            self.assertTrue(type(entry[1]) is str)
        else:
            self.assertTrue(type(entry[1]) is unicode)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)

        entry_extra = mmap_dic.lookup_extra(entry[0])
        if PY3:
            self.assertTrue(type(entry_extra[0]) is str)
            self.assertTrue(type(entry_extra[1]) is str)
            self.assertTrue(type(entry_extra[2]) is str)
            self.assertTrue(type(entry_extra[3]) is str)
            self.assertTrue(type(entry_extra[4]) is str)
            self.assertTrue(type(entry_extra[5]) is str)
        else:
            self.assertTrue(type(entry_extra[0]) is unicode)
            self.assertTrue(type(entry_extra[1]) is unicode)
            self.assertTrue(type(entry_extra[2]) is unicode)
            self.assertTrue(type(entry_extra[3]) is unicode)
            self.assertTrue(type(entry_extra[4]) is unicode)
            self.assertTrue(type(entry_extra[5]) is unicode)

        # entry in the user defined dictionary
        user_dic = UserDictionary(user_dict=os.path.join(
            parent_dir, 'tests/user_ipadic.csv'),
                                  enc='utf8',
                                  type='ipadic',
                                  connections=connections)
        entry = user_dic.lookup(u'東京スカイツリー'.encode('utf8'))[0]
        if PY3:
            self.assertTrue(type(entry[1]) is str)
        else:
            self.assertTrue(type(entry[1]) is unicode)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)
Esempio n. 10
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os, sys

# TODO: better way to find package...
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)

from janome.lattice import *
from janome.dic import SystemDictionary, MMapSystemDictionary
from sysdic import entries, mmap_entries, connections, chardef, unknowns
import unittest

SYS_DIC = SystemDictionary(entries(), connections, chardef.DATA, unknowns.DATA)
MMAP_SYS_DIC = MMapSystemDictionary(mmap_entries(), connections, chardef.DATA,
                                    unknowns.DATA)


class TestLattice(unittest.TestCase):
    def test_initialize_lattice(self):
        lattice = Lattice(5, SYS_DIC)
        self.assertEqual(7, len(lattice.snodes))
        self.assertTrue(isinstance(lattice.snodes[0][0], BOS))
        self.assertEqual(8, len(lattice.enodes))
        self.assertTrue(isinstance(lattice.enodes[1][0], BOS))

    def test_add_forward_end(self):
        s = u'すもも'
        lattice = Lattice(len(s), SYS_DIC)
Esempio n. 11
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os, sys

# TODO: better way to find package...
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)

from janome.lattice import *
from janome.dic import SystemDictionary, MMapSystemDictionary
from sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns
import unittest

SYS_DIC = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA,
                           unknowns.DATA)
MMAP_SYS_DIC = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections,
                                    chardef.DATA, unknowns.DATA)


class TestLattice(unittest.TestCase):
    def test_initialize_lattice(self):
        lattice = Lattice(5, SYS_DIC)
        self.assertEqual(7, len(lattice.snodes))
        self.assertTrue(isinstance(lattice.snodes[0][0], BOS))
        self.assertEqual(8, len(lattice.enodes))
        self.assertTrue(isinstance(lattice.enodes[1][0], BOS))

    def test_add_forward_end(self):
        s = u'すもも'
Esempio n. 12
0
    def test_property_types(self):
        sys_dic = SystemDictionary(entries(), connections(), chardef.DATA,
                                   unknowns.DATA)
        # entry in the system dictionary
        entry = sys_dic.lookup(u'すもも')[0]
        if PY3:
            self.assertTrue(type(entry[0]) is str)
            self.assertTrue(type(entry[4]) is str)
            self.assertTrue(type(entry[5]) is str)
            self.assertTrue(type(entry[6]) is str)
            self.assertTrue(type(entry[7]) is str)
            self.assertTrue(type(entry[8]) is str)
            self.assertTrue(type(entry[9]) is str)
        else:
            self.assertTrue(type(entry[0]) is unicode)
            self.assertTrue(type(entry[4]) is unicode)
            self.assertTrue(type(entry[5]) is unicode)
            self.assertTrue(type(entry[6]) is unicode)
            self.assertTrue(type(entry[7]) is unicode)
            self.assertTrue(type(entry[8]) is unicode)
            self.assertTrue(type(entry[9]) is unicode)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)

        # unknown entry
        entry = sys_dic.unknowns.get(u'HIRAGANA')[0]
        if PY3:
            self.assertTrue(type(entry[3]) is str)
        else:
            self.assertTrue(type(entry[3]) is unicode)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)

        # entry in the user defined dictionary
        user_dic = UserDictionary(user_dict=os.path.join(
            parent_dir, 'tests/user_ipadic.csv'),
                                  enc='utf8',
                                  type='ipadic',
                                  connections=connections())
        entry = user_dic.lookup(u'東京スカイツリー')[0]
        if PY3:
            self.assertTrue(type(entry[0]) is str)
            self.assertTrue(type(entry[4]) is str)
            self.assertTrue(type(entry[5]) is str)
            self.assertTrue(type(entry[6]) is str)
            self.assertTrue(type(entry[7]) is str)
            self.assertTrue(type(entry[8]) is str)
            self.assertTrue(type(entry[9]) is str)
        else:
            self.assertTrue(type(entry[0]) is unicode)
            self.assertTrue(type(entry[4]) is unicode)
            self.assertTrue(type(entry[5]) is unicode)
            self.assertTrue(type(entry[6]) is unicode)
            self.assertTrue(type(entry[7]) is unicode)
            self.assertTrue(type(entry[8]) is unicode)
            self.assertTrue(type(entry[9]) is unicode)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
Esempio n. 13
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os, sys

# TODO: better way to find package...
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)

from janome.lattice import *
from janome.dic import SystemDictionary, MMapSystemDictionary
from sysdic import entries, mmap_entries, connections, chardef, unknowns
import unittest

SYS_DIC = SystemDictionary(entries(), connections, chardef.DATA, unknowns.DATA)
MMAP_SYS_DIC = MMapSystemDictionary(mmap_entries(), connections, chardef.DATA, unknowns.DATA)

class TestLattice(unittest.TestCase):
    def test_initialize_lattice(self):
        lattice = Lattice(5, SYS_DIC)
        self.assertEqual(7, len(lattice.snodes))
        self.assertTrue(isinstance(lattice.snodes[0][0], BOS))
        self.assertEqual(9, len(lattice.enodes))
        self.assertTrue(isinstance(lattice.enodes[1][0], BOS))

    def test_add_forward_end(self):
        s = u'すもも'
        lattice = Lattice(len(s), SYS_DIC)
        entries = SYS_DIC.lookup(s.encode('utf8'))
        for entry in entries: