Beispiel #1
0
    def test_property_types(self):
        sys_dic = SystemDictionary(all_fstdata(), entries(), connections,
                                   chardef.DATA, unknowns.DATA)
        # entry in the system dictionary
        entry = sys_dic.lookup('すもも'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)

        entry_extra = sys_dic.lookup_extra(entry[0])
        self.assertTrue(type(entry_extra[0]) is str)
        self.assertTrue(type(entry_extra[1]) is str)
        self.assertTrue(type(entry_extra[2]) is str)
        self.assertTrue(type(entry_extra[3]) is str)
        self.assertTrue(type(entry_extra[4]) is str)
        self.assertTrue(type(entry_extra[5]) is str)

        # unknown entry
        entry = sys_dic.unknowns.get(u'HIRAGANA')[0]
        self.assertTrue(type(entry[3]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[1]) is int)
        self.assertTrue(type(entry[2]) is int)

        # mmap dict etnry
        mmap_dic = MMapSystemDictionary(all_fstdata(), mmap_entries(),
                                        connections, chardef.DATA,
                                        unknowns.DATA)
        entry = mmap_dic.lookup(u'すもも'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)

        entry_extra = mmap_dic.lookup_extra(entry[0])
        self.assertTrue(type(entry_extra[0]) is str)
        self.assertTrue(type(entry_extra[1]) is str)
        self.assertTrue(type(entry_extra[2]) is str)
        self.assertTrue(type(entry_extra[3]) is str)
        self.assertTrue(type(entry_extra[4]) is str)
        self.assertTrue(type(entry_extra[5]) is str)

        # entry in the user defined dictionary
        user_dic = UserDictionary(user_dict=os.path.join(
            parent_dir, 'tests/user_ipadic.csv'),
                                  enc='utf8',
                                  type='ipadic',
                                  connections=connections)
        entry = user_dic.lookup('東京スカイツリー'.encode('utf8'))[0]
        self.assertTrue(type(entry[1]) is str)
        self.assertTrue(type(entry[0]) is int)
        self.assertTrue(type(entry[2]) is int)
        self.assertTrue(type(entry[3]) is int)
        self.assertTrue(type(entry[4]) is int)
Beispiel #2
0
    def test_system_dictionary_cache(self):
        sys_dic = SystemDictionary(all_fstdata(), entries(), connections,
                                   chardef.DATA, unknowns.DATA)
        self.assertEqual(11, len(sys_dic.lookup('小書き'.encode('utf8'))))
        self.assertEqual(11, len(sys_dic.lookup('小書き'.encode('utf8'))))
        self.assertEqual(11, len(sys_dic.lookup('小書きにしました'.encode('utf8'))))

        self.assertEqual(10, len(sys_dic.lookup('みんなと'.encode('utf8'))))
        self.assertEqual(10, len(sys_dic.lookup('みんなと'.encode('utf8'))))

        self.assertEqual(2, len(sys_dic.lookup('叩く'.encode('utf8'))))
        self.assertEqual(2, len(sys_dic.lookup('叩く'.encode('utf8'))))
Beispiel #3
0
 def test_system_dictionary_ipadic(self):
     sys_dic = SystemDictionary(all_fstdata(), entries(), connections,
                                chardef.DATA, unknowns.DATA)
     self.assertEqual(7, len(sys_dic.lookup('形態素'.encode('utf-8'))))
     self.assertEqual(1, sys_dic.get_trans_cost(0, 1))
     self.assertEqual({'HIRAGANA': []}, sys_dic.get_char_categories('は'))
     self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories('ハ'))
     self.assertEqual({'KATAKANA': []}, sys_dic.get_char_categories('ハ'))
     self.assertEqual({'KANJI': []}, sys_dic.get_char_categories('葉'))
     self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories('C'))
     self.assertEqual({'ALPHA': []}, sys_dic.get_char_categories('C'))
     self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories('#'))
     self.assertEqual({'SYMBOL': []}, sys_dic.get_char_categories('#'))
     self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories('5'))
     self.assertEqual({'NUMERIC': []}, sys_dic.get_char_categories('5'))
     self.assertEqual({
         'KANJI': [],
         'KANJINUMERIC': ['KANJI']
     }, sys_dic.get_char_categories('五'))
     self.assertEqual({'GREEK': []}, sys_dic.get_char_categories('Γ'))
     self.assertEqual({'CYRILLIC': []}, sys_dic.get_char_categories('Б'))
     self.assertEqual({'DEFAULT': []}, sys_dic.get_char_categories('𠮷'))
     self.assertEqual({'DEFAULT': []}, sys_dic.get_char_categories('한'))
     self.assertTrue(sys_dic.unknown_invoked_always('ALPHA'))
     self.assertFalse(sys_dic.unknown_invoked_always('KANJI'))
     self.assertTrue(sys_dic.unknown_grouping('NUMERIC'))
     self.assertFalse(sys_dic.unknown_grouping('KANJI'))
     self.assertEqual(2, sys_dic.unknown_length('HIRAGANA'))
Beispiel #4
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os, sys

# TODO: better way to find package...
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)

from janome.lattice import *
from janome.dic import SystemDictionary, MMapSystemDictionary
from sysdic import entries, mmap_entries, connections, chardef, unknowns
import unittest

SYS_DIC = SystemDictionary(entries(), connections, chardef.DATA, unknowns.DATA)
MMAP_SYS_DIC = MMapSystemDictionary(mmap_entries(), connections, chardef.DATA,
                                    unknowns.DATA)


class TestLattice(unittest.TestCase):
    def test_initialize_lattice(self):
        lattice = Lattice(5, SYS_DIC)
        self.assertEqual(7, len(lattice.snodes))
        self.assertTrue(isinstance(lattice.snodes[0][0], BOS))
        self.assertEqual(8, len(lattice.enodes))
        self.assertTrue(isinstance(lattice.enodes[1][0], BOS))

    def test_add_forward_end(self):
        s = u'すもも'
        lattice = Lattice(len(s), SYS_DIC)
Beispiel #5
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os, sys

# TODO: better way to find package...
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)

from janome.lattice import *
from janome.dic import SystemDictionary, MMapSystemDictionary
from sysdic import all_fstdata, entries, mmap_entries, connections, chardef, unknowns
import unittest

SYS_DIC = SystemDictionary(all_fstdata(), entries(), connections, chardef.DATA,
                           unknowns.DATA)
MMAP_SYS_DIC = MMapSystemDictionary(all_fstdata(), mmap_entries(), connections,
                                    chardef.DATA, unknowns.DATA)


class TestLattice(unittest.TestCase):
    def test_initialize_lattice(self):
        lattice = Lattice(5, SYS_DIC)
        self.assertEqual(7, len(lattice.snodes))
        self.assertTrue(isinstance(lattice.snodes[0][0], BOS))
        self.assertEqual(8, len(lattice.enodes))
        self.assertTrue(isinstance(lattice.enodes[1][0], BOS))

    def test_add_forward_end(self):
        s = u'すもも'
        lattice = Lattice(len(s), SYS_DIC)