Ejemplo n.º 1
0
 def test_parse_splitinfo_invalid_wordid_userdict(self):
     mocked_lexicon = mock.Mock(spec=Lexicon)
     mocked_lexicon.size.return_value = 1
     builder = UserDictionaryBuilder(None, mocked_lexicon)
     with self.assertRaises(ValueError) as cm:
         builder.parse_splitinfo('0/U1')
     self.assertEqual('invalid word ID', cm.exception.args[0])
 def test_parseline_with_userdefined_POS(self):
     builder = UserDictionaryBuilder(self.grammar,
                                     self.lexicon_set,
                                     logger=self.logger)
     builder.parse_line(
         '田中,0,0,0,田中,存在,しない,品詞,*,*,*,タナカ,田中,*,A,*,*,*\n'.split(','))
     self.assertEqual(1, len(builder.pos_table.get_list()))
Ejemplo n.º 3
0
 def test_parse_splitinfo_invalid_system_wordid_in_userdict(self):
     mocked_lexicon = mock.Mock(spec=Lexicon)
     mocked_lexicon.size.return_value = 1
     builder = UserDictionaryBuilder(None, mocked_lexicon)
     builder.entries.append(None)
     with self.assertRaises(ValueError) as cm:
         builder.parse_splitinfo('1/U0')
     self.assertEqual('invalid word id', cm.exception.args[0])
     pass
Ejemplo n.º 4
0
    def test_parse_splitinfo(self):
        builder = DictionaryBuilder(logger=self.logger)
        builder.entries.extend([None] * 4)
        self.assertEqual([], builder.parse_splitinfo('*'))
        self.assertEqual([1, 2, 3], builder.parse_splitinfo('1/2/3'))
        self.assertEqual(2, builder.parse_splitinfo('1/U2/3')[1])

        mocked_lexicon = mock.Mock(spec=Lexicon)
        mocked_lexicon.size.return_value = 4
        builder = UserDictionaryBuilder(None, mocked_lexicon)
        builder.entries += [None, None, None]
        self.assertEqual([1, 2 | 1 << 28, 3], builder.parse_splitinfo("1/U2/3"))
    def test_build(self):
        out_path = os.path.join(self.test_dir, 'output.txt')
        in_path = os.path.join(self.test_dir, 'input.txt')

        out_stream = open(out_path, 'wb')
        # lexicon_paths = [self.input_path]
        # matrix_input_stream = open(self.matrix_path, 'r')
        with open(in_path, 'w', encoding='utf-8') as wf:
            wf.write(
                "東京都市,0,0,0,東京都市,名詞,固有名詞,地名,一般,*,*,ヒガシキョウトシ,東京都市,*,B,\"東,名詞,普通名詞,一般,*,*,*,ヒガシ/3/U1\",*,\"4/3/市,名詞,普通名詞,一般,*,*,*,シ\"\n"
            )
            wf.write('市,-1,-1,0,市,名詞,普通名詞,一般,*,*,*,シ,市,*,A,*,*,*\n')

        _, _, grammar, lexicon_set = TestDictionaryBuilder.read_system_dictionary(
            self.dict_filename)
        header = DictionaryHeader(SYSTEM_DICT_VERSION, int(time.time()),
                                  'test')
        out_stream.write(header.to_bytes())
        builder = UserDictionaryBuilder(grammar,
                                        lexicon_set,
                                        logger=self.logger)
        lexicon_paths = [in_path]
        builder.build(lexicon_paths, None, out_stream)
        out_stream.close()

        buffers, header, grammar, lexicon_set = TestDictionaryBuilder.read_system_dictionary(
            out_path)
        lexicon = lexicon_set.lexicons[0]

        # header
        self.assertEqual(SYSTEM_DICT_VERSION, header.version)
        self.assertEqual('test', header.description)

        # lexicon
        self.assertEqual(0, lexicon.get_left_id(0))
        self.assertEqual(0, lexicon.get_cost(0))
        wi = lexicon.get_word_info(0)
        self.assertEqual('東京都市', wi.surface)
        self.assertEqual('東京都市', wi.normalized_form)
        self.assertEqual(-1, wi.dictionary_form_word_id)
        self.assertEqual('ヒガシキョウトシ', wi.reading_form)
        self.assertEqual(3, wi.pos_id)
        self.assertEqual([4, 3, 1 | (1 << 28)], wi.a_unit_split)
        self.assertEqual([], wi.b_unit_split)
        self.assertEqual([4, 3, 1 | (1 << 28)], wi.word_structure)
        lst = lexicon.lookup('東京都市'.encode('utf-8'), 0)
        self.assertEqual((0, len('東京都市'.encode('utf-8'))), lst.__next__())
        with self.assertRaises(StopIteration):
            lst.__next__()

        self.assertEqual(-1, lexicon.get_left_id(1))
        self.assertEqual(0, lexicon.get_cost(1))
        wi = lexicon.get_word_info(1)
        self.assertEqual('市', wi.surface)
        self.assertEqual('市', wi.normalized_form)
        self.assertEqual(-1, wi.dictionary_form_word_id)
        self.assertEqual('シ', wi.reading_form)
        self.assertEqual(4, wi.pos_id)
        self.assertEqual([], wi.a_unit_split)
        self.assertEqual([], wi.b_unit_split)
        lst = lexicon.lookup('東'.encode('utf-8'), 0)
        with self.assertRaises(StopIteration):
            lst.__next__()