def gen_word(self, text): self.text = text self.word_net = WordNet(self.text) # 粗分词网 gen_word_net(self.text, self.word_net) # 维特比 self.vertexs = viterbi(self.word_net.vertexs) self.word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
def setUp(self): self.text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业" self.word_net = WordNet(self.text) # 粗分词网 gen_word_net(self.text, self.word_net) # 维特比 self.vertexs = viterbi(self.word_net.vertexs) self.word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
def setUp(self): self.text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机" self.word_net = WordNet(self.text) # 粗分词网 gen_word_net(self.text, self.word_net) # 维特比 self.vertexs = viterbi(self.word_net.vertexs) self.vertexs = combine_by_custom_dict(self.vertexs, CustomDict().trie) self.word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
def test_combin_by_dict(self): dat = DoubleArrayTrie() dat.build([u"江", u"河", u"湖", "海"]) text = u"江河湖海" word_net = WordNet(text) gen_word_net(text, word_net, dat) vertexs = [v[0] for v in word_net.vertexs] self.assertEqual(len(word_net), 6, u"自定义字典分词") combin_dat = DoubleArrayTrie() combin_dat.build(key=[u"江河湖海"], v=[u"江河湖海 n 1"]) vertexs = combine_by_custom_dict(vertexs, combin_dat) self.assertEqual(len(vertexs), 3, u"合并完成后应该只有前尾加中间词")
def test_recognition(self): text = u"签约仪式前,秦光荣、李纪恒、仇和、王春桂、张晓辉等一同会见了参加签约的企业家。" word_net = WordNet(text) # 粗分词网 gen_word_net(text, word_net) # 维特比 vertexs = viterbi(word_net.vertexs) word_net_optimum = WordNet(text, vertexs=vertexs) person_recognition.recognition(vertexs, word_net_optimum, word_net) vertexs = viterbi(word_net_optimum.vertexs) self.assertIn(Vertex(u"秦光荣", attribute=u"nr 1"), vertexs) self.assertIn(Vertex(u"李纪恒", attribute=u"nr 1"), vertexs) self.assertIn(Vertex(u"仇和", attribute=u"nr 1"), vertexs) self.assertIn(Vertex(u"王春桂", attribute=u"nr 1"), vertexs) self.assertIn(Vertex(u"张晓辉", attribute=u"nr 1"), vertexs) print(vertexs)
def test_gen_word_net(self): text = u"一举成名天下知" word_net = WordNet(text) gen_word_net(text, word_net) self.assertEqual(word_net.vertexs.__len__(), text.__len__() + 2) # 一举 一举成名 # 举 # 成 成名 # 名 # 天 天下 # 下 # 知 self.assertEqual(word_net.vertexs[1].__len__(), 2) self.assertEqual(word_net.vertexs[2].__len__(), 1) self.assertEqual(word_net.vertexs[3].__len__(), 2) self.assertEqual(word_net.vertexs[4].__len__(), 1) self.assertEqual(word_net.vertexs[5].__len__(), 2) self.assertEqual(word_net.vertexs[6].__len__(), 1) self.assertEqual(word_net.vertexs[7].__len__(), 1)
def seg_to_vertexs(text): word_net = WordNet(text) # 粗分词网 gen_word_net(text, word_net) if Config.debug: print(u"打印粗分词网:") print(unicode(word_net)) # 维特比 vertexs = viterbi(word_net.vertexs) if Config.use_custom_dict: vertexs = combine_by_custom_dict(vertexs) word_net_optimum = WordNet(text, vertexs=vertexs) if Config.name_recognize: person_recognition.recognition(vertexs, word_net_optimum, word_net) if Config.place_recognize: place_recognition.recognition(vertexs, word_net_optimum, word_net) if Config.debug: print(u"打印人名、地名识别词网:") print(unicode(word_net_optimum)) vertexs = viterbi(word_net_optimum.vertexs) if Config.org_recognize: word_net_optimum = WordNet(text, vertexs=vertexs) vertexs = organization_recognition.recognition(vertexs, word_net_optimum, word_net) if Config.debug: print(u"打印人组织识别词网:") print(unicode(word_net_optimum)) return vertexs
def test_gen_word_net_include_num(self): text = u"123456" word_net = WordNet(text) gen_word_net(text, word_net) self.assertEqual(word_net.vertexs.__len__(), 6 + 2) self.assertTrue([] not in word_net.vertexs, u"原始词网,不能可能有空节点")