def setUp(self): self.text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机" self.word_net = WordNet(self.text) # 粗分词网 gen_word_net(self.text, self.word_net) # 维特比 self.vertexs = viterbi(self.word_net.vertexs) self.vertexs = combine_by_custom_dict(self.vertexs, CustomDict().trie) self.word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
def test_combin_by_dict(self): dat = DoubleArrayTrie() dat.build([u"江", u"河", u"湖", "海"]) text = u"江河湖海" word_net = WordNet(text) gen_word_net(text, word_net, dat) vertexs = [v[0] for v in word_net.vertexs] self.assertEqual(len(word_net), 6, u"自定义字典分词") combin_dat = DoubleArrayTrie() combin_dat.build(key=[u"江河湖海"], v=[u"江河湖海 n 1"]) vertexs = combine_by_custom_dict(vertexs, combin_dat) self.assertEqual(len(vertexs), 3, u"合并完成后应该只有前尾加中间词")
def test_combin_by_dict(self): dat = DoubleArrayTrie() dat.build([u"江", u"河", u"湖", "海"]) text = u"江河湖海" word_net = WordNet(text) gen_word_net(text, word_net, dat) vertexs = [v[0] for v in word_net.vertexs] self.assertEqual(len(word_net), 6, u"自定义字典分词") combin_dat = DoubleArrayTrie() combin_dat.build(key=[u"江河湖海"], v=[u"江河湖海 n 1"]) vertexs = combine_by_custom_dict(vertexs, combin_dat) self.assertEqual(len(vertexs), 3, u"合并完成后应该只有前尾加中间词")
def seg_to_vertexs(text): word_net = WordNet(text) # 粗分词网 gen_word_net(text, word_net) if Config.debug: print(u"打印粗分词网:") print(unicode(word_net)) # 维特比 vertexs = viterbi(word_net.vertexs) if Config.use_custom_dict: vertexs = combine_by_custom_dict(vertexs) word_net_optimum = WordNet(text, vertexs=vertexs) if Config.name_recognize: person_recognition.recognition(vertexs, word_net_optimum, word_net) if Config.place_recognize: place_recognition.recognition(vertexs, word_net_optimum, word_net) if Config.debug: print(u"打印人名、地名识别词网:") print(unicode(word_net_optimum)) vertexs = viterbi(word_net_optimum.vertexs) if Config.org_recognize: word_net_optimum = WordNet(text, vertexs=vertexs) vertexs = organization_recognition.recognition(vertexs, word_net_optimum, word_net) if Config.debug: print(u"打印人组织识别词网:") print(unicode(word_net_optimum)) return vertexs