def parse_pattern(tag_str, vertexs, wordnet_optimum, wordnet_all): new_tag_list = [] new_vertexs = [] for i, t in enumerate(tag_str): if t == str(NR.U): new_tag_list.append(str(NR.K)) new_tag_list.append(str(NR.B)) word_K = vertexs[i].real_word[:-1] word_B = vertexs[i].real_word[-1] new_vertexs.append(Vertex(word_K)) new_vertexs.append(Vertex(word_B)) elif t == str(NR.V): if tag_str[i - 1] == str(NR.B): new_tag_list.append(str(NR.E)) else: new_tag_list.append(str(NR.D)) new_tag_list.append(str(NR.L)) word_ED = vertexs[i].real_word[:-1] word_L = vertexs[i].real_word[-1] new_vertexs.append(Vertex(word_ED)) new_vertexs.append(Vertex(word_L)) else: new_tag_list.append(t) new_vertexs.append(vertexs[i]) return "".join(new_tag_list), new_vertexs
def test_vector(self): v1 = Vertex("test", attribute="nr 1") v2 = Vertex("test", attribute="nr 1") v3 = Vertex("test", attribute="nr1 1") self.assertEqual(v1, v2) self.assertNotEqual(v1, v3) self.assertIn(v1, [v2]) self.assertNotIn(v1, [v3])
def test_recognition(self): place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) vertexs = viterbi(self.word_net_optimum.vertexs) self.assertIn(Vertex(u"宁夏"), vertexs) self.assertIn(Vertex(u"固原市"), vertexs) self.assertIn(Vertex(u"彭阳县", attribute=u"ns 1"), vertexs) self.assertIn(Vertex(u"红河镇", attribute=u"ns 1"), vertexs) self.assertIn(Vertex(u"黑牛沟村", attribute=u"ns 1"), vertexs)
def test_recognition(self): text = u"签约仪式前,秦光荣、李纪恒、仇和、王春桂、张晓辉等一同会见了参加签约的企业家。" word_net = WordNet(text) # 粗分词网 gen_word_net(text, word_net) # 维特比 vertexs = viterbi(word_net.vertexs) word_net_optimum = WordNet(text, vertexs=vertexs) person_recognition.recognition(vertexs, word_net_optimum, word_net) vertexs = viterbi(word_net_optimum.vertexs) self.assertIn(Vertex(u"秦光荣", attribute=u"nr 1"), vertexs) self.assertIn(Vertex(u"李纪恒", attribute=u"nr 1"), vertexs) self.assertIn(Vertex(u"仇和", attribute=u"nr 1"), vertexs) self.assertIn(Vertex(u"王春桂", attribute=u"nr 1"), vertexs) self.assertIn(Vertex(u"张晓辉", attribute=u"nr 1"), vertexs) print(vertexs)
def test_recognition_1_level(self): text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业" self.gen_word(text) # vertexs = persion_recognition.recognition(vertexs, word_net_optimum, word_net) # word_net_optimum = WordNet(text, vertexs=vertexs) organization_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) vertexs = viterbi(self.word_net_optimum.vertexs) self.assertIn(Vertex(u"济南杨铭宇餐饮管理有限公司", attribute=u"nt 1"), vertexs)
def test_role_tag(self): word_seg_list = [ new_tag_vertex(TAG_BIGIN), Vertex(u"秦", attribute=Attribute(u'n 1')), Vertex(u"光荣", attribute=Attribute(u'n 1')), Vertex(u"同志", attribute=Attribute(u'n 1')), new_tag_vertex(TAG_END), ] taglist = role_tag(word_seg_list) self.assertTrue(isinstance(taglist, list)) self.assertEqual(taglist[2].to_tuple(), (NR.Z, 29, NR.L, 2)) tag_index_list = viterbi_roletag(taglist, PersonTranMatrix().hmm) self.assertEqual(tag_index_list[0], NR.A, u"人名识别,第一个标识应该为TAG_BAGIN") self.assertEqual(tag_index_list[1], NR.B) self.assertEqual(tag_index_list[2], NR.Z) self.assertEqual(tag_index_list[3], NR.L) self.assertEqual(tag_index_list[4], NR.A)
def test_organization_recognition(self): text = traditional_to_simplified(u"馬總統上午前往陸軍航空601旅,") Config.debug = True self.gen_word(text) person_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) word_net_optimum = WordNet(self.text, vertexs=self.vertexs) vertexs = organization_recognition.recognition(self.vertexs, word_net_optimum, self.word_net) dump_vertexs(vertexs) self.assertIn(Vertex(u"陆军航空601旅", attribute=u"nt 1"), vertexs)
def test_recognition_2_level(self): text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业" self.gen_word(text) person_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) word_net_optimum = WordNet(self.text, vertexs=self.vertexs) vertexs = organization_recognition.recognition(self.vertexs, word_net_optimum, self.word_net) # viterbi(word_net_optimum.vertexs) dump_vertexs(vertexs) self.assertIn(Vertex(u"济南杨铭宇餐饮管理有限公司", attribute=u"nt 1"), vertexs)
def role_viterbi(vertexs, wordnet_optimum, hmm, trie, recognition_attr, tag_func, viterbi_fun=viterbi_template): tag_list = tag_func(vertexs) if Config.debug: sb = [] for i, tag in enumerate(tag_list): sb.append(u"[ %s %s ]" % (vertexs[i].real_word, tag)) print u"角色观察: %s" % u"".join(sb) tag_list = viterbi_fun(tag_list, hmm) if Config.debug: sb = [] for i, tag in enumerate(tag_list): sb.append(u"%s/%s" % (vertexs[i].real_word, tag)) print(u"角色标注:[%s]" % u", ".join(sb)) tag_str = [str(x) for x in tag_list] tag_str = ''.join(tag_str) search = Searcher(trie, tag_str) vertexs_offset = [0] * len(vertexs) offset = 1 # head tail skip for i, v in enumerate(vertexs[1:-1]): vertexs_offset[i + 1] = offset offset += len(vertexs[i + 1].real_word) while search.next(): name_str = "" for i in range(search.begin, search.begin + len(search.key)): name_str += vertexs[i].real_word # 添加到词网内 vertex = Vertex(name_str, attribute=recognition_attr) wordnet_optimum.add(vertexs_offset[search.begin], vertex) vertexs = viterbi(wordnet_optimum.vertexs) return vertexs
def test_word_net_insert(self): text = u"1234567890" word_net_all = WordNet(text) for i, c in enumerate(text): word_net_all.add(i + 1, Vertex(c))