def combine_comm(self, words): """根据词性标注进行普通实体合并 Args: words:WordUnit list,进行命名实体合并后的words Returns: words_combine:WordUnit list,进行普通实体连接后的words """ newword = words[0].lemma #第一个词,作为新词 words_combine = [] #存储合并后的结果 n = 1 i = 1 #当前词ID while i < len(words): word = words[i] #词合并:(前后词都是实体)and(前后词的词性相同 or 前词 in ["nz","j"] or 后词 in ["nz","j"]) if (self.is_entity(word.postag) and self.is_entity(words[i - 1].postag) and (word.postag in ['nz', 'j'] or words[i - 1].postag in ['nz', 'j'])): newword += word.lemma else: words_combine.append(WordUnit(n, newword, words[i - 1].postag)) #添加上一个词 n += 1 newword = word.lemma #当前词作为新词 i += 1 #添加最后一个词 words_combine.append(WordUnit(n, newword, words[len(words) - 1].postag)) return words_combine
def postag(self, lemmas): """ Parameters ---------- lemmas : List,分词后的结果 entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns ------- words:WordUnit List,包括分词与词性标注的结果 """ words = [] # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() #释放 return words
def combine(self, words, netags): """ 根据命名实体的B-I-E进行词合并(B:这个词为一个命名实体的开始,I:这个词为一个 命名实体的中间,E:这个词为一个命名实体的结尾,O:这个词不是一个命名实体) Args: words:WordUnit list ,分词与词性标注后得到的words netags:list,命名实体识别结果 Returns: words_combine:WordUnit list,连接后的结果 """ words_combine = [] #存储连接后的结果 length = len(netags) n = 1 #实体计数,从1开始 i = 0 while i < length: if 'B-' in netags[i]: newword = words[i].lemma j = i + 1 while j < length: if 'I-' in netags[j]: newword += words[j].lemma elif 'E-' in netags[j]: newword += words[j].lemma break elif 'O' == netags[j] or (j + 1) == length: break j += 1 words_combine.append( WordUnit(n, newword, self.judge_postag(netags[j - 1]))) n += 1 i = j else: words[i].ID = n n += 1 words_combine.append(words[i]) i += 1 return self.combine_comm(words_combine)
words_str += word.to_string()+'\n' return words_str.rstrip('\n') def get_lemmas(self): """获得句子的分词结果 Returns: lemmas:Str,该句子的分词结果 """ lemmas = '' for word in self.words: lemmas += word.lemma+'\t' return lemmas.rstrip('\t') if __name__ =='__main__': #中国首都北京 word3 = WordUnit(3,'北京','ns',0,None,'HED') word2 = WordUnit(2,'首都','ns',3,word3,'ATT') word1 = WordUnit(1,'中国','ns',2,word2,'ATT') words = []#句子的词单元 words.append(word1) words.append(word2) words.append(word3) sentence = SentenceUnit(words) print(sentence.to_string()) print('句子分词结果:'+sentence.get_lemmas()) print('"首都"的中心词lemma:'+sentence.words[1].head_word.lemma) print('句子的中心词:'+sentence.get_head_word().to_string())