Beispiel #1
0
 def combine_comm(self, words):
     """根据词性标注进行普通实体合并
     Args:
         words:WordUnit list,进行命名实体合并后的words
     Returns:
         words_combine:WordUnit list,进行普通实体连接后的words
     """
     newword = words[0].lemma  #第一个词,作为新词
     words_combine = []  #存储合并后的结果
     n = 1
     i = 1  #当前词ID
     while i < len(words):
         word = words[i]
         #词合并:(前后词都是实体)and(前后词的词性相同 or 前词 in ["nz","j"] or 后词 in ["nz","j"])
         if (self.is_entity(word.postag)
                 and self.is_entity(words[i - 1].postag)
                 and (word.postag in ['nz', 'j']
                      or words[i - 1].postag in ['nz', 'j'])):
             newword += word.lemma
         else:
             words_combine.append(WordUnit(n, newword,
                                           words[i - 1].postag))  #添加上一个词
             n += 1
             newword = word.lemma  #当前词作为新词
         i += 1
     #添加最后一个词
     words_combine.append(WordUnit(n, newword,
                                   words[len(words) - 1].postag))
     return words_combine
Beispiel #2
0
 def postag(self, lemmas):
     """
     Parameters
     ----------
     lemmas : List,分词后的结果
     entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生
     Returns
     -------
     words:WordUnit List,包括分词与词性标注的结果
     """
     words = []
     # 词性标注
     postags = self.postagger.postag(lemmas)
     for i in range(len(lemmas)):
         # 存储分词与词性标记后的词单元WordUnit,编号从1开始
         word = WordUnit(i + 1, lemmas[i], postags[i])
         words.append(word)
     # self.postagger.release() #释放
     return words
Beispiel #3
0
 def combine(self, words, netags):
     """
     根据命名实体的B-I-E进行词合并(B:这个词为一个命名实体的开始,I:这个词为一个
     命名实体的中间,E:这个词为一个命名实体的结尾,O:这个词不是一个命名实体)
     Args:
         words:WordUnit list ,分词与词性标注后得到的words
         netags:list,命名实体识别结果
     Returns:
         words_combine:WordUnit list,连接后的结果
     """
     words_combine = []  #存储连接后的结果
     length = len(netags)
     n = 1  #实体计数,从1开始
     i = 0
     while i < length:
         if 'B-' in netags[i]:
             newword = words[i].lemma
             j = i + 1
             while j < length:
                 if 'I-' in netags[j]:
                     newword += words[j].lemma
                 elif 'E-' in netags[j]:
                     newword += words[j].lemma
                     break
                 elif 'O' == netags[j] or (j + 1) == length:
                     break
                 j += 1
             words_combine.append(
                 WordUnit(n, newword, self.judge_postag(netags[j - 1])))
             n += 1
             i = j
         else:
             words[i].ID = n
             n += 1
             words_combine.append(words[i])
         i += 1
     return self.combine_comm(words_combine)
Beispiel #4
0
            words_str += word.to_string()+'\n'
        return words_str.rstrip('\n')
    
    def get_lemmas(self):
        """获得句子的分词结果
        Returns:
            lemmas:Str,该句子的分词结果
        """
        lemmas = ''
        for word in self.words:
            lemmas += word.lemma+'\t'
        return lemmas.rstrip('\t')
    
if __name__ =='__main__':
    #中国首都北京
    word3 = WordUnit(3,'北京','ns',0,None,'HED')
    word2 = WordUnit(2,'首都','ns',3,word3,'ATT')
    word1 = WordUnit(1,'中国','ns',2,word2,'ATT')
    
    words = []#句子的词单元
    words.append(word1)
    words.append(word2)
    words.append(word3)
    
    sentence = SentenceUnit(words)
    print(sentence.to_string())
    
    print('句子分词结果:'+sentence.get_lemmas())
    print('"首都"的中心词lemma:'+sentence.words[1].head_word.lemma)
    print('句子的中心词:'+sentence.get_head_word().to_string())