Ejemplo n.º 1
0
 def named_entity_recognition(self, sent, standard_name=False):
     """
     利用pyhanlp的命名实体识别,找到句子中的(人名,地名,机构名)三种实体。harvesttext会预先链接已知实体
     :param sent:
     :param standard_name:
     :return: 发现的命名实体信息,字典 {实体名: 实体类型}
     """
     from pyhanlp import HanLP, JClass
     if not self.hanlp_prepared:
         self.hanlp_prepare()
     self.standard_name = standard_name
     entities_info = self.entity_linking(sent)
     sent2 = self.decoref(sent, entities_info)
     StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.StandardTokenizer")
     StandardTokenizer.SEGMENT.enableAllNamedEntityRecognize(True)
     entity_type_dict = {}
     try:
         for x in StandardTokenizer.segment(sent2):
             # 三种前缀代表:人名(nr),地名(ns),机构名(nt)
             tag0 = str(x.nature)
             if tag0.startswith("nr"):
                 entity_type_dict[x.word] = "人名"
             elif tag0.startswith("ns"):
                 entity_type_dict[x.word] = "地名"
             elif tag0.startswith("nt"):
                 entity_type_dict[x.word] = "机构名"
             elif tag0.startswith("nz"):
                 entity_type_dict[x.word] = "其他专名"
     except:
         pass
     return entity_type_dict
Ejemplo n.º 2
0
    def named_entity_recognition(self, sent, standard_name=False, return_posseg=False):
        '''利用pyhanlp的命名实体识别,找到句子中的(人名,地名,机构名,其他专名)实体。harvesttext会预先链接已知实体

        :param sent: string, 文本
        :param standard_name: bool, 是否把连接到的已登录转化为标准名
        :param return_posseg: bool, 是否返回包括命名实体识别的,带词性分词结果
        :param book: bool, 预先识别
        :return: entity_type_dict: 发现的命名实体信息,字典 {实体名: 实体类型}
            (return_posseg=True时) possegs: list of (单词, 词性)
        '''
        from pyhanlp import HanLP, JClass
        if not self.hanlp_prepared:
            self.hanlp_prepare()
        self.standard_name = standard_name
        entities_info = self.entity_linking(sent)
        sent2 = self.decoref(sent, entities_info)
        StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.StandardTokenizer")
        StandardTokenizer.SEGMENT.enableAllNamedEntityRecognize(True)
        entity_type_dict = {}
        try:
            possegs = []
            for x in StandardTokenizer.segment(sent2):
                # 三种前缀代表:人名(nr),地名(ns),机构名(nt)
                tag0 = str(x.nature)
                if tag0.startswith("nr"):
                    entity_type_dict[x.word] = "人名"
                elif tag0.startswith("ns"):
                    entity_type_dict[x.word] = "地名"
                elif tag0.startswith("nt"):
                    entity_type_dict[x.word] = "机构名"
                elif tag0.startswith("nz"):
                    entity_type_dict[x.word] = "其他专名"
                possegs.append((x.word, tag0))
        except:
            pass
        if return_posseg:
            return entity_type_dict, possegs
        else:
            return entity_type_dict
Ejemplo n.º 3
0
def hanlp_cut(text):
    tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
    return " ".join([term.word for term in tokenizer.segment(text)])