Ejemplo n.º 1
0
def name_recognize_one():
    import sys, os
    import pyltp
    from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

    paragraph = '叙利亚东古塔地区。7日发生疑似化学武器袭击事件,导致70余人丧生。报道一出,叙利亚反对派、美国、英国、法国等纷纷指责叙政府军使用化学武器袭击无辜平民。但叙利亚坚决否认,并指责西方和叙反对派造谣,目的是保护被围困的恐怖分子。俄外交部则认为,该谣言旨在袒护恐怖分子,并为外部势力发动打击寻找借口。'

    sentence = SentenceSplitter.split(paragraph)[1]
    print('split {}'.format(sentence))
    # 断句
    #     for i in sentence:
    #         print(i)
    #         print()
    segmentor = Segmentor()
    segmentor.load(sg_model_path)
    words = segmentor.segment(sentence)
    print('|'.join(words))

    postagger = Postagger()
    postagger.load(ps_model_path)
    postags = postagger.postag(words)
    for k, v in dict(zip(words, postags)).items():
        print(k, v)

    # print(' ## '.join(postags))
    parser = Parser()
    parser.load(pr_model_path)
    arcs = parser.parse(words, postags)
    print(' '.join('%d:%s ' % (arc.head, arc.relation) for arc in arcs))

    print('#' * 8)
    recognizer = NamedEntityRecognizer()
    recognizer.load(ner_model_path)
    netag = recognizer.recognize(words, postags)
    for word, ntag in zip(words, netag):
        if ntag != 'O':
            # print('ntag')
            print(word + ' / ' + netag)
    print(' / '.join(netag))

    # 命名实体识别
    word_list = ['欧几里得', '是', '西元前', '三', '世纪', '的', '希腊', '数学家', '。']
    postags_list = ['nh', 'v', 'nt', 'm', 'n', 'u', 'ns', 'n', 'wp']
    nertags = recognizer.recognize(word_list, postags_list)
    for word, ntag in zip(word_list, nertags):
        if ntag != 'O':
            print(word + '/' + ntag)
    #print (" ".join(word_list))
    print(' '.join(nertags))

    segmentor.release()
    postagger.release()
    parser.release()
    recognizer.release()
Ejemplo n.º 2
0
class pyltp_model():
    def __init__(self, LTP_DATA_DIR='/Users/didi/Desktop/ltp_data_v3.4.0'):
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        ner_model_path = os.path.join(
            LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        self.segmentor = Segmentor()  # 初始化实例
        self.postagger = Postagger()  # 初始化实例
        self.recognizer = NamedEntityRecognizer()  # 初始化实例

        self.segmentor.load(cws_model_path)  # 加载模型
        self.postagger.load(pos_model_path)  # 加载模型
        self.recognizer.load(ner_model_path)  # 加载模型

    def token(self, sentence):
        words = self.segmentor.segment(sentence)  # 分词
        words = list(words)
        postags = self.postagger.postag(words)  # 词性标注
        postags = list(postags)
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别
        netags = list(netags)
        result = []
        for i, j in zip(words, netags):
            if j in ['S-Nh', 'S-Ni', 'S-Ns']:
                result.append(j)
                continue
            result.append(i)
        return result

    def close(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()  # 释放模型
Ejemplo n.º 3
0
def ltp_ner_data():
    """使用 LTP 进行命名实体识别"""
    LTP_DATA_DIR = 'D:\BaiduNetdiskDownload\ltp_data_v3.4.0'  # ltp模型目录的路径
    ner_model_path = os.path.join(LTP_DATA_DIR,
                                  'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

    from pyltp import NamedEntityRecognizer
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    result = []
    file = [(const.qc_train_pos, const.qc_train_ner),
            (const.qc_test_pos, const.qc_test_ner)]
    for i in range(2):
        with open(file[i][0], 'r', encoding='utf-8') as f:
            for line in f.readlines():
                attr = line.strip().split('\t')
                words_pos = attr[1].split(" ")
                words = [word.split('/_')[0] for word in words_pos]
                postags = [word.split('/_')[1] for word in words_pos]
                netags = recognizer.recognize(words, postags)  # 命名实体识别
                res = ' '.join([
                    "{}/_{}".format(words[i], netags[i])
                    for i in range(len(words))
                ])
                result.append("{}\t{}\n".format(attr[0], res))
        with open(file[i][1], 'w', encoding='utf-8') as f:
            f.writelines(result)
        result.clear()
    recognizer.release()  # 释放模型
Ejemplo n.º 4
0
def segmentsentence(sentence):
    segmentor = Segmentor()
    postagger = Postagger()
    parser = Parser()
    recognizer = NamedEntityRecognizer()

    segmentor.load("./ltpdata/ltp_data_v3.4.0/cws.model")
    postagger.load("./ltpdata/ltp_data_v3.4.0/pos.model")
    # parser.load("./ltpdata/ltp_data_v3.4.0/parser.model")
    recognizer.load("./ltpdata/ltp_data_v3.4.0/ner.model")
    #############
    word_list = segmentor.segment(sentence)
    postags_list = postagger.postag(word_list)
    nertags = recognizer.recognize(word_list, postags_list)
    ############
    for word, ntag in zip(word_list, nertags):
        if ntag == 'Nh':
            entity_list.append(word)
    print(" ".join(word_list))
    print(' '.join(nertags))
    ############
    segmentor.release()
    postagger.release()
    # parser.release()
    recognizer.release()
    return word_list
Ejemplo n.º 5
0
def ner_data():
     # 分词模型
    segmentor = Segmentor()
    segmentor.load('cws.model')
    # 词性标注模型
    postagger = Postagger()
    postagger.load('pos.model')
    # 命名实体模型
    recognizer = NamedEntityRecognizer()
    NamedEntityRecognizer.load('ner.model')
    # 加载将要被分词的数据
    data_csv = pd.read_csv('../data.csv', encoding='utf-8-sig')
    datas = data_csv['title']

    util = Utils()
    data_processed = open('./data_processed_recognizer.csv', 'w', encoding='utf-8')
    for data in datas:
        words = segmentor.segment(data)
        postags = postagger.postag(words)
        word_split = ' '.join(words).split(' ')
        netags = recognizer.recognize(words, postags)
        netag_split = ' '.join(netags).split(' ')
        concat_word = util.concat(word_split, netag_split, tag='netags')
        data_processed.write(concat_word + '\n')
    data_processed.close()
Ejemplo n.º 6
0
class pyltp_impl(Seg):
    def __init__(self, dictpath, mode='seg'):
        super().__init__(mode)

        from pyltp import Segmentor
        from pyltp import Postagger
        from pyltp import NamedEntityRecognizer
        self.ltp_seg = Segmentor()
        self.ltp_pos = Postagger()
        self.ltp_ner = NamedEntityRecognizer()

        self.ltp_seg.load(os.path.join(dictpath, 'cws.model'))

        if mode != 'seg':
            self.ltp_pos.load(os.path.join(dictpath, 'pos.model'))

        if mode == 'ner':
            self.ltp_ner.load(os.path.join(dictpath, 'ner.model'))

    def impl_func(self, sentence):
        seg_res = self.ltp_seg.segment(sentence)
        if self.mode == 'seg':
            return seg_res

        pos_res = self.ltp_pos.postag(seg_res)
        if self.mode == 'postag':
            return [(word, tag) for (word, tag) in zip(seg_res, pos_res)]

        ner_res = self.ltp_ner.recognize(seg_res, pos_res)
        return [(word, tag) for (word, tag) in zip(seg_res, ner_res)]
Ejemplo n.º 7
0
class Ltp(NerModel):
    def __init__(self):
        super(Ltp, self).__init__()
        self._model_path = "./model/ltp/"
        self._seg = Segmentor()
        self._pos = Postagger()
        self._recognizer = NamedEntityRecognizer()
        self._load_model()
        self._object_str = "[INFO] This is ltp object!"
        print("[INFO] All model is load!")

    def __repr__(self):
        return self._object_str

    def _load_model(self):
        self._seg.load(self._model_path + "cws.model")
        self._pos.load(self._model_path + "pos.model")
        self._recognizer.load(self._model_path + "ner.model")

    def get_entity(self, sentence):
        words = self._seg.segment(sentence)
        pos = self._pos.postag(words)
        ner = self._recognizer.recognize(words, pos)
        entity = [w for w, s in zip(words, ner) if s != 'O']
        if entity:
            return "".join(entity) if len(entity) > 1 else entity[0]
Ejemplo n.º 8
0
def ner(words, postags):

    #    print('命名实体开始!')
    recognizer = NamedEntityRecognizer()
    recognizer.load('D:\\ltp_data\\ner.model')  #加载模型
    netags = recognizer.recognize(words, postags)  #命名实体识别
    for word, ntag in zip(words, netags):
        pass


#        print(word+'/'+ ntag)

#        while(ntag == "B-Ni" or ntag == "I-Ni" or ntag=="E-Ni"):
#            ntag_company1.append(word)
#            if(ntag=="E-Ni"):
#                break
##
##
#        while(ntag == "B-Ni" or ntag == "I-Ni" or ntag=="E-Ni"):
#            ntag_company2.append(word)
#            if(ntag=="E-Ni"):
#                break

    recognizer.release()  #释放模型
    nerttags = list(netags)
    nerwords = list(words)

    return nerttags, nerwords
def get_ner_list(words_list, postag_list):

    ner = NamedEntityRecognizer()
    ner.load(ner_model_path)
    ner_list = list(ner.recognize(words_list, postag_list))
    ner.release()
    return ner_list
Ejemplo n.º 10
0
def name_recognition(words, postags):
    '''
    命名实体识别
    :param words:分词结果
    :param postags:标注结果
    :return:
    '''
    recognizer = NamedEntityRecognizer()
    #初始化实例
    recognizer.load('E:\\NLP-homework\\ltp-data-v3.3.1\\ltp_data\\ner.model')
    #模型加载
    netags = recognizer.recognize(words, postags)
    #识别命名实体

    result = ''
    for i in range(0, len(netags)):
        if i < len(words) - 2:
            if 's' in netags[i]:
                if 'O' in netags[
                        i + 1] and words[i + 1] != '' and words[i + 1] != ',':
                    if 's' in netags[i + 2]:
                        result += words[i] + words[i + 1] + words[i + 2] + ""
    print(result)
    # for word, ntag in zip(words, netags):
    #     print word + '/' + ntag
    recognizer.release()
    return netags
Ejemplo n.º 11
0
class Parse_Util(object):
    def __init__(self, lexicon_path='./data/lexicon'):
        # 分词
        self.segmentor = Segmentor()
        # self.segmentor.load_with_lexicon(cws_model_path, lexicon_path)
        self.segmentor.load(cws_model_path)
        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(pos_model_path)
        # 依存句法分析
        self.parser = Parser()
        self.parser.load(par_model_path)
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(ner_model_path)
        # jieba 分词
        # jieba.load_userdict(lexicon_path)

    def __del__(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    # 解析句子
    def parse_sentence(self, sentence):
        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)
        arcs = self.parser.parse(words, postags)
        # child_dict_list = ParseUtil.build_parse_child_dict(words, arcs)

        return words, postags, netags, arcs
Ejemplo n.º 12
0
class LTP_word():
    """docstring for parser_word
    deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值
    release释放缓存"""
    def __init__(self, model_path):
        self.model_path = model_path
        self.segmentor = Segmentor()  # 分词初始化实例
        self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt'))
        self.postagger = Postagger() # 词性标注初始化实例
        self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型
        self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例
        self.recognizer.load(path.join(self.model_path, 'ner.model'))
        self.parser = Parser() # 依存句法初始化实例 s
        self.parser.load(path.join(self.model_path, 'parser.model'))  # 加载模型
        self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例
        self.labeller.load(path.join(self.model_path, 'srl'))
    def deal (self, text):  #把所有该要使用的东西都提取出来
        words =self.segmentor.segment(text)    # 分词 
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)	#命名实体
        arcs = self.parser.parse(words, postags)  # 句法分析
        roles = self.labeller.label(words, postags, netags, arcs)  # 语义角色标注
        return words,postags,arcs,roles,netags
    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
Ejemplo n.º 13
0
    def ltp_word(self):
        """创建一个方法,用来进行句子的分词、词性分析等处理。"""
        # 分词
        segmentor = Segmentor()
        segmentor.load(os.path.join(MODELDIR, "cws.model"))
        words = segmentor.segment(self.content)
        #print("*************分词*****************")
        #print("\t".join(words))

        # 词性标注
        postagger = Postagger()
        postagger.load(os.path.join(MODELDIR, "pos.model"))
        postags = postagger.postag(words)
        #print("*************词性标注*************")
        #print(type(postags))
        #print("\t".join(postags))

        # 依存句法分析
        parser = Parser()
        parser.load(os.path.join(MODELDIR, "parser.model"))
        arcs = parser.parse(words, postags)
        #print("*************依存句法分析*************")
        #print(type(arcs))
        #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        # 把依存句法分析结果的head和relation分离出来
        arcs_head = []
        arcs_relation = []
        for arc in arcs:
            arcs_head.append(arc.head)
            arcs_relation.append(arc.relation)

        # 命名实体识别
        recognizer = NamedEntityRecognizer()
        recognizer.load(os.path.join(MODELDIR, "ner.model"))
        netags = recognizer.recognize(words, postags)
        #print("*************命名实体识别*************")
        #print("\t".join(netags))
        """
        # 语义角色标注
        labeller = SementicRoleLabeller()
        labeller.load(os.path.join(MODELDIR, "pisrl.model"))
        roles = labeller.label(words, postags, arcs)
        print("*************语义角色标注*************")
        for role in roles:
            print(role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
        """

        segmentor.release()
        postagger.release()
        parser.release()
        recognizer.release()
        #labeller.release()

        # 调用list_conversion函数,把处理结果列表化
        words_result = list_conversion(words, postags, netags, arcs_head,
                                       arcs_relation)

        return words_result
Ejemplo n.º 14
0
def get_all_name(r_filename,w_file):
    # global nlp
    LTP_DATA_DIR = r'ltp_data_v3.4.0'  # LTP模型目录路径
    # 分词
    segmentor = Segmentor()  # 初始化
    segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model'))  # 加载模型
    # 词性标注
    postagger = Postagger()  # 初始化
    postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))  # 加载模型
    # 命名实体识别
    recognizer = NamedEntityRecognizer()  # 实例化
    recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))
    f_r=open(r_filename,"r",encoding="utf-8")
    f_w=open(w_file,"w",encoding="utf-8")
    count=0
    for line in f_r:
        count+=1
        lines=line.strip("\n").replace(r"\n","")
    #    print("----------"+lines)
        words = segmentor.segment(lines)
        postags = postagger.postag(words)
        netags = recognizer.recognize(words, postags)
        sen=get_some_idea(line,netags,words)
        print(sen)
        if sen:
            for key in sen:
                sens="\t".join(list(set([data[1] for data in sen[key]])))
                f_w.write(key +"\t"+sens +"\n")
    # nlp.close()
    f_r.close()
    f_w.close()
Ejemplo n.º 15
0
def locationNER(text):
    #先分词
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(text)  # 分词
    #print ('\t'.join(words))
    segmentor.release()

    #再词性标注
    postagger = Postagger() # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    postagger.release()  # 释放模型

    #最后地理实体识别

    recognizer = NamedEntityRecognizer() # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    for i in range (0,len(netags)):
       if 'I-Ns'in netags[i] or 'I-Ni'in netags[i]:
           results.append(words[i-1]+words[i]+words[i+1])
       if 'S-Ns'in netags[i] or 'S-Ni'in netags[i]:
           results.append(words[i])
    return results
def namedEntityRecognize(sentence):
    '''
        使用pyltp模块进行命名实体识别
        返回:1)命名实体和类别元组列表、2)实体类别列表
    '''
    namedEntityTagTupleList = []

    segmentor = Segmentor()
    # segmentor.load(inout.getLTPPath(index.CWS))
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS),
                                inout.getResourcePath('userDic.txt'))
    words = segmentor.segment(sentence)
    segmentor.release()
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))
    postags = postagger.postag(words)
    postagger.release()
    recognizer = NamedEntityRecognizer()
    recognizer.load(inout.getLTPPath(index.NER))
    netags = recognizer.recognize(words, postags)
    recognizer.release()

    # 封装成元组形式
    for word, netag in zip(words, netags):
        namedEntityTagTupleList.append((word, netag))

    neTagList = '\t'.join(netags).split('\t')

    return namedEntityTagTupleList, neTagList
Ejemplo n.º 17
0
def name_entity_recognize(req):
    if req.method == 'POST':
        #print '-----------word_class_analyse START -----\r\n'
        intext = req.POST["intext"].encode('utf-8', 'ignore')

        words = segmentor(intext)
        tags = posttagger(words)

        recognizer = NamedEntityRecognizer()
        # recognizer.load('/usr/local/src/ltp_data/ner.model')
        recognizer.load(ner_model_path)
        #recognizer = settings.RECOGNIZER
        netags = recognizer.recognize(words, tags)  # 命名实体识别

        outtext = '{"result":['
        for word, tag in zip(words, netags):
            # print word+'/'+tag + '\r\n'
            outtext += '{"tag":"' + "%s" % tag + '",'
            outtext += '"content"' + ':"' + word + '"},'

        outtext = outtext.rstrip(',') + ']}'

        response = HttpResponse(outtext)
        response["Access-Control-Allow-Origin"] = "*"
        response["Access-Control-Allow-Methods"] = "POST"

        response["Access-Control-Max-Age"] = "1000"
        response["Access-Control-Allow-Headers"] = "*"
        return response
Ejemplo n.º 18
0
def name_recognition(words, postags):
    """
    命名实体识别
    :param words:分词
    :param postags:标注
    :return:
    """
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(
        'D:/Program Files/ltp-models/3.3.1/ltp-data-v3.3.1/ltp_data/ner.model'
    )  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别

    # 地名标签为 ns
    result = ''
    for i in range(0, len(netags)):
        if i < len(words) - 2:
            if 's' in netags[i]:
                if 'O' in netags[
                        i + 1] and words[i + 1] != ',' and words[i + 1] != ',':
                    if 's' in netags[i + 2]:
                        result += words[i] + words[i + 1] + words[i + 2] + " "
    print
    result
    # for word, ntag in zip(words, netags):
    #     print word + '/' + ntag
    recognizer.release()  # 释放模型
    return netags
Ejemplo n.º 19
0
def e_recognize(words, postags):
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    #for word, ntag in zip(words, netags):
    #print(word + '/' + ntag)
    recognizer.release()  # 释放模型
    return netags
Ejemplo n.º 20
0
def ner(words, postags):
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    for word, ntag in zip(words, netags):
        print(word + '/' + ntag)
    recognizer.release()  # 释放模型
    return netags
Ejemplo n.º 21
0
def get_ner(words, postags):
    """ ltp 命名实体识别 """
    ner_model_path = os.path.join(LTP_TOP_DIR, 'ner.model')
    recognizer = NamedEntityRecognizer()
    recognizer.load(ner_model_path)
    netags = recognizer.recognize(words, postags)
    recognizer.release()
    return list(netags)
def ltp_name_entity_recognizer(LTP_DATA_DIR, words, postags):
    # 命名实体识别模型路径,模型名称为`ner.model`
    ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    recognizer.release()  # 释放模型
    return netags
Ejemplo n.º 23
0
def ner(words, postags):
    recognizer = NamedEntityRecognizer() # 初始化实例
    recognizer.load('/Users/chenming/Spyder/3.3.1/ltp_data/ner.model')  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    for word, ntag in zip(words, netags):
        print (word + '/' + ntag)
    recognizer.release()  # 释放模型
    return netags
Ejemplo n.º 24
0
 def entity_recognize(cutting_list, tagging_list):
     ner_model_path = os.path.join(LtpParser.ltp_path, 'ner.model')
     from pyltp import NamedEntityRecognizer
     recognizer = NamedEntityRecognizer()
     recognizer.load(ner_model_path)
     ne_tags = recognizer.recognize(cutting_list, tagging_list)
     recognizer.release()
     return ne_tags
Ejemplo n.º 25
0
def ner(words, postags):
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load('../ltp_data/ner.model')  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    # for word,tag in zip(words,netags):
    #     print word+'/'+tag
    recognizer.release()  # 释放模型
    return netags
Ejemplo n.º 26
0
 def get_ner(self, word_list, postag_list, model):
     recognizer = NamedEntityRecognizer()
     recognizer.load(model)
     netags = recognizer.recognize(word_list, postag_list)  # 命名实体识别
     # for word, ntag in zip(word_list, netags):
     #     print(word + '/' + ntag)
     recognizer.release()  # 释放模型
     return list(netags)
Ejemplo n.º 27
0
 def get_length(self, filename):
     segmentor = Segmentor()
     segmentor.load('cws.model')
     postagger = Postagger()  # 初始化实例
     postagger.load('pos.model')  # 加载模型
     recognizer = NamedEntityRecognizer()  # 初始化实例
     recognizer.load('ner.model')  # 加载模型
     f = open(filename, 'r')
     l_dict = {}
     doc = []
     q_ner = []
     all_sum = 0
     for i in f:
         doc.append(i)
     all_doc_n = len(doc)
     for k in range(all_doc_n):
         sum = 0
         doc_list = doc[k].replace('{"pid": ',
                                   '').replace('"document": [',
                                               '').replace(']}',
                                                           '').split(',')
         # print(doc_list)
         doc_n = int(doc_list[0])
         sentense_num = len(doc_list)
         # for d in doc_list[1:]:
         #     d = d[2:-2]
         for i in range(1, sentense_num):
             d = doc_list[i][2:-1]
             words = []
             cut_words = '\t'.join(segmentor.segment(d))
             words_list = cut_words.split('\t')
             if words_list == ['']:
                 continue
             postags = postagger.postag(words_list)  # 词性标注
             pos_line = '\t'.join(postags)
             q_pos_list = pos_line.split('\t')
             netags = recognizer.recognize(words_list, postags)  # 命名实体识别
             ner_line = '\t'.join(netags)
             ner_list = ner_line.split('\t')
             sum += len(words_list)
             ner_str = ''
             # print(ner_list)
             for nr in range(len(ner_list)):
                 if ner_list[nr][0] != 'O':
                     if ner_list[nr][0] == 'S' or ner_list[nr][0] == 'E':
                         ner_str += words_list[nr]
                         q_ner.append(ner_str)
                         ner_str = ''
                     else:
                         ner_str += words_list[nr]
         all_sum += sum
         l_dict[doc_n] = sum
         # print(q_ner)
     q_ner = list(set(q_ner))
     with open('ner_word.txt', 'w') as f:
         for qn in q_ner:
             f.write(qn)
             f.write('\n')
Ejemplo n.º 28
0
    def segment(self, texts, use_tag_filter=True):
        # 初始化实例
        # global word_list, netags, postags, relation, heads
        words = []
        pos = []
        ner = []
        rel = []
        hea = []

        segmentor = Segmentor()
        segmentor.load_with_lexicon(self.cws_model_path, './dict/user_recg.dic')  # 加载模型,参数是自定义词典的文件路径  self.dic_list

        postagger = Postagger()
        postagger.load(self.pos_model_path)

        recognizer = NamedEntityRecognizer()
        recognizer.load(self.ner_model_path)

        parser = Parser()
        parser.load(self.pas_model_path)

        for text in texts:
            text = text.lower()

            word_list = segmentor.segment(text)
            word_list = [word for word in word_list if len(word) > 1]
            # word_list = [word for word in word_list if re.match("[\u0041-\u005a\u4e00-\u9fa5]+", word) != None]  # .decode('utf8') 保留中英文
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]  # 去除停用词

            # 词性标注
            posttags = postagger.postag(word_list)
            postags = list(posttags)

            # NER识别
            netags = recognizer.recognize(word_list, postags)

            # 句法分析
            arcs = parser.parse(word_list, postags)
            rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
            relation = [arc.relation for arc in arcs]  # 提取依存关系
            heads = ['Root' if id == 0 else word_list[id - 1] for id in rely_id]  # 匹配依存父节点词语

            if use_tag_filter:
                dic = dict(zip(word_list, postags))
                word_list = [x for x in dic.keys() if dic[x] in self.tags_filter]

            words.append(word_list)
            pos.append(postags)
            ner.append(netags)
            rel.append(relation)
            hea.append(heads)

        segmentor.release()
        postagger.release()
        recognizer.release()
        parser.release()

        return words, pos, ner, rel, hea
Ejemplo n.º 29
0
def pyltp_ner(text):  # 识别机构名-pyltp
    LTP_DATA_DIR = Path.cwd().parent / 'ltp_model'  # ltp模型存放路径
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
    # 分词
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(text)  # 分词
    words_list = list(words)  # words_list列表保存着分词的结果
    segmentor.release()  # 释放模型

    # 词性标注
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    postags_list = list(postags)  # postags_list保存着词性标注的结果
    postagger.release()  # 释放模型

    # 命名体识别
    ner_model_path = os.path.join(LTP_DATA_DIR,
                                  'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    netags_list = list(netags)  # netags_list保存着命名实体识别的结果
    data = {"reg": netags, "words": words, "tags": postags}
    # print(data)
    recognizer.release()  # 释放模型

    # 去除非命名实体
    a = len(words_list)
    words_list_1 = []
    postags_list_1 = []
    netags_list_1 = []
    for i in range(a):
        if netags_list[i] != 'O':
            words_list_1.append(words_list[i])
            postags_list_1.append(postags_list[i])
            netags_list_1.append(netags_list[i])

    # 提取机构名
    a1 = len(words_list_1)
    organizations = []
    for i in range(a1):
        if netags_list_1[i] == 'S-Ni':
            organizations.append(words_list_1[i])
        elif netags_list_1[i] == 'B-Ni':
            temp_s = ""
            temp_s += words_list_1[i]
            j = i + 1
            while j < a1 and (netags_list_1[j] == 'I-Ni'
                              or netags_list_1[j] == 'E-Ni'):
                temp_s += words_list_1[j]
                j = j + 1
            organizations.append(temp_s)
    orignizations = list(set(organizations))  # 对公司名去重
    return orignizations
Ejemplo n.º 30
0
def ner(words, postags):
    global recognizer
    if recognizer is None:
        ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        recognizer = NamedEntityRecognizer() # 初始化实例
        recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    # print(list(zip(list(words), list(postags), list(netags))))
    return list(netags)
Ejemplo n.º 31
0
    def __init__(self):
        self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        segmentor = Segmentor()
        segmentor.load(self.cws_model_path)
        self.words = segmentor.segment(data)
        # print("|".join(words))
        segmentor.release()


        postagger = Postagger() # 初始化实例
        postagger.load(self.pos_model_path)  # 加载模型
        self.postags = postagger.postag(self.words)  # 词性标注
        # print('\t'.join(postags))
        postagger.release()  # 释放模型


        recognizer = NamedEntityRecognizer() # 初始化实例
        recognizer.load(self.ner_model_path)  # 加载模型
        self.netags = recognizer.recognize(self.words, self.postags)  # 命名实体识别
        # print('\t'.join(netags))
        recognizer.release()  # 释放模型
Ejemplo n.º 32
0
def mingming_shiti(words,postags):
    """命名实体。机构名(Ni)人名(Nh)地名(Ns)"""
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    netags = recognizer.recognize(words, postags)
    print ("\t".join(netags))
Ejemplo n.º 33
0
postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
    print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

segmentor.release()
postagger.release()
parser.release()
recognizer.release()
labeller.release()