Beispiel #1
0
def segmentation(filename, output_filename):

    print "segmenting '%s' to '%s'" % (filename, output_filename)

    f = open(filename, "r")
    lines = f.readlines()
    f.close()

    MODELDIR = "./ltp_data/"

    # segment
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    # postag
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    
    # Named Entity Recognize
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    
    # Parse and get SVO
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    
    f = open(output_filename, "w")
    fner = open(output_filename.split(".")[0]+"_ner.txt", "w")

    for _line in lines:
        line = _line[:-1]
        if line[-1] in "\n\r":
            line = line[:-1]
        
        words = segmentor.segment(line)
        postags = postagger.postag(words)
#        netags = recognizer.recognize(words, postags)
#        arcs = parser.parse(words, postags)

        for i in range(len(words)):
            f.write( "%s/%s\t" % (words[i], postags[i]))
#            if netags[i]!='O':
#                fner.write("%s/%s\t" % (words[i], netags[i]))
        f.write("\n")
#        fner.write("\n")

    f.close()
Beispiel #2
0
def segmentsentence(sentence):
    segmentor = Segmentor()
    postagger = Postagger()
    parser = Parser()
    recognizer = NamedEntityRecognizer()

    segmentor.load("./ltpdata/ltp_data_v3.4.0/cws.model")
    postagger.load("./ltpdata/ltp_data_v3.4.0/pos.model")
    # parser.load("./ltpdata/ltp_data_v3.4.0/parser.model")
    recognizer.load("./ltpdata/ltp_data_v3.4.0/ner.model")
    #############
    word_list = segmentor.segment(sentence)
    postags_list = postagger.postag(word_list)
    nertags = recognizer.recognize(word_list, postags_list)
    ############
    for word, ntag in zip(word_list, nertags):
        if ntag == 'Nh':
            entity_list.append(word)
    print(" ".join(word_list))
    print(' '.join(nertags))
    ############
    segmentor.release()
    postagger.release()
    # parser.release()
    recognizer.release()
    return word_list
Beispiel #3
0
def namedEntityRecognize(sentence):
    '''
        使用pyltp模块进行命名实体识别
        返回:1)命名实体和类别元组列表、2)实体类别列表
    '''
    namedEntityTagTupleList = []

    segmentor = Segmentor()
    # segmentor.load(inout.getLTPPath(index.CWS))
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS),
                                inout.getResourcePath('userDic.txt'))
    words = segmentor.segment(sentence)
    segmentor.release()
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))
    postags = postagger.postag(words)
    postagger.release()
    recognizer = NamedEntityRecognizer()
    recognizer.load(inout.getLTPPath(index.NER))
    netags = recognizer.recognize(words, postags)
    recognizer.release()

    # 封装成元组形式
    for word, netag in zip(words, netags):
        namedEntityTagTupleList.append((word, netag))

    neTagList = '\t'.join(netags).split('\t')

    return namedEntityTagTupleList, neTagList
def ltp_ner_data():
    """使用 LTP 进行命名实体识别"""
    LTP_DATA_DIR = 'D:\BaiduNetdiskDownload\ltp_data_v3.4.0'  # ltp模型目录的路径
    ner_model_path = os.path.join(LTP_DATA_DIR,
                                  'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

    from pyltp import NamedEntityRecognizer
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    result = []
    file = [(const.qc_train_pos, const.qc_train_ner),
            (const.qc_test_pos, const.qc_test_ner)]
    for i in range(2):
        with open(file[i][0], 'r', encoding='utf-8') as f:
            for line in f.readlines():
                attr = line.strip().split('\t')
                words_pos = attr[1].split(" ")
                words = [word.split('/_')[0] for word in words_pos]
                postags = [word.split('/_')[1] for word in words_pos]
                netags = recognizer.recognize(words, postags)  # 命名实体识别
                res = ' '.join([
                    "{}/_{}".format(words[i], netags[i])
                    for i in range(len(words))
                ])
                result.append("{}\t{}\n".format(attr[0], res))
        with open(file[i][1], 'w', encoding='utf-8') as f:
            f.writelines(result)
        result.clear()
    recognizer.release()  # 释放模型
Beispiel #5
0
    def __init__(self):
        LTP_DIR = "./ltp_data"
        self.lac = LAC(mode='lac')
        self.lac.load_customization('data/custom.txt', sep=None)
        self.ddparser = DDParser(encoding_model='transformer')
        self.fine_info = FineGrainedInfo
        self.keyword = Keyword()
        self.jieba = jieba
        self.posseg = jieba.posseg
        self.segmentor = Segmentor(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger(
            model_path=os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer(
            os.path.join(LTP_DIR, "ner.model"))
Beispiel #6
0
    def restart(self):

        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.srler.release()

        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.MODELDIR, "ner.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
        self.srler = SementicRoleLabeller()
        self.srler.load(os.path.join(self.MODELDIR, "pisrl.model"))
Beispiel #7
0
    def __init__(self):
        LTP_DIR = "./ltp_data"
        self.segmentor = Segmentor()
        # self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))
        self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"),
                                         './dict.txt')

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
Beispiel #8
0
    def __init__(self):
        LTP_PATH = '/root/tmp/pycharm_project_96/pyltp_test/ltp_data'

        # 分词
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_PATH,'cws.model'))
        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_PATH,'pos.model'))
        # 依存句法
        self.parser = Parser()
        self.parser.load(os.path.join(LTP_PATH,'parser.model'))
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_PATH,'ner.model'))
        # # 语义角色标注
        self.labeller = SementicRoleLabeller()
        self.labeller.label(os.path.join(LTP_PATH,'pisrl.model'))
Beispiel #9
0
    def __init__(self):
        print(111)
        LTP_DIR = "D:\\ltp_data\\ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))
        print(111)
Beispiel #10
0
 def getNameRecognizer(self):
     if Config.c_namerecognizer:
         return Config.c_namerecognizer
     else:
         ner_model_path = os.path.join(Config.ltp_data_dir,
                                       Config.ner_model)
         Config.c_namerecognizer = NamedEntityRecognizer()
         Config.c_namerecognizer.load(ner_model_path)
         return Config.c_namerecognizer
 def __init__(self,):
     LTP_DATA_DIR = SETTINGS.LTP_DATA_DIR # ltp模型目录的路径
     cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
     pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
     par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
     srl_model_path = os.path.join(LTP_DATA_DIR, 'srl')  # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
     ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
 
     self.segmentor = Segmentor()  # 初始化实例
     self.segmentor.load_with_lexicon(cws_model_path,'./segName')  # 加载模型
     self.postagger = Postagger() # 初始化实例
     self.postagger.load_with_lexicon(pos_model_path,'./postagName')  # 加载模型
     self.parser = Parser() # 初始化实例
     self.parser.load(par_model_path)  # 加载模型
     self.labeller = SementicRoleLabeller() # 初始化实例
     self.labeller.load(srl_model_path)  # 加载模型
     self.recognizer = NamedEntityRecognizer() # 初始化实例
     self.recognizer.load(ner_model_path)  # 加载模型
    def __init__(self):
        # LTP_DIR = './ltp_data_v3.4.0'
        print("加载模型路径", LTP_DIR)
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
        print("加载完毕")
Beispiel #13
0
    def __init__(self, **kwargs):
        """
        Args:
            annotators: set that can include pos and ner.
            model: ltp model to use (path).
        """
        self.segmentor = Segmentor()  # 初始化分词器实例
        self.recognizer = NamedEntityRecognizer()  # 初始化命名实体识别器实例
        self.postagger = Postagger()  # 初始化词性标注实例

        self.segmentor.load(cws_model_path)  # 加载分词模型

        self.annotators = copy.deepcopy(kwargs.get('annotators', set()))
        if {'pos'} & self.annotators:
            self.postagger.load(pos_model_path)
        if {'ner'} & self.annotators:
            self.postagger.load(pos_model_path)
            self.recognizer.load(ner_model_path)
def dependency_parsing(ltp_model_path, sents, postags, said):

    LTP_DATA_DIR = ltp_model_path  # ltp模型目录的路径
    par_model_path = os.path.join(
        LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
    ner_model_path = os.path.join(
        LTP_DATA_DIR, 'ner.model')  # 依存句法分析模型路径,模型名称为`parser.model`

    from pyltp import Parser, NamedEntityRecognizer
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型

    contents = []
    for index in range(len(sents)):
        wo = sents[index].split()

        po = postags[index]

        netags = recognizer.recognize(wo, po)  # 命名实体识别
        # print("netags", list(netags))
        netags = list(netags)
        if ('S-Ns' not in netags) and ('S-Ni'
                                       not in netags) and ('S-Nh'
                                                           not in netags):
            continue

        arcs = parser.parse(wo, po)  # 句法分析

        arcs = [(arc.head, arc.relation) for arc in arcs]

        arcs = [(i, arc) for i, arc in enumerate(arcs) if arc[1] == 'SBV']
        for arc in arcs:
            verb = arc[1][0]
            subject = arc[0]
            if wo[verb - 1] not in said:
                continue

            contents.append((wo[subject], wo[verb - 1], ''.join(wo[verb:])))
    # parser.release()  # 释放模型
    # recognizer.release()  # 释放模型
    return contents
Beispiel #15
0
def name_recognize_one():
    import sys, os
    import pyltp
    from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

    paragraph = '叙利亚东古塔地区。7日发生疑似化学武器袭击事件,导致70余人丧生。报道一出,叙利亚反对派、美国、英国、法国等纷纷指责叙政府军使用化学武器袭击无辜平民。但叙利亚坚决否认,并指责西方和叙反对派造谣,目的是保护被围困的恐怖分子。俄外交部则认为,该谣言旨在袒护恐怖分子,并为外部势力发动打击寻找借口。'

    sentence = SentenceSplitter.split(paragraph)[1]
    print('split {}'.format(sentence))
    # 断句
    #     for i in sentence:
    #         print(i)
    #         print()
    segmentor = Segmentor()
    segmentor.load(sg_model_path)
    words = segmentor.segment(sentence)
    print('|'.join(words))

    postagger = Postagger()
    postagger.load(ps_model_path)
    postags = postagger.postag(words)
    for k, v in dict(zip(words, postags)).items():
        print(k, v)

    # print(' ## '.join(postags))
    parser = Parser()
    parser.load(pr_model_path)
    arcs = parser.parse(words, postags)
    print(' '.join('%d:%s ' % (arc.head, arc.relation) for arc in arcs))

    print('#' * 8)
    recognizer = NamedEntityRecognizer()
    recognizer.load(ner_model_path)
    netag = recognizer.recognize(words, postags)
    for word, ntag in zip(words, netag):
        if ntag != 'O':
            # print('ntag')
            print(word + ' / ' + netag)
    print(' / '.join(netag))

    # 命名实体识别
    word_list = ['欧几里得', '是', '西元前', '三', '世纪', '的', '希腊', '数学家', '。']
    postags_list = ['nh', 'v', 'nt', 'm', 'n', 'u', 'ns', 'n', 'wp']
    nertags = recognizer.recognize(word_list, postags_list)
    for word, ntag in zip(word_list, nertags):
        if ntag != 'O':
            print(word + '/' + ntag)
    #print (" ".join(word_list))
    print(' '.join(nertags))

    segmentor.release()
    postagger.release()
    parser.release()
    recognizer.release()
Beispiel #16
0
def get_all_name(r_filename, w_file):
    # global nlp
    LTP_DATA_DIR = r'ltp_data_v3.4.0'  # LTP模型目录路径

    # 分词
    segmentor = Segmentor()  # 初始化
    segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model'))  # 加载模型
    # words = segmentor.segment(line)  # 分词

    # 词性标注
    postagger = Postagger()  # 初始化
    postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))  # 加载模型
    #postags = postagger.postag(words)
    # postags = postagger.postag(['中国', '进出口', '银行', '与', '中国银行', '加强', '合作', '。'])
    #res=[]
    # 命名实体识别
    recognizer = NamedEntityRecognizer()  # 实例化
    recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))

    f_r = open(r_filename, "r", encoding="utf-8")
    f_w = open(w_file, "w", encoding="utf-8")
    count = 0
    for line in f_r:
        count += 1
        line = line.strip(r"\n")
        line = raplace_line_feed(line)
        line = more_space_to_one(line)
        print(line)
        words = segmentor.segment(line)
        postags = postagger.postag(words)
        netags = recognizer.recognize(words, postags)
        name_list = get_name(netags, words)
        if name_list != []:
            print(name_list)
            sen = get_some_idea(line, name_list)
            print(sen)
            if sen:
                for key in sen:
                    # print(sen[key])
                    sens = "\t".join(list(set([data[1] for data in sen[key]])))
                    f_w.write(key + "\t" + sens + "\n")
    # nlp.close()
    f_r.close()
    f_w.close()
Beispiel #17
0
def ner(words, postags):
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    for word, ntag in zip(words, netags):
        print(word + '/' + ntag)
    recognizer.release()  # 释放模型
    return netags
Beispiel #18
0
def get_ner(words, postags):
    """ ltp 命名实体识别 """
    ner_model_path = os.path.join(LTP_TOP_DIR, 'ner.model')
    recognizer = NamedEntityRecognizer()
    recognizer.load(ner_model_path)
    netags = recognizer.recognize(words, postags)
    recognizer.release()
    return list(netags)
def ltp_name_entity_recognizer(LTP_DATA_DIR, words, postags):
    # 命名实体识别模型路径,模型名称为`ner.model`
    ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    recognizer.release()  # 释放模型
    return netags
Beispiel #20
0
def ner(words, postags):
    recognizer = NamedEntityRecognizer() # 初始化实例
    recognizer.load('/Users/chenming/Spyder/3.3.1/ltp_data/ner.model')  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    for word, ntag in zip(words, netags):
        print (word + '/' + ntag)
    recognizer.release()  # 释放模型
    return netags
Beispiel #21
0
 def entity_recognize(cutting_list, tagging_list):
     ner_model_path = os.path.join(LtpParser.ltp_path, 'ner.model')
     from pyltp import NamedEntityRecognizer
     recognizer = NamedEntityRecognizer()
     recognizer.load(ner_model_path)
     ne_tags = recognizer.recognize(cutting_list, tagging_list)
     recognizer.release()
     return ne_tags
def e_recognize(words, postags):
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    #for word, ntag in zip(words, netags):
    #print(word + '/' + ntag)
    recognizer.release()  # 释放模型
    return netags
Beispiel #23
0
def ner(words, postags):
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load('../ltp_data/ner.model')  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    # for word,tag in zip(words,netags):
    #     print word+'/'+tag
    recognizer.release()  # 释放模型
    return netags
Beispiel #24
0
 def get_ner(self, word_list, postag_list, model):
     recognizer = NamedEntityRecognizer()
     recognizer.load(model)
     netags = recognizer.recognize(word_list, postag_list)  # 命名实体识别
     # for word, ntag in zip(word_list, netags):
     #     print(word + '/' + ntag)
     recognizer.release()  # 释放模型
     return list(netags)
Beispiel #25
0
class LtpLanguageAnalysis(object):
    def __init__(self, model_dir="D:/ltp_data_v3.4.0"):
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(model_dir, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(model_dir, "pos.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(model_dir, "parser.model"))
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(model_dir, "ner.model"))  # 加载命名实体识别模型

    def analyze(self, text):
        # 分词
        words = self.segmentor.segment(text)
        print('\t'.join(words))

    def postags(self, words):
        # 词性标注
        postags = self.postagger.postag(words)
        # print('\t'.join(postags))
        return list(postags)
        # return '\t'.join(postags)

    def parse(self, words, postags):
        # 句法分析
        arcs = self.parser.parse(words, postags)
        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
        # print("\t".join(arc.relation for arc in arcs))
        return "\t".join(arc.relation for arc in arcs)

    def ner(self, words, postags):
        # 命名实体
        netag = self.recognizer.recognize(words, postags)
        for word, ntag in zip(words, netag):
            if ntag != 'O':
                print(word + '/' + ntag)
        print("\t".join(netag))

    def release_model(self):
        # 释放模型
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
    def __init__(self, data_dir):
        self.LTP_DATA_DIR = data_dir
        cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')
        # 分词模型路径,分词模型名称是‘cws.model’
        self.segmentor = Segmentor()
        self.segmentor.load(cws_model_path)

        pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')
        # 词性标注模型路径,分词模型名称是‘pos.model’
        self.postagger = Postagger()
        self.postagger.load(pos_model_path)

        ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(ner_model_path)

        par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model')
        self.parser = Parser()
        self.parser.load(par_model_path)
Beispiel #27
0
    def __init__(self):
        LTP_DIR = "../../res/ltp/ltp_data_v3.4.0"
        LTP_DIR_USER = "******"
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt"))
        # self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt"))
        # self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))
    def segment(self, texts, use_tag_filter=True):
        # 初始化实例
        # global word_list, netags, postags, relation, heads
        words = []
        pos = []
        ner = []
        rel = []
        hea = []

        segmentor = Segmentor()
        segmentor.load_with_lexicon(self.cws_model_path, './dict/user_recg.dic')  # 加载模型,参数是自定义词典的文件路径  self.dic_list

        postagger = Postagger()
        postagger.load(self.pos_model_path)

        recognizer = NamedEntityRecognizer()
        recognizer.load(self.ner_model_path)

        parser = Parser()
        parser.load(self.pas_model_path)

        for text in texts:
            text = text.lower()

            word_list = segmentor.segment(text)
            word_list = [word for word in word_list if len(word) > 1]
            # word_list = [word for word in word_list if re.match("[\u0041-\u005a\u4e00-\u9fa5]+", word) != None]  # .decode('utf8') 保留中英文
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]  # 去除停用词

            # 词性标注
            posttags = postagger.postag(word_list)
            postags = list(posttags)

            # NER识别
            netags = recognizer.recognize(word_list, postags)

            # 句法分析
            arcs = parser.parse(word_list, postags)
            rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
            relation = [arc.relation for arc in arcs]  # 提取依存关系
            heads = ['Root' if id == 0 else word_list[id - 1] for id in rely_id]  # 匹配依存父节点词语

            if use_tag_filter:
                dic = dict(zip(word_list, postags))
                word_list = [x for x in dic.keys() if dic[x] in self.tags_filter]

            words.append(word_list)
            pos.append(postags)
            ner.append(netags)
            rel.append(relation)
            hea.append(heads)

        segmentor.release()
        postagger.release()
        recognizer.release()
        parser.release()

        return words, pos, ner, rel, hea
Beispiel #29
0
    def __init__(self):
        LTP_DIR = "data\ltp_data"
        cws_model_path = os.path.join(LTP_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        lexicon_path = "dictionary\Dir1.txt"  # 参数lexicon是自定义词典的文件路径
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(cws_model_path, lexicon_path)
        # self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))
Beispiel #30
0
class LTP:
    def __init__(self, base_dir, is_custom_seg_dict=False):
        if base_dir is None:
            base_dir = 'lib/ltp_data_v3.4.0'
        self.init(base_dir, is_custom_seg_dict)

    def init(self, base_dir, is_custom_seg_dict):
        segmentor_model = os.path.join(base_dir, 'cws.model')
        tagger_model = os.path.join(base_dir, 'pos.model')
        ner_model = os.path.join(base_dir, 'ner.model')
        parser_model = os.path.join(base_dir, 'parser.model')
        custom_seg_dict = os.path.join(dict_dir,
                                       'vertical_domain_baike_dict.txt')

        self.segmentor = Segmentor()
        if is_custom_seg_dict:
            self.segmentor.load_with_lexicon(segmentor_model, custom_seg_dict)
        else:
            self.segmentor.load(segmentor_model)

        self.tagger = Postagger()
        self.tagger.load(tagger_model)

        self.nertagger = NamedEntityRecognizer()
        self.nertagger.load(ner_model)

        self.parser = Parser()
        self.parser.load(parser_model)

    def parse(self, sentence, parse_tree=True):
        words = list(self.segmentor.segment(sentence))
        tags = list(self.tagger.postag(words))
        ner_tags = list(self.nertagger.recognize(words, tags))
        if parse_tree:
            arcs = list(self.parser.parse(words, tags))
        else:
            arcs = None
        result = LTPResult(words, tags, ner_tags, arcs, sentence)
        return result

    def cut(self, sentence):
        words = self.segmentor.segment(sentence)
        return words
Beispiel #31
0
def test(sentence):
    os.environ['STANFORD_PARSER'] = STANFORD_PARSER_PATH
    os.environ['STANFORD_MODELS'] = STANFORD_MODELS_PATH
    os.environ['JAVAHOME'] = JAVA_HOME
    stanford_model_path = CHINESE_MODEL_PATH
    s_parser = stanford.StanfordParser(model_path=stanford_model_path)

    par_model_path = os.path.join(
        LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`

    from pyltp import Parser
    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型

    cws_model_path = os.path.join(LTP_DATA_DIR,
                                  'cws.model')  # 分词模型路径,模型名称为`cws.model`

    from pyltp import Segmentor
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型

    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

    from pyltp import Postagger
    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型

    ner_model_path = os.path.join(LTP_DATA_DIR,
                                  'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

    from pyltp import NamedEntityRecognizer
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    words = segmentor.segment(sentence)
    postags = postagger.postag(words)
    netags = recognizer.recognize(words, postags)
    arcs = parser.parse(words, postags)  # 句法分析

    res = zip(words, postags, netags, arcs)
    for i in res:
        print(','.join(i[:3]), str(i[3].head) + ':' + i[3].relation)
Beispiel #32
0
    def __init__(self):
        self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        segmentor = Segmentor()
        segmentor.load(self.cws_model_path)
        self.words = segmentor.segment(data)
        # print("|".join(words))
        segmentor.release()


        postagger = Postagger() # 初始化实例
        postagger.load(self.pos_model_path)  # 加载模型
        self.postags = postagger.postag(self.words)  # 词性标注
        # print('\t'.join(postags))
        postagger.release()  # 释放模型


        recognizer = NamedEntityRecognizer() # 初始化实例
        recognizer.load(self.ner_model_path)  # 加载模型
        self.netags = recognizer.recognize(self.words, self.postags)  # 命名实体识别
        # print('\t'.join(netags))
        recognizer.release()  # 释放模型
Beispiel #33
0
print "\t".join(words)

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
    print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

segmentor.release()
postagger.release()
parser.release()
Beispiel #34
0

import sys, os

ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
sys.path.append(os.path.join(ROOTDIR, "lib"))
# Set your own model path
MODELDIR = os.path.join("/home/fish/", "ltp_data")
from pyltp import Segmentor, Postagger, NamedEntityRecognizer  # @UnresolvedImport

# 分词功能
segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))


def ltp(sentence):
    words = segmentor.segment(sentence)
    # 词性标注功能
    postags = postagger.postag(words)
    # 实体识别
    netags = recognizer.recognize(words, postags)
    l = []
    li = zip(list(words), list(postags), list(netags))
    for a, b, c in li:
        # 去掉命名实体
        if c == "O":
            #             去掉所有名词
Beispiel #35
0
def mingming_shiti(words,postags):
    """命名实体。机构名(Ni)人名(Nh)地名(Ns)"""
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    netags = recognizer.recognize(words, postags)
    print ("\t".join(netags))