コード例 #1
0
ファイル: ltp.py プロジェクト: deepwindlee/NLP_Course
    def role_label(self, words, postags, arcs):
        """
        语义角色标注
        :param words:
        :param postags:
        :param arcs:
        :return:
        """
        srl_model = os.path.join(self.MODEL_PATH, 'pisrl_win.model')

        labeller = SementicRoleLabeller()  # 初始化实例
        labeller.load(srl_model)  # 加载模型

        roles = labeller.label(words, postags, arcs)  # 语义角色标注

        for role in roles:
            print(
                role.index, "".join([
                    "{0}:({1},{2})".format(arg.name, arg.range.start,
                                           arg.range.end)
                    for arg in role.arguments
                ]))
        labeller.release()

        return "roles{}".format(roles)
コード例 #2
0
class LTP_word():
    """docstring for parser_word
    deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值
    release释放缓存"""
    def __init__(self, model_path):
        self.model_path = model_path
        self.segmentor = Segmentor()  # 分词初始化实例
        self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt'))
        self.postagger = Postagger() # 词性标注初始化实例
        self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型
        self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例
        self.recognizer.load(path.join(self.model_path, 'ner.model'))
        self.parser = Parser() # 依存句法初始化实例 s
        self.parser.load(path.join(self.model_path, 'parser.model'))  # 加载模型
        self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例
        self.labeller.load(path.join(self.model_path, 'srl'))
    def deal (self, text):  #把所有该要使用的东西都提取出来
        words =self.segmentor.segment(text)    # 分词 
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)	#命名实体
        arcs = self.parser.parse(words, postags)  # 句法分析
        roles = self.labeller.label(words, postags, netags, arcs)  # 语义角色标注
        return words,postags,arcs,roles,netags
    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
コード例 #3
0
ファイル: Text.py プロジェクト: burgeon26/neg-classifier
class LTP:
    def __init__(self):
        self.segmentor = Segmentor()  # 分词器
        self.segmentor.load_with_lexicon(
            Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH)  # 加载模型
        self.postagger = Postagger()  # 词性分析器
        self.postagger.load(Config.POSTAGGER_PATH)  # 加载模型
        self.parser = Parser()  # 句法分析器
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH)
        self.parser.load(Config.PARSER_PATH)  # 加载模型
        self.labeller = SementicRoleLabeller()  # 语义角色分析器
        self.labeller.load(Config.LABELLER_PATH)  # 加载模型
        self.negative_list = get_negative_list()
        self.no_list = get_no_list()
        self.limit_list = get_limit_list()
        self.special_list = get_special_list()
        self.key_sentences = []

    def __del__(self):
        """
        资源释放
        """
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.labeller.release()
コード例 #4
0
def role(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    # labeller.load('/usr/local/src/ltp_data/srl')  # 加载模型
    labeller.load(srl_model_path)  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    """
    #arg.name 表示语义角色关系
    #arg.range.start 表示起始词位置
    #arg.range.end 表示结束位置
    roletype = {'C-A0':'施事','A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人'
        , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因'
        , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有','DIS': '转折'}
    postype = {'A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人'
        , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因'
        , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有'}
    for role in roles:
        #print role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

        outstr = ""
        for arg in role.arguments:
            block = ''

            for num in range(arg.range.start, arg.range.end+1):
                block = block + words[num]+'[%d-%s]'%(num,postags[num])
            outstr = outstr + roletype[arg.name] + "(%s);" % block
        print '%d-%s'%(role.index,words[role.index])+ ":"+outstr
    """
    labeller.release()  # 释放模型
    return roles
コード例 #5
0
    def semantic_role_label(self):
        #依存句法分析
        parser = Parser()
        parser.load('ltp_data/parser.model')
        arcs = parser.parse(self.words, self.postags)
        parser.release()

        labeller = SementicRoleLabeller()
        labeller.load('ltp_data/srl')
        roles = labeller.label(self.words, self.postags, self.netags, arcs)

        Label_AX = []  #存放A0或者A1标签的列表
        for role in roles:
            Label_AX.extend([
                arg for arg in role.arguments
                if arg.name == "A0" or arg.name == "A1"
            ])
        for label in Label_AX:
            #排除一些长度异常的标签为A0或者A1的动作实施者或者动作接受者
            if label.range.end - label.range.start > 0 and label.range.end - label.range.start < 10:
                for i in range(label.range.start, label.range.end + 1):
                    #将动作实施者或者动作接受者中的名词,人名,地名拿出来作为实体
                    if self.postags[i] == "n" or self.postags[
                            i] == "ns" or self.postags[
                                i] == "nh" or self.postags[i] == "ni":
                        self.entity.append(self.words[i])
                    else:
                        pass
            else:
                pass
        labeller.release()
コード例 #6
0
ファイル: yu06.py プロジェクト: Minggggggggg/nlp
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller() # 初始化实例
    labeller.load('/Users/chenming/Spyder/3.3.1/ltp_data/srl/')  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print (role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    labeller.release()  # 释放模型
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    roles = labeller.label(words, postags, arcs)  # 语义角色标注
    #for role in roles:
    #   print (role.index, "".join(   ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    labeller.release()  # 释放模型
    return roles
コード例 #8
0
ファイル: CJNLP.py プロジェクト: denggaoshan/18cmdick
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller() # 初始化实例
    model = "srl"
    labeller.load(os.path.join(modelPath, model))  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])
    labeller.release()  # 释放模型
コード例 #9
0
def ltp_sementic_role_labeller(LTP_DATA_DIR, words, postags, arcs):
    # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
    # windos下开发使用pisrl_win.model模型
    srl_model_path = os.path.join(LTP_DATA_DIR, 'srl/pisrl_win.model')
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    # arcs 使用依存句法分析的结果
    roles = labeller.label(words, postags, arcs)  # 语义角色标注
    labeller.release()  # 释放模型
    return roles
コード例 #10
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load('../ltp_data/srl')  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    # print '----------------'
    # for role in roles:
    #     print role.index, "".join(
    #         ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])
    # print '----------------'
    labeller.release()  # 释放模型
    return roles
コード例 #11
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(os.path.join(LTP_DATA_DIR, 'srl'))  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()  # 释放模型
コード例 #12
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load('/Users/zhangqinyuan/Downloads/ltp_data_v3.4.0/srl')  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()  # 释放模型
コード例 #13
0
def SrlFunction(contents):
    from pyltp import Segmentor
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load(cws_model_path)  # 加载模型
    segmentor.load_with_lexicon(cws_model_path,
                                'E:\\ltp_data_v3.4.0\\personal_seg.txt')
    words = segmentor.segment(contents)  # 分词
    k = 1
    for word in words:
        print(word + str(k) + '  ', end='')
        k = k + 1
    print('\n')
    # print('\t'.join(words))
    segmentor.release()  # 释放模型
    wordslist = list(words)

    from pyltp import Postagger
    postagger = Postagger()
    # postagger.load(pos_model_path)
    postagger.load_with_lexicon(pos_model_path,
                                'D:\\ltp_data_v3.4.0\\personal_pos.txt')
    postags = postagger.postag(wordslist)
    print('\t'.join(postags))
    postagger.release()

    # wordslist = ['人力资源社会保障局','主管','医疗保险','工作']
    # postags = ['n','v','n','v']

    from pyltp import Parser
    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型
    arcs = parser.parse(wordslist, postags)  # 句法分析
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    parser.release()  # 释放模型

    from pyltp import SementicRoleLabeller
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    # arcs 使用依存句法分析的结果
    roles = labeller.label(wordslist, postags, arcs)  # 语义角色标注

    # 打印结果
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()  # 释放模型
コード例 #14
0
    def get_role_list(self, words, postags):
        parser = Parser()
        parser.load(Dependency.par_model)

        rolelabel = SementicRoleLabeller()
        rolelabel.load(Dependency.pisrl_model)
        try:
            parsers = parser.parse(words, postags)
            roles = rolelabel.label(words, postags, parsers)
        except Exception as e:
            roles = [[]]
        finally:
            parser.release()
            rolelabel.release()
            return roles
コード例 #15
0
 def get_roles_by_pyltp(self, words_list, postags_list, arcs_list):
     roles_list = list()
     # 语义角色标注模型路径,模型名称为‘pisrl.model’
     srl_model_path = os.path.join(self.ltp_dir_path, "pisrl.model")
     labeller = SementicRoleLabeller()
     labeller.load(srl_model_path)
     roles = labeller.label(words_list, postags_list, arcs_list)
     labeller.release()
     # 尝试释放内存
     # import gc
     # del labeller
     # gc.collect()
     # 算了,这个不行
     roles_list = list(roles)
     return roles_list
コード例 #16
0
def get_srl(sentence):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    words = list(pyltp_cut(sentence))  # pyltp分词
    postags = list(postagger.postag(words))  # 词性标注
    arcs = get_parsing(sentence)
    # arcs 使用依存句法分析的结果
    roles = labeller.label(words, postags, arcs)  # 语义角色标注

    # 打印结果
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
        labeller.release()  # 释放模型
コード例 #17
0
ファイル: pyltp_eg1.py プロジェクト: shamrock222/nlp-learning
    def get_srl(self, words):
        # 语义角色标注
        labeller = SementicRoleLabeller()  # 初始化实例
        labeller.load(self.srl_model_path)  # 加载模型
        # arcs 使用依存句法分析的结果
        postags = self.get_postags(words)
        arcs = self.get_dependency(words)
        roles = labeller.label(words, postags, arcs)  # 语义角色标注

        # 打印结果
        for role in roles:
            print(
                role.index, "".join([
                    "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                    for arg in role.arguments
                ]))
        labeller.release()  # 释放模型
        return roles
コード例 #18
0
ファイル: parser.py プロジェクト: KDD2018/Machine-Learning
def labeller(word_tag, arcs, srl_model_path):
    '''
    Desc: 语义角色标注
    Args: word_tag(dict) 词性词典
          arcs 依存关系
          srl_model_path 语义角色标注模型
    '''

    labeller = SementicRoleLabeller()
    labeller.load(srl_model_path)
    roles = labeller.label(list(word_tag.keys()), list(word_tag.values()),
                           arcs)
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()
コード例 #19
0
def sentence_label(parse_result):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    i = 0
    final_result = []

    for key, value in parse_result.items():
        i += 1
        if i % 50 == 0:
            print('休息一下')
            time.sleep(5)
        words = value[0]
        postags = value[1]
        arcs = value[2]
        roles = labeller.label(words, postags, arcs)

    print('done')
    print(final_result)
    labeller.release()
コード例 #20
0
ファイル: nlp_tools.py プロジェクト: DeepDarkOdyssey/exalt
class LtpParser(object):
    def __init__(self, data_dir: str):
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(data_dir, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(data_dir, "pos.model"))
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(data_dir, "ner.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(data_dir, "parser.model"))
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(data_dir, "pisrl.model"))

    def parse(self, text: str) -> List[str]:
        tokens = self.segmentor.segment(text)
        postags = self.postagger.postag(tokens)
        netags = self.recognizer.recognize(tokens, postags)
        arcs = self.parser.parse(tokens, postags)
        roles = self.labeller.label(tokens, postags, arcs)
        srlabels = {}
        for role in roles:
            srlabels[role.index] = {
                arg.name: {
                    "start": arg.range.start,
                    "end": arg.range.end
                }
                for arg in role.arguments
            }
        return {
            "tokens": list(tokens),
            "postags": list(postags),
            "netags": list(netags),
            "srlabels": srlabels,
        }

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
コード例 #21
0
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
    print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

segmentor.release()
postagger.release()
parser.release()
recognizer.release()
labeller.release()
コード例 #22
0
class Extractor():

    def __init__(self):
        self.__clause_list = []
        self.__subclause_dict = {}
        self.__triple_list = []
        self.__segmentor = Segmentor()
        self.__postagger = Postagger()
        self.__recognizer = NamedEntityRecognizer()
        self.__parser = Parser()
        self.__labeller = SementicRoleLabeller()
        self.__words_full_list = []
        self.__netags_full_list = []

    @property
    def clause_list(self):
        return self.__clause_list

    @property
    def triple_list(self):
        return self.__triple_list


    def split(self, words, postags):
        start = 0
        for j, w in enumerate(words):
            if w == ',' or w == ',' or w == '。':
                clause = Clause(start, j-1 )
                self.__clause_list.append(clause)
                start = j + 1

        for clause in self.__clause_list:
            clause.split(postags)
            for subclause in clause.sub_clause_list:
                self.add_inverted_idx(subclause)

    def add_inverted_idx(self, subclause):
        for i in range(subclause.start_idx, subclause.end_idx):
            self.__subclause_dict[i] = subclause

    def load(self):
        self.__segmentor.load('ltp_data/cws.model')
        self.__postagger.load('ltp_data/pos.model')
        self.__recognizer.load('ltp_data/ner.model')
        self.__parser.load('ltp_data/parser.model')
        self.__labeller.load('ltp_data/srl')

    def release(self):
        self.__segmentor.release()
        self.__postagger.release()
        self.__recognizer.release()
        self.__parser.release()
        self.__labeller.release()

    def clear(self):
        self.__triple_list = []
        self.__words_full_list = []
        self.__netags_full_list = []
    
    def resolve_conference(self, entity):
        try:
            e_str = entity.get_content_as_str()
        except Exception:
            return '?'
        ref = e_str
        if e_str == '他' or e_str == '她':
            for i in range(entity.loc, -1, -1):
                if self.__netags_full_list[i].lower().endswith('nh'):
                    ref = self.__words_full_list[i]
                    break
        return ref
    
    def resolve_all_conference(self):
        for t in self.triple_list:
            e_str = self.resolve_conference(t.entity_1)
            try:
                t.entity_1.content = e_str.split()
            except Exception:
                pass

    def chunk_str(self, data):
        sents = SentenceSplitter.split(data)
        offset = 0
        for sent in sents:
            try:
                words = self.__segmentor.segment(sent)
                postags = self.__postagger.postag(words)
                netags = self.__recognizer.recognize(words, postags)
                arcs = self.__parser.parse(words, postags)
                roles = self.__labeller.label(words, postags, netags, arcs)
                self.chunk_sent(list(words), list(postags), list(arcs), offset)
                offset += len(list(words))
                self.__words_full_list.extend(list(words))
                self.__netags_full_list.extend(list(netags))
            except Exception as e:
                print(str(e))
                pass

    def chunk_sent(self, words, postags, arcs, offset):
        root = [i for i,x in enumerate(arcs) if x.relation == 'HED']
        if len(root) > 1:
            raise Exception('More than 1 HEAD arc is detected!')
        root = root[0]
        relations = [i for i, x in enumerate(arcs) if x.head == root and x.relation == 'COO']
        relations.insert(0,root)

        prev_e1 = None
        e1      = None
        for rel in relations:

            left_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'SBV']

            if len(left_arc) > 1:
                pass
                #raise Exception('More than 1 left arc is detected!')
            elif len(left_arc) == 0:
                e1 = prev_e1
            elif len(left_arc) == 1:
                left_arc = left_arc[0]
                leftmost = find_farthest_att(arcs, left_arc)
                e1 = Entity(1, [words[i] for i in range(leftmost, left_arc + 1)], offset + leftmost)


            prev_e1 = e1

            right_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'VOB']

            e2_list = []
            if not right_arc:
                e2 = Entity(2, None)
                e2_list.append(e2)
            else:
                right_ext = find_farthest_vob(arcs, right_arc[0])

                items = [i for i, x in enumerate(arcs) if x.head == right_ext and x.relation == 'COO']
                items = right_arc + items

                count = 0
                for item in items:
                    leftmost = find_farthest_att(arcs, item)


                    e2 = None

                    if count == 0:
                        e2 = Entity(2, [words[i] for i in range(leftmost, right_ext + 1)], offset+leftmost)
                    else:
                        p1 = range(leftmost, right_arc[0])
                        p2 = range(item, find_farthest_vob(arcs, item) + 1)
                        e2 = Entity(2, [words[i] for i in itertools.chain(p1, p2)])

                    e2_list.append(e2)
                    r = Relation(words[rel])
                    t = Triple(e1, e2, r)
                    self.__triple_list.append(t)
                    count += 1
コード例 #23
0
class SentenceParser:
    def __init__(self):
        # LTP_DIR = './ltp_data_v3.4.0'
        print("加载模型路径", LTP_DIR)
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
        print("加载完毕")

    '''句法分析---为句子中的每个词语维护一个依存句法依存儿子节点(词的出度)的字典'''
    '''
        句法分析中,每个只有一个入度(可能吧),可能有多个出度。
        为了可以结构化的展示分析结果,或者说方便提取信息。
        对每个词建立一个子节点的字典:
            1) 若该词的出度为0,字典为NULL
            2) 若该词的出度为n,那字典的元素个数为n
    '''

    def build_parse_child_dict(self, words, postags, arcs):
        """
        格式化句法分析结果
        :param words: 分词结果
        :param postags: 词性标注结果
        :param arcs: 句法分析结果
        :return: child_dict_list, format_parse_list
        """
        '''
        arcs是一个列表:
            列表元素当前单词,每个元素arc包含arc.head, arc.relation信息,
            head为指向该词(词的父节点)的下标(从1开始),relation为父节点和该词的句法关系
            *** 因为每个词只有 一个入度, 这个arc信息就表示入度信息
            
        LTP句法分析模型输出arcs:表示每个词的入度信息,父节点信息,只有一个
        返回:
            child_dict_list:是表示每个词的出度信息,就是子节点信息
            format_parse_list:每个词信息格式化:  与父节点句法关系,该词,该词下标,该词词性,父节点词,父词下标,父词词性
        '''

        child_dict_list = []
        format_parse_list = []

        # 对每个词建立子节点信息
        for index in range(len(words)):
            child_dict = dict()
            ## 遍历寻找该词的子节点
            for arc_index in range(len(arcs)):
                ## 如果有指向该词的子节点,则加入child_dict
                if arcs[arc_index].head == index + 1:
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)

            child_dict_list.append(child_dict)

        # 对每个词建立指定信息
        ## 包含: [依存关系,词,下标,POS,父节点词,父节点下标,父节点POS]  # 还可以加上词的NER信息
        rely_id = [arc.head for arc in arcs]  # 提取每个词依存父节点id(其中id为0的是Root)
        relation = [arc.relation for arc in arcs]  # 提取每个词依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''语义角色标注'''
    '''
        只对句子中 谓词 进行论元分析,抽取论元以及标注论元和谓词的关系。
    '''

    def format_labelrole(self, words, postags):
        """
        格式化语义角色标注结果
        :param words:
        :param postags:
        :return:
        """
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        '''
        roles中有多个role,每个role代表句子中的一个谓词
            role.index 代表谓词的索引, 
            role.arguments 代表关于该谓词的若干语义角色。(这里的论元可能不是简单的一个词)
                arg.name 表示语义角色类型,
                arg.range.start 表示该语义角色起始词位置的索引,(索引从0开始)
                arg.range.end 表示该语义角色结束词位置的索引。
        roles={
            'r1':{
                'args1':{
                    'name': 语义角色类型,
                    'range':{
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                'args2':{
                    'name': 语义角色类型,
                    'range': {
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                ...
            },
            'r2':{
                'args1': {
                    'name': 语义角色类型,
                    'range': {
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                'args2': {
                    'name': 语义角色类型,
                    'range': {
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                ...
            },
            ...
        }
        '''
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    def close(self):
        """关闭与释放模型"""
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()

    '''parser主函数'''
    '''
    将模型的输出进行处理,方便之后数据处理
        模型输出:words, postags, ners, arcs, roles
        处理后信息:
            child_dict_list:句法分析,每个词的子节点信息
            format_parse_list:句法分析,每个词的信息和父节点信心(父节点唯一)
            roles_dic:
    '''

    def parser_main(self, sentence):
        '''words, postags, ners, arcs 为LTP模型输出'''
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        ners = list(self.recognizer.recognize(words, postags))
        arcs = self.parser.parse(words, postags)

        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
        """
        arcs中有多个arc
            arc.head 表示依存弧的父节点词的索引。ROOT节点的索引是0,第一个词开始的索引依次为1、2、3…
            arc.relation 表示依存弧的关系。
            注意:一个词最多只有一个弧指向它(即只有一个入度),但是一个词可以指向多个词(即有多个出度)
        """
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)

        return words, postags, ners, child_dict_list, format_parse_list, roles_dict
コード例 #24
0
class RequestHandler():
    def __init__(self):
        self.intents = [
            'translation', 'app', 'calc', 'match', 'radio', 'health', 'novel',
            'video', 'cinemas', 'music', 'stock', 'train', 'news', 'message',
            'map', 'weather', 'cookbook', 'tvchannel', 'flight', 'schedule',
            'riddle', 'email', 'contacts', 'bus', 'website', 'datetime',
            'poetry', 'lottery', 'chat', 'epg', 'telephone'
        ]

        self.segmentor = Segmentor()  # 初始化实例 CWS
        self.segmentor.load(configs.cws_path)  # 加载模型
        self.postagger = Postagger()  # 初始化实例 POS Tagger
        self.postagger.load(configs.pos_path)  # 加载模型
        self.labeller = SementicRoleLabeller()  # 初始化实例 SRLer
        self.labeller.load(configs.srl_path)  # 加载模型
        self.parser = Parser()  # 初始化实例   Parser
        self.parser.load(configs.parser_path)  # 加载模型

        self.ac = ACAutomatons()

        self.clf_31 = NBSVM()

        self.char_vectorizer_31 = joblib.load(configs.models_path +
                                              '/nbsvm-vocab-ch.pkl')
        self.word_vectorizer_31 = joblib.load(configs.models_path +
                                              '/nbsvm-vocab-wd.pkl')
        self.clf_31 = joblib.load(configs.models_path + '/nbsvm_31.pkl')
        self.ch2_ = joblib.load(configs.models_path +
                                '/nbsvm-feature_selector.pkl')
        self.word_vectorizer_tv = joblib.load(configs.models_path +
                                              '/vocab-wd_epg-tvchannel.pkl')
        self.char_vectorizer_tv = joblib.load(configs.models_path +
                                              '/vocab-ch_epg-tvchannel.pkl')
        self.clf_tv = joblib.load(configs.models_path +
                                  '/svm_epg-tvchannel.pkl')
        self.word_vectorizer_movie = joblib.load(configs.models_path +
                                                 '/vocab-wd_video-cinemas.pkl')

        self.char_vectorizer_movie = joblib.load(configs.models_path +
                                                 '/vocab-ch_video-cinemas.pkl')
        self.clf_movie = joblib.load(configs.models_path +
                                     '/svm_video-cinemas.pkl')
        self.char_vectorizer_internet = joblib.load(
            configs.models_path + '/vocab-ch_website-app.pkl')
        self.word_vectorizer_internet = joblib.load(
            configs.models_path + '/vocab-wd_website-app.pkl')
        self.clf_internet = joblib.load(configs.models_path +
                                        '/svm_website-app.pkl')
        self.char_vectorizer_star = joblib.load(configs.models_path +
                                                '/vocab-ch_video-music.pkl')
        self.clf_star = joblib.load(configs.models_path +
                                    '/svm_video-music.pkl')

        self.word_vectorizer_star = joblib.load(configs.models_path +
                                                '/vocab-wd_video-music.pkl')
        self.char_vectorizer_video = joblib.load(configs.models_path +
                                                 '/vocab-ch_video-epg.pkl')
        self.word_vectorizer_video = joblib.load(configs.models_path +
                                                 '/vocab-wd_video-epg.pkl')
        self.clf_video = joblib.load(configs.models_path +
                                     '/svm_video-epg.pkl')

    def getResult(self, sentence):
        """1. Complete the classification in this function.

        Args:
            sentence: A string of sentence.

        Returns:
            classification: A string of the result of classification.
        """
        processed = self.preprocess(sentence)

        return self.pipeline(processed)

    def getBatchResults(self, sentencesList):
        """2. You can also complete the classification in this function,
                if you want to classify the sentences in batch.

        Args:
            sentencesList: A List of Dictionaries of ids and sentences,
                like:
                [{'id':331, 'content':'帮我打电话给张三' }, 
                 {'id':332, 'content':'帮我订一张机票!' },
                 ... ]

        Returns:
            resultsList: A List of Dictionaries of ids and results.
                The order of the list must be the same as the input list,
                like:
                [{'id':331, 'result':'telephone' }, 
                 {'id':332, 'result':'flight' },
                 ... ]
        """
        resultsList = []
        for sentence in sentencesList:
            resultDict = {}
            resultDict['id'] = sentence['id']
            resultDict['result'] = self.getResult(sentence['content'])
            resultsList.append(resultDict)

        return resultsList

    def pattern_match(self, sample):
        srl_res = self.sRLMatch(sample)
        if srl_res != None:

            return srl_res
        else:
            rul_res = self.ruleMatch(sample)
            if rul_res != None:

                return rul_res
            else:
                return None

    def ruleMatch(self, sample):
        domains = get_rule(sample['query'], self.ac)

        if len(domains) < 1:
            return None
        else:
            sorted_domains = aggregate_domains(domains)

            for each in sorted_domains:
                if each[0] == 'datetime':
                    nouns = get_nouns(sample['query'], 'festival', self.ac)

                    if len(nouns) > 0:
                        return 'datetime'
                    else:
                        continue

                elif each[0] == 'email':
                    if len(
                            set(sample['word'])
                            & set(['写', '回复', '转发', '打开', '查收', '查看', '答复'])
                    ) > 0:
                        return 'email'
                    else:
                        continue

            else:
                return None

    def sRLMatch(self, sample):
        srl_res = getSRL(sample['query'], self.segmentor, self.postagger,
                         self.parser, self.labeller)
        if len(srl_res) == 0:  #no any predicate in query or single entity
            return None
        else:
            for res in srl_res:
                predicate_domains = get_predicate(res[0], self.ac)
                if len(predicate_domains) < 1:
                    continue  #no such a predicate in database
                else:
                    sorted_domains = aggregate_domains(predicate_domains)
                    for each in sorted_domains:
                        if each[0] == 'app':
                            nouns = get_nouns(res[1], 'app', self.ac)
                            if len(nouns) > 0:

                                return 'app'
                            else:
                                continue

                        elif each[0] == 'cinemas':
                            nouns = get_nouns(res[1], 'film', self.ac)
                            if len(nouns) > 0:
                                return 'Movie_stuff'
                            else:
                                continue
                        elif each[0] == 'contacts':
                            # 'nr' by POS-tagger indicates a person's name
                            if 'nr' in sample['tag']:
                                return 'contacts'
                            else:
                                continue

                        elif each[0] == 'cookbook':
                            nouns = get_nouns(res[1], 'food', self.ac)
                            if len(nouns) > 0:  # 如果命中任何专有名词,则划分到意图app

                                return 'cookbook'
                            else:
                                continue

                        elif each[0] == 'tvchannel':
                            nouns = get_nouns(res[1], 'tvchannel', self.ac)
                            if len(nouns) > 0:
                                return 'TV_stuff'
                            else:
                                continue

                        elif each[0] == 'video':
                            nouns = get_nouns(res[1], 'video', self.ac)
                            if len(nouns) > 0:
                                return 'Video_stuff'
                            else:
                                continue

                        elif each[0] == 'health':
                            nouns = get_nouns(res[1], 'disease', self.ac)
                            nouns.extend(get_nouns(res[1], 'drug', self.ac))
                            if len(nouns) > 0:
                                return 'health'
                            else:
                                continue

                        elif each[0] == 'music':
                            nouns_song = get_nouns(res[1], 'song', self.ac)
                            nouns_singer = get_nouns(res[1], 'singer', self.ac)
                            if len(nouns_song) > 0:

                                return 'music'
                            elif len(nouns_singer) > 0:
                                return 'Star_stuff'
                            else:
                                continue

                        elif each[0] == 'novel':
                            nouns = get_nouns(res[1], 'novel', self.ac)
                            if '小说' in res[1] or len(nouns) > 0:

                                return 'novel'
                            else:
                                continue

                        elif each[0] == 'poetry':
                            nouns = get_nouns(res[1], 'poet', self.ac)
                            if len(nouns) > 0:

                                return 'poetry'
                            else:
                                continue

                        elif each[0] == 'radio':
                            if len(get_nouns(res[1], 'radio', self.ac)) > 0:

                                return 'radio'
                            else:
                                continue

                        elif each[0] == 'stock':
                            nouns = get_nouns(res[1], 'stock', self.ac)
                            if len(nouns) > 0:

                                return 'stock'
                            else:
                                continue

                        elif each[0] == 'website':
                            nouns = get_nouns(res[1], 'website', self.ac)
                            if len(nouns) > 0:

                                return 'Internet_stuff'
                            else:
                                continue

    def retrieval(self, sample):
        """
        To find proper nouns to handle single entity in a query
        :param sample: a dict indicates a query and its POS tag
        :return:a string indicates one certain intent
        """
        pn_res = doRetrieval(sample['query'],
                             self.ac)  #look up single instance
        sorted_domains = aggregate_domains(pn_res)
        if len(sorted_domains) == 1:  #one instance
            domain = sorted_domains[0][0]
            if len(max(sorted_domains[0][1],
                       key=len)) > len(sample['query']) / 2:
                if domain == 'airline': return 'flight'
                if domain in ['railwaystation', 'airport']: return 'map'
                if domain == 'app': return 'app'
                if domain == 'contacts': return 'contacts'
                if domain in ['drug', 'disease']: return 'health'
                if domain == 'festival': return 'datetime'
                if domain in ['moviestar', 'film', 'video']: return 'video'
                if domain == 'food': return 'cookbook'
                if domain == 'novel': return 'novel'
                if domain == 'place': return 'map'
                if domain == 'poet': return 'poetry'
                if domain == 'radio': return 'radio'
                if domain in ['singer', 'song']: return 'music'
                if domain == 'sports': return 'match'
                if domain == 'stock': return 'stock'
                if domain == 'tvchannel': return 'tvchannel'
                if domain == 'website': return 'website'
            return None
        else:
            return None

    def classifyAllIntents(self, sample):
        """
        A classifier for 31 intents including chitchat
        :param sample: a dict indicates a query and its POS tag
        :return:a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_31.transform(text)
        test_wd = self.word_vectorizer_31.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        test_vec = self.ch2_.transform(test_vec)
        pred = self.clf_31.predict(test_vec)
        return pred.tolist()[0]

    def epgOrTvchannel(self, sample):
        """
        A classifier to label a instance with 'epg' or 'tvchannel'
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_tv.transform(text)
        test_wd = self.word_vectorizer_tv.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        pred = self.clf_tv.predict(test_vec)
        return pred.tolist()[0]

    def videoOrCinemas(self, sample):
        """
        A classifier to label a instance with 'video' or 'cinemas'
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_movie.transform(text)
        test_wd = self.word_vectorizer_movie.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        pred = self.clf_movie.predict(test_vec)
        return pred.tolist()[0]

    def websiteOrApp(self, sample):
        """
        A classifier to label a instance with 'website' or 'app'
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_internet.transform(text)
        test_wd = self.word_vectorizer_internet.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        pred = self.clf_internet.predict(test_vec)
        return pred.tolist()[0]

    def videoOrMusic(self, sample):
        """
        A classifier to label a instance with 'video' or 'music'
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_star.transform(text)
        test_wd = self.word_vectorizer_star.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        pred = self.clf_star.predict(test_vec)
        return pred.tolist()[0]

    def videoOrEpg(self, sample):
        """
        A classifier to label a instance with 'epg' or 'video'
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        raw_query = sample['query']
        text = [''.join([w for w in jieba.cut(raw_query)])]
        test_ch = self.char_vectorizer_video.transform(text)
        test_wd = self.word_vectorizer_video.transform(text)
        test_vec = hstack([test_ch, test_wd])
        test_vec = csr_matrix(test_vec)
        pred = self.clf_video.predict(test_vec)
        return pred.tolist()[0]

    def pipeline(self, sample, use_pse=True, use_retrieval=False):
        """
        A pipeline to label a instance with one of 31 possible intents
        :param sample: a dict indicates a query and its POS tag
        :return: a string indicates one certain intent
        """
        if use_pse:
            ps_res = prettySureExpression(sample['query'], self.ac)

            if len(list(set([_[1][0] for _ in ps_res]))) == 1:
                return ps_res[0][1][0]
        pm_res = self.pattern_match(sample)

        if pm_res == 'TV_stuff':
            clf_res = self.classifyAllIntents(
                sample)  # a ML classifier to label 31 intentions
            if clf_res in ['epg', 'tvchannel']:
                return clf_res
            else:
                return self.epgOrTvchannel(
                    sample)  #a ML classifier to label epg or tvchannel

        elif pm_res == 'Movie_stuff':
            clf_res = self.classifyAllIntents(
                sample)  # a ML classifier to label 31 intentions
            if clf_res in ['video', 'cinemas']:
                return clf_res
            else:
                return self.videoOrCinemas(sample)

        elif pm_res == 'Internet_stuff':
            clf_res = self.classifyAllIntents(
                sample)  # a ML classifier to label 31 intentions
            if clf_res in ['website', 'app']:
                return clf_res
            else:
                return self.websiteOrApp(sample)

        elif pm_res == 'Star_stuff':
            clf_res = self.classifyAllIntents(
                sample)  # a ML classifier to label 31 intentions
            if clf_res in ['video', 'music']:
                return clf_res
            else:
                return self.videoOrMusic(sample)

        elif pm_res == 'Video_stuff':
            clf_res = self.classifyAllIntents(
                sample)  # a ML classifier to label 31 intentions
            if clf_res in ['video', 'epg']:
                return clf_res
            else:
                return self.videoOrEpg(sample)

        elif pm_res == None:

            if use_retrieval:
                ret_res = self.retrieval(sample, self.ac)
                if ret_res == None:
                    return self.classifyAllIntents(
                        sample
                    )  # no pattern matched, so that classify it using ML
                else:
                    return ret_res
            else:
                return self.classifyAllIntents(sample)
        else:
            return pm_res

    def preprocess(self, raw_query):
        """
        To segment a raw user query into words and POS-tags it
        :param raw_query: a string generated by a user
        :return: a dict indicate the segmented query ,raw query and POS-tags
        """
        tmp = pseg.cut(raw_query)
        words = []
        pos = []
        for word, flag in tmp:
            words.append(word)
            pos.append(flag)
        inst = {}
        inst['tag'] = pos
        inst['word'] = words
        del words
        del pos
        inst['query'] = raw_query
        return inst

    def close(self):
        """
        To release relevant models
        """
        self.postagger.release()  # 释放模型
        self.segmentor.release()  # 释放模型
        self.labeller.release()  # 释放模型
        self.parser.release()  # 释放模型
        del self.ac
        gc.collect()
コード例 #25
0
class ltp_api(object):
    def __init__(self, MODELDIR, exword_path=None):
        self.MODELDIR = MODELDIR
        self.output = {}
        self.words = None
        self.postags = None
        self.netags = None
        self.arcs = None
        self.exword_path = exword_path  # e.x: '/data1/research/matt/ltp/exwords.txt'
        # 分词
        self.segmentor = Segmentor()
        if not self.exword_path:
            # 是否加载额外词典
            self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
        else:
            self.segmentor.load_with_lexicon(
                os.path.join(self.MODELDIR, "cws.model"), self.exword_path)

        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
        # 依存句法
        self.parser = Parser()
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.MODELDIR, "ner.model"))
        # 语义角色
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))

    # 分词
    def ltp_segmentor(self, sentence):
        words = self.segmentor.segment(sentence)
        return words

    # 词性标注
    def ltp_postagger(self, words):
        postags = self.postagger.postag(words)
        return postags

    # 依存语法
    def ltp_parser(self, words, postags):
        arcs = self.parser.parse(words, postags)
        return arcs

    # 命名实体识别
    def ltp_recognizer(self, words, postags):
        netags = self.recognizer.recognize(words, postags)
        return netags

    # 语义角色识别
    def ltp_labeller(self, words, postags, arcs):
        output = []
        roles = self.labeller.label(words, postags, arcs)
        for role in roles:
            output.append([(role.index, arg.name, arg.range.start,
                            arg.range.end) for arg in role.arguments])
        return output

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.recognizer.release()
        self.labeller.release()

    def get_result(self, sentence):
        self.words = self.ltp_segmentor(sentence)
        self.postags = self.ltp_postagger(self.words)
        self.arcs = self.ltp_parser(self.words, self.postags)
        self.netags = self.ltp_recognizer(self.words, self.postags)
        self.output['role'] = self.ltp_labeller(self.words, self.postags,
                                                self.arcs)

        # 载入output
        self.output['words'] = list(self.words)
        self.output['postags'] = list(self.postags)
        self.output['arcs'] = [(arc.head, arc.relation) for arc in self.arcs]
        self.output['netags'] = list(self.netags)
コード例 #26
0
class myLTP:
    def __init__(self, LTP_DATA_DIR, pattern_dir='pattern.txt'):
        self.LTP_DATA_DIR = LTP_DATA_DIR
        self.ne_pattern = self._read_ne_pattern(pattern_dir)

    def _read_ne_pattern(self, filename):
        ne_pattern = []
        with open(filename, encoding='utf8') as filein:
            for line in filein:
                if line[0] != '#':
                    np = line.split()[:2]
                    ne_pattern.append(np)
        return ne_pattern

    def find_ne_by_pattern(self, text):
        ne_dic = defaultdict(list)
        for ne_type, pattern in self.ne_pattern:
            nes = re.findall(pattern, text)
            text = re.sub(pattern, ne_type, text)
            ne_dic[ne_type].extend(nes)
        return text, ne_dic

    def load(self, index=[1, 1, 1, 1, 1]):
        """分词 词性标注 命名实体识别 句法分析 语义角色分析"""
        if index[0]:
            cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')
            self.segmentor = Segmentor()
            self.segmentor.load(cws_model_path)

        if index[1]:
            pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')
            self.postagger = Postagger()
            self.postagger.load(pos_model_path)

        if index[2]:
            ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')
            self.recognizer = NamedEntityRecognizer()
            self.recognizer.load(ner_model_path)

        if index[3]:
            par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model')
            self.parser = Parser()
            self.parser.load(par_model_path)

        if index[4]:
            srl_model_path = os.path.join(self.LTP_DATA_DIR, 'pisrl_win.model')
            self.labeller = SementicRoleLabeller()
            self.labeller.load(srl_model_path)

    def release(self):
        try:
            self.segmentor.release()
        except:
            pass
        try:
            self.postagger.release()
        except:
            pass
        try:
            self.recognizer.release()
        except:
            pass
        try:
            self.parser.release()
        except:
            pass
        try:
            self.labeller.release()
        except:
            pass

    def split_sentence(self, text):
        """分句"""
        return SentenceSplitter.split(text)

    def word_segment(self, sentence):
        """使用结巴分词"""
        # words = self.segmentor.segment(sentence)
        words = jieba.cut(sentence)
        return list(words)

    def pos_tag(self, words):
        """词性标注"""
        postags = self.postagger.postag(words)
        return postags

    def named_entity_recognize(self, words, postags):
        """命名实体识别"""
        netags = self.recognizer.recognize(words, postags)
        return netags

    def parse(self, words, postags):
        """句法分析"""
        arcs = self.parser.parse(words, postags)  # (arc.head, arc.relation)
        return arcs

    def sementic_role_label(self, words, postags, arcs):
        """语义角色分析"""
        roles = self.labeller.label(words, postags, arcs)
        return roles

    def _get_ne_for_sentence(self, sentence):
        """获取实体,包括通过正则表达式定义的一些实体"""

        sentence, ne_dic = self.find_ne_by_pattern(sentence)
        words = list(self.word_segment(sentence))
        postags = self.postagger.postag(words)
        ners = self.named_entity_recognize(words, postags)
        res = {}
        res['words'] = words
        res['ners'] = []
        for index, ner in enumerate(ners):
            if ner != 'O':
                if ner[0] in ('S', 'B'):
                    res['ners'].append([ner[2:], index, index + 1])
                else:
                    res['ners'][-1][-1] += 1
        for ner_type, v in ne_dic.items():
            v = iter(v)
            if v:
                for index, word in enumerate(words):
                    if word == ner_type:
                        words[index] = v.__next__()
                        res['ners'].append([ner_type, index, index + 1])
        return res

    def _get_dne_for_sentence(self, sentence):
        res = []
        s = self._get_ne_for_sentence(sentence)
        ners = s['ners']
        words = s['words']
        for entity1, entity2 in combinations(ners, 2):
            res.append((entity1, entity2, words))
        return res

    def get_dne(self, text):
        """获取实体对,人名(Nh)地名(Ns)机构名(Ni)"""
        res = []
        sentences = self.split_sentence(text)
        for sentence in sentences:
            r = self._get_dne_for_sentence(sentence)
            res.extend(r)
        return res
コード例 #27
0
class OpinionExtractor(object):
    def __init__(self):
        self.__segmentor = Segmentor()
        self.__postagger = Postagger()
        self.__parser = Parser()  # 初始化实例
        self.__labeller = SementicRoleLabeller()  # 初始化实例

        self.__segmentor.load_with_lexicon(
            os.path.join(LTP_MODEL_DIR, "cws.model"),
            os.path.join(DICTIONARY_DIR, "custom_lexicon.model"))
        self.__postagger.load(os.path.join(LTP_MODEL_DIR, "pos.model"))
        self.__parser.load(os.path.join(LTP_MODEL_DIR, "parser.model"))  # 加载模型
        self.__labeller.load(os.path.join(LTP_MODEL_DIR,
                                          "pisrl.model"))  # 加载模型

        self.__adv_dict_list = self.__load_adverb_dictionary()
        self.__adv_list = self.__adv_dict_list.get("范围副词") + self.__adv_dict_list.get("频率副词") \
                          + self.__adv_dict_list.get("程度副词") + self.__adv_dict_list.get("时间副词") \
                          + self.__adv_dict_list.get("肯否副词") + self.__adv_dict_list.get("语气副词") \
                          + self.__adv_dict_list.get("情态副词")

        self.__pronoun_list = self.__load_pronoun_words()
        self.__vi_list = self.__load_intransitive_verb()
        self.__auxiliary_dict_list = self.__load_auxiliary_dictionary()
        self.__auxiliary_list = self.__auxiliary_dict_list.get(
            "语气助词") + self.__auxiliary_dict_list.get(
                "结构助词") + self.__auxiliary_dict_list.get("时态助词")

        self.__special_prefix_list = self.__load_special_prefix_words()
        self.__stopwords_list = self.__load_stopwords("之前", "是因为", "已经")

    def release(self):
        self.__labeller.release()
        self.__parser.release()
        self.__postagger.release()
        self.__segmentor.release()

    @classmethod
    def __load_stopwords(cls, *self_define_stopwords):
        """
        get stopwords list
        :param self_define_stopwords: add self define stop word to stopwords list
        :return: stopwords_list
        """
        stopwords_list = [
            word.strip()
            for word in open(os.path.join(DICTIONARY_DIR, "stopwords.txt"),
                             "r").readlines()
        ]
        for stopword in self_define_stopwords:
            stopwords_list.append(stopword)
        return stopwords_list

    @classmethod
    def __load_special_prefix_words(cls):
        """
        加载特别开始词
        :return:
        """
        special_prefix_words = []
        with open(os.path.join(DICTIONARY_DIR, "special_prefix.txt"),
                  "r") as sp_file:
            for word in sp_file.readlines():
                special_prefix_words.append(word.strip())
        return special_prefix_words

    @classmethod
    def __load_intransitive_verb(cls):
        """
        加载不及物动词
        :return:
        """
        intransitive_verb = []
        with open(os.path.join(DICTIONARY_DIR, "intransitive_verb.txt"),
                  "r") as vi_file:
            for word in vi_file.readlines():
                intransitive_verb.append(word.strip())
        return intransitive_verb

    @classmethod
    def __load_pronoun_words(cls):
        """
        加载代词
        :return:
        """
        pronoun_words = []
        with open(os.path.join(DICTIONARY_DIR, "pronoun.txt"),
                  "r") as pronoun_file:
            for word in pronoun_file.readlines():
                pronoun_words.append(word.strip())
        return pronoun_words

    @classmethod
    def __load_adverb_dictionary(cls):
        """
        加载副词
        :return:
        """
        dictionary = {}
        with open(os.path.join(DICTIONARY_DIR, "adv.txt"), "r") as adv_file:
            for line in adv_file.readlines():
                index = line.index(":")
                key = line[0:index].strip()
                value = line[index + 1:].strip()
                dictionary.update({key: value.split(" ")})
        return dictionary

    @classmethod
    def __load_auxiliary_dictionary(cls):
        """
        加载助词
        :return:
        """
        dictionary = {}
        with open(os.path.join(DICTIONARY_DIR, "auxiliary.txt"),
                  "r") as adv_file:
            for line in adv_file.readlines():
                index = line.index(":")
                key = line[0:index].strip()
                value = line[index + 1:].strip()
                dictionary.update({key: value.split(" ")})
        return dictionary

    @classmethod
    def __smart_split_sentence(cls, comment):
        """
        拆分句子
        :param comment:
        :return:
        """
        # 替换空格为","
        comment = re.sub(re.compile(r"(\s+)", re.S), ",", comment.strip())
        # 句子按分隔[。|!|,|、|?|.|!|,|?]符分出多个子句
        subcomments = re.split(r'[。|!|,|、|?|\.|!|,|\?]', comment)
        return subcomments

    def sentence_segment_add_space(self, comment, stopwords_list={}):
        """
        使用空格间隔分词
        如:
        我们 喜欢 吃 冰激凌
        :param comment: 一条语料
        :param stopwords_list: 停用词列表
        :return:
        """
        self.__segmentor
        segment = self.__segmentor.segment(self.__remove_special_word(comment))
        return segment, " ".join(segment)

    def __word_self_attention(self, parent_pos, parent_word,
                              current_arc_relation, current_arc_pos,
                              current_word):
        """
        判断词性与依存关系组合的有效性

        词注意力机制
        :param parent_pos: 父节点的词性
        :param parent_word: 父节点的词
        :param current_arc_relation: 当前节点的依存关系
        :param current_arc_pos: 当前节点的词词性
        :param current_word: 当前节点的词
        :return:
        """
        if parent_pos == Pos.v.value:
            if current_arc_relation == Dependency.SBV.value:
                return True
            if current_arc_relation == Dependency.VOB.value:
                return True
            if current_arc_relation == Dependency.FOB.value:
                return True
            if current_arc_relation == Dependency.ADV.value:
                if current_arc_pos == Pos.d.value:
                    if current_word in self.__adv_dict_list.get("肯否副词"):
                        return True
                if current_arc_pos == Pos.p.value and current_word in [
                        "由", "用"
                ]:  # 由关晓彤代言
                    return True
                if current_arc_pos == Pos.v.value:
                    return True
            if current_arc_relation == Dependency.ATT.value:
                return True
            if current_arc_relation == Dependency.CMP.value:
                return True
            # if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get("语气助词") + self.__auxiliary_dict_list.get("时态助词"):
            if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_list:
                return True
        elif parent_pos == Pos.a.value:
            if current_arc_relation == Dependency.SBV.value and current_word not in self.__pronoun_list:  # e.g.:材料新鲜  它很方便
                return True
            if current_arc_relation == Dependency.ADV.value and (
                    current_word not in self.__adv_dict_list.get("程度副词") +
                    self.__adv_dict_list.get("范围副词") or
                (current_arc_pos == Pos.p.value
                 and current_word in ["比"])):  # 比别家好
                return True
            if current_arc_relation == Dependency.ATT.value:
                return True
            if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get(
                    "语气助词") + self.__auxiliary_dict_list.get("结构助词"):
                return True
        elif parent_pos in [
                Pos.n.value, Pos.nd.value, Pos.nh.value, Pos.ni.value,
                Pos.nl.value, Pos.ns.value, Pos.nt.value, Pos.nz.value
        ]:
            if current_arc_relation == Dependency.ADV.value:
                return True
            if current_arc_relation == Dependency.ATT.value:  # 属性语义修饰名词
                return True
            if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get(
                    "语气助词") + self.__auxiliary_dict_list.get("结构助词"):  # 美丽的
                return True
        elif parent_pos == Pos.p.value:
            if current_arc_relation == Dependency.SBV.value:  # 他给我感觉
                return True
            if current_arc_relation == Dependency.VOB.value:  # 给我感觉
                return True
            if current_arc_relation == Dependency.POB.value:  # 比别家好
                return True
        elif parent_pos == Pos.d.value:
            if current_arc_relation == Dependency.SBV.value:
                return True
            if current_arc_relation == Dependency.VOB.value:  # 没有|d  4|过于|d  5|甜腻
                return True
        elif parent_pos in [Pos.i.value, Pos.r.value, Pos.q.value
                            ] or current_arc_relation == Dependency.CMP.value:
            return True
        return False

    def __parse_opinion(self, core_word_index, arcs, words, postags):
        """

        :param core_word_index:
        :param arcs:
        :param words:
        :param postags:
        :return: opinion_word_list
        """
        has_vob = False
        sbv_word = ()
        sbv_att_word_list = []
        available_word_idx_list = [core_word_index]
        opinion_word_list = []

        def word_root_index(core_word_idx, index):
            """
            查找词的root index
            :return:
            """
            arc = arcs[index]
            idx = index if arc.relation == Dependency.HED.value else arc.head - 1
            if idx == core_word_idx or idx == index:
                return idx
            else:
                return word_root_index(core_word_idx, idx)

        def do_parse_opinion(core_word_idx):
            """
            提取以动词为核心的观点,提取的主要结构主谓结构(SBV)、动宾结构(VOB)、状中结构(ADV)、动补结构(CMP)、介宾结构(POB)
            :return:
            """
            nonlocal has_vob
            nonlocal sbv_word
            nonlocal sbv_att_word_list
            nonlocal available_word_idx_list

            for m, arc in enumerate(arcs):
                # tuple格式:(index, 句法依存关系, 词性, 词)
                current_word_tuple = (m, arc.relation, postags[m], words[m])

                parent_word_index = arc.head - 1
                parent_word_tuple = (parent_word_index,
                                     arcs[parent_word_index].relation,
                                     postags[parent_word_index],
                                     words[parent_word_index])

                if arc.head == core_word_idx + 1 \
                        and (current_word_tuple[2] not in [Pos.wp.value, Pos.o.value, Pos.c.value, Pos.r.value, Pos.e.value] or (current_word_tuple[2] == Pos.r.value and current_word_tuple[3] not in self.__pronoun_list)) \
                        and self.__word_self_attention(parent_word_tuple[2], parent_word_tuple[3], current_word_tuple[1], current_word_tuple[2], current_word_tuple[3]):

                    # 计算词的root词是否等于关键词
                    root_core_index = word_root_index(core_word_index, m)
                    if root_core_index == core_word_index:
                        if arc.relation == Dependency.VOB.value or (
                                arc.relation == Dependency.CMP.value and
                                postags[current_word_tuple[0]] == Pos.a.value):
                            has_vob = True
                            available_word_idx_list.append(m)
                            opinion_word_list.append(current_word_tuple)
                        else:
                            if arc.head - 1 in available_word_idx_list:
                                available_word_idx_list.append(m)
                                # 若是主谓结构先暂存,不加入观点词list
                                if arc.relation == Dependency.SBV.value:
                                    if len(sbv_word) == 0:
                                        sbv_word = current_word_tuple
                                else:
                                    # 计算词的root词是否等于sbv关键词
                                    sbv_index = sbv_word[0] if len(
                                        sbv_word) > 0 else -1
                                    root_sbv_index = word_root_index(
                                        sbv_index, current_word_tuple[0])
                                    if root_sbv_index == sbv_index:
                                        # 若是主谓结构的其他属性词,暂存在主谓属性词列表
                                        sbv_att_word_list.append(
                                            current_word_tuple)
                                    else:
                                        opinion_word_list.append(
                                            current_word_tuple)
                    do_parse_opinion(m)

        do_parse_opinion(core_word_index)

        def need_sbv():
            """
            判断是否需要主谓结构
            :return:
            """
            # 三元组判断,只有包含了动宾结构才把主谓结构加入
            if has_vob:
                return True
            # 及物动词可以直接加sbv
            if postags[core_word_index] == Pos.a.value:
                return True
            # 形容词句意可以直接在sbv
            if words[core_word_index] in self.__vi_list:
                return True
            return False

        if need_sbv() and len(sbv_word) > 0:
            opinion_word_list.append(sbv_word)
            opinion_word_list += sbv_att_word_list

        return opinion_word_list

    def extract_opinion(self,
                        comment,
                        distinct_opinion=True,
                        show_core_word=False,
                        show_detail=False):
        """
        抽取观点
        :param comment:
        :param distinct_opinion: 是否去重观点
        :param show_core_word: 是否展示观点核心词
        :param show_detail: 是否展示分词等详细信息
        :return:
        """
        subcomments = self.__smart_split_sentence(comment)
        opinion_list = []
        for subcomment in subcomments:
            words, sentence_with_space = self.sentence_segment_add_space(
                subcomment)
            opinions = self.__parse_segment(words, show_detail)
            if len(opinions) > 0:
                opinion_list += opinions
        if distinct_opinion:
            opinion_list = self.__distinct_opinion(opinion_list)
        if not show_core_word:
            opinion_list = [opinion[2] for opinion in opinion_list]
        return opinion_list

    @classmethod
    def __distinct_opinion(cls, opinions):
        """
        观点去重
        :param opinions:
        :return:
        """
        index = 2
        distinct_opinion_list = []
        for n in range(1, len(opinions)):
            for m in range(n, 0, -1):
                opi_1 = opinions[m][index]
                opi_2 = opinions[m - 1][index]
                if len(opi_1) > len(opi_2):
                    tmp = opinions[m - 1]
                    opinions[m - 1] = opinions[m]
                    opinions[m] = tmp

        for opinion in opinions:
            opi = opinion[index]
            if len(distinct_opinion_list) == 0:
                distinct_opinion_list.append(opinion)
            else:
                include = False
                for idx in range(0, len(distinct_opinion_list)):
                    try:
                        include |= distinct_opinion_list[idx][index].index(
                            opi) > -1
                    except ValueError:
                        pass
                if not include:
                    distinct_opinion_list.append(opinion)

        return distinct_opinion_list

    def __parse_segment(self, words, show_detail=False):
        postags = self.__postagger.postag(words)

        word_tag_tuple_list = []
        for i in range(len(words)):
            word_tag_tuple_list.append((str(i), words[i], postags[i]))
        arcs = self.__parser.parse(words, postags)

        # arcs 使用依存句法分析的结果
        labels = self.__labeller.label(words, postags, arcs)  # 语义角色标注

        if show_detail:
            logger.info("|".join(words))
            logger.info("  ".join('|'.join(tpl)
                                  for tpl in word_tag_tuple_list))
            logger.info("  ".join("%d|%d:%s" % (n, arc.head, arc.relation)
                                  for n, arc in enumerate(arcs)))
            for label in labels:
                logger.info(
                    str(label.index) + ":" + ",".join([
                        "%s:(%d,%d)" %
                        (arg.name, arg.range.start, arg.range.end)
                        for arg in label.arguments
                    ]))

        # opinions = self.__parse_main_opinion(arcs, words, postags)
        opinions = self.__parse_opinions(arcs, words, postags)
        return opinions

    def __parse_opinions(self, arcs, words, postags):
        """
        给出核心词性,解释所有该词性的短语观点
        :param arcs:
        :param words:
        :param postags:
        :return:
        """
        opinions = []
        for n, arc in enumerate(arcs):
            postag = postags[n]
            word = words[n]
            if postag in [Pos.v.value, Pos.a.value, Pos.i.value] or \
                    (postag == Pos.a.value and word not in self.__adv_list) or \
                    (arc.relation in [Dependency.HED.value, Dependency.COO.value] and postag not in [Pos.v.value, Pos.a.value, Pos.i.value, Pos.m.value, Pos.c.value]):
                opinion_word_list = self.__parse_opinion(
                    n, arcs, words, postags)
                if self.__check_opinion(postag, word, opinion_word_list):
                    opinion_str = self.__opinion_to_str(
                        n, words, opinion_word_list)
                    opinions.append((postag, words[n], opinion_str))

        return opinions

    def __parse_main_opinion(self, arcs, words, postags):
        """

        :param arcs:
        :param words:
        :param postags:
        :return:
        """
        for n, arc in enumerate(arcs):
            if arc.relation == Dependency.HED.value:
                core_index = n
        core_pos = postags[core_index]
        opinion_word_list = self.__parse_opinion(core_index, arcs, words,
                                                 postags)
        return core_pos, words[core_index], self.__opinion_to_str(
            core_index, words, opinion_word_list)

    @classmethod
    def __check_opinion(cls, core_word_pos, core_word, opinion_word_list):
        """
        检测opinion有效性
        :param core_word_pos:
        :param core_word:
        :param opinion_word_list:
        :return:
        """
        if len(opinion_word_list) > 0:
            return True
        if len(opinion_word_list) == 0 and core_word_pos not in [
                Pos.v.value, Pos.d.value
        ]:
            return True
        if len(opinion_word_list
               ) == 0 and core_word_pos == Pos.v.value and len(
                   core_word) > 1:  # 入口即化|v
            return True
        return False

    def __opinion_to_str(self, core_word_index, words, opinion_word_list):
        """
        输出观点字符串
        :param core_word_index:
        :param words:
        :param opinion_word_list:
        :return:
        """
        index_list = [core_word_index]
        if self.__remove_core_word(words[core_word_index]):
            index_list = []

        for opinion_word in opinion_word_list:
            index = opinion_word[0]
            index_list.append(index)
        index_list.sort()

        opinion = ""
        for index in index_list:
            opinion += words[index]

        return self.__remove_special_word(opinion)

    @classmethod
    def __remove_core_word(cls, word):
        if word == "是":
            return True
        return False

    def __remove_special_word(self, opinion):
        new_opinion = opinion
        for sp_word in self.__special_prefix_list:
            if opinion.rfind(sp_word) == 0:
                new_opinion = opinion[len(sp_word):]
                return self.__remove_special_word(new_opinion)
        return new_opinion
コード例 #28
0
class Semantic_Parser(object):
    def __init__(self):
        self.cws_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/cws.model'
        self.pos_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/pos.model'
        self.parser_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/parser.model'
        self.ner_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/ner.model'
        self.srl_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/srl/'

    def load(self):
        self.segmentor = Segmentor()
        self.segmentor.load(self.cws_model_path)

        self.postagger = Postagger()
        self.postagger.load(self.pos_model_path)

        self.parser = Parser()
        self.parser.load(self.parser_model_path)

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(self.ner_model_path)

        self.labeller = SementicRoleLabeller()
        self.labeller.load(self.srl_model_path)

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.recognizer.release()
        self.labeller.release()

    def get_cws(self, sentence):
        try:
            cws = self.segmentor.segment(sentence)
        except:
            cws = self.segmentor.segment(sentence.decode('utf8'))
        print(" ".join(cws))
        return cws

    def get_pos(self, cws):
        postags = self.postagger.postag(cws)
        print(" ".join(postags))
        return postags

    def get_arcs(self, cws, postags):
        arcs = self.parser.parse(cws, postags)
        label = " ".join("%s:%d:%s" % (word, arc.head, arc.relation)
                         for word, arc in zip(cws, arcs))
        print(label)
        return arcs

    def get_role(self, cws, postags, arcs):
        netags = self.recognizer.recognize(cws, postags)
        roles = self.labeller.label(cws, postags, netags, arcs)
        for role in roles:
            print(
                role.index, "".join([
                    "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                    for arg in role.arguments
                ]))

    def get_query(self, cws, arcs):
        '''
        对问句做句法分析后,提取其中的主干部分
        先取HED,然后分别取SBV和VOB

        :param cws:
        :param arcs:
        :return:
        '''
        words = [word for word in cws]
        head = [arc.head for arc in arcs]
        relation = [arc.relation for arc in arcs]
        print(words)
        print(head)
        print(relation)
        hed_index = index(head, 0)[0] + 1
        import_index = index(head, hed_index)
        print(import_index)
        sbv = [words[i] for i in import_index if relation[i] == 'SBV']
        vob = [words[i] for i in import_index if relation[i] == 'VOB']
        print(''.join(sbv))
        print(''.join(vob))
        return ''.join(sbv), ''.join(vob)
コード例 #29
0
        continue
    print(words[i])
print('\t'.join(postags))
postagger.release()  # 释放模型

from pyltp import NamedEntityRecognizer
recognizer = NamedEntityRecognizer()  # 初始化实例
recognizer.load('/home/curtank/Documents/ltp_data/ner.model')  # 加载模型
netags = recognizer.recognize(words, postags)  # 命名实体识别
print('\t'.join(netags))
recognizer.release()  # 释放模型

from pyltp import Parser
parser = Parser()
parser.load('/home/curtank/Documents/ltp_data/parser.model')
arcs = parser.parse(words, postags)  # 句法分析
print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
parser.release()  # 释放模型

from pyltp import SementicRoleLabeller
labeller = SementicRoleLabeller()  # 初始化实例
labeller.load('/home/curtank/Documents/ltp_data/srl')  # 加载模型
roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
for role in roles:
    print(
        role.index, "  ".join([
            "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
            for arg in role.arguments
        ]))
labeller.release()  # 释放模型
コード例 #30
0
class ltpTools():
    def __init__(self):

        #initialize every ltp tool
        LTP_DIR = "/home/demo1/support_ltp"

        #分词器
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        #词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        #依存句法分析
        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        #命名实体识别
        #self.recognizer = NamedEntityRecognizer()
        #self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        #语义角色标注模块
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
        print('模型已全部加载')

    def __del__(self):

        self.segmentor.release()
        self.labeller.release()
        self.postagger.release()
        self.postagger.release()
        print('模型已全部释放')

    def segANDpos(self, sen):
        '''
        分词加词性标注,同时返回词列表和词性列表,一一对应
        '''

        words = self.segmentor.segment(sen)

        postags = self.postagger.postag(words)

        return list(words), list(postags)

    '''语义角色标注'''

    def format_labelrole(self, words, postags):

        #依赖于词性的标注,做依存句法的分析
        #解释:
        #依存句法分析是基于词性标注的。
        arcs = self.parser.parse(words, postags)

        #根据依存句法的分析,标注语义角色
        roles = self.labeller.label(words, postags, arcs)

        #以字典储存,key为编号,value为列表
        #而且是嵌套字典,以arg.name作为key
        #这个字典的含义就是:每个角色的索引是一级key,二级字典以语义角色类型为key
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }

        print(roles_dict)
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):

        #其数据结构是:
        #这个list底下是一个个字典,每个字典的key是关系名称,每个字典的value是这个关系所对应的词语,这样就得到了父节点们所拥有的关系及有这种关系的孩子
        child_dict_list = []

        #这个list的意义就是展示每个词的依存关系
        format_parse_list = []

        #一级循环:对每个词分析
        for index in range(len(words)):

            #预设孩子字典
            child_dict = dict()

            #二级循环:查每个词的语义角色
            for arc_index in range(len(arcs)):

                #这里无非就是查一下我到底有没有成为谁的爸爸,如果有的话就登记一下
                if arcs[arc_index].head == index + 1:  #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)

        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        '''显然这是一个类的主函数'''

        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list