Beispiel #1
0
def segmentation(filename, output_filename):

    print "segmenting '%s' to '%s'" % (filename, output_filename)

    f = open(filename, "r")
    lines = f.readlines()
    f.close()

    MODELDIR = "./ltp_data/"

    # segment
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    # postag
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    
    # Named Entity Recognize
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    
    # Parse and get SVO
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    
    f = open(output_filename, "w")
    fner = open(output_filename.split(".")[0]+"_ner.txt", "w")

    for _line in lines:
        line = _line[:-1]
        if line[-1] in "\n\r":
            line = line[:-1]
        
        words = segmentor.segment(line)
        postags = postagger.postag(words)
#        netags = recognizer.recognize(words, postags)
#        arcs = parser.parse(words, postags)

        for i in range(len(words)):
            f.write( "%s/%s\t" % (words[i], postags[i]))
#            if netags[i]!='O':
#                fner.write("%s/%s\t" % (words[i], netags[i]))
        f.write("\n")
#        fner.write("\n")

    f.close()
Beispiel #2
0
    def __init__(self):
        self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        segmentor = Segmentor()
        segmentor.load(self.cws_model_path)
        self.words = segmentor.segment(data)
        # print("|".join(words))
        segmentor.release()


        postagger = Postagger() # 初始化实例
        postagger.load(self.pos_model_path)  # 加载模型
        self.postags = postagger.postag(self.words)  # 词性标注
        # print('\t'.join(postags))
        postagger.release()  # 释放模型


        recognizer = NamedEntityRecognizer() # 初始化实例
        recognizer.load(self.ner_model_path)  # 加载模型
        self.netags = recognizer.recognize(self.words, self.postags)  # 命名实体识别
        # print('\t'.join(netags))
        recognizer.release()  # 释放模型
def mingming_shiti(words,postags):
    """命名实体。机构名(Ni)人名(Nh)地名(Ns)"""
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    netags = recognizer.recognize(words, postags)
    print ("\t".join(netags))
Beispiel #4
0
postagger = Postagger()
# 加载模型
postagger.load(pos_model_path)
# 分词结果
words = ['元芳', '你', '怎么', '看']
# 词性标注
postags = postagger.postag(words)
print('\t'.join(postags))
# 释放模型
postagger.release()

# 4. 命名实体识别
# 初始化实例
recognizer = NamedEntityRecognizer()
# 加载模型
recognizer.load(ner_model_path)
words = ['元芳', '你', '怎么', '看']
postags = ['nh', 'r', 'r', 'v']
# 命名实体识别
netags = recognizer.recognize(words, postags)
print('\t'.join(netags))
# 释放模型
recognizer.release()

# 5. 依存句法分析
# 初始化实例
parser = Parser()
# 加载模型
parser.load(par_model_path)
words = ['元芳', '你', '怎么', '看']
postags = ['nh', 'r', 'r', 'v']
Beispiel #5
0
# comma_punc=re.compile(r"[,:: ]".decode("utf8"))
comma_punc = re.compile(r"[,:: ]")
# period_punc=re.compile(r"[。?!;?!;]".decode("utf8"))
period_punc = re.compile(r"[。?!;?!;]")
# del_punc=re.compile(r"[‘’“” ]".decode("utf8"))
del_punc = re.compile(r"[‘’“” ]")
# sub_punc=re.compile(r"[,]".decode("utf8"))
sub_punc = re.compile(r"[,]")
###load models
segmentor = Segmentor()
segmentor.load(model_dir + "cws.model")
postagger = Postagger()
postagger.load(model_dir + "pos.model")
recognizer = NamedEntityRecognizer()
recognizer.load(model_dir + "ner.model")
parser = Parser()
parser.load(model_dir + "parser.model")
labeller = SementicRoleLabeller()
labeller.load(model_dir + "srl")


def parse(sent):
    #this functions detects the structure of a sentence
    if len(sent) < 12:
        return "el"
    if len(sent) > 60:
        return "el"
    words = segmentor.segment(sent.strip())  #word segmentation
    postags = postagger.postag(words)  #pos tagging
    netags = recognizer.recognize(words, postags)  #entity recognition
Beispiel #6
0
class Char_Feature_Pipeline():
    def __init__(self, char_voca, freq_dict, pyltp_path):
        self.char_voca = char_voca
        self.freq_dict = freq_dict
        self.pyltp_path = pyltp_path
        PYLTP_PATH = self.pyltp_path
        self.segmentor = Segmentor()
        self.segmentor.load(PYLTP_PATH + '/cws.model')
        self.postagger = Postagger()  # 初始化实例
        self.postagger.load(PYLTP_PATH + '/pos.model')  # 加载模型
        self.recognizer = NamedEntityRecognizer()  # 初始化实例
        self.recognizer.load(PYLTP_PATH + '/ner.model')  # 加载模型

    def pipeline(self, raw_iter, label_bool=False):
        for Id, sample in enumerate(raw_iter):
            dct = self.gene_pyltp_feature(
                sample, Id)  # use pyltp to get new seg, pos and ner
            dct = self.replace_raw_text(dct)
            dct = self.gene_ner_feature(dct)  # generate the NER feature
            if label_bool:
                dct = self.gene_ner_label(dct)  # gene_ner_label
                dct = self.gene_ner_weight(dct)  # gene_label_weight
            yield dct

    def release_pyltp_model(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()

    @add_to_input
    def gene_pyltp_feature(self, sample, Id):
        words = self.segmentor.segment(sample['text'])
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)
        res = [{
            'word': word,
            'postag': postag,
            'netag': netag
        } for word, postag, netag in zip(words, postags, netags)]
        res = {'_id': sample.get('_id', Id), 'pyltp_tags': res}
        return res

    @add_to_input
    def replace_raw_text(self, sample):
        def mask_books(text):
            parts = []
            last_tail = 0
            for match in re.finditer('《[^》]*》', text):
                parts.append(text[last_tail:match.span()[0]])
                last_tail = match.span()[1]
                parts.append(''.join(
                    ['《', 'X' * (match.span()[1] - match.span()[0] - 2), '》']))
            parts.append(text[last_tail:])
            return ''.join(parts)

        text = sample['text']
        new_text = re.sub('[A-Z]', 'B', text)  # 大写英文字母统一变为B
        new_text = re.sub('[a-z]', 'b', new_text)  # 小写英文字母统一变为b
        new_text = re.sub('[0-9]', 'x', new_text)  # 数字统一变为x
        new_text = mask_books(new_text)  # 中文书名号《》内统一变为X
        res = {'raw_text': text, 'text': new_text, '_id': sample['_id']}
        return res

    @add_to_input
    def gene_ner_feature(self, sample):
        char_index = [
            self.char_voca.loadWord2idAndId2Word(char)
            for char in sample['text']
        ]
        char_pos, char_bmes = postag2char_pos_bmes(sample['postag'])

        ltp_other, ltp_char_bmes = postag2char_pos_bmes(
            sample['pyltp_tags'],
            word_key='word',
            other_keys=['postag', 'netag'],
            check_length=True,
            text=sample['text'])

        pos_index = [POS_VOCA[pos] for pos in char_pos]
        bmes_index = [BMES_VOCA[k] for k in char_bmes]
        char_freq = [self.freq_dict[s] for s in sample['text']]

        ltp_bmes_index = [BMES_VOCA[k] for k in ltp_char_bmes]
        ltp_pos_index = [POS_VOCA[pos] for pos in ltp_other[0]]
        ltp_ner_index = [NER_VOCA[k] for k in ltp_other[1]]

        if not len(char_pos) == len(char_index):
            pos_index = ltp_pos_index
            bmes_index = ltp_bmes_index

        assert len(char_index) == len(pos_index)

        res = {
            '_id': sample['_id'],
            'char_index': char_index,
            'char_size': len(sample['text']),
            'pos_index': pos_index,
            'bmes_index': bmes_index,
            'char_freq': char_freq,
            'ltp_pos_index': ltp_pos_index,
            'ltp_bmes_index': ltp_bmes_index,
            'ltp_ner_index': ltp_ner_index
        }
        return res

    @add_to_input
    def gene_ner_label(self, sample):
        text = sample['text']
        char_length = len(text)
        subjects = set([spo['subject'] for spo in sample['spo_list']])
        objects = set([spo['object'] for spo in sample['spo_list']])
        locates = np.zeros(char_length, dtype=int)
        for bject in subjects:
            for span in my_finditer(bject, text):
                locates[span[0]:span[1]] = 1
        sub_locates = locates.tolist()
        locates = np.zeros(char_length, dtype=int)
        for bject in objects:
            for span in my_finditer(bject, text):
                locates[span[0]:span[1]] = 1
        ob_locates = locates.tolist()
        res = {
            '_id': sample['_id'],
            'sub_label': sub_locates,
            'ob_label': ob_locates
        }
        return res

    @add_to_input
    def gene_ner_weight(self, sample):
        sub_weight = calculate_weight(sample['sub_label'])
        ob_weight = calculate_weight(sample['ob_label'])
        res = {
            '_id': sample['_id'],
            'sub_weight': sub_weight,
            'ob_weight': ob_weight
        }
        return res
Beispiel #7
0
class DSFN:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir:str,用户自定义词典目录
        default_model_dir:str,ltp模型文件目录
    """

    entity_verb_new = entity_verb_new()
    all_entity = entity_verb_new.readAllEntity(
        "../../entity_verb//entity_verb_result\\all_entity.json")
    default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录
    location_entity = [
        "中和殿", "太庙", "人文地理", "亚运村", "九龙壁", "圆明园", "古典建筑", "庑殿顶", "天井", "无量殿",
        "慈宁宫", "三希堂", "居庸关", "延寿寺", "排云殿", "东桥", "圜丘", "南天门", "垂花门", "西六宫",
        "配楼", "柳荫街", "中国四大名园", "午门", "乾东五所", "建筑管理", "世界博物馆", "西什库教堂", "晚清",
        "万泉河", "东暖阁", "储秀宫", "西华门", "院落", "地安门东大街", "御路", "知鱼桥", "清宁宫", "金水河",
        "景山前街", "司马台长城", "景山公园", "乐寿堂", "东六宫", "延陵", "宜芸馆", "芍药居", "承乾宫",
        "琉璃瓦", "湘江", "敌楼", "安定门外大街", "三音石", "崇文门", "天坛路", "台基", "东城区", "外朝",
        "武备", "全国重点文物保护单位", "房山石", "静园", "香山", "中东", "坤宁宫", "彩画", "江南园林",
        "北河沿大街", "岳阳楼", "丽景轩", "巴黎圣母院", "钟表馆", "戏楼", "白银", "红海", "中原", "明长城",
        "神乐署", "瀛洲", "码头", "百度地图", "旋子彩画", "乾西五所", "天圆地方", "琉璃厂文化街", "广岛",
        "御沟", "井亭", "古柏林", "石坊", "北京故宫", "宝云阁", "甬道", "熙和门", "乾清门", "北京城",
        "暖温带", "沥粉贴金", "安定路", "北齐长城", "减柱造", "宅园", "清华园", "天坛东门站", "西苑", "土山",
        "温带季风气候", "宫古", "东直门", "美国国务卿", "北海", "中华梦石城", "东门站", "天坛公园", "江山",
        "谐趣园", "修宅", "苏堤", "玉泉", "牌坊", "蓟镇", "高速公路", "钟粹宫", "无梁殿", "政治家", "牌楼",
        "波斯", "西内", "老龙头", "阴阳石", "三神山", "丹陛桥", "中国第一历史档案馆", "建筑艺术", "四川",
        "护城河", "文华殿", "静宜园", "乐峰", "永和宫", "金砖", "清漪园", "安定门", "宫殿", "梵华楼",
        "龙井", "水街", "东华门", "歇山式顶", "斋宫", "渤海镇", "仁和", "白浮村", "建筑风格", "买卖街",
        "藻鉴堂", "寿安宫", "奉先殿", "后海", "宋", "承德避暑山庄", "前门站", "寿安山", "八达岭", "棂星门",
        "经幢", "泰山", "后三宫", "天桥商场", "维新派", "拙政园", "北京十六景", "南湖岛", "山寨", "东海",
        "寺庙", "图书馆", "西山", "延禧宫", "九土", "十七孔桥", "鹊桥", "石鼓", "样式雷", "礼乐", "圆石",
        "动物园", "西湖", "齐长城遗址", "京畿", "正脊", "神武门", "洛神赋图", "绿地面积", "暖阁", "多宝塔",
        "磨砖对缝", "湖心亭", "崇楼", "五谷丰登", "养性殿", "关山", "砖雕", "北境", "凤凰墩", "金殿",
        "永定路", "世界遗产", "古柏", "郡王府", "慕田峪", "皇舆全览图", "古典园林", "坐北朝南", "皇极殿",
        "皇家园林", "东四十条", "京西", "黄花镇", "通惠河", "宁寿宫", "旅游局", "大角楼", "昆明湖", "后溪",
        "东堤", "汉白玉石", "皇史宬", "湖心岛", "长春宫", "玉澜堂", "紫檀", "玉泉山", "玉山", "茶楼",
        "敌台", "乾清宫", "巴县", "藕香榭", "斗拱", "苏州街", "紫禁城", "颐和轩", "皇穹宇", "南方",
        "智慧海", "八小部洲", "拱券", "门楣", "太和殿", "銮仪卫", "法门寺地宫", "清音阁", "龙王庙", "城岛",
        "皇陵", "筒瓦", "天地坛", "张古", "建筑史", "武英殿", "北长街", "天坛", "云山", "大石桥", "北平",
        "宫殿建筑", "山东", "博物馆", "昆明池", "交道口南大街", "平流村", "聊城", "三大殿", "清晏舫", "墀头",
        "养心殿", "御道", "百花园", "翊坤宫", "神道", "落地罩", "渔村", "丹陛", "歇山顶", "畅音阁",
        "漱芳斋", "黄鹤楼", "柱础", "嘉乐堂", "庆长", "档案", "保定", "上海", "佛香阁", "望柱", "德和园",
        "天桥", "北京旅游网", "祈年殿", "颐和园", "攒尖顶", "香岩宗印之阁", "分界线", "大杂院", "交泰殿",
        "太和门", "南郊", "健翔桥", "瓮山", "勤政殿", "云南", "景仁宫", "小山村", "金水桥", "保和殿",
        "寄畅园", "珍妃井", "德和园大戏楼", "正房", "第一批全国重点文物保护单位", "三合院", "万寿山", "厉家菜",
        "玉峰塔", "藻井", "恭王府花园", "文昌阁", "景山", "前门东大街", "端门", "代王府", "万寿亭", "景阳宫",
        "东四环", "景明楼", "祈谷坛", "大戏楼", "安佑宫", "石舫", "流杯亭", "行宫", "法华寺", "圜丘坛",
        "正义路", "居庸关长城", "箭扣长城", "石牌坊", "回音壁", "和玺彩画", "二龙戏珠", "北四环", "玉龙",
        "广州", "盛京", "四合院", "曲尺", "谷仓", "永定门", "宝顶", "苏式彩画", "皇宫", "寿康宫"
    ]

    def __init__(self, model_dir=default_model_dir, all_entity=all_entity):
        self.default_model_dir = model_dir
        # 加载ltp模型
        #
        default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录
        self.segmentor = Segmentor()
        user_dict = "..\\source\\user.txt"
        segmentor_flag = self.segmentor.load_with_lexicon(
            os.path.join(default_model_dir, 'cws.model'), user_dict)
        # self.segmentor2 = Segmentor()
        # segmentor_flag = self.segmentor.load(os.path.join(default_model_dir, 'cws.model'))
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(
            os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(
            os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parser_flag = self.parser.load(
            os.path.join(self.default_model_dir, 'parser.model'))

        if segmentor_flag or postag_flag or ner_flag or parser_flag:  # 可能有错误
            print('load model failed')

    def segment(self, sentence, entity_postag=dict()):
        words = self.segmentor.segment(sentence)
        lemmas = []
        for lemma in words:
            lemmas.append(lemma)
        return lemmas

    def getPostag(self):
        return self.postagger

    def postag(self, lemmas):
        """
        Parameters
        ----------
        lemmas : List,分词后的结果
        entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns
        -------
        words:WordUnit List,包括分词与词性标注的结果
        """
        words = []
        # 词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, lemmas[i], postags[i])
            words.append(word)
        # self.postagger.release() #释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word:str,单词
        Returns:
            pos_tag:str,该单词的词性标注
        """
        pos_tag = self.postagger.postag([word])
        return pos_tag[0]

    def netag(self, words):
        """
        命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Parameters
            words : WordUnit list,包括分词与词性标注结果
        Returns
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        words_netag = EntityCombine().combine(words, netags)
        return words_netag

    def parse(self, words):
        """
        对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns
            *:sentenceUnit 句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        return SentenceUnit(words)

    def close(self):
        """
        关闭与释放
        """
        # pynlpir.close()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    def splitSentence(self, text):
        pattern = r'。|!|?|;|='
        result_list = re.split(pattern, text)
        result_list = list(filter(self.not_empty, result_list))
        #    print(result_list)
        return result_list

    def splitSentenceByComma(self, text):
        pattern = r','
        result_list = re.split(pattern, text)
        result_list = list(filter(self.not_empty, result_list))
        final_list = []
        for sentence in result_list:
            if len(sentence) <= 40:
                final_list.append(sentence)
        return final_list

    def not_empty(self, s):
        return s and "".join(s.split())

    def dsfn1_2_3_4COO(self, sentence, item1, item2):
        allTripes = []
        """
        判断两个实体是否属于DSFN1的情况,并输出三元组
        """
        location_position_list = [
            '主席', '总统', '总理', '主任', '内', '东门', '西门', '南门', '北门', '大门', '外',
            '国家主席', '尚书'
        ]
        if self.dsfnConstraints3(sentence, item1, item2) and (
                item1.dependency == "ATT" and item1.head_word.postag != 'v'
                and item1.head_word.postag != 'a'):
            AttWord = item1.head_word
            AttWordDict = dict()
            AttWordStr = ""
            while AttWord.ID < item2.ID:
                AttWordDict[AttWord.ID] = AttWord.lemma
                # AttWordStr += AttWord.lemma
                if (AttWord.dependency == "ATT"
                        and AttWord.head_word.postag != 'v'
                        and AttWord.head_word.postag != 'a'):
                    AttWord = AttWord.head_word
                else:
                    break

            if (AttWord.ID == item2.ID):
                flag = True
                while flag:
                    len1 = len(AttWordDict)
                    AttList = AttWordDict.keys()
                    for id in range(item1.ID + 1, item2.ID):
                        item = sentence.get_word_by_id(id)
                        if item.head_word != None and item.head_word.ID in AttList and (
                                item.dependency == "ATT" and item.postag != 'v'
                                and item.postag != 'a'):
                            AttWordDict[item.ID] = item.lemma
                    if len1 == len(AttWordDict):
                        flag = False
                    else:
                        flag = True
                AttWordDict = sorted(AttWordDict.items(),
                                     key=lambda item: item[0])
                AttWordStr = ""
                for i in AttWordDict:
                    AttWordStr += i[1]
                # print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")")
                if AttWordStr in location_position_list:
                    allTripes.append([item1.lemma, AttWordStr, item2.lemma])
        """
        考虑DSFN2的情况
        """
        if item1.dependency == "SBV" and item1.head_word.postag == "v":
            pred1 = item1.head_word
            predDict = dict()
            predDict[pred1.ID] = pred1.lemma

            if item2.dependency == "VOB" and item2.head_word.postag == "v":
                pred2 = item2.head_word
                predDict[pred2.ID] = pred2.lemma
                if (len(predDict) == 1):
                    PredWordStr = ""
                    for i in predDict:
                        PredWordStr += predDict[i]
                    # print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr + "," + item2.lemma + ")")
                    allTripes.append([item1.lemma, PredWordStr, item2.lemma])
                    """
                    新加,为了考虑“习近平视察和访问上海”的情况
                    """
                if len(predDict) == 2:
                    num = self.get_entity_num_between(pred1, pred2, sentence)
                    flagSBV = True
                    flagVOB = True
                    for word in sentence.words:
                        if word.dependency == "SBV" and word.head_word.ID == pred2.ID:
                            flagSBV = False
                        if (word.dependency == "VOB" and word.head_word.ID == pred1.ID)  or (word.dependency == "POB" \
                                and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred1.ID):
                            flagVOB = False
                    flagCMP = True
                    if pred1 != None and pred1.dependency == "CMP" and pred1.head_word.ID == pred2.ID:
                        flagCMP = False
                    if pred2 != None and pred2.dependency == "CMP" and pred2.head_word.ID == pred1.ID:
                        flagCMP = False
                    flagCOO = True
                    if pred1 != None and pred1.dependency == "COO" and pred1.head_word.ID == pred2.ID:
                        flagCOO = False
                    if pred2 != None and pred2.dependency == "COO" and pred2.head_word.ID == pred1.ID:
                        flagCOO = False

                    # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num))
                    if num == 0:
                        if flagCMP == False:
                            if flagVOB == True and flagSBV == True:
                                allTripes.append([
                                    item1.lemma,
                                    pred1.lemma + "" + pred2.lemma, item2.lemma
                                ])
                        if flagCOO == False:
                            if flagVOB == True and flagSBV == True:
                                allTripes.append([
                                    item1.lemma,
                                    pred1.lemma + "" + pred2.lemma, item2.lemma
                                ])
                        else:
                            if flagVOB == True:
                                allTripes.append(
                                    [item1.lemma, pred1.lemma, item2.lemma])
                            if flagSBV == True:
                                allTripes.append(
                                    [item1.lemma, pred2.lemma, item2.lemma])
        """
        DSFN3.0
        """
        pred = None
        if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
        elif item1.dependency == "FOB" and item2.dependency == "POB":  # 考虑介词为“被”的情况,如 “小王被小明所陷害”
            pred = item1.head_word
            prep = item2.head_word
            c = item1
            item1 = item2
            item2 = c
        if pred != None and prep != None:
            if prep.dependency == "ADV":
                if prep.head_word.ID == pred.ID:
                    pred2 = None
                    object = None
                    objectForPred2 = None
                    for i in range(pred.ID + 1, len(sentence.words) + 1):
                        item = sentence.get_word_by_id(i)

                        if item.dependency == "VOB" and item.head_word.ID == pred.ID:
                            object = item
                            objectDict = dict()
                            objectDict[object.ID] = object
                            for word in sentence.words:
                                if word.head_word != None and word.dependency == "ATT" and word.head_word.ID in objectDict:
                                    objectDict[word.ID] = word
                            objectDict = sorted(objectDict.items(),
                                                key=lambda item: item[0])
                            objectStr = ""
                            for objectItem in objectDict:
                                objectStr += objectItem[1].lemma
                            allTripes.append([
                                item1.lemma, pred.lemma + "" + objectStr,
                                item2.lemma
                            ])

                    if object == None:
                        hasPOB = False
                        for i in range(pred.ID + 1, len(sentence.words) + 1):
                            item = sentence.get_word_by_id(i)
                            if item.dependency == "POB" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID == pred.ID:
                                hasPOB = True
                                allTripes.append([
                                    item1.lemma, pred.lemma + "" +
                                    item.head_word.lemma + "" + item.lemma,
                                    item2.lemma
                                ])
                        # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")")
                        if hasPOB == False:
                            allTripes.append(
                                [item1.lemma, pred.lemma, item2.lemma])
        """
        DSFN4
        """
        pred = None
        prep = None
        prep1 = None
        pred2 = None
        if item1.dependency == "SBV" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
            if prep.dependency == "CMP":
                pred2 = prep.head_word
                if pred2.ID == pred.ID:
                    # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                    allTripes.append([
                        item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma
                    ])
                else:
                    num = self.get_entity_num_between(pred, pred2, sentence)
                    flagSBV = True
                    flagVOB = True
                    for word in sentence.words:
                        if word.dependency == "SBV" and word.head_word.ID == pred2.ID:
                            flagSBV = False
                        if (word.dependency == "VOB" and word.head_word.ID == pred.ID) or (word.dependency == "POB" \
                                and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred.ID):
                            flagVOB = False
                    # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num))
                    if num == 0:
                        flag = True
                        for word in sentence.words:
                            if word.dependency == "CMP" and word.head_word.ID == pred.ID:
                                prep1 = word
                        if prep1 != None:
                            if flagVOB == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")")
                                allTripes.append([
                                    item1.lemma, pred.lemma + "" + prep1.lemma,
                                    item2.lemma
                                ])
                            # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                            if flagSBV == True:
                                allTripes.append([
                                    item1.lemma, pred2.lemma + "" + prep.lemma,
                                    item2.lemma
                                ])
                        else:
                            if flagVOB == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")")
                                allTripes.append(
                                    [item1.lemma, pred.lemma, item2.lemma])
                            if flagSBV == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                                allTripes.append([
                                    item1.lemma, pred2.lemma + "" + prep.lemma,
                                    item2.lemma
                                ])
        """
        DSFN5
        """
        # self.dsfn5and6(rawSentence,sentence,item1,item2)
        return allTripes

    def get_entity_num_between(self, verb1, verb2, sentence):
        """
        获得两个动词之间的实体数量
        Parameters
        ----------
        entity1 : WordUnit,动词1
        entity2 : WordUnit,动词2
        Returns:
            num:int,两动词间的实体数量
        """
        if verb1.ID > verb2.ID:
            c = verb1
            verb1 = verb2
            verb2 = c
        num = 0
        i = verb1.ID
        while i < verb2.ID - 1:
            if self.is_entity(sentence.words[i]):
                num += 1
            i += 1
        return num

    def is_entity(self, entry):
        """判断词单元是否是实体
        Args:
            entry:WordUnit,词单元
        Returns:
            *:bool,实体(True),非实体(False)
        """
        #候选实体词性列表
        entity_postags = ['nh', 'ni', 'ns', 'nz', 'j', 'n', 'v', 'm']
        # print(entry.lemma+" : "+entry.postag)
        if entry.postag in entity_postags:
            return True
        else:
            return False

    def dsfnAttCOO(self, sentence, item1, item2):
        item1Att = item1
        item2Att = item2
        while item1Att.dependency == "ATT":
            item1Att = item1Att.head_word

        allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2)
        if allTripe == None or len(allTripe) == 0:
            while item2Att.dependency == "ATT":
                item2Att = item2Att.head_word
            allTripe = self.dsfn1_2_3_4COO(sentence, item1, item2Att)
        if allTripe == None or len(allTripe) == 0:
            allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2Att)
        for tripe in allTripe:
            if tripe[0] == item1Att.lemma:
                tripe[0] = item1.lemma
            if tripe[2] == item2Att.lemma:
                tripe[2] = item2.lemma
        return allTripe

    def dsfn5COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            allTripes1 = self.dsfn1_2_3_4COO(sentence, item1COO, item2)
            # print(allTripes1)
            for tripe in allTripes1:
                if tripe[0] == item1COO.lemma:
                    tripe[0] = item1.lemma
                elif tripe[2] == item1COO.lemma:
                    tripe[2] = item1.lemma
            return allTripes1
            # print("allTripes1"+str(allTripes1))
    def dsfn6COO(self, sentence, item1, item2):
        if item2.dependency == "COO":
            item2COO = item2.head_word
            allTripes2 = self.dsfn1_2_3_4COO(sentence, item1, item2COO)
            for tripe in allTripes2:
                if tripe[2] == item2COO.lemma:
                    tripe[2] = item2.lemma
                elif tripe[0] == item2COO.lemma:
                    tripe[0] = item2.lemma
            return allTripes2

    def dsfn5and6COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            if item2.dependency == "COO":
                item2COO = item2.head_word
                allTripe = self.dsfn1_2_3_4COO(sentence, item1COO, item2COO)
                for tripe in allTripe:
                    if tripe[0] == item1COO.lemma and tripe[
                            2] == item2COO.lemma:
                        tripe[0] = item1.lemma
                        tripe[2] = item2.lemma
                    if tripe[2] == item1COO.lemma and tripe[
                            0] == item2COO.lemma:
                        tripe[2] = item1.lemma
                        tripe[0] = item2.lemma
                return allTripe

    def dsfnStart(self, rawSentence, entity1, entity2, all_entity):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        lemmas = dsfn.segment(rawSentence)
        words = dsfn.postag(lemmas)
        words_netag = dsfn.netag(words)
        sentence = dsfn.parse(words_netag)
        # print(sentence.to_string())
        Rawitem1 = None
        Rawitem2 = None
        item1 = None
        item2 = None
        Rawitem1Index = -1
        Rawitem2Index = -1
        indexList = [-1, -1]
        for item in sentence.words:
            if (item.lemma == entity1):
                Rawitem1 = item
            if (item.lemma == entity2):
                Rawitem2 = item
            if Rawitem1 != None and Rawitem2 != None and (
                    Rawitem1.ID != Rawitem1Index
                    or Rawitem2.ID != Rawitem2Index):
                Rawitem1Index = Rawitem1.ID
                Rawitem2Index = Rawitem2.ID
                # if item1 == None or item2 == None:
                #     return None
                item1 = Rawitem1
                item2 = Rawitem2
                if item1.ID > item2.ID:
                    c = item1
                    item1 = item2
                    item2 = c
                # print(str(item1.ID) + "   " + str(item2.ID))
                itemCopy1 = item1
                itemCopy2 = item2
                # print(item1.lemma)
                # print(item2.lemma)
                # print(self.dsfnConstraints2(sentence,item1,item2,all_entity))
                if self.dsfnConstraints2(sentence, item1, item2,
                                         all_entity) == False:

                    continue
                allTripes = self.dsfnStartCOO2(sentence, item1, item2)
                # print("111"+item2.lemma)
                # print(allTripes)
                if allTripes == None or (allTripes != None
                                         and len(allTripes) == 0):
                    # print("我要走ATT的部分了")
                    while item1.dependency == "ATT":
                        item1 = item1.head_word
                    while item2.dependency == "ATT":
                        item2 = item2.head_word
                    allTripes = self.dsfnStartCOO2(sentence, item1, item2)
                    if len(allTripes) != 0:
                        for tripe in allTripes:
                            if tripe[1] != "":
                                if tripe[0] == item1.lemma:
                                    if item1.ID < itemCopy1.ID:
                                        tripe[
                                            0] = item1.lemma + "" + itemCopy1.lemma
                                    elif item1.ID > itemCopy1.ID:
                                        tripe[
                                            0] = itemCopy1.lemma + "" + item1.lemma
                                    else:
                                        tripe[0] = itemCopy1.lemma

                                elif tripe[2] == item1.lemma:
                                    if item1.ID < itemCopy1.ID:
                                        tripe[
                                            2] = item1.lemma + "" + itemCopy1.lemma
                                    elif item1.ID > itemCopy1.ID:
                                        tripe[
                                            2] = itemCopy1.lemma + "" + item1.lemma
                                    else:
                                        tripe[2] = itemCopy1.lemma
                                    # tripe[2] = itemCopy1.lemma

                                if tripe[0] == item2.lemma:
                                    if item2.ID < itemCopy2.ID:
                                        tripe[
                                            0] = item2.lemma + "" + itemCopy2.lemma
                                    elif item2.ID > itemCopy2.ID:
                                        tripe[
                                            0] = itemCopy2.lemma + "" + item2.lemma
                                    else:
                                        tripe[0] = itemCopy2.lemma
                                elif tripe[2] == item2.lemma:
                                    # print(item2.lemma)
                                    if item2.ID < itemCopy2.ID:
                                        tripe[
                                            2] = item2.lemma + "" + itemCopy2.lemma
                                    elif item2.ID > itemCopy2.ID:
                                        tripe[
                                            2] = itemCopy2.lemma + "" + item2.lemma
                                    else:
                                        tripe[2] = itemCopy2.lemma
                                # print("12345")
                                resultList.append(tripe)
                else:
                    for tripe in allTripes:
                        if tripe[1] != "":
                            resultList.append(tripe)
                    # if len(resultList) > 0:
                    #     return np.array(set([tuple(t) for t in resultList]))
        if item1 == None or item2 == None:
            return None
        if len(resultList) > 0:
            # return np.array(set([tuple(t) for t in resultList]))
            # print("输出结果1"+str(resultList))
            return resultList

    def dsfnStartCOO2(self, sentence, item1, item2):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        itemCopy1 = item1
        itemCopy2 = item2
        """
        来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV]
        """
        # print(item1.lemma)
        # print(item2.lemma)
        allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2)
        if len(allTripes) == 0:
            # print("11111111")
            allTripes = self.dsfn5COO(sentence, item1, item2)
            if allTripes == None or len(allTripes) == 0:
                # print("2222222")
                allTripes = self.dsfn6COO(sentence, item1, item2)
                if allTripes == None or len(allTripes) == 0:
                    # print("3333333")
                    allTripes = self.dsfn5and6COO(sentence, item1, item2)
                    # if allTripes == None or len(allTripes) == 0:
                    #     print("44444444444")
                    #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
        # print("第一次"+str(allTripes))
        if allTripes != None and len(allTripes) != 0:
            for tripe in allTripes:
                resultList.append(tripe)
        # print("第二次")
        pred1 = None
        subForCoo = None
        for item in sentence.words:
            if item.postag == "v" and item.dependency == "COO":
                pred1 = item.head_word

                for word in sentence.words:
                    if word.dependency == "SBV" and word.head_word.ID == pred1.ID:
                        for phrase in sentence.words:
                            if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID:
                                subForCoo = phrase
                        if subForCoo == None or (
                                subForCoo != None and subForCoo.ID
                                == word.ID):  # 处理动词COO的情况,必须要保证此并列动词没有额外主语。
                            # 考虑到:习近平主席视察厦门,李克强总理访问香港
                            word.head_word = item
                            # print(sentence.to_string())
                            # print(item1.lemma)
                            # print(item2.lemma)
                            allTripes = self.dsfn1_2_3_4COO(
                                sentence, item1, item2)
                            if len(allTripes) == 0:
                                # print("11111111")
                                allTripes = self.dsfn5COO(
                                    sentence, item1, item2)
                                if allTripes == None or len(allTripes) == 0:
                                    # print("2222222")
                                    allTripes = self.dsfn6COO(
                                        sentence, item1, item2)
                                    if allTripes == None or len(
                                            allTripes) == 0:
                                        # print("3333333")
                                        allTripes = self.dsfn5and6COO(
                                            sentence, item1, item2)
                                        # if allTripes == None or len(allTripes) == 0:
                                        #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
                            # print("第二次"+str(allTripes))
                            if allTripes != None and len(allTripes) != 0:
                                for tripe in allTripes:
                                    resultList.append(tripe)
        # print(np.array(set([tuple(t) for t in resultList])))
        return resultList

    def dsfnConstraints1(self, rawSentence, maxLength):
        """
        :param rawSentence: 原句子
        :param maxLength: 句子的最大长度
        :return: 小于maxLength的长度
        """
        newSentence = []

        if len(rawSentence) <= maxLength:
            newSentence.append(rawSentence)
            return newSentence
        else:
            newSentence = self.splitSentenceByComma(rawSentence)
            return newSentence

    def dsfnConstraints2(self, sentence, item1, item2, allEntities):
        countEntity = 0
        countChar = 0
        for index in range(item1.ID + 1, item2.ID):
            word = sentence.get_word_by_id(index)
            countChar += len(word.lemma)
            if word.lemma in allEntities:
                countEntity += 1
        # print(countEntity)
        # print(countChar)
        if countEntity > 3:
            return False
        elif countChar > 12:
            # print(countChar)
            return False
        else:
            return True

    def dsfnConstraints3(self, sentence, item1, item2):
        countChar = 0
        for index in range(item1.ID + 1, item2.ID):
            word = sentence.get_word_by_id(index)
            countChar += len(word.lemma)
        if countChar > 5:
            return False
        else:
            return True

    def getSPO(self, sentence):
        all_result = []
        raw_sentence = []
        RawSentence = sentence
        lemmas = self.segment(sentence)
        words = self.postag(lemmas)
        words_netag = self.netag(words)
        sentence = self.parse(words_netag)
        # print(sentence.to_string())
        for itemWord in sentence.words:
            #来找到一个动词,这个动词要么是一句话的HED,要么与一句话的HED是COO的依存关系
            if (itemWord.head_word == None and itemWord.postag == "v" ) or (itemWord.postag == "v" and
                                                                  itemWord.dependency == "COO" and itemWord.head_word.head_word == None)\
                     or (itemWord.postag == "v") :
                relation_verb = itemWord  #将找到的这个动词,作为relation_verb
                relationString = relation_verb.lemma
                # print(relationString)
                if itemWord.head_word == None:
                    # print("1")
                    verbId = itemWord.ID  #关系动词的ID
                    verbId2 = None
                elif itemWord.head_word.head_word == None:
                    # print("2")

                    verbId = itemWord.ID  #该关系动词的ID
                    if itemWord.dependency == "COO" or self.get_entity_num_between(
                            itemWord, itemWord.head_word, sentence) == 0:
                        verbId2 = itemWord.head_word.ID  # 这句话的HED,用来找SUB
                    else:
                        verbId2 = None
                else:
                    # print("3")
                    verbId = itemWord.ID  #该关系动词的ID
                    verbId2 = None
                O_dict = dict()  #存储所有的Object
                S_dict = dict()  #存储所有的Subject
                verb_dict = dict()  #存储所有的verb,主要考虑的情况为:习近平主席在北京大学发表演讲
                OBJ = None
                SUB = None
                DSFN3 = dict()
                for item in sentence.words:
                    if item.dependency == "SBV" and item.head_word.ID == verbId:  #寻找这个动词的主语
                        # if SUB == None or SUB.lemma != entity:
                        SUB = item  #找到主语
                        S_dict[SUB.ID] = SUB.lemma  #将主语加入到字典中

                    if (item.dependency == "VOB"
                            and item.head_word.ID == verbId
                            and item.postag != "v"):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        relationString = relation_verb.lemma
                        verb_dict[OBJ.ID] = relationString
                    if (item.dependency == "POB"
                            and item.head_word.postag == "p"
                            and item.head_word.dependency == "CMP"
                            and item.head_word.head_word.ID == verbId):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        relationString = relation_verb.lemma + "" + item.head_word.lemma
                        verb_dict[OBJ.ID] = relationString

                    if (item.dependency == "POB" and (item.head_word.postag == "p" or item.head_word.postag == 'd')\
                            and item.head_word.dependency == "ADV" and item.head_word.head_word.ID == verbId \
                            and item.postag!='v'):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        verbObj = None
                        DSFN3[OBJ.ID] = True
                        objectDict = dict()
                        relationString = relation_verb.lemma
                        for eachWord in sentence.words:
                            if eachWord.dependency == "VOB" and eachWord.head_word.ID == relation_verb.ID:
                                # relationString = relation_verb.lemma + "" + eachWord.lemma
                                verbObj = eachWord
                                objectDict[verbObj.ID] = verbObj
                        if verbObj != None:
                            for word in sentence.words:
                                if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == verbObj.ID:
                                    objectDict[word.ID] = word
                            objectDict = sorted(objectDict.items(),
                                                key=lambda item: item[0])
                            objectStr = ""
                            for objectItem in objectDict:
                                objectStr += objectItem[1].lemma
                            relationString = relation_verb.lemma + "" + objectStr

                        else:
                            for eachWord in sentence.words:
                                if eachWord.dependency == "POB" and eachWord.head_word.dependency == "CMP" and\
                                    eachWord.head_word.head_word.ID == relation_verb.ID:
                                    relationString = relation_verb.lemma + "" + eachWord.head_word.lemma + "" + eachWord.lemma

                        verb_dict[OBJ.ID] = relationString

                if SUB == None:  #如果没找到主语,那么就找与该动词并列的verbId2的主语
                    for item in sentence.words:
                        if item.dependency == "SBV" and item.head_word.ID == verbId2:
                            # if SUB == None or SUB.lemma != entity:
                            SUB = item
                            S_dict[SUB.ID] = SUB.lemma
                # print(verbId2)
                if OBJ == None:
                    verb_coo = None
                    for item in sentence.words:
                        if item.dependency == "COO" and item.head_word.ID == verbId and item.ID > verbId:
                            verb_coo = item
                            break
                    flag = True
                    if verb_coo != None and self.get_entity_num_between(
                            relation_verb, verb_coo, sentence) == 0:

                        for item in sentence.words:
                            if item.dependency == "SBV" and item.head_word.ID == verb_coo.ID:
                                flag = False
                        if flag != False:
                            for item in sentence.words:
                                if (item.dependency == "VOB" and item.head_word.ID == verb_coo.ID)\
                                        or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP"
                                and item.head_word.head_word.ID== verb_coo.ID) or (item.dependency == "POB" and item.head_word.postag == "p"\
                        and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verb_coo.ID):

                                    OBJ = item
                                    O_dict[OBJ.ID] = OBJ.lemma
                # print(S_dict)
                # print(verb_dict)
                # print(O_dict)
                SUB_COO = None
                OBJ_COO = None
                for item in sentence.words:
                    if item.head_word != None:
                        if SUB != None and item.dependency == "COO" and item.head_word.ID in S_dict:  #获得主语的COO
                            SUB_COO = item
                            S_dict[SUB_COO.ID] = SUB_COO.lemma
                    if item.head_word != None and OBJ != None:
                        if item.dependency == "COO" and item.head_word.ID in O_dict:  #获得宾语的COO
                            OBJ_COO = item
                            O_dict[OBJ_COO.ID] = OBJ_COO.lemma
                S_new = []

                for sub in S_dict:
                    # if sentence.get_word_by_id(sub).postag == 'r':
                    #     continue
                    S_dict2 = dict()  # 存放主语ATT的列表
                    S_dict2[sub] = S_dict[sub]
                    flag = True
                    while flag == True:
                        len1 = len(S_dict2)
                        for item in sentence.words:
                            if item.head_word != None:
                                SUBList = S_dict2.keys()
                                if item.head_word.ID in SUBList and (
                                        item.dependency == "ATT"
                                        or item.dependency == "ADV"):
                                    SUBATT = item
                                    S_dict2[SUBATT.ID] = SUBATT.lemma

                            if len(S_dict2) != len1:
                                flag = True
                            else:
                                flag = False
                    S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0])
                    Subject = ""
                    for i in S_dict2:
                        Subject += i[1]
                    S_new.append(Subject)

                O_new = []
                V_new = []
                for obj in O_dict:
                    # if sentence.get_word_by_id(obj).postag == 'r':
                    #     continue
                    O_dict2 = dict()  # 存放宾语ATT的列表
                    O_dict2[obj] = O_dict[obj]
                    if verb_dict != None:
                        if obj in verb_dict:
                            relationString2 = verb_dict[obj]
                        else:
                            relationString2 = relation_verb.lemma
                    else:
                        relationString2 = relation_verb.lemma
                    V_new.append(relationString2)
                    flag = True
                    while flag == True:
                        len2 = len(O_dict2)
                        for item in sentence.words:
                            if item.head_word != None:
                                OBJList = O_dict2.keys()
                                if item.head_word.ID in OBJList and (
                                        item.dependency == "ADV"
                                        or item.dependency == "ATT"
                                        or item.dependency == "VOB" or
                                    (item.dependency == "COO"
                                     and item.head_word.ID != obj)):
                                    if item.dependency == "ATT" and item.postag == "v":
                                        if self.get_entity_num_between(
                                                item,
                                                sentence.get_word_by_id(obj),
                                                sentence) > 0:
                                            continue
                                        else:
                                            OBJATT = item
                                            O_dict2[OBJATT.ID] = OBJATT.lemma
                                    else:
                                        OBJATT = item
                                        O_dict2[OBJATT.ID] = OBJATT.lemma
                                        # print(OBJATT.lemma)

                            if len(O_dict2) != len2:
                                flag = True
                            else:
                                flag = False  #一直循环,直到找不到新的修饰词
                    O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0])
                    Object = ""
                    for i in O_dict2:
                        Object += i[1]
                    flag = False
                    # if obj in DSFN3:
                    #     for location in self.location_entity:
                    #         if location in Object :
                    #             flag = True
                    #     if flag == True:
                    #         O_new.append(Object)
                    #     if flag == False:
                    #         O_new.append("")
                    # else:
                    O_new.append(Object)
                # print(O_dict)
                # print(O_new)

                for sub in S_new:
                    for i in range(0, len(O_new)):
                        obj = O_new[i]
                        relationWord = V_new[i]
                        if obj != "":
                            # print(RawSentence)
                            # print((sub, relationWord, obj))
                            all_result.append([sub, relationWord, obj])
                            raw_sentence.append(RawSentence)

        return all_result, raw_sentence

    def hasEntity(self, word, allEntity):
        for entity in allEntity:
            if entity in word:
                # print(entity)
                return True
        return False

    def PostProcessSPO(self, rawSentence, allTripes, allEntity):
        output_list = []
        for i in range(0, len(allTripes)):
            tripe = allTripes[i]
            sub = tripe[0]
            obj = tripe[2]
            # print(sub)
            # print(obj)
            if self.hasEntity(sub, allEntity) and self.hasEntity(
                    obj, allEntity):
                output_list.append(tripe)
        return output_list
Beispiel #8
0
class LtpParser():
    def __init__(self):
        LTP_DIR = "./ltp_data"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

    '''ltp基本操作'''
    def basic_parser(self, words):
        postags = list(self.postagger.postag(words))
        netags = self.recognizer.recognize(words, postags)
        return postags, netags

    '''ltp获取词性'''
    def get_postag(self, words):
        return list(self.postagger.postag(words))

    '''基于实体识别结果,整理输出实体列表'''
    def format_entity(self, words, netags, postags):
        name_entity_dist = {}
        name_entity_list = []
        place_entity_list = []
        organization_entity_list = []
        ntag_E_Nh = ""
        ntag_E_Ni = ""
        ntag_E_Ns = ""
        index = 0
        for item in zip(words, netags):
            word = item[0]
            ntag = item[1]
            if ntag[0] != "O":
                if ntag[0] == "S":
                    if ntag[-2:] == "Nh":
                        name_entity_list.append(word+'_%s ' % index)
                    elif ntag[-2:] == "Ni":
                        organization_entity_list.append(word+'_%s ' % index)
                    else:
                        place_entity_list.append(word + '_%s ' % index)
                elif ntag[0] == "B":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                elif ntag[0] == "I":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                else:
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                        name_entity_list.append(ntag_E_Nh)
                        ntag_E_Nh = ""
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                        organization_entity_list.append(ntag_E_Ni)
                        ntag_E_Ni = ""
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                        place_entity_list.append(ntag_E_Ns)
                        ntag_E_Ns = ""
            index += 1
        name_entity_dist['nhs'] = self.modify_entity(name_entity_list, words, postags, 'nh')
        name_entity_dist['nis'] = self.modify_entity(organization_entity_list, words, postags, 'ni')
        name_entity_dist['nss'] = self.modify_entity(place_entity_list,words, postags, 'ns')
        return name_entity_dist

    '''entity修正,为rebuild_wordspostags做准备'''
    def modify_entity(self, entity_list, words, postags, tag):
        entity_modify = []
        if entity_list:
            for entity in entity_list:
                entity_dict = {}
                subs = entity.split(' ')[:-1]
                start_index = subs[0].split('_')[1]
                end_index = subs[-1].split('_')[1]
                entity_dict['stat_index'] = start_index
                entity_dict['end_index'] = end_index
                if start_index == entity_dict['end_index']:
                    consist = [words[int(start_index)] + '/' + postags[int(start_index)]]
                else:
                    consist = [words[index] + '/' + postags[index] for index in range(int(start_index), int(end_index)+1)]
                entity_dict['consist'] = consist
                entity_dict['name'] = ''.join(tmp.split('_')[0] for tmp in subs) + '/' + tag
                entity_modify.append(entity_dict)
        return entity_modify

    '''基于命名实体识别,修正words,postags'''
    def rebuild_wordspostags(self, name_entity_dist, words, postags):
        pre = ' '.join([item[0] + '/' + item[1] for item in zip(words, postags)])
        post = pre
        for et, infos in name_entity_dist.items():
            if infos:
                for info in infos:
                    post = post.replace(' '.join(info['consist']), info['name'])
        post = [word for word in post.split(' ') if len(word.split('/')) == 2 and word.split('/')[0]]
        words = [tmp.split('/')[0] for tmp in post]
        postags = [tmp.split('/')[1] for tmp in post]

        return words, postags

    '''依存关系格式化'''
    def syntax_parser(self, words, postags):
        arcs = self.parser.parse(words, postags)
        words = ['Root'] + words
        postags = ['w'] + postags
        tuples = list()
        for index in range(len(words)-1):
            arc_index = arcs[index].head
            arc_relation = arcs[index].relation
            tuples.append([index+1, words[index+1], postags[index+1], words[arc_index], postags[arc_index], arc_index, arc_relation])

        return tuples

    '''为句子中的每个词语维护一个保存句法依存儿子节点的字典'''
    def build_parse_child_dict(self, words, postags, tuples):
        child_dict_list = list()
        for index, word in enumerate(words):
            child_dict = dict()
            for arc in tuples:
                if arc[3] == word:
                    if arc[-1] in child_dict:
                        child_dict[arc[-1]].append(arc)
                    else:
                        child_dict[arc[-1]] = []
                        child_dict[arc[-1]].append(arc)
            child_dict_list.append([word, postags[index], index, child_dict])

        return child_dict_list

    '''parser主函数'''
    def parser_main(self, words, postags):
        tuples = self.syntax_parser(words, postags)
        child_dict_list = self.build_parse_child_dict(words, postags, tuples)
        return tuples, child_dict_list

    '''基础语言分析'''
    def basic_process(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags, netags = self.basic_parser(words)
        name_entity_dist = self.format_entity(words, netags, postags)
        words, postags = self.rebuild_wordspostags(name_entity_dist, words, postags)
        return words, postags
Beispiel #9
0
class Model:
    def __init__(self):
        self.name_says = defaultdict(
            list)  #定义成全局变量有可能从sentence_process()中写入,也可能从single_sentence()写入
        self.model = Word2Vec.load(path)
        self.word_total_count = self.model.corpus_total_words
        self.word_dict = self.model.wv.vocab
        self.dim = 256

        self.postagger = Postagger()  # 初始化实例
        self.postagger.load(pos_model_path)  # 加载模型

        self.say_sim = [
            '诊断', '交代', '说', '说道', '指出', '报道', '报道说', '称', '警告', '所说', '告诉',
            '声称', '表示', '时说', '地说', '却说', '问道', '写道', '答道', '感叹', '谈到', '说出',
            '认为', '提到', '强调', '宣称', '表明', '明确指出', '所言', '所述', '所称', '所指', '常说',
            '断言', '名言', '告知', '询问', '知道', '得知', '质问', '问', '告诫', '坚称', '辩称',
            '否认', '还称', '指责', '透露', '坦言', '表达', '中说', '中称', '他称', '地问', '地称',
            '地用', '地指', '脱口而出', '一脸', '直说', '说好', '反问', '责怪', '放过', '慨叹', '问起',
            '喊道', '写到', '如是说', '何况', '答', '叹道', '岂能', '感慨', '叹', '赞叹', '叹息',
            '自叹', '自言', '谈及', '谈起', '谈论', '特别强调', '提及', '坦白', '相信', '看来', '觉得',
            '并不认为', '确信', '提过', '引用', '详细描述', '详述', '重申', '阐述', '阐释', '承认',
            '说明', '证实', '揭示', '自述', '直言', '深信', '断定', '获知', '知悉', '得悉', '透漏',
            '追问', '明白', '知晓', '发觉', '察觉到', '察觉', '怒斥', '斥责', '痛斥', '指摘', '回答',
            '请问', '坚信', '一再强调', '矢口否认', '反指', '坦承', '指证', '供称', '驳斥', '反驳',
            '指控', '澄清', '谴责', '批评', '抨击', '严厉批评', '诋毁', '责难', '忍不住', '大骂',
            '痛骂', '问及', '阐明'
        ]
        self.valid_sentence = []

        self.parser = Parser()
        self.parser.load(par_model_path)

        self.segmentor = Segmentor()
        self.segmentor.load(cws_model_path)

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(ner_model_path)

    # @functools.lru_cache()
    # @fn_timer
    def get_count(self, word):
        """
        O(1)
        """
        # word_count = 0 #定义默认值
        vector = np.zeros(1)  #定义默认值

        if word in self.word_dict:
            wf = self.word_dict[word].count
            wv = self.model.wv[word]
        else:
            wf = 1
            wv = np.zeros(self.dim)
        return wf / self.word_total_count, wv

        # keys = self.model.wv.vocab.keys()
        # 获取词频及词向量
        # total_words_count = sum([v.count for k,v in self.model.wv.vocab.items()]) #单词总数
        # if word in keys:
        #    word_count = self.model.wv.vocab[word].count
        #    vector = self.model.wv[word] # 单词词语数量
        # word_frequency=word_count/total_words_count # 词频
        # return word_frequency,vector

    #获取句子向量
    #TODO: 计算P(w)的过程可以优化
    def sentence_embedding(self, sentence):
        # 按照论文算法Vs=1/|s|*∑a/(a+p(w))*Vw
        sentences = self.process_content(sentence).replace(' ', '')
        a = 1e-3  #0.001
        # words = list(self.pyltp_cut(sentences))
        # sentence_length = len(words) #句子长度
        # sum_vector = sum([a/(a+float(self.get_count(w)[0]))*self.get_count(w)[1] for w in words])

        words = self.pyltp_cut(sentences)
        sum_vector = np.zeros(self.dim)
        for i, w in enumerate(words):
            wf, wv = self.get_count(w)
            sum_vector += a / (a + wf) * wv

        # sentence_vector = sum_vector/sentence_length
        return sum_vector / (i + 1)

    # 欧式距离
    def euclidSimilar(self, inA, inB):
        return 1.0 / (1.0 + la.norm(inA - inB))

    # 皮尔逊相关系数
    def pearsonSimilar(self, inA, inB):
        if len(inA) != len(inB):
            return 0.0
        if len(inA) < 3:
            return 1.0
        return 0.5 + 0.5 * np.corrcoef(inA, inB, rowvar=0)[0][1]

    # 余弦相似度
    def cosSimilar(self, inA, inB):
        inA = np.mat(inA)
        inB = np.mat(inB)
        num = float(inA * inB.T)
        denom = la.norm(inA) * la.norm(inB)
        return 0.5 + 0.5 * (num / denom)

    # 句子依存分析
    def parsing(self, sentence):
        words = self.pyltp_cut(sentence)  # pyltp分词
        # words=list(jieba.cut(sentence)) #结巴分词
        postags = self.postagger.postag(words)  # 词性标注
        # tmp=[str(k+1)+'-'+v for k,v in enumerate(words)]
        # print('\t'.join(tmp))
        # parser = Parser() # 初始化实例
        # parser.load(par_model_path)  # 加载模型
        arcs = self.parser.parse(words, postags)  # 句法分析
        # parser.release()  # 释放模型
        return arcs

    # 命名实体
    # @functools.lru_cache()
    def get_name_entity(self, strs):
        sentence = ''.join(strs)
        # recognizer = NamedEntityRecognizer()  # 初始化实例
        # recognizer.load(ner_model_path)  # 加载模型
        # words = list(jieba.cut(sentence))  # 结巴分词
        words = self.pyltp_cut(sentence)  #pyltp分词更合理
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别
        # tmp=[str(k+1)+'-'+v for k,v in enumerate(netags)]
        # print('\t'.join(tmp))
        # recognizer.release()  # 释放模型
        return netags

    # 输入单个段落句子数组
    def valid_sentences_(self, sentences, res):
        expect = 0.76

        tmp = ""  # 储存前一个言论
        while sentences:
            curr = sentences.pop(0)
            if curr[0] == '“':  # 当前句子或为 “言论在发言人前的直接引用”。
                print(curr)
                people = re.search('”(.+)“|”(.+)', curr)  # 提取发言人所在句段
                if people:
                    people = [i for i in people.groups() if i][0]
                elif res:
                    res[-1][1] += '。' + curr
                    continue
                else:
                    continue

                saying = curr.replace(people, '')  # 剩余部分被假设为“言论”
                if res and self.judge_pronoun(people):
                    res[-1][1] += '。' + saying
                else:
                    comb = self.single_sentence(people)
                    if comb:
                        saying += comb[1] if comb[1] else ''
                        res.append([comb[0], saying])
                continue

            # 尝试提取新闻 发言人,言论内容
            combi = self.single_sentence(curr)

            # 无发言人: 当前句子属于上一个发言人的言论 或 不属于言论
            if not combi:
                if res and tmp and self.compare_sentence(
                        tmp, curr) > expect:  #基于句子相似度判断
                    print('{} - {} : {}'.format(
                        tmp, curr, self.compare_sentence(tmp, curr)))
                    res[-1][1] += '。' + curr
                    tmp = curr
                continue

            # 有发言人: 提取 发言人 和 言论。
            name, saying = combi
            if res and self.judge_pronoun(curr) and saying:
                res[-1][1] += '。' + saying
            elif saying:
                res.append([name, saying])
            tmp = saying
        return res

    # 输入单个段落句子数组(deprecated)
    #TODO: deprecated
    #def valid_sentences(self, sentences):
    #    expect = 0.75 #近似语句期望系数,本人根据测试估算值
    #    # n_s=defaultdict(list)  #用于返回人物:言论
    #    first = ''  #第一个句子
    #    if len(sentences) == 1:
    #        if self.single_sentence(sentences[0]):
    #            self.name_says[self.single_sentence(sentences[0])[0]].append(self.single_sentence(sentences[0])[1])
    #        return self.name_says
    #    while sentences:
    #        if len(sentences) == 1:
    #            second = sentences.pop(0)  # 第二个句子
    #        else:
    #            first = first + ',' + sentences.pop(0)  # 第一个句子与上一个叠加
    #            second = sentences.pop(0)  # 第二个句子

    #        if self.compare_sentence(first, second) > expect or (self.judge_pronoun(second) and self.single_sentence(second)) or (re.findall(r'^“(.+?)$”', second) and self.single_sentence(first)): #语句近似或者second为代词表达的句子
    #            first = first+','+second
    #        elif self.single_sentence(second) and self.single_sentence(first):
    #            self.name_says[self.single_sentence(first)[0]].append(self.single_sentence(first)[1]) #将第一个语句到此,解析后存入字典中
    #            first=second #第二语句赋值到第一语句
    #        else:
    #            first = first + ',' + second

    #    if self.single_sentence(first):#while循环后遗留的first句子
    #        self.name_says[self.single_sentence(first)[0]].append(self.single_sentence(first)[1])

    #    return self.name_says

    # 输入一个句子,若为包含‘说’或近似词则提取人物、言论,否则返回空
    # just_name:仅进行返回名字操作 ws:整句分析不进行多个“说判断”
    @functools.lru_cache()
    def single_sentence(self, sentence, just_name=False, ws=False):
        sentence = ','.join([x for x in sentence.split(',') if x])
        cuts = list(self.pyltp_cut(sentence))  # pyltp分词更合理
        # mixed = list(set(self.pyltp_cut(sentence)) & set(self.say_sim))
        # mixed.sort(key=cuts.index)

        # if not mixed: return False

        # 判断是否有‘说’相关词:
        mixed = [word for word in cuts if word in self.say_sim]
        if not mixed: return False

        ne = self.get_name_entity(tuple(sentence))  #命名实体
        wp = self.parsing(sentence)  #依存分析
        wp_relation = [w.relation for w in wp]
        postags = list(self.postagger.postag(cuts))
        name = ''

        stack = []
        for k, v in enumerate(wp):
            # save the most recent Noun
            if postags[k] in ['nh', 'ni', 'ns']:
                stack.append(cuts[k])

            if v.relation == 'SBV' and (cuts[v.head - 1] in mixed):  #确定第一个主谓句
                name = self.get_name(cuts[k], cuts[v.head - 1], cuts,
                                     wp_relation, ne)

                if just_name == True: return name  #仅返回名字
                says = self.get_says(cuts, wp_relation, [i.head for i in wp],
                                     v.head)
                if not says:
                    quotations = re.findall(r'“(.+?)”', sentence)
                    if quotations: says = quotations[-1]
                return name, says
            # 若找到‘:’后面必定为言论。
            if cuts[k] == ':':
                name = stack.pop()
                says = ''.join(cuts[k + 1:])
                return name, says
        return False

    # 输入主语第一个词语、谓语、词语数组、词性数组,查找完整主语
    def get_name(self, name, predic, words, property, ne):
        index = words.index(name)
        cut_property = property[index + 1:]  #截取到name后第一个词语
        pre = words[:index]  #前半部分
        pos = words[index + 1:]  #后半部分
        #向前拼接主语的定语
        while pre:
            w = pre.pop(-1)
            w_index = words.index(w)

            if property[w_index] == 'ADV': continue
            if property[w_index] in ['WP', 'ATT', 'SVB'] and (w not in [
                    ',', '。', '、', ')', '('
            ]):
                name = w + name
            else:
                pre = False

        while pos:
            w = pos.pop(0)
            p = cut_property.pop(0)
            if p in ['WP', 'LAD', 'COO', 'RAD'] and w != predic and (w not in [
                    ',', '。', '、', ')', '('
            ]):
                name = name + w  # 向后拼接
            else:  #中断拼接直接返回
                return name
        return name

    # 获取谓语之后的言论
    def get_says(self, sentence, property, heads, pos):
        # word = sentence.pop(0) #谓语
        if ':' in sentence:
            return ''.join(sentence[sentence.index(':') + 1:])
        while pos < len(sentence):
            w = sentence[pos]
            p = property[pos]
            h = heads[pos]
            # 谓语尚未结束
            if p in ['DBL', 'CMP', 'RAD']:
                pos += 1
                continue
            # 定语
            if p == 'ATT' and property[h - 1] != 'SBV':
                pos = h
                continue
            # 宾语
            if p == 'VOB':
                pos += 1
                continue
            # if p in ['ATT', 'VOB', 'DBL', 'CMP']:  # 遇到此性质代表谓语未结束,continue
            #    continue
            else:
                if w == ',':
                    return ''.join(sentence[pos + 1:])
                else:
                    return ''.join(sentence[pos:])

    #解析处理语句并返回给接口
    def sentence_process(self, sentence):
        # 文章 -->清除空行
        # 文章 -->句号分割:如果句号分割A.B, 若B存在‘说’,对B独立解析,否则判断A | B是否相似,确定A是否抛弃B句。
        # 句子 -->确定主谓宾: 依存分析、命名实体识别 -->首先要找到宾语,然后确定宾语是否与说近似,若存在多个与‘说’近似,确定第一个为陈述。在说前找命名实体,说后面到本句结尾为宾语
        # 命名实体 -->通过命名实体识别,若S - NE, NE = S - NE。若B - NE / I - NE / E - NE,NE = B - NE + I - NE + E - NE

        self.name_says = defaultdict(list)
        sentence = sentence.replace('\r\n', '\n')
        sections = sentence.split('\n')  #首先切割成段落
        sections = [s for s in sections if s.strip()]
        valids = ''

        res = []
        for sec in sections:  #段落
            # sec = sec.replace('。”', '”。')  #当做纠正语法错误...
            # sentence_list = sec.split('。')  # 段落拆分成句子
            sentence_list = split(sec)
            sentence_list = [s.strip() for s in sentence_list if s.strip()]
            self.cut_sententce_for_name = [s for s in sentence_list if s]
            # valids = self.valid_sentences(sentence_list)
            res += self.valid_sentences_(sentence_list, [])
        #     print(valids)
        # print("*****************")
        # print(self.cut_sententce_for_name)
        # print(self.valid_sentences)
        # print(self.valid_sentence)
        # print("%%%%%%%%%%%%%")
        if res:
            self.name_says = defaultdict()
            for name, saying in res:
                if name and saying:
                    self.name_says[name] = self.name_says.get(
                        name, '') + saying + ' | '
        return self.name_says

    # 判断是否为代词结构句子“他认为...,他表示....”
    #@fn_timer
    def judge_pronoun(self, sentence):
        subsentence = re.search('(.+)“|”(.+)', sentence)
        if subsentence:
            sentence = subsentence.group(1)
        cuts = list(self.pyltp_cut(sentence))  # 确定分词
        wp = self.parsing(sentence)  # 依存分析
        postags = list(self.postagger.postag(cuts))
        for k, v in enumerate(wp):
            if v.relation == 'SBV' and postags[k] == 'r':  # 确定第一个主谓句
                return True
        return False

    # #获取人物及人物观点中的命名实体
    # def get_name_saywords(self,content):
    #     name_says=self.sentence_process(content)
    #     result=[]
    #     says_list=[]
    #     if name_says:
    #         for name,says in name_says.items():
    #             print(name)
    #             print(says)
    #             says_str = ''.join([''.join(s) for s in says])
    #             # name,says=name_says[0],name_says[1]
    #             name_entity=self.get_name_entity(tuple(says_str))
    #             name_entity=' '.join(name_entity)
    #             result.append((name,name_entity))
    #     else:
    #         return None

    #获取整个新闻文章中的命名实体
    #TODO: This function hasn't been used.
    # def get_news_ne(self,sentence):
    #     self.name_says = defaultdict(list)
    #     sections=sentence.split('\r\n') #首先切割成段落
    #     sections = [s for s in sections if s.strip()]
    #     ne_list = []
    #     for sec in sections: #段落
    #         words = list(self.pyltp_cut(sentence))
    #         nes = self.get_name_entity(tuple(sec))
    #         for k, v in enumerate(nes):
    #             if v != 'O':
    #                 ne_list.append(words[k])
    #     ne_list=list(set(ne_list))
    #     return ' '.join(ne_list)

    # #获取文章中关键词
    # def get_news_keywords(self,news,totalnews):
    #     print(news)
    #     print("*******************")
    #     print(totalnews)
    #句子比对皮尔逊系数
    def compare_sentence(self, inA, inB):
        inC = self.sentence_embedding(inA)
        inD = self.sentence_embedding(inB)
        return self.pearsonSimilar(inC, inD)  #皮尔逊
        # print(self.euclidSimilar(inC,inD))
        # print(self.pearsonSimilar(inC,inD))
        # print(self.cosSimilar(inC,inD))
        # print('------------------------')

    #pyltp中文分词
    #@fn_timer
    # @functools.lru_cache()
    def pyltp_cut(self, sentence):
        # segmentor = Segmentor()  # 初始化实例
        # segmentor.load(cws_model_path)  # 加载模型
        words = self.segmentor.segment(sentence)  # 分
        # segmentor.release()  # 释放模型
        return words

    #结巴词性标注
    def jieba_pseg(self, sentence):
        return pseg.cut(sentence)

    #结巴与哈理工词性标注比较
    #TODO: function hasn't been used
    # def jieba_compare_pyltp(self,sentence):
    #     sentence = sentence.replace('\r\n', '\n')
    #     sections = sentence.split('\n')  # 首先切割成段落
    #     sections = [s for s in sections if s.strip()]
    #     for sec in sections:  # 段落
    #         sentence_list = sec.split('。')  # 段落拆分成句子
    #         sentence_list = [s for s in sentence_list if s]
    #         for sl in sentence_list:
    #             jieba_cut = list(jieba.cut(sl))
    #             jieba_pseg = list(self.jieba_pseg(sl))
    #             print("pyltp 分词:")
    #             pyltp=list(self.pyltp_cut(sl)) #pyltp分词
    #             print(pyltp)
    #             print("结巴分词:")
    #             print(jieba_cut)
    #             print("pyltp词性标注:")
    #             pyltp_pseg=list(self.postagger.postag(jieba_cut))
    #             print(pyltp_pseg)
    #             print("结巴词性标注:")
    #             print(jieba_pseg)
    #             parsed=[(x.head,x.relation) for x in list(self.parsing(sl))]
    #             print(parsed)

    def document_frequency(self, word, document):
        if sum(1 for n in document if word in n) == 0:
            print(word)
            print(type(document))
            print(len(document))
            print(document[0])
        return sum(1 for n in document if word in n)

    def idf(self, word, content, document):
        """Gets the inversed document frequency"""
        return math.log10(
            len(content) / self.document_frequency(word, document))

    def tf(self, word, document):
        """
        Gets the term frequemcy of a @word in a @document.
        """
        words = document.split()

        return sum(1 for w in words if w == word)

    #TODO: The function hasn't been used
    # def get_keywords_of_a_ducment(self,content,document):
    #     content=self.process_content(content)
    #     documents=[self.process_content(x) for x in document]
    #     words = set(content.split())
    #     tfidf = [(w, self.tf(w,content) * self.idf(w,content,documents)) for w in words]
    #     tfidf = sorted(tfidf, key=lambda x: x[1], reverse=True)
    #     tfidf=' '.join([w for w,t in tfidf[:5]]) #取前5为关键词
    #     return tfidf

    def process_content(self, content):
        # print(type(content))
        # content=''.join(content)
        content = re.sub('[+——() ? 【】“”!,:。?、~@#¥%……&*()《 》]+', '', content)
        content = ' '.join(jieba.cut(content))
        return content

    def release_all(self):
        self.segmentor.release()
        self.recognizer.release()
        self.parser.release()
        self.postagger.release()
class NERTagger(object):

    def __init__(self, model_dir_path, com_blacklist):
        # 初始化相关模型文件路径
        self.model_dir_path = model_dir_path
        self.cws_model_path = os.path.join(self.model_dir_path, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.model_dir_path, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.model_dir_path, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

        # 初始化分词模型
        self.segmentor = Segmentor()
        self.segmentor.load(self.cws_model_path)

        # 初始化词性标注模型
        self.postagger = Postagger()
        self.postagger.load(self.pos_model_path)

        # 初始化NER模型
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(self.ner_model_path)

        # 初始化公司名黑名单
        self.com_blacklist = set()
        with open(com_blacklist, 'r', encoding='UTF-8') as f_com_blacklist:
            for line in f_com_blacklist:
                if len(line.strip()) > 0:
                    self.com_blacklist.add(line.strip())


    def ner(self, text, entity_dict):
        words = self.segmentor.segment(text)  # 分词
        post_tags = self.postagger.postag(words)
        ner_tags = self.recognizer.recognize(words, post_tags)  # 命名实体识别
        entity_list = []
        entity = ""
        for word, post_tag, ner_tag in zip(words, post_tags, ner_tags):
            tag = ner_tag[0]
            entity_type = ner_tag[2:]
            if tag == 'S' :  # 单独成实体
                entity_list.append((word, entity_type))
            elif tag in 'BIE':  # 实体的开始词、中间词、结束词
                entity += word
                if tag == 'E':
                    #判断公司名黑名单
                    if entity in self.com_blacklist:  # 黑名单公司名称,在配置文件中
                        entity_list.append((entity, "n"))
                    else:
                        entity_list.append((entity, entity_type))
                    entity = ""
            elif tag == 'O':  # 不构成命名实体
                if post_tag == 'nt':  # 如果词性是机构团体
                    entity += word
                else:
                    if entity != "":  # 这时候已经把刚才漏掉的机构团体名称重新赋值给entity了
                        entity_list.append((entity, 'nt'))
                        entity = ""
                    # 排除错误数字识别,例如“大宗”
                    if post_tag == 'm' and not re.match("[0-9]+.*",word):
                        post_tag = 'n'
                    # 识别数字中的百分数
                    if post_tag == 'm' and re.match("[0-9.]+%",word):
                        post_tag = 'mp'
                    entity_list.append((word, post_tag))
        entity_list = self.ner_tag_by_dict(entity_dict, entity_list)  # entity_dict空字典.抽取出命名实体中里漏掉的公司简称实体
        return NERTaggedText(text, entity_list)

    def ner_tag_by_dict(self, entity_dict, entity_list):
        i = 0
        while i < len(entity_list) - 1:
            has_entity = False
            for entity_len in range(4,1,-1):
                segment = "".join([ x[0] for x in entity_list[i:i+entity_len]])  # 对多个相邻的词进行组合
                segment_uni = segment
                if segment_uni in entity_dict:  # 查看词是否在公司简称的字典里
                    has_entity = True
                    entity_list[i] = (segment, entity_dict[segment_uni])
                    del entity_list[i+1:i+entity_len]
                    i = i + entity_len
                    break
            if not has_entity:
                i += 1
        return entity_list


    def __del__(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
Beispiel #11
0
                        cache = []
                        curid += 1
                    else:
                        tmp = tmp[tmp.find("|||") + 4:]
                        cache.append(tmp.split())
    if cache:
        rs.append(" ".join([
            "<qid_" + str(curid) + ">", "|||",
            getans(cache, frdt.readline(), mapd, postagger, recognizer)
        ]))
        cache = []
    rs = "\n".join(rs)
    with open(rsf, "w") as fwrt:
        fwrt.write(rs.encode("utf-8"))


if __name__ == "__main__":
    ltpdata = "/media/Storage/data/ltp_data/"
    postagger = Postagger()
    postagger.load(os.path.join(ltpdata, "pos.model"))
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(ltpdata, "ner.model"))

    if len(sys.argv) < 6:
        handle(sys.argv[1].decode("utf-8"), sys.argv[2].decode("utf-8"),
               sys.argv[3].decode("utf-8"), sys.argv[4].decode("utf-8"))
    else:
        handle(sys.argv[1].decode("utf-8"), sys.argv[2].decode("utf-8"),
               sys.argv[3].decode("utf-8"), sys.argv[4].decode("utf-8"),
               int(sys.argv[5].decode("utf-8")))
Beispiel #12
0
class Tools:
    def __init__(self):
        path = 'ltp_data_v3.4.0'
        self.par_model_path = os.path.join(path, 'parser.model')
        self.cws_model_path = os.path.join(path, 'cws.model')
        self.pos_model_path = os.path.join(path, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(path, 'ner.model')
        self.srl_model_path = os.path.join(path, 'pisrl_win.model')
        self.recognizer = NamedEntityRecognizer() # 初始化实例
        self.postagger = Postagger() # 初始化实例
        self.segmentor = Segmentor()  # 初始化实例
        self.labeller = SementicRoleLabeller() # 初始化实例
        self.parser = Parser() # 初始化实例
        self.parser.load(self.par_model_path)  # 加载模型
        self.labeller.load(self.srl_model_path)  # 加载模型
        self.recognizer.load(self.ner_model_path)  # 加载模型
        self.postagger.load(self.pos_model_path)  # 加载模型
        self.segmentor.load(self.cws_model_path)  # 加载模型

    def __del__(self):
        self.parser.release()
        self.labeller.release()
        # self.recognizer.release()
        self.postagger.release()
        self.segmentor.release()
    def read_file_or_dir(self,path):
        if os.path.exists(path):
            pass
        else:
            print("路径不存在!")
            os._exit()
        if os.path.isdir(path):
            file_list = os.listdir(path)
            file_path_list = [path + "/" + file_name for file_name in file_list]
            return file_path_list
        else:
            try:
                with open(path,encoding="utf-8") as rd:
                    content = rd.read()
            except UnicodeDecodeError:
                with open(path,encoding="gbk") as rd:
                    content = rd.read()
            return content

    def nltk(self,txt):#传入单句
        words = self.segmentor.segment(txt)
        postags = self.postagger.postag(words)
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        return list(words),list(postags),roles

    def deal_with_pos_str(self,pos_str):
        reg_pattern = "n+"
        reg_pattern_2 = "ncn"
        reg_pattern_9 = "an"
        reg_pattern_3 = "un"
        reg_pattern_4 = "v+"
        reg_pattern_5 = "rn"
        reg_pattern_8 = "vcv"
        reg_pattern_6 = "pbnv"
        reg_pattern_7 = "pnv"
        reg_pattern_10 = "av"

        pos_str = re.sub(reg_pattern,"n",pos_str)
        pos_str = re.sub(reg_pattern_2,"n",pos_str)
        pos_str = re.sub(reg_pattern_4,"v",pos_str)
        pos_str = re.sub(reg_pattern_5,"n",pos_str)
        pos_str = re.sub(reg_pattern_9,"n",pos_str)
        pos_str = re.sub(reg_pattern_3,"n",pos_str)
        pos_str = re.sub(reg_pattern_8,"v",pos_str)
        pos_str = re.sub(reg_pattern_6,"v",pos_str)
        pos_str = re.sub(reg_pattern_7,"v",pos_str)
        pos_str = re.sub(reg_pattern_10,"v",pos_str)
        return pos_str
Beispiel #13
0
def cal_sentiment_NER(df_text):
    """
    natural language processing on every row from the input.
    1. for loop dataframe:
    2. preprocess text in the df.
    3. get entity using pyLTP
    4. get sentiment, keywords, summary using SnowNLP.
    5. append result to df
    Keyword Arguments:
    df_text --
    """
    # 词性标注
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型

    # 命名实体识别
    ner_model_path = os.path.join(LTP_DATA_DIR,
                                  'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.DEBUG)

    if isinstance(df_text, gftIO.GftTable):
        df_text = df_text.as_mutable_column_tab()
    df_result = pd.DataFrame(columns=[
        'datetime', 'people', 'geography', 'organization', 'keyword',
        'summary', 'score'
    ])
    for item in df_text[:10].iterrows():
        #  print(item[1]['Conclusion'])
        logging.info(item[0])

        text = item[1]['Conclusion']
        datetime = item[1]['WritingDate']
        if not pd.isnull(text):
            text_split = preprocessing.preprocess_string(text)
            # 词性标注
            #            postagger = Postagger()  # 初始化实例

            words = text_split.split()  # 分词结果
            postags = postagger.postag(words)  # 词性标注
            netags = recognizer.recognize(words, postags)  # 命名实体识别

            dict_netags = defaultdict(list)
            ls_netags = list(zip(netags, words))
            for x, y in ls_netags:
                dict_netags[x].append(y)

            s = SnowNLP(text)
            score = s.sentiments * 2
            # # 人名(Nh)、地名(Ns)、机构名(Ni。)
            # # B、I、E、S
            ls_organization = [
                dict_netags[x] for x in ['S-Ni', 'B-Ni', 'E-Ni', 'I-Ni']
            ]
            ls_people = [
                dict_netags[x] for x in ['S-Nh', 'B-Nh', 'E-Nh', 'I-Nh']
            ]
            ls_geography = [
                dict_netags[x] for x in ['S-Ns', 'B-Ns', 'E-Ns', 'I-Ns']
            ]
            try:
                df_result = df_result.append(
                    {
                        'datetime':
                        datetime,
                        'keyword':
                        ','.join(s.keywords()),
                        'organization':
                        list(itertools.chain.from_iterable(ls_organization)),
                        'people':
                        list(itertools.chain.from_iterable(ls_people)),
                        'geography':
                        list(itertools.chain.from_iterable(ls_geography)),
                        'summary':
                        ';'.join(s.summary()),
                        'score':
                        score
                        # 'text': text,
                    },
                    ignore_index=True)
            except:
                continue
    return df_result
Beispiel #14
0
def simlify(text):
    LTP_DATA_DIR = r'E:\anaconda\ltpmoxin\ltp_data'  # ltp模型目录的路径
    cws_model_path = os.path.join(LTP_DATA_DIR,
                                  'cws.model')  # 分词模型路径,模型名称为`cws.model`

    lexicon_path = os.path.join(LTP_DATA_DIR, 'lexicon')  # 分词词典lexicon

    segmentor = Segmentor()  # 初始化实例

    # segmentor.load(cws_model_path)  # 加载模型,如果不想自定义词典,就用这一句load模型即可

    segmentor.load_with_lexicon(cws_model_path,
                                lexicon_path)  # 加载模型,参数lexicon是自定义词典的文件路径

    words = segmentor.segment(text)  # 分词

    #print('|'.join(words))#打印分词结果

    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

    postagger = Postagger()  # 初始化实例

    postagger.load(pos_model_path)  # 加载模型

    postags = postagger.postag(words)  # 词性标注,这里words是分词后的list

    #print(' | '.join(postags))

    postagger.release()  # 释放模型

    par_model_path = os.path.join(
        LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`

    parser = Parser()  # 初始化实例

    parser.load(par_model_path)  # 加载模型

    arcs = parser.parse(words, postags)  # 句法分析
    parser.release()  # 释放模型
    #信息提取,结果展示

    rely_id = [arc.head for arc in arcs]  # 提取依存父节点id

    relation = [arc.relation for arc in arcs]  # 提取依存关系

    heads = ['Root' if id == 0 else words[id - 1]
             for id in rely_id]  # 匹配依存父节点词语

    #for i in range(len(words)):

    #print(relation[i] +'(' + words[i] +', ' + heads[i] +')')

    array = []
    for i in range(len(words)):
        dict = {}
        dict["dep"] = words[i]
        dict["gov"] = heads[i]
        dict["pos"] = relation[i]
        array.append(dict)
    return array

    ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    #for word, ntag in zip(words, netags):
    #   print(word + '/' + ntag)
    recognizer.release()  # 释放模型
Beispiel #15
0
class LtpHelper(Component):
    """A new component"""
    name = "ltp"

    provides = []

    requires = []

    defaults = {}

    language_list = None

    def __init__(self, component_config: Dict[Text, Any] = None):
        super(LtpHelper, self).__init__(component_config)
        self.path = component_config['path']
        self.lexicon = component_config['lexicon']
        self.dimension = component_config['dimension']

        ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
        MODELDIR = os.path.join(ROOTDIR, self.path)
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"),
                                         self.lexicon)

        self.postagger = Postagger()
        self.postagger.load(os.path.join(MODELDIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(MODELDIR, 'parser.model'))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(MODELDIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))

    def extract_tokens(self, message: Message):
        tokens = list(self.segmentor.segment(message.text))
        segments = []
        start = 0
        for idx, token in enumerate(tokens):
            end = start + len(token)
            segments.append({'start': start, 'end': end})
            start = end
        message.set("segments", segments)
        message.set("tokens", tokens)

    def extract_poses(self, message: Message):
        if not message.get("tokens", default=None):
            self.extract_tokens(message)

        message.set("poses",
                    list(self.postagger.postag(message.get("tokens"))))

    def extract_tagseq(self, message: Message):
        """
        实体抽取, 这部分需要扩张
        :param message:
        :return:
        """
        message.set(
            "tagseq",
            list(
                self.recognizer.recognize(message.get("tokens"),
                                          message.get("poses"))))

    def extract_parses(self, message: Message):
        message.set(
            "arcs",
            self.parser.parse(message.get("tokens"), message.get("poses")))

    def extract_labels(self, message: Message):
        message.set(
            "labels",
            self.labeller.label(message.get("tokens"), message.get("poses"),
                                message.get("arcs")))

    def train(self, training_data: TrainingData, config: RasaNLUModelConfig,
              **kwargs: Any) -> None:
        """Train this component.
        """
        pass

    def extract_entities(self, message: Message):

        # step1. 序列标注
        self.extract_tagseq(message)

        # step2.
        tokens, labels = message.get("tokens"), message.get("tagseq")
        i, start, end = 0, 0, 0
        spans = []
        while i < len(labels):
            if labels[i].startswith('E'):
                dim = labels[i].split('-')[1]
                # 实体->词条
                value = "".join(tokens[start:i + 1])
                # 句子开始
                _start = get_start(start, tokens=tokens)
                # 句子结束
                _end = get_start(i, tokens=tokens) + len(value)
                ent = {
                    'label': self.dimension[dim],
                    'start': _start,
                    'end': _end,
                }
                spans.append(ent)
                start = 0
            elif labels[i].startswith('B'):
                start = i
            elif labels[i].startswith('S'):
                dim = labels[i].split('-')[1]
                value = "".join(tokens[i:i + 1])

                _start = get_start(i, tokens=tokens)
                _end = _start + len(value)
                ent = {
                    'label': self.dimension[dim],
                    'start': _start,
                    'end': _end,
                }
                spans.append(ent)
            else:  # O
                pass
            i += 1
        message.set("spans", spans, add_to_output=True)

    def extract_pronouns(self, message: Message, **kwargs: Any):
        pronouns = []
        tokens, poses = message.get("tokens"), message.get("poses")
        for i, (w, p) in enumerate(zip(tokens, poses)):
            if p == 'r' and legalPronouns(w):
                # 增加性别、单复数属性
                start = get_start(i, tokens=tokens)
                end = start + len(w)
                pronouns.append({
                    'start': start,
                    'end': end,
                    'label': "Pronoun"
                })
        message.set("spans",
                    message.get("spans", []) + pronouns,
                    add_to_output=True)

    def entity_segment(self, message: 'Message', **kwargs: Any):
        # type: (List, List[Dict])->List
        """ 属性链接

        :param tokens: [word, word, word]
        :param entities: [{'entity': 'A', 'body': 'word', 'start': 0, 'end': 1}, ...]
        :return: [word, word, word]
        """
        entities = message.get("entities")
        tokens = message.get("tokens")

        if len(entities) == 0:
            return tokens
        else:
            # 求出tokens中所有词的starts和ends的坐标
            lengths = [len(w) for w in tokens]
            pos = [0]
            for p in lengths:
                pos.append(p + pos[-1])
            starts = pos[:-1]
            ends = pos[1:]

            #  标注长度和位置信息
            i = 0
            for e in entities:
                e['length'], e['index'] = e['end'] - e['start'], i
                i += 1

            # 保证entities的start和end,在starts和ends里面,否则筛除
            valid_entities = [
                e for e in entities
                if (e['start'] in starts) and (e['end'] in ends)
            ]
            token_entities = [{
                'entity': w,
                'body': w,
                'start': start,
                'end': end
            } for w, start, end in zip(tokens, starts, ends)]
            # 对entities按长度的降序排列,意味着如果位置相同,长词语优先保留
            valid_entities.sort(key=lambda x: x['length'], reverse=True)
            valid_entities.extend(token_entities)
            valid_entities.sort(key=lambda x: x['start'], reverse=False)
            # 筛选实体,如有包含,较长的实体优先;如有交叉,先出现的实体优先;如完全相同,取第1个(意味着随机)
            p = 0
            filtered_entities = []

            for e in valid_entities:
                if e['start'] == p:
                    filtered_entities.append(e)
                    p = e['end']
            # 改变token
            word_tokens = [
                message.text[e['start']:e['end']] for e in filtered_entities
            ]

            # 记录词语的位置
            entity_selected = {}
            i = 1
            for e in filtered_entities:
                if 'length' in e:
                    e.update({'index': i})
                    entity_selected.update({i: e})
                i += 1

            valid_pos = list(entity_selected.keys())

            message.set("tokens", word_tokens)
            message.set("entity_selected", entity_selected)
            message.set("valid_pos", valid_pos)

    def link_analyze(self, message: Message, **kwargs: Any):

        tokens = message.get("tokens", [])
        postags = message.get("poses", [])
        arcs = message.get("arcs")
        arcs = [(arc.head, arc.relation) for arc in arcs]
        semantic = list(
            zip(list(range(1,
                           len(tokens) + 1)), tokens, postags, arcs))
        logging.debug('semantic structrue: {}'.format(semantic))
        # 以下是特殊情况下的句法调整
        # 第一种情况:记录动词“是”和“为”的位置
        loc = []
        for struc in semantic:
            if (struc[1] in ['是', '为']) and (struc[2] == 'v'):
                loc.append(struc[0])
        for i in loc:
            pre_loc = 0
            suf_loc = 0
            for j in range(1, i):
                if (semantic[j - 1][3][0] == i) and (semantic[j - 1][3][1]
                                                     == 'SBV'):
                    pre_loc = j
            for j in range(i + 1, min(len(semantic) + 1,
                                      i + 10)):  # 最多间隔10个词语,对于宾语来说已经足够
                if (semantic[j - 1][3][0] == i) and (semantic[j - 1][3][1]
                                                     == 'VOB'):
                    suf_loc = j
            if pre_loc and suf_loc:
                semantic[pre_loc - 1] = (semantic[pre_loc - 1][0],
                                         semantic[pre_loc - 1][1],
                                         semantic[pre_loc - 1][2], (suf_loc,
                                                                    'SEO'))

        # 第二种情况:此处是句法分析出错的情况,将实体识别成谓语成分SBV,词性为i
        loc = []
        for struc in semantic:
            if struc[2] == 'i':
                loc.append(struc[0])
        for i in loc:
            for j in range(1, i):
                if (semantic[j - 1][3][0] == i) and (semantic[j - 1][3][1]
                                                     == 'SBV'):
                    semantic[j - 1] = (semantic[j - 1][0], semantic[j - 1][1],
                                       semantic[j - 1][2], (i, 'SEO'))

        # 第三种情况:记录动词“名叫”和“叫”的位置
        loc = []
        for struc in semantic:
            if (struc[1] in ['名叫', '叫', '叫做']) and (struc[2] == 'v'):
                loc.append(struc[0])
        for i in loc:
            for j in range(i + 1, min(len(semantic) + 1, i + 10)):
                if (semantic[j - 1][3][0] == i) and (semantic[j - 1][3][1]
                                                     == 'VOB'):
                    semantic[j - 1] = (semantic[j - 1][0], semantic[j - 1][1],
                                       semantic[j - 1][2],
                                       (semantic[i - 1][3][0], 'SEO'))

        message.set('semantic', semantic, add_to_output=False)

    def process(self, message: Message, **kwargs: Any):
        """Process an incoming message.

        This is the components chance to process an incoming
        message. The component can rely on
        any context attribute to be present, that gets created
        by a call to :meth:`components.Component.pipeline_init`
        of ANY component and
        on any context attributes created by a call to
        :meth:`components.Component.process`
        of components previous to this one."""
        # TODO 分词, 如果利用其它分词组件, 需要进一步调整
        if not message.get("tokens", default=None):
            self.extract_tokens(message)
            # 词性标注
            self.extract_poses(message)
            # 句法依存
            self.extract_parses(message)
            # 抽取实体<序列标注+实体提取>
            self.extract_entities(message)
            # 抽取代词
            self.extract_pronouns(message)
        else:
            # rasa tokenizers
            tokens = message.get("tokens")
            message.set("tokenizers", tokens)
            # List tokens
            tokens = [tokenizer_extract(token) for token in tokens]
            message.set("tokens", tokens)
            self.extract_poses(message)
            # 句法依存
            self.extract_parses(message)
            # 抽取实体<序列标注+实体提取>
            # 语义分割 ->
            self.entity_segment(message)
            # 属性分析 ->
            self.link_analyze(message)

    @classmethod
    def load(cls,
             meta: Dict[Text, Any],
             model_dir: Optional[Text] = None,
             model_metadata: Optional[Metadata] = None,
             cached_component: Optional = None,
             **kwargs):
        return cls(meta)
if __name__ == '__main__':

    # testLine = '著名相声家成龙的师傅是马季。'

    while True:
        testLine = raw_input('请输入字符串:(-1退出)')

        namedEntityTagTupleList = []

        segmentor = Segmentor()
        # segmentor.load(inout.getLTPPath(index.CWS))
        segmentor.load_with_lexicon(inout.getLTPPath(index.CWS),
                                    inout.getResourcePath('userDic.txt'))
        words = segmentor.segment(testLine)
        segmentor.release()
        postagger = Postagger()
        postagger.load(inout.getLTPPath(index.POS))
        postags = postagger.postag(words)
        postagger.release()
        recognizer = NamedEntityRecognizer()
        recognizer.load(inout.getLTPPath(index.NER))
        netags = recognizer.recognize(words, postags)
        recognizer.release()

        for word, netag in zip(words, netags):
            namedEntityTagTupleList.append((word, netag))

        neTagList = '\t'.join(netags).split('\t')

        printEscapeStr(namedEntityTagTupleList)
        printEscapeStr(neTagList)
Beispiel #17
0
class LtpModel(object):
    """
    封装pyltp model 类,方便使用
    """
    @pysnooper.snoop()
    def __init__(self, LTP_DATA_DIR):
        """加载pyltp模型"""
        self.LTP_DATA_DIR = LTP_DATA_DIR  # pyltp的存放路径

        # 分词模型路径,分词模型名称是 'cws.model'
        cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')
        self.segmentor = Segmentor()
        self.segmentor.load(cws_model_path)

        # 词性标注模型路径,分词模型名称是 'pos.model'
        pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')
        self.postager = Postagger()
        self.postager.load(pos_model_path)

        # 命名实体识别模型路径,模型名称为'ner.model'
        ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(ner_model_path)

        # 依存句法分析模型路径,模型名称为 'parser.model'
        par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model')
        self.parser = Parser()
        self.parser.load(par_model_path)

        # # 语义角色标注模型目录路径,模型目录为'pisrl.model'
        # srl_model_path = os.path.join(self.LTP_DATA_DIR, 'pisrl.model')
        # self.labeller = SementicRoleLabeller()  # 初始化实例
        # self.labeller.load(srl_model_path)  # 加载模型

    def load_model(self):
        # """加载pyltp模型"""
        # # 分词模型路径,分词模型名称是‘cws.model’
        # self.segment = Segmentor()
        # print(cws_model_path)
        # self.segment.load(cws_model_path)

        # # 词性标注模型路径,分词模型名称是‘pos.model’
        # self.postager = Postagger()
        # self.postager.load(pos_model_path)
        #
        # # 命名实体识别模型路径,模型名称为`pos.model`
        # self.recognizer = NamedEntityRecognizer()
        # self.recognizer.load(ner_model_path)
        #
        # # 依存句法分析模型路径,模型名称为`parser.model`
        # self.parser = Parser()
        # self.parser.load(par_model_path)
        #
        # # 语义角色标注模型目录路径,模型目录为`srl`
        # self.labeller = SementicRoleLabeller()  # 初始化实例
        # self.labeller.load(srl_model_path)  # 加载模型

        # 加载word2vec 模型
        pass

    @pysnooper.snoop()
    def release_all_model(self):
        """释放模型"""
        self.segmentor.release()
        self.postager.release()
        self.recognizer.release()
        self.parser.release()
        # word2vec 模型的释放
        pass

    # 分句
    @pysnooper.snoop()
    def split_sentences(self, string):
        sents = SentenceSplitter.split(string)
        sentences = [s for s in sents if len(s) != 0]
        return sentences

    def jieba_word_cut(self, string):
        string = re.findall(
            '[\d|\w|\u3002 |\uff1f |\uff01 |\uff0c |\u3001 |\uff1b |\uff1a |\u201c |\u201d |\u2018 |\u2019 |\uff08 |\uff09 |\u300a |\u300b |\u3008 |\u3009 |\u3010 |\u3011 |\u300e |\u300f |\u300c |\u300d |\ufe43 |\ufe44 |\u3014 |\u3015 |\u2026 |\u2014 |\uff5e |\ufe4f |\uffe5]+',
            string)
        string = ' '.join(string)

        return ' '.join(jieba.cut(string))

    # 分词
    @pysnooper.snoop()
    def split_words(self, sentences):
        sents = [self.jieba_word_cut(s) for s in sentences]
        return sents

    # 词性分析
    @pysnooper.snoop()
    def get_word_pos(self, sents):
        postags = [self.postager.postag(words.split()) for words in sents]
        postags = [list(w) for w in postags]
        return postags

    # 依存句法分析
    @pysnooper.snoop()
    def dependency_parsing(self, sents, postags, said):

        contents = []
        for index in range(len(sents)):
            wo = sents[index].split()
            po = postags[index]

            netags = self.recognizer.recognize(wo, po)  # 命名实体识别
            netags = list(netags)
            # print(netags)
            if ('S-Nh' not in netags) and ('S-Ni' not in netags) and (
                    'S-Ns'
                    not in netags):  # 人名、机构名、地名  当人名、机构名、地名在该句中则进行依存句法分析
                continue

            arcs = self.parser.parse(wo, po)

            arcs = [(arc.head, arc.relation) for arc in arcs]
            # print(arcs)  #[(2, 'SBV'), (0, 'HED'), (5, 'SBV'), (5, 'ADV'), (2, 'VOB')]
            arcs = [(i, arc) for i, arc in enumerate(arcs)
                    if arc[1] == 'SBV']  # SBV 主谓关系 找出主谓关系的句子
            # print(arcs)  #[(0, (2, 'SBV')), (2, (5, 'SBV'))]
            for arc in arcs:
                verb = arc[1][0]  # 2  5
                subject = arc[0]  # 0  1
                if wo[verb -
                      1] not in said:  # 如果wo[verb - 1]这个所对应的词语  在已建词表said中,则打印出来
                    continue
                # print(wo[subject],wo[verb - 1],''.join(wo[verb:]))
                contents.append((wo[subject], wo[verb - 1],
                                 ''.join(wo[verb:])))  # 依次为人物、"说"的近义词、文本

        return contents

    @pysnooper.snoop()
    def get_sentences_json_result(self, string):
        """
        对输入的句子进行SBV提取
        :param string:
        :return: list of dict [{}]
        """

        sentences = self.split_sentences(string)  # 分句
        sents = self.split_words(sentences)  # 分词
        postags = self.get_word_pos(sents)  # 词性分析
        contents = self.dependency_parsing(sents, postags, txt_said)  # 依存句法分析

        # 拼装json结果
        contents_dict = []
        for ones in enumerate(contents):
            # json 字段
            result = {
                'name': ones[1][0],
                'trigger': ones[1][1],
                'content': ones[1][2]
            }
            contents_dict.append(result)
        return contents_dict
Beispiel #18
0
class LtpParser:
    def __init__(self):
        LTP_DIR = 'E:\LTP\ltp_data_v3.4.0'  # ltp模型目录的路径
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(
            LTP_DIR, "cws.model"))  # 分词模型路径,模型名称为`cws.model`

        self.postagger = Postagger()
        self.postagger.load(os.path.join(
            LTP_DIR, "pos.model"))  # 词性标注模型路径,模型名称为`pos.model`

        self.parser = Parser()
        self.parser.load(os.path.join(
            LTP_DIR, "parser.model"))  # 依存句法分析模型路径,模型名称为`parser.model

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(
            LTP_DIR, "ner.model"))  # 命名实体识别模型路径,模型名称为`ner.model`

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, "pisrl_win.model")
                           )  # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。

    '''语义角色标注'''

    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        for role in roles:
            print(words[role.index])
            print(
                role.index, "".join([
                    "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                    for arg in role.arguments
                ]))
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:  #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list

    def sentence_splitter(self, sentence='你好,你觉得这个例子从哪里来的?当然还是直接复制官方文档'):
        sents = SentenceSplitter.split(sentence)  # 分句
        return (list(sents))
Beispiel #19
0
from pyltp import Postagger
from pyltp import Segmentor
from pyltp import NamedEntityRecognizer

pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 词性标注模型路径,模型名称为`pos.model`
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

postagger = Postagger() # 初始化实例
postagger.load_with_lexicon(pos_model_path, './model/lexicon')  # 加载模型, 及自定外部詞典
#postagger.load(pos_model_path)  # 加载模型, 及自定外部詞典

segmentor = Segmentor()  # 初始化实例
segmentor.load_with_lexicon(cws_model_path, './model/lexicon') # 加载模型,第二个参数是您的外部词典文件路径
recognizer = NamedEntityRecognizer() # 初始化实例
recognizer.load(ner_model_path)  # 加载模型

words = segmentor.segment(text)
postags = postagger.postag(words)  # 词性标注
netags = recognizer.recognize(words, postags)  # 命名实体识别

arcs = parser.parse(words, postags)  # 句法分析

print ('=' * 30)
print ('\t'.join(words))
print ('\t'.join(postags))
print ('\t'.join(netags))
print ("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))


postagger.release()
Beispiel #20
0
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import time
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import NamedEntityRecognizer
import re

segmentor = Segmentor()  # 初始化实例
segmentor.load('/home/sherlock/Documents/ltp_data/cws.model')
#实例化词性工具
postagger = Postagger()  # 初始化实例
postagger.load('/home/sherlock/Documents/ltp_data/pos.model')  # 加载模型
recognizer = NamedEntityRecognizer()
recognizer.load('/home/sherlock/Documents/ltp_data/ner.model')


def wdseg(inputstr, ret_type):

    words = segmentor.segment(inputstr)  # 分词
    if ret_type == 'str':
        seg_word = ' '.join(words)
    if ret_type == 'lst':
        seg_word = ' '.join(words)
        seg_word = seg_word.split()

    #segmentor.release()  # 释放模型
    return seg_word

Beispiel #21
0
class LtpParser:
    def __init__(self):
        LTP_DIR = "/home/python/ltp/ltp_data_v3.4.0"

        # 分词模型,单文件
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        # 词性标注模型,单文件
        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        # 依存句法分析模型,单文件
        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        # 命名实体识别模型,单文件
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        # 语义角色标注模型,多文件
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

    def release_model(self):
        # 释放模型
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()

    # 命名实体识别
    def entity_ner(self, words, postags):
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别
        entity_ner = list()
        for word, ntag in zip(words, netags):
            entity_ner.append((word, ntag))
        return entity_ner

    # 语义角色标注
    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    # 句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典
    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:  # arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)
        return child_dict_list, format_parse_list

    # 通用实体识别
    def comm_ner(self, sentence):
        class_entity = {"Ni": "institution", "Ns": "place", "Nh": "name"}
        words = jieba.cut(sentence)
        words = [word for word in words]
        postags = list(self.postagger.postag(words))
        entity_ner = self.entity_ner(words, postags)
        result = set()
        entity = list()
        index = 0
        for item in entity_ner:
            entity_name = item[0]
            entity_bz = item[1]
            temp = entity_bz.split("-")
            if len(temp) == 2:
                bz = temp[0]
                type = temp[1]
                if bz == "S":
                    result.add((entity_name, class_entity.get(type)))
                else:
                    entity.append((index, entity_name, class_entity.get(type)))
                    if bz == "E":
                        index = index + 1
        if len(entity) > 0:
            entitydf = pd.DataFrame(entity)
            resulttemp = entitydf.groupby([
                0, 2
            ])[1].apply(lambda x: "".join(list(x))).reset_index(name='实体名称')
            for item in np.array(resulttemp[["实体名称", 2]]).tolist():
                result.add(tuple(item))
        print("ltp=", result)
        return result
Beispiel #22
0
#!/usr/bin/env python
# coding=utf-8

from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer
from pprint import pprint

segmentor = Segmentor()
segmentor.load_with_lexicon("./ltp_data/cws.model", './construct_dict.txt')
# segmentor.load("./ltp_data/cws.model")  # 分词模型
postagger = Postagger()
postagger.load("./ltp_data/pos.model")  # 词性标注
parser = Parser()
parser.load("./ltp_data/parser.model")  # 依存句法分析
recognizer = NamedEntityRecognizer()
recognizer.load("./ltp_data/ner.model")  # 命名实体识别

in_file_name = "input_test"
out_file_name = "output.txt"
in_file = open(in_file_name, 'r', encoding="utf-8")
out_file = open(out_file_name, 'w+', encoding="utf-8")

construct_list = []


def get_contruct_list():
    f = open('construct_dict.txt', 'r', encoding="utf-8")
    for line in f:
        construct = line.strip()
        if construct not in construct_list:
            construct_list.append(construct)
Beispiel #23
0
class LtpParser:
    def __init__(self):
        LTP_DIR = "./ltp_data"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

    def format_labelrole(self, words, postags):
        '''语义角色标注'''
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    def build_parse_child_dict(self, words, postags, arcs):
        '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:  #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    def parser_main(self, sentence):
        '''parser主函数'''
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
Beispiel #24
0
class LTP_CLASS(object):
    def __init__(self):
        self.LTP_DATA_DIR = '/Users/yf/Downloads/ltp_data_v3.4.0'
        # 自定义分词表
        self.cut_file = '/Users/yf/Downloads/ltp_data_v3.4.0/cut.txt'
        # 分词结果
        self.cut_list = []
        # 依存关系
        self.arcs = None
        # 词性
        self.part_speech_list = []
        # 分词
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(
            os.path.join(self.LTP_DATA_DIR, 'cws.model'), self.cut_file)
        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.LTP_DATA_DIR, 'pos.model'))
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.LTP_DATA_DIR, 'ner.model'))
        # 依存句法分析
        self.parser = Parser()
        self.parser.load(os.path.join(self.LTP_DATA_DIR, 'parser.model'))
        # 语义角色标注
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(self.LTP_DATA_DIR, 'pisrl.model'))

        # 词性标注集
        self._dict = {
            "a": "形容词",
            "ni": "机构名称",
            "b": "其他名词修饰语",
            "nl": "位置名词",
            "c": "连词",
            "ns": "地名",
            "d": "副词",
            "nt": "时态名词",
            "e": "感叹",
            "nz": "其他专有名词",
            "g": "词素",
            "o": "拟声词",
            "h": "字首",
            "p": "介词",
            "i": "成语",
            "q": "数量",
            "j": "缩写",
            "r": "代词",
            "k": "后缀",
            "u": "辅助的",
            "m": "数",
            "v": "动词",
            "n": "一般名词",
            "wp": "标点",
            "nd": "方向名词",
            "ws": "外来词",
            "nh": "人名",
            "x": "最小意义单位"
        }
        # 依存句法关系
        self._dict2 = {
            "SBV": "主谓关系",
            "VOB": "动宾关系",
            "IOB": "间宾关系",
            "FOB": "前置宾语",
            "DBL": "兼语",
            "ATT": "定中关系",
            "ADV": "状中结构",
            "CMP": "动补结构",
            "COO": "并列关系",
            "POB": "介宾关系",
            "LAD": "左附加关系",
            "RAD": "右附加关系",
            "IS": "独立结构",
            "HED": "核心关系"
        }
        # 命名实体识别标注集
        self._idct3 = {
            "O": "这个词不是NE",
            "S": "这个词单独构成一个NE",
            "B": "这个词为一个NE的开始",
            "I": "这个词为一个NE的中间",
            "E": "这个词位一个NE的结尾"
        }
        self._dict4 = {"Nh": "人名", "Ni": "机构名", "Ns": "地名"}
        # 语义角色类型
        self._dict5 = {
            "ADV": "默认标记",
            "BNE": "受益人",
            "CND": "条件",
            "DIR": "方向",
            "DGR": "程度",
            "EXT": "扩展",
            "FRQ": "频率",
            "LOC": "地点",
            "MNR": "方式",
            "PRP": "目的或原因",
            "TMP": "时间",
            "TPC": "主题",
            "CRD": "并列参数",
            "PRD": "谓语动词",
            "PSR": "持有者",
            "PSE": "被持有"
        }

    # 释放对象
    def colse_ltp(self):
        # 分词释放
        self.segmentor.release()
        # 词性释放
        self.postagger.release()
        # 实体释放
        self.recognizer.release()
        # 依存关系释放
        self.parser.release()
        # 语义角色释放
        self.labeller.release()

    # 分句
    def cut_split(self, msg):
        sents = SentenceSplitter.split(msg)
        return [i for i in sents]

    # 分词
    def cut_words(self, msg):
        words = self.segmentor.segment(msg)
        self.cut_list = [i for i in words]
        return self.cut_list

    # 词性标注
    def part_speech(self):
        postags = self.postagger.postag(self.cut_list)  # 词性标注
        self.part_speech_list = [i for i in postags]
        return self.part_speech_list

    # 实体识别
    def notional_words(self):
        return self.recognizer.recognize(self.cut_list,
                                         self.part_speech_list)  # 命名实体识别

    # 依存句法分析
    def interdependent(self):
        self.arcs = self.parser.parse(self.cut_list,
                                      self.part_speech_list)  # 句法分析
        return [(arc.head, arc.relation) for arc in self.arcs]

    # 语义角色标注
    def role(self):
        roles = self.labeller.label(self.cut_list, self.part_speech_list,
                                    self.arcs)  # 语义角色标注
Beispiel #25
0
from myfuncs import get_person_entity_set
from pyltp import Segmentor, NamedEntityRecognizer, Parser, Postagger
import os
import platform


cwd = os.getcwd()
model_dir = '/Users/karloar/Documents/other/ltp_data_v3.4.0'
if platform.system() == 'Windows':
    model_dir = r'E:\ltp_data'

cws_model = os.path.join(model_dir, 'cws.model')
cwd_dict = os.path.join(cwd, 'dict.txt')
pos_model = os.path.join(model_dir, 'pos.model')
ner_model = os.path.join(model_dir, 'ner.model')
parser_model = os.path.join(model_dir, 'parser.model')

if __name__ == '__main__':
    segmentor = Segmentor()
    segmentor.load_with_lexicon(cws_model, cwd_dict)
    # segmentor.load(cws_model)
    postagger = Postagger()
    postagger.load(pos_model)
    ner = NamedEntityRecognizer()
    ner.load(ner_model)
    parser = Parser()
    parser.load(parser_model)
    sentence = '新加坡《联合早报》曝出了赵薇与上海知名人士汪雨的儿子汪道涵热恋。'
    word_list = segmentor.segment(sentence)
    content = get_content_from_ltp(' '.join(list(word_list)), 'sdp')
    print(content)
Beispiel #26
0
class NLP:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir: str,用户自定义词典目录
        default_model_dir: str,ltp模型文件目录
    """
    default_user_dict_dir = '../../resource/'  # 默认的用户词典目录,清华大学法律词典
    default_model_dir = '../../model/'  # ltp模型文件目录
    
    def __init__(self, user_dict_dir=default_user_dict_dir, model_dir=default_model_dir):
        self.default_user_dict_dir = user_dict_dir
        self.default_model_dir = model_dir
        # 初始化分词器
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(self.default_model_dir, "cws.model"))
        # pynlpir.open()  # 初始化分词器
        # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快
        files = os.listdir(user_dict_dir)
        for file in files:
            file_path = os.path.join(user_dict_dir, file)
            # 文件夹则跳过
            if os.path.isdir(file):
                continue
            with open(file_path, 'r', encoding='utf-8') as f:
                line = f.readline()
                while line:
                    word = line.strip('\n').strip()
                    jieba.add_word(word)
                    # print(c_char_p(word.encode()))
                    # pynlpir.nlpir.AddUserWord(c_char_p(word.encode()))
                    line = f.readline()

        # 加载ltp模型
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model'))

        if postag_flag or ner_flag or parse_flag:
            print('load model failed!')

    def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR进行分词处理
        Args:
            sentence: string,句子
            entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
        Returns:
            lemmas: list,分词结果
        """
        # 添加实体词典
        # if entity_postag:
        #     for entity in entity_postag:
                # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
                # jieba.add_word(entity)
        # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode()))  # 单个用户词加入示例
        # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode()))  # 单个用户词加入示例
        # 分词,不进行词性标注
        # lemmas = pynlpir.segment(sentence, pos_tagging=False)
        # lemmas = jieba.lcut(sentence)
        # pynlpir.close()  # 释放
        lemmas = list(self.segmentor.segment(sentence))
        return lemmas

    def postag(self, lemmas):
        """对分词后的结果进行词性标注
        Args:
            lemmas: list,分词后的结果
            entity_dict: set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns:
            words: WordUnit list,包含分词与词性标注结果
        """
        words = []  # 存储句子处理后的词单元
        # 词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i+1, lemmas[i], postags[i])
            words.append(word)
        # self.postagger.release()  # 释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word: str,单词
        Returns:
            post_tag: str,该单词的词性标注
        """
        post_tag = self.postagger.postag([word, ])
        return post_tag[0]

    def netag(self, words):
        """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Args:
            words: WordUnit list,包含分词与词性标注结果
        Returns:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标书结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        # print('\t'.join(netags))  # just for test
        words_netag = EntityCombine().combine(words, netags)
        # self.recognizer.release()  # 释放
        return words_netag

    def parse(self, words):
        """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns:
            *: SentenceUnit,该句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        # self.parser.release()
        return SentenceUnit(words)

    def close(self):
        """关闭与释放nlp"""
        # pynlpir.close()
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
Beispiel #27
0
class PyltpAnalyzer(object):
    def __init__(self, fileDir=LTP_DATA_DIR):
        """

        :param filename:
        """
        print('77777&777777777777777')
        self.fileDir = fileDir
        # 初始化分词实例
        self.cws_model_path = os.path.join(
            self.fileDir, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.segmentor = Segmentor()
        self.segmentor.load(self.cws_model_path)  # 加载模型
        # 初始化标注实例
        self.pos_model_path = os.path.join(
            self.fileDir, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.postagger = Postagger()
        self.postagger.load(self.pos_model_path)  # 加载模型

        # 初始化命名实体识别实例
        self.ner_model_path = os.path.join(
            self.fileDir, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(self.ner_model_path)  # 加载模型

        #依存句法分析
        self.par_model_path = os.path.join(
            self.fileDir, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
        self.parser = Parser()  # 初始化实例
        self.parser.load(self.par_model_path)  # 加载模型

    def segmentSentence(self, sentence):
        return list(self.segmentor.segment(sentence))

    def segment(self, sentences):
        """

        :param sentences: 句子列表
        :return:句子分词结果
        """
        wordsList = []
        if sentences:
            for sentence in sentences:
                wordsList.append(list(self.segmentor.segment(sentence)))
        return wordsList

    def postag(self, wordsList):
        """

        :param wordsList: 句子分词列表
        :return: 句子分词词性标注结果
        """
        postagsList = []
        if wordsList:
            for words in wordsList:
                postagsList.append(list(self.postagger.postag(words)))
        return postagsList

    def recognize(self, wordsList, postagsList):
        """

        :param wordsList: 句子分词列表
        :param postagsList: 句子标注列表
        :return: 句子命名实体识别结果
        """
        netagsList = []
        if wordsList and postagsList:
            if len(wordsList) == len(postagsList):
                for words, postags in zip(wordsList, postagsList):
                    netagsList.append(
                        list(self.recognizer.recognize(words, postags)))
            else:
                print(
                    "wordsList = {} ,len(wordsList) = {}  and postagsList = {} ,len(postagsList)"
                    .format(wordsList, len(wordsList), postagsList,
                            len(postagsList)))
        else:
            print("wordsList = {}  and postagsList = {}".format(
                wordsList, postagsList))

        return netagsList

    def dependencyParse(self, wordsList, postagsList):
        """

        :param wordsList: 句子分词列表
        :param postagsList: 句子标注列表
        :return: 句子句法分析结果
        """
        arcsList = []
        if wordsList and postagsList:
            if len(wordsList) == len(postagsList):
                for words, postags in zip(wordsList, postagsList):
                    arcsList.append(list(self.parser.parse(
                        words, postags)))  #arc.head 父节点, arc.relation 依存关系
            else:
                print(
                    "wordsList = {} ,len(wordsList) = {}  and postagsList = {} ,len(postagsList)"
                    .format(wordsList, len(wordsList), postagsList,
                            len(postagsList)))
        else:
            print("wordsList = {}  and postagsList = {}".format(
                wordsList, postagsList))

        return arcsList

    def finalize(self):
        """
        释放所有没用到的模型
        :return:
        """
        self.segmentor.release()  # 释放分词模型
        self.postagger.release()  # 释放词性模型
        self.recognizer.release()  # 释放命名实体模型
        self.parser.release()  # 释放依存句法模型
class ltp_api(object):
    def __init__(self, MODELDIR, exword_path=None):
        self.MODELDIR = MODELDIR
        self.output = {}
        self.words = None
        self.postags = None
        self.netags = None
        self.arcs = None
        self.exword_path = exword_path  #  e.x: '/data1/research/matt/ltp/exwords.txt'
        # 分词
        self.segmentor = Segmentor()
        if not self.exword_path:
            # 是否加载额外词典
            self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
        else:
            self.segmentor.load_with_lexicon(
                os.path.join(self.MODELDIR, "cws.model"), self.exword_path)

        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
        # 依存句法
        self.parser = Parser()
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.MODELDIR, "ner.model"))
        # 语义角色
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))

    # 分词
    def ltp_segmentor(self, sentence):
        words = self.segmentor.segment(sentence)
        return words

    # 词性标注
    def ltp_postagger(self, words):
        postags = self.postagger.postag(words)
        return postags

    # 依存语法
    def ltp_parser(self, words, postags):
        arcs = self.parser.parse(words, postags)
        return arcs

    # 命名实体识别
    def ltp_recognizer(self, words, postags):
        netags = self.recognizer.recognize(words, postags)
        return netags

    # 语义角色识别
    def ltp_labeller(self, words, postags, arcs):
        output = []
        roles = self.labeller.label(words, postags, arcs)
        for role in roles:
            output.append([(role.index, arg.name, arg.range.start,
                            arg.range.end) for arg in role.arguments])
        return output

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.recognizer.release()
        self.labeller.release()

    def get_result(self, sentence):
        self.words = self.ltp_segmentor(sentence)
        self.postags = self.ltp_postagger(self.words)
        self.arcs = self.ltp_parser(self.words, self.postags)
        self.netags = self.ltp_recognizer(self.words, self.postags)
        self.output['role'] = self.ltp_labeller(self.words, self.postags,
                                                self.arcs)

        # 载入output
        self.output['words'] = list(self.words)
        self.output['postags'] = list(self.postags)
        self.output['arcs'] = [(arc.head, arc.relation) for arc in self.arcs]
        self.output['netags'] = list(self.netags)
Beispiel #29
0
import os

from lxml import etree
from pyltp import Segmentor, Postagger, NamedEntityRecognizer

# model path
MODELDIR="/data/ltp/ltp-models/3.3.0/ltp_data"
#MODELDIR="/home/twjiang/01.lab/ltp_model/3.3.0/ltp_data"
print "正在加载LTP模型... ..."
segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
print "模型加载完毕."
print "正在加载大词林实体词库... ..."
bigcilin_file = open("/users1/twjiang/03.data/entitys_bigcilin.txt")
bigcilin = []

line = bigcilin_file.readline()
while line:
    entity = line.strip()
    bigcilin.append(entity)
    line = bigcilin_file.readline()
bigcilin_file.close()

print "大词林实体词库加载完毕: 已加载%d实体" % (len(bigcilin))

piece_size = 3
Beispiel #30
0
        try:
            htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
            sz = re_charEntity.search(htmlstr)
        except KeyError:
            # 以空串代替
            htmlstr = re_charEntity.sub('', htmlstr, 1)
            sz = re_charEntity.search(htmlstr)
    return htmlstr


segmentor = Segmentor()  # 初始化实例
segmentor.load("/Users/guoziyao/repos/pyltp/ltp_data/cws.model")  # 加载模型
postagger = Postagger()  # 初始化实例
postagger.load("/Users/guoziyao/repos/pyltp/ltp_data/pos.model")  # 加载模型
recognizer = NamedEntityRecognizer()  # 初始化实例
recognizer.load("/Users/guoziyao/repos/pyltp/ltp_data/ner.model")  # 加载模型


def recgonize(url, tag):
    """
    :param url: 网址
    :param tag: 实体标记。Ni 机构名,Nh 人名,Ns 地名
    :return: 得到的结果
    """
    text = http_request(url)
    text = filter_tags(text).replace(' ', '')
    lines = text.split()

    result = []
    for line in lines:
        line = line.encode('utf-8')
'''
#根据词性,挑选句子对
import os
import codecs
import re
import pandas as pd
import sentence_parser
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from pyltp import SentenceSplitter
LTP_DIR = 'D:\LTP\MODEL\ltp_data'  # ltp模型目录的路径
segmentor = Segmentor()
segmentor.load(os.path.join(LTP_DIR, "cws.model"))  # 分词模型路径,模型名称为`cws.model`
postagger = Postagger()
postagger.load(os.path.join(LTP_DIR, "pos.model"))  # 词性标注模型路径,模型名称为`pos.model`
recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(LTP_DIR,
                             "ner.model"))  # 命名实体识别模型路径,模型名称为`ner.model`
parser = Parser()
parser.load(os.path.join(LTP_DIR,
                         "parser.model"))  # 依存句法分析模型路径,模型名称为`parser.model


def postag_list(file, n_postags, line_total):
    with codecs.open(file, 'r', encoding='utf-8') as f:
        lines = [line.split(' ')[0].strip() for line in f]
    for line in lines:
        with codecs.open('../data/causality_sentences_2.txt',
                         'a',
                         encoding='utf-8') as fw:
            fw.write(line + '\n')
        sus_pos_wors = []
        words = list(segmentor.segment(line))
Beispiel #32
0
class TripleIE(object):
    def __init__(self,
                 in_file_path,
                 out_file_path,
                 model_path,
                 clean_output=False):
        self.logger = logging.getLogger("TripleIE")

        self.in_file_path = in_file_path
        self.out_file_path = out_file_path
        self.model_path = model_path
        self.clean_output = clean_output  # 输出是否有提示

        self.out_handle = None

        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(self.model_path, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.model_path, "pos.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(self.model_path, "parser.model"))
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.model_path, "ner.model"))

    def run(self, in_file_path=None, out_file_path=None):
        if in_file_path is not None:
            self.in_file_path = in_file_path
        if out_file_path is not None:
            self.out_file_path = out_file_path

        self.out_handle = open(self.out_file_path, 'a')

        with open(self.in_file_path, "r", encoding="utf-8") as rf:
            self.logger.info("loadding input file {}...".format(
                self.in_file_path))
            text = ""
            for line in rf:
                line = line.strip()
                text += line
            self.logger.info("done with loadding file...")

            text = U.rm_html(text)
            sentences = U.split_by_sign(text)

            self.logger.info("detect {} sentences".format(len(sentences)))

            self.logger.info("start to extract...")
            for sentence in tqdm(sentences):
                self.extract(sentence)

            self.logger.info("done with extracting...")
            self.logger.info("output to {}".format(self.out_file_path))

        # close handle
        self.out_handle.close()

    def extract(self, sentence):
        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        ner = self.recognizer.recognize(words, postags)
        arcs = self.parser.parse(words, postags)

        sub_dicts = self._build_sub_dicts(words, postags, arcs)
        for idx in range(len(postags)):

            if postags[idx] == 'v':
                sub_dict = sub_dicts[idx]
                # 主谓宾
                if 'SBV' in sub_dict and 'VOB' in sub_dict:
                    e1 = self._fill_ent(words, postags, sub_dicts,
                                        sub_dict['SBV'][0])
                    r = words[idx]
                    e2 = self._fill_ent(words, postags, sub_dicts,
                                        sub_dict['VOB'][0])
                    if self.clean_output:
                        self.out_handle.write("%s, %s, %s\n" % (e1, r, e2))
                    else:
                        self.out_handle.write("主谓宾\t(%s, %s, %s)\n" %
                                              (e1, r, e2))
                    self.out_handle.flush()
                # 定语后置,动宾关系
                if arcs[idx].relation == 'ATT':
                    if 'VOB' in sub_dict:
                        e1 = self._fill_ent(words, postags, sub_dicts,
                                            arcs[idx].head - 1)
                        r = words[idx]
                        e2 = self._fill_ent(words, postags, sub_dicts,
                                            sub_dict['VOB'][0])
                        temp_string = r + e2
                        if temp_string == e1[:len(temp_string)]:
                            e1 = e1[len(temp_string):]
                        if temp_string not in e1:
                            if self.clean_output:
                                self.out_handle.write("%s, %s, %s\n" %
                                                      (e1, r, e2))
                            else:
                                self.out_handle.write(
                                    "动宾定语后置\t(%s, %s, %s)\n" % (e1, r, e2))

                            self.out_handle.flush()

            # 抽取命名实体有关的三元组
            try:
                if ner[idx][0] == 'S' or ner[idx][0] == 'B':
                    ni = idx
                    if ner[ni][0] == 'B':
                        while len(ner) > 0 and len(
                                ner[ni]) > 0 and ner[ni][0] != 'E':
                            ni += 1
                        e1 = ''.join(words[idx:ni + 1])
                    else:
                        e1 = words[ni]
                    if arcs[ni].relation == 'ATT' and postags[
                            arcs[ni].head - 1] == 'n' and ner[arcs[ni].head -
                                                              1] == 'O':
                        r = self._fill_ent(words, postags, sub_dicts,
                                           arcs[ni].head - 1)
                        if e1 in r:
                            r = r[(r.idx(e1) + len(e1)):]
                        if arcs[arcs[ni].head - 1].relation == 'ATT' and ner[
                                arcs[arcs[ni].head - 1].head - 1] != 'O':
                            e2 = self._fill_ent(
                                words, postags, sub_dicts,
                                arcs[arcs[ni].head - 1].head - 1)
                            mi = arcs[arcs[ni].head - 1].head - 1
                            li = mi
                            if ner[mi][0] == 'B':
                                while ner[mi][0] != 'E':
                                    mi += 1
                                e = ''.join(words[li + 1:mi + 1])
                                e2 += e
                            if r in e2:
                                e2 = e2[(e2.idx(r) + len(r)):]
                            if r + e2 in sentence:
                                if self.clean_output:
                                    self.out_handle.write("%s, %s, %s\n" %
                                                          (e1, r, e2))
                                else:
                                    self.out_handle.write(
                                        "人名/地名/机构\t(%s, %s, %s)\n" %
                                        (e1, r, e2))

                                self.out_handle.flush()
            except:
                pass

    """
    :decription: 为句子中的每个词语维护一个保存句法依存儿子节点的字典
    :args:
        words: 分词列表
        postags: 词性列表
        arcs: 句法依存列表
    """

    def _build_sub_dicts(self, words, postags, arcs):
        sub_dicts = []
        for idx in range(len(words)):
            sub_dict = dict()
            for arc_idx in range(len(arcs)):
                if arcs[arc_idx].head == idx + 1:
                    if arcs[arc_idx].relation in sub_dict:
                        sub_dict[arcs[arc_idx].relation].append(arc_idx)
                    else:
                        sub_dict[arcs[arc_idx].relation] = []
                        sub_dict[arcs[arc_idx].relation].append(arc_idx)
            sub_dicts.append(sub_dict)
        return sub_dicts

    """
    :decription:完善识别的部分实体
    """

    def _fill_ent(self, words, postags, sub_dicts, word_idx):
        sub_dict = sub_dicts[word_idx]
        prefix = ''
        if 'ATT' in sub_dict:
            for i in range(len(sub_dict['ATT'])):
                prefix += self._fill_ent(words, postags, sub_dicts,
                                         sub_dict['ATT'][i])

        postfix = ''
        if postags[word_idx] == 'v':
            if 'VOB' in sub_dict:
                postfix += self._fill_ent(words, postags, sub_dicts,
                                          sub_dict['VOB'][0])
            if 'SBV' in sub_dict:
                prefix = self._fill_ent(words, postags, sub_dicts,
                                        sub_dict['SBV'][0]) + prefix

        return prefix + words[word_idx] + postfix
Beispiel #33
0
postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
    print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

segmentor.release()
postagger.release()
parser.release()
recognizer.release()
Beispiel #34
0
class MyLTP():
    def __init__(self):
        ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
        # sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path
        # Set your own model path
        self.MODELDIR = os.path.join(ROOTDIR, "./ltp_data")
        # Init LTP Model
        self.segmentor = Segmentor()
        self.postagger = Postagger()
        self.parser = Parser()
        self.recognizer = NamedEntityRecognizer()
        self.labeller = SementicRoleLabeller()
        self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
        self.recognizer.load(os.path.join(self.MODELDIR, "ner.model"))
        self.labeller.load(os.path.join(self.MODELDIR, "pisrl.model"))

    # 下述函数返回值均为 list, list[0] 为第一个句子的运行结果
    # ---------------------------- 分词 -------------------------------
    def MySegmentor(self, paragraph):
        # 段落分成句子
        sentences = SentenceSplitter.split(paragraph)
        result = []
        for sentence in sentences:
            words = self.segmentor.segment(sentence)
            # 输出
            # print("\t".join(words))
            result.append(words)
        return result

    # ---------------------------- 词性标注 -------------------------------
    def MyPostagger(self, words):
        result = []
        for word in words:
            postags = self.postagger.postag(word)
            # list-of-string parameter is support in 0.1.5
            # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
            # 输出
            # print("\t".join(postags))
            result.append(postags)
        return result

    # ---------------------------- 依存句法分析 -------------------------------
    def MyParser(self, words, postags):
        result = []
        for index in range(0, len(words)):
            arcs = self.parser.parse(words[index], postags[index])
            # 输出
            # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
            result.append(arcs)
        return result

    # ---------------------------- 命名实体识别 -------------------------------
    def MyRecognizer(self, words, postags):
        result = []
        for index in range(0, len(words)):
            netags = self.recognizer.recognize(words[index], postags[index])
            # 输出
            # print("\t".join(netags))
            result.append(netags)
        return result

    # ---------------------------- 语义角色标注 -------------------------------
    def MyRoleLabller(self, words, postags, arcs):
        result = []
        for index in range(0, len(words)):
            roles = self.labeller.label(words[index], postags[index],
                                        arcs[index])
            # 输出
            # for role in roles:
            #     print(role.index, "".join(
            #             ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
            result.append(roles)
        return result