Esempio n. 1
0
class NamedEntity:
    def __init__(self, user_dict):
        self.ltp = LTP()  # 默认加载Small模型
        # user_dict.txt 是词典文件, max_window是最大前向分词窗口
        self.ltp.init_dict(path=user_dict, max_window=4)

    def entity_recognition(self, text: list):
        """
        命名实体识别
        :param text: 原始文本
        :return: 从原始文本中抽取的命名实体
        """
        seg, hidden = self.ltp.seg(text)   # 分词
        ner = self.ltp.ner(hidden)
        entity = []
        for tag, start, end in ner[0]:
            entity.append(seg[0][start:end+1][0])
        return entity
Esempio n. 2
0
import os
from ltp import LTP
from ..config.config import LEX_PATH
ltp = LTP(path="base")

ltp.init_dict(path=os.path.join(LEX_PATH, 'aerospace_lexicon.txt'))


def cut(sent):
    segment, _ = ltp.seg([sent])
    return segment[0]


def pos_cut(sent):
    segment, hidden = ltp.seg([sent])
    pos = ltp.pos(hidden)
    return [tuple(segment[0]), tuple(pos[0])]
Esempio n. 3
0
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 10:08:36 2020

@author: DELL
"""
import os
from ltp import LTP
from ..config.config import LEX_PATH
ltp = LTP(path = "base")

ltp.init_dict(path=os.path.join(LEX_PATH, 'air_lexicon.txt'))

def cut(sent):
    segment, _ = ltp.seg([sent])
    return segment[0]

def pos_cut(sent):
    segment, hidden = ltp.seg([sent])
    pos = ltp.pos(hidden)
    return [tuple(segment[0]), tuple(pos[0])]
class NLP:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir: str,用户自定义词典目录
    """
    RESOURCE_DIR = os.path.abspath(
        os.path.join(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
            "resource"))

    def __init__(self, model_type='base', user_dict_dir=RESOURCE_DIR):
        self.default_user_dict_dir = user_dict_dir
        # 加载ltp模型
        self.ltp = LTP(model_type)
        # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快
        files = os.listdir(user_dict_dir)
        for file in files:
            file_path = os.path.join(user_dict_dir, file)
            # 文件夹则跳过
            if os.path.isdir(file):
                continue
            self.ltp.init_dict(file_path)

        # # 词性标注模型
        # self.postagger = Postagger()
        # postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model'))
        # # 命名实体识别模型
        # self.recognizer = NamedEntityRecognizer()
        # ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model'))
        # # 依存句法分析模型
        # self.parser = Parser()
        # parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model'))

    def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR进行分词处理
        Args:
            sentence: string,句子
            entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
        Returns:
            lemmas: list,分词结果
        """
        # 添加实体词典
        if entity_postag:
            for entity in entity_postag:
                self.ltp.add_words([entity])
        segment, hidden = self.ltp.seg([sentence])
        return segment[0], hidden

    def postag(self, segment, hidden):
        """对分词后的结果进行词性标注
        Args:
            segment: list,分词后的结果
        Returns:
            words: WordUnit list,包含分词与词性标注结果
        """
        words = []  # 存储句子处理后的词单元
        # 词性标注
        postags = self.ltp.pos(hidden)
        for i in range(len(segment)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, segment[i], postags[0][i])
            words.append(word)
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word: str,单词
        Returns:
            post_tag: str,该单词的词性标注
        """
        _, hidden = self.ltp.seg([word], is_preseged=True)
        post_tag = self.ltp.pos(hidden)
        return post_tag[0]

    def netag(self, words, hidden):
        """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Args:
            words: WordUnit list,包含分词与词性标注结果
        Returns:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标书结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.ltp.ner(hidden, as_entities=False)
        words_netag = EntityCombine.combine(words, netags[0])
        return words_netag

    def parse_seged(self, words):
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        _, hidden = self.ltp.seg([lemmas], is_preseged=True)
        arcs = self.ltp.dep(hidden)[0]
        for i in range(len(arcs)):
            words[i].head = arcs[i][1]
            words[i].dependency = arcs[i][2]
        return SentenceUnit(words)

    def parse(self, words, hidden):
        """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns:
            *: SentenceUnit,该句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.ltp.dep(hidden)[0]
        for i in range(len(arcs)):
            words[i].head = arcs[i][1]
            words[i].dependency = arcs[i][2]
        return SentenceUnit(words)

    def close(self):
        """关闭与释放nlp"""
        pass




from ltp import LTP

# text='我现在在天津,我想知道这里的大学都有什么学校.'

#加入用户词典.
from ltp import LTP
ltp = LTP()
# user_dict.txt 是词典文件, max_window是最大前向分词窗口

# 注意max_window一定要开大, 开大字典里面词组最大长度.
ltp.init_dict(path="user_dict.txt", max_window=6)
# 也可以在代码中添加自定义的词语
ltp.add_words(words=["肖申克的救赎", "长江大桥"], max_window=6)











def searchKG(kglist,text): # 用bert来算距离的
    tmp3 = []
Esempio n. 6
0
from ltp import LTP
import pandas as pd

df = pd.read_csv('分词以后的药食同源.csv', encoding='utf-8')

ltp = LTP()

# user_dict.txt 是词典文件, max_window是最大前向分词窗口
ltp.init_dict(path="中药大词典合集.txt", max_window=4)
# segment, _ = ltp.seg(["他叫汤姆去拿外衣。"])
print(segment)
Esempio n. 7
0
class NLP:
    """
    在LTP分析的结果上进行封装

    """
    def __init__(self,
                 default_model_dir=LTP4_MODEL_DIR,
                 user_dict_dir=USER_DICT_DIR):
        self.ltp = LTP(path=default_model_dir)
        for file in os.listdir(user_dict_dir):
            self.ltp.init_dict(path=os.path.join(user_dict_dir, file))
        self.sentences = []
        self.postags = []
        self.nertags = []
        self.dep = []

    def segment(self, sentences):
        self.sentences = sentences
        lemmas, hidden = self.ltp.seg(sentences)
        return lemmas, hidden

    def postag(self, lemmas, hidden):
        """
        根据postag的结果抽取words
        :param lemmas:
        :param hidden:
        :return:
        """
        words = []
        postags = self.ltp.pos(hidden)
        self.postags = postags
        for idx_sent, postags_sent in enumerate(postags):
            words_sent = []
            for i in range(len(postags_sent)):
                # 词的编号从1开始
                word = WordUnit(i + 1, lemmas[idx_sent][i], postags_sent[i])
                words_sent.append(word)
            words.append(words_sent)
        # for i in range(len(postags)):
        #     word = WordUnit(i+1, lemmas[i], postags[i])
        #     words.append(word)
        return words

    def nertag(self, words, hidden):
        """
        根据nertag的结果抽取words,将ner得到的信息作为pos的纠正和补充,例如n->ni/ns/nl
        :param lemmas:
        :param hidden:
        :return:
        """
        # Nh 人名     Ni 机构名      Ns 地名
        nertags = self.ltp.ner(hidden)
        self.nertags = nertags
        '''
        为了进行三元组提取,使用到ner信息,需要将一些ner分析后的词进行合并得到新词。
        NOTE:NER之后可能将一些tokens合并成一个word
        例如:
            [['高克', '访问', '中国', ',', '并', '在', '同济', '大学', '发表', '演讲', '。']]
            [['nh', 'v', 'ns', 'wp', 'c', 'p', 'nz', 'n', 'v', 'v', 'wp']]
            [[('Nh', 0, 0), ('Ns', 2, 2), ('Ni', 6, 7)]]
            [[(1, 2, 'SBV'), (2, 0, 'HED'), (3, 2, 'VOB'), (4, 2, 'WP'), (5, 9, 'ADV'), (6, 9, 'ADV'), (7, 8, 'ATT'), (8, 6, 'POB'), (9, 2, 'COO'), (10, 9, 'VOB'), (11, 2, 'WP')]]
        '''
        ner2pos = {'Nh': 'nh', 'Ns': 'ns', 'Ni': 'ni'}
        n = 1
        #for i in range(len(words)):
        for idx_sent, nertags_sent in enumerate(nertags):
            for item in nertags_sent:
                for i in range(item[1], item[2] + 1):
                    words[idx_sent][i].nertag = item[0]
                    words[idx_sent][i].postag = ner2pos[item[0]]
        # for item in nertags:
        #     for i in range(item[1], item[2]+1):
        #         words[i].postag = ner2pos[item[0]]
        return words

    def dependency(self, words, hidden):
        """
        根据dp结果,抽取words信息,用于之后的三元组抽取。(主要是词之间的依赖关系)
        :param hidden:
        :return:
        """
        sentences = []
        dep = self.ltp.dep(hidden)
        for idx_sent, dep_sent in enumerate(dep):
            for i in range(len(words[idx_sent])):
                if i < len(
                        dep_sent
                ):  # [(1, 2, 'ATT'), (2, 3, 'ATT')]] 省略了(3, 0, 'HED)
                    words[idx_sent][i].head = dep_sent[i][
                        1]  # 记录的是word的ID,不是下标
                    words[idx_sent][i].dependency = dep_sent[i][2]
                    # 同时记录每个词的在dp树上的子节点
                    ## dep_sent[i][1]是head_word的ID
                    ## child_words记录:子节点ID和边的依赖
                    words[idx_sent][dep_sent[i][1] - 1].child_words.append(
                        (dep_sent[i][0], dep_sent[i][2]))
            sentences.append(
                SentenceUnit(self.sentences[idx_sent], self.nertags[idx_sent],
                             words[idx_sent]))
        return sentences