Exemple #1
0
class Run(object):
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):
        if onnx:
            self.ltp = FastLTP(path=path, device=device, need_config=True)
        else:
            self.ltp = LTP(path=path, device=device, need_config=True)
        self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                   range(0, len(a), batch_size))

    def _build_words(self, words, pos, dep):
        res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}]
        for word, p, (id, parent, relation) in zip(words, pos, dep):
            offset = res[-1]['offset'] + res[-1]['length']
            res.append({
                'id': id - 1,
                'length': len(word),
                'offset': offset,
                'text': word,
                'pos': p,
                'parent': parent - 1,
                'relation': relation,
                'roles': [],
                'parents': []
            })

        return res[1:]

    def _predict(self, sentences: List[str]):
        result = []
        for sentences_batch in self.split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)
            batch_pos = self.ltp.pos(hidden)
            batch_ner = self.ltp.ner(hidden)
            batch_srl = self.ltp.srl(hidden)
            batch_dep = self.ltp.dep(hidden)
            batch_sdp = self.ltp.sdp(hidden)

            for sent, seg, pos, ner, srl, dep, sdp in \
                    zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp):

                words = self._build_words(seg, pos, dep)

                for word, token_srl in zip(words, srl):
                    for role, start, end in token_srl:
                        text = "".join(seg[start:end + 1])
                        offset = words[start]['offset']
                        word['roles'].append({
                            'text': text,
                            'offset': offset,
                            'length': len(text),
                            'type': role
                        })

                for start, end, label in sdp:
                    words[start - 1]['parents'].append({
                        'parent': end - 1,
                        'relate': label
                    })

                nes = []
                for role, start, end in ner:
                    text = "".join(seg[start:end + 1])
                    nes.append({
                        'text': text,
                        'offset': start,
                        'ne': role.lower(),
                        'length': len(text)
                    })

                result.append({'text': sent, 'nes': nes, 'words': words})

        return result

    def test(self, sentences: List[str] = None):
        self.ltp.add_words("DMI与主机通讯中断")
        if sentences is None:
            sentences = ["他叫汤姆去拿外衣。"]
        res = self._predict([sentence.strip() for sentence in sentences])
        print(json.dumps(res, indent=2, sort_keys=True, ensure_ascii=False))

    def save(self, out='ltp.npz'):
        import numpy as np
        nps = {}
        for k, v in self.ltp.model.state_dict().items():
            k = k.replace("gamma", "weight").replace("beta", "bias")
            nps[k] = np.ascontiguousarray(v.cpu().numpy())

        np.savez(out, **nps)

        config = self.ltp.config
        with open('config.json', 'w', encoding='utf-8') as f:
            json.dump(config, f, indent=2)

    def test_seged(self):
        import torch
        sentences = [
            'My name is tom.', 'He called Tom to get coats.', '他叫Tom去拿外衣。',
            '他叫汤姆去拿外衣。', "我去长江大桥玩。"
        ]
        seg, hidden = self.ltp.seg(sentences)
        seged, hidden_seged = self.ltp.seg(seg, is_preseged=True)
        hidden: dict
        hidden_seged: dict
        for key, value in hidden.items():
            if isinstance(value, torch.Tensor):
                test = torch.sum(value.float() -
                                 hidden_seged[key].float()).numpy()
                print(key, test)

        print(seg == seged)


from ltp import LTP

# text='我现在在天津,我想知道这里的大学都有什么学校.'

#加入用户词典.
from ltp import LTP
ltp = LTP()
# user_dict.txt 是词典文件, max_window是最大前向分词窗口

# 注意max_window一定要开大, 开大字典里面词组最大长度.
ltp.init_dict(path="user_dict.txt", max_window=6)
# 也可以在代码中添加自定义的词语
ltp.add_words(words=["肖申克的救赎", "长江大桥"], max_window=6)











def searchKG(kglist,text): # 用bert来算距离的
    tmp3 = []
    for i in kglist:
        t = (cosine_distance(vec2(i), vec2(text)))
class NLP:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir: str,用户自定义词典目录
    """
    RESOURCE_DIR = os.path.abspath(
        os.path.join(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
            "resource"))

    def __init__(self, model_type='base', user_dict_dir=RESOURCE_DIR):
        self.default_user_dict_dir = user_dict_dir
        # 加载ltp模型
        self.ltp = LTP(model_type)
        # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快
        files = os.listdir(user_dict_dir)
        for file in files:
            file_path = os.path.join(user_dict_dir, file)
            # 文件夹则跳过
            if os.path.isdir(file):
                continue
            self.ltp.init_dict(file_path)

        # # 词性标注模型
        # self.postagger = Postagger()
        # postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model'))
        # # 命名实体识别模型
        # self.recognizer = NamedEntityRecognizer()
        # ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model'))
        # # 依存句法分析模型
        # self.parser = Parser()
        # parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model'))

    def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR进行分词处理
        Args:
            sentence: string,句子
            entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
        Returns:
            lemmas: list,分词结果
        """
        # 添加实体词典
        if entity_postag:
            for entity in entity_postag:
                self.ltp.add_words([entity])
        segment, hidden = self.ltp.seg([sentence])
        return segment[0], hidden

    def postag(self, segment, hidden):
        """对分词后的结果进行词性标注
        Args:
            segment: list,分词后的结果
        Returns:
            words: WordUnit list,包含分词与词性标注结果
        """
        words = []  # 存储句子处理后的词单元
        # 词性标注
        postags = self.ltp.pos(hidden)
        for i in range(len(segment)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, segment[i], postags[0][i])
            words.append(word)
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word: str,单词
        Returns:
            post_tag: str,该单词的词性标注
        """
        _, hidden = self.ltp.seg([word], is_preseged=True)
        post_tag = self.ltp.pos(hidden)
        return post_tag[0]

    def netag(self, words, hidden):
        """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Args:
            words: WordUnit list,包含分词与词性标注结果
        Returns:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标书结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.ltp.ner(hidden, as_entities=False)
        words_netag = EntityCombine.combine(words, netags[0])
        return words_netag

    def parse_seged(self, words):
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        _, hidden = self.ltp.seg([lemmas], is_preseged=True)
        arcs = self.ltp.dep(hidden)[0]
        for i in range(len(arcs)):
            words[i].head = arcs[i][1]
            words[i].dependency = arcs[i][2]
        return SentenceUnit(words)

    def parse(self, words, hidden):
        """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns:
            *: SentenceUnit,该句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.ltp.dep(hidden)[0]
        for i in range(len(arcs)):
            words[i].head = arcs[i][1]
            words[i].dependency = arcs[i][2]
        return SentenceUnit(words)

    def close(self):
        """关闭与释放nlp"""
        pass
Exemple #4
0
# text='我现在在天津,我想知道这里的大学都有什么学校.'

#加入用户词典.
from ltp import LTP
ltp = LTP()
# user_dict.txt 是词典文件, max_window是最大前向分词窗口

# 注意max_window一定要开大, 开大字典里面词组最大长度.
ltp.init_dict(path="user_dict.txt", max_window=6)
# 也可以在代码中添加自定义的词语

#改成自动算最大窗口吧.
words = ["肖申克的救赎", "长江大桥", "负重前行"]
max_window = max([len(i) for i in words])
ltp.add_words(words=words, max_window=max_window)


def searchKG(kglist, text):  # 用bert来算距离的
    tmp3 = []
    for i in kglist:
        t = (cosine_distance(vec2(i), vec2(text)))
        tmp3.append(t)
    tmp3 = np.array(tmp3)
    print('所有的距离为', tmp3)
    # 查询到的最近kg 3元组是!!!!!!!!!!!!!!!!
    dix = np.argmin(tmp3)
    print('最近的3元组是', kglist[dix], '对应的阈值是', tmp3[dix])

    return kglist[dix]