Beispiel #1
0
class Ner:
    def __init__(self):
        self.ltp = LTP()
    
    def preprocess(self, sent):
        return re.sub('\s+', '', sent)

    def ner(self, sents):
        assert not any(re.search(r'\s', x) for x in sents), "no space is allowed"
        psents = [x for x in sents if x != '']
        if len(psents) == 0:
            return [[] for x in sents]
        segment, hidden = self.ltp.seg(psents)
        ne = self.ltp.ner(hidden)
        anes = []
        for sseg, sne in zip(segment, ne):
            nes = []
            slens = [0] + [len(x) for x in sseg]
            for i in range(1, len(slens)):
                slens[i] += slens[i - 1]
            for t, x, y in sne:
                if t == 'Ns':
                    nes.append([slens[x], slens[y + 1]])
            anes.append(nes)
        fnes = []
        cur = 0
        for s in sents:
            if s == '':
                fnes.append([])
            else:
                fnes.append(anes[cur])
                cur += 1
        return fnes
Beispiel #2
0
class NamedEntity:
    def __init__(self, user_dict):
        self.ltp = LTP()  # 默认加载Small模型
        # user_dict.txt 是词典文件, max_window是最大前向分词窗口
        self.ltp.init_dict(path=user_dict, max_window=4)

    def entity_recognition(self, text: list):
        """
        命名实体识别
        :param text: 原始文本
        :return: 从原始文本中抽取的命名实体
        """
        seg, hidden = self.ltp.seg(text)   # 分词
        ner = self.ltp.ner(hidden)
        entity = []
        for tag, start, end in ner[0]:
            entity.append(seg[0][start:end+1][0])
        return entity
Beispiel #3
0
def new_generate_ltp_results():
    # 加载模型
    ltp_model = '../../ltp_models/base1'
    ltp = LTP(path=ltp_model)

    # 读取原句子
    data = read_file_in_ltp('../data/train_base.json')
    sentences = list(map(lambda x: x['content'], data))

    segmented, pos, ner, srl, dep, sdp_tree, sdp_graph = [], [], [], [], [], [], []
    for sent in tqdm(sentences):
        # 分词
        segmented0, hidden = ltp.seg([sent])
        # 词性标注
        cur_pos = ltp.pos(hidden)
        # 命名实体识别
        cur_ner = ltp.ner(hidden)
        # 语义角色标注
        cur_srl = ltp.srl(hidden)
        # 依存句法分析
        cur_dep = ltp.dep(hidden)
        # 语义依存分析 (树)
        cur_sdp_tree = ltp.sdp(hidden, mode='tree')
        # 语义依存分析 (图)
        cur_sdp_graph = ltp.sdp(hidden, mode='graph')

        segmented.append(segmented0[0])
        pos.append(cur_pos[0])
        ner.append(cur_ner[0])
        srl.append(cur_srl[0])
        dep.append(cur_dep[0])
        sdp_tree.append(cur_sdp_tree[0])
        sdp_graph.append(cur_sdp_graph[0])

        # 生成句子与分词的对应
    sent_seg_matches = sentence_segment_match(data, segmented)
    pickle.dump([segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches], open('new_ltp_results.pk', 'wb'))

    return segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches
Beispiel #4
0
class Server(object):
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):
        self.ltp = LTP(path=path, device=device)
        self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                   range(0, len(a), batch_size))

    def _build_words(self, words, pos, dep):
        res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}]
        for word, p, (id, parent, relation) in zip(words, pos, dep):
            offset = res[-1]['offset'] + res[-1]['length']
            res.append({
                'id': id - 1,
                'length': len(word),
                'offset': offset,
                'text': word,
                'pos': p,
                'parent': parent - 1,
                'relation': relation,
                'roles': [],
                'parents': []
            })

        return res[1:]

    def _predict(self, sentences: List[str]):
        result = []
        for sentences_batch in self.split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)
            batch_pos = self.ltp.pos(hidden)
            batch_ner = self.ltp.ner(hidden)
            batch_srl = self.ltp.srl(hidden)
            batch_dep = self.ltp.dep(hidden, fast=False)
            batch_sdp = self.ltp.sdp(hidden, mode='mix')

            for sent, seg, pos, ner, srl, dep, sdp in \
                    zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp):

                words = self._build_words(seg, pos, dep)

                for word, token_srl in zip(words, srl):
                    for role, start, end in token_srl:
                        text = "".join(seg[start:end + 1])
                        offset = words[start]['offset']
                        word['roles'].append({
                            'text': text,
                            'offset': offset,
                            'length': len(text),
                            'type': role
                        })

                for start, end, label in sdp:
                    words[start - 1]['parents'].append({
                        'parent': end - 1,
                        'relate': label
                    })

                nes = []
                for role, start, end in ner:
                    text = "".join(seg[start:end + 1])
                    nes.append({
                        'text': text,
                        'offset': start,
                        'ne': role.lower(),
                        'length': len(text)
                    })

                result.append({'text': sent, 'nes': nes, 'words': words})

        return result

    def serve(self, port: int = 5000, n_process: int = None):
        if n_process is None:
            n_process = 1 if sys.platform == 'win32' else 8

        fmt = LogFormatter(fmt='%(asctime)s - %(levelname)s - %(message)s',
                           datefmt='%Y-%m-%d %H:%M:%S',
                           color=True)
        root_logger = logging.getLogger()

        console_handler = logging.StreamHandler()
        file_handler = logging.FileHandler('server.log')

        console_handler.setFormatter(fmt)
        file_handler.setFormatter(fmt)

        root_logger.addHandler(console_handler)
        root_logger.addHandler(file_handler)

        app_log.setLevel(logging.INFO)
        gen_log.setLevel(logging.INFO)
        access_log.setLevel(logging.INFO)

        app_log.info("Model is loading...")
        app_log.info("Model Has Been Loaded!")

        app = Application([(r"/.*", LTPHandler, dict(ltp=self))])

        server = HTTPServer(app)
        server.bind(port)
        server.start(n_process)
        ioloop.IOLoop.instance().start()
Beispiel #5
0
##
#--------从测试看出来,ner本身对问题有干扰,所以在kglist里面要去除.

# tiaozhuan=searchKG(kglist=['地点','地址','大小','颜色','老婆','丈夫'],text='我家住在和平区哪个地方')

# print(tiaozhuan,"jieguo shi !!!!!!!!!!!!!!!!")
##

# 加入句子成分跳转.
seg, hidden = ltp.seg([text])
# sdp = ltp.sdp(hidden, graph=False)

print(seg, "seg")
pos = ltp.pos(hidden)
ner = ltp.ner(hidden)
print("ner", ner)
srl = ltp.srl(hidden)
dep = ltp.dep(hidden)
sdp = ltp.sdp(hidden)

print(ner, "ner结果")
seg = seg[0]
dep = dep[0]
sdp = sdp[0]
print(sdp, "语义分析!!!!!!!!!!!!!!!!!!!")  # 太难用了.
print(dep)
for i in dep:  #dep算法目前识别不出来老婆的跳转.

    print(i, seg[i[0] - 1], seg[i[1] - 1])  # 注意下标会多一个, 箭1后为真正下标.
'''
def save_as_txt(data):
    from ltp import LTP
    import random
    ltp = LTP()
    for row in data:
        id = row[0]
        school_id = ("000" + str(row[1]))[-4:]
        texts = row[2]
        textlines = texts.split('\n')
        shortened_textlines = []
        for line in textlines:
            line_len = len(line)
            if line_len > 100:
                for i in range(line_len // 100):
                    shortened_textlines.append(line[i * 100:(i + 1) * 100])
            else:
                shortened_textlines.append(line)
        text = ' '.join(shortened_textlines)
        path = './data/' + str(school_id)
        if os.path.exists(path): pass
        else: os.makedirs(path)
        with open((path + '/' + str(school_id) + "-" + str(id) + ".txt"),
                  'w',
                  encoding='UTF-8') as file:
            file.write(text)
            file.close()
            print("\r已保存 " + str(school_id) + "-" + str(id) + ".txt", end="")
            # T2	报告人 68 71	曹进德
            # R2 报告人_单位 Arg1: T2 Arg2: T1
        seg, hidden = ltp.seg([text])
        ner = ltp.ner(hidden)
        ner_info = []
        entities_nh = []
        entities_ni = []
        print(type(text))
        print()
        for i in ner[0]:
            if (i[0] == 'Nh'):
                start = i[1]
                end = i[2]
                entity = "".join(seg[0][start:end + 1])
                if (len(entity) > 1):
                    entities_nh.append(entity)

            elif (i[0] == 'Ni'):
                start = i[1]
                end = i[2]
                entity = "".join(seg[0][start:end + 1])
                if entity in schoolnames:
                    entities_ni.append(entity)

        for entity in set(entities_nh):
            pattern = re.compile(entity)
            iter = pattern.finditer(text)
            count = 0
            for record in iter:
                ner_info.append("T" + str(300 + count) + "\t姓名 " +
                                str(record.span()[0]) + " " +
                                str(record.span()[1]) + "\t" +
                                str(record.group()) + "\n")
                count += 1

        for entity in set(entities_ni):
            pattern = re.compile(entity)
            iter = pattern.finditer(text)
            count = 0
            for record in iter:
                ner_info.append("T" + str(400 + count) + "\t单位 " +
                                str(record.span()[0]) + " " +
                                str(record.span()[1]) + "\t" +
                                str(record.group()) + "\n")
                count += 1

        pattern = re.compile('教授|副教授|讲师|研究员|副研究员|助理教授|助理研究员')
        iter = pattern.finditer(text)
        count = 0
        for record in iter:
            ner_info.append("T" + str(500 + count) + "\t职称 " +
                            str(record.span()[0]) + " " +
                            str(record.span()[1]) + "\t" +
                            str(record.group()) + "\n")
            count += 1

        date_1 = r"([0-9]+年[0-9]+月[0-9]+日)"  # |([0-9]+月[0-9]+日)
        date_2 = r"([零〇一二三四五六七八九]年[十]?[一二三四五六七八九]月[一二三]?[十]?[一二三四五六七八九十]日)"
        date_3 = r"([0-9]+月[0-9]+日)"
        flag = False
        count = 0
        ## 方式1
        pattern = re.compile(date_1)
        iter = pattern.finditer(text)
        for record in iter:
            ner_info.append("T" + str(600 + count) + "\t日期 " +
                            str(record.span()[0]) + " " +
                            str(record.span()[1]) + "\t" +
                            str(record.group()) + "\n")
            count += 1
            flag = True

        if (flag is False):
            pattern = re.compile(date_3)
            iter = pattern.finditer(text)
            for record in iter:
                ner_info.append("T" + str(600 + count) + "\t日期 " +
                                str(record.span()[0]) + " " +
                                str(record.span()[1]) + "\t" +
                                str(record.group()) + "\n")
                count += 1

        ## 方式2
        pattern = re.compile(date_2)
        iter = pattern.finditer(text)
        for record in iter:
            ner_info.append("T" + str(600 + count) + "\t日期 " +
                            str(record.span()[0]) + " " +
                            str(record.span()[1]) + "\t" +
                            str(record.group()) + "\n")
            count += 1

        with open((path + '/' + str(school_id) + "-" + str(id) + ".ann"),
                  'w',
                  encoding='UTF-8') as file:
            print([text])
            print(ner_info)
            file.writelines(ner_info)
            file.close()
            print("\r已保存 " + str(school_id) + "-" + str(id) + ".ann", end="")
Beispiel #7
0
class Run(object):
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):
        if onnx:
            self.ltp = FastLTP(path=path, device=device, need_config=True)
        else:
            self.ltp = LTP(path=path, device=device, need_config=True)
        self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                   range(0, len(a), batch_size))

    def _build_words(self, words, pos, dep):
        res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}]
        for word, p, (id, parent, relation) in zip(words, pos, dep):
            offset = res[-1]['offset'] + res[-1]['length']
            res.append({
                'id': id - 1,
                'length': len(word),
                'offset': offset,
                'text': word,
                'pos': p,
                'parent': parent - 1,
                'relation': relation,
                'roles': [],
                'parents': []
            })

        return res[1:]

    def _predict(self, sentences: List[str]):
        result = []
        for sentences_batch in self.split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)
            batch_pos = self.ltp.pos(hidden)
            batch_ner = self.ltp.ner(hidden)
            batch_srl = self.ltp.srl(hidden)
            batch_dep = self.ltp.dep(hidden)
            batch_sdp = self.ltp.sdp(hidden)

            for sent, seg, pos, ner, srl, dep, sdp in \
                    zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp):

                words = self._build_words(seg, pos, dep)

                for word, token_srl in zip(words, srl):
                    for role, start, end in token_srl:
                        text = "".join(seg[start:end + 1])
                        offset = words[start]['offset']
                        word['roles'].append({
                            'text': text,
                            'offset': offset,
                            'length': len(text),
                            'type': role
                        })

                for start, end, label in sdp:
                    words[start - 1]['parents'].append({
                        'parent': end - 1,
                        'relate': label
                    })

                nes = []
                for role, start, end in ner:
                    text = "".join(seg[start:end + 1])
                    nes.append({
                        'text': text,
                        'offset': start,
                        'ne': role.lower(),
                        'length': len(text)
                    })

                result.append({'text': sent, 'nes': nes, 'words': words})

        return result

    def test(self, sentences: List[str] = None):
        self.ltp.add_words("DMI与主机通讯中断")
        if sentences is None:
            sentences = ["他叫汤姆去拿外衣。"]
        res = self._predict([sentence.strip() for sentence in sentences])
        print(json.dumps(res, indent=2, sort_keys=True, ensure_ascii=False))

    def save(self, out='ltp.npz'):
        import numpy as np
        nps = {}
        for k, v in self.ltp.model.state_dict().items():
            k = k.replace("gamma", "weight").replace("beta", "bias")
            nps[k] = np.ascontiguousarray(v.cpu().numpy())

        np.savez(out, **nps)

        config = self.ltp.config
        with open('config.json', 'w', encoding='utf-8') as f:
            json.dump(config, f, indent=2)

    def test_seged(self):
        import torch
        sentences = [
            'My name is tom.', 'He called Tom to get coats.', '他叫Tom去拿外衣。',
            '他叫汤姆去拿外衣。', "我去长江大桥玩。"
        ]
        seg, hidden = self.ltp.seg(sentences)
        seged, hidden_seged = self.ltp.seg(seg, is_preseged=True)
        hidden: dict
        hidden_seged: dict
        for key, value in hidden.items():
            if isinstance(value, torch.Tensor):
                test = torch.sum(value.float() -
                                 hidden_seged[key].float()).numpy()
                print(key, test)

        print(seg == seged)
class Server(object):
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):
        if onnx:
            self.ltp = FastLTP(path=path, device=device)
        else:
            self.ltp = LTP(path=path, device=device)
        self.split = lambda a: map(lambda b: a[b:b + batch_size],
                                   range(0, len(a), batch_size))

    def _build_words(self, words, pos, dep):
        res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}]
        for word, p, (id, parent, relation) in zip(words, pos, dep):
            offset = res[-1]['offset'] + res[-1]['length']
            res.append({
                'id': id - 1,
                'length': len(word),
                'offset': offset,
                'text': word,
                'pos': p,
                'parent': parent - 1,
                'relation': relation,
                'roles': [],
                'parents': []
            })

        return res[1:]

    def _predict(self, sentences: List[str]):
        result = []
        for sentences_batch in self.split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)
            batch_pos = self.ltp.pos(hidden)
            batch_ner = self.ltp.ner(hidden)
            batch_srl = self.ltp.srl(hidden)
            batch_dep = self.ltp.dep(hidden)
            batch_sdp = self.ltp.sdp(hidden)

            for sent, seg, pos, ner, srl, dep, sdp in \
                    zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp):

                words = self._build_words(seg, pos, dep)

                for word, token_srl in zip(words, srl):
                    for role, start, end in token_srl:
                        text = "".join(seg[start:end + 1])
                        offset = words[start]['offset']
                        word['roles'].append({
                            'text': text,
                            'offset': offset,
                            'length': len(text),
                            'type': role
                        })

                for start, end, label in sdp:
                    words[start - 1]['parents'].append({
                        'parent': end - 1,
                        'relate': label
                    })

                nes = []
                for role, start, end in ner:
                    text = "".join(seg[start:end + 1])
                    nes.append({
                        'text': text,
                        'offset': start,
                        'ne': role.lower(),
                        'length': len(text)
                    })

                result.append({'text': sent, 'nes': nes, 'words': words})

        return result
Beispiel #9
0
class CnProcessor:
    r"""
        Text Processor class implement NER.
    """
    _instance_lock = threading.Lock()

    def __init__(self):
        self.__ner = None
        self.__pos = None

    # Single instance mode
    def __new__(cls, *args, **kwargs):
        if not hasattr(CnProcessor, "_instance"):
            with CnProcessor._instance_lock:
                if not hasattr(CnProcessor, "_instance"):
                    CnProcessor._instance = object.__new__(cls)
        return CnProcessor._instance

    @staticmethod
    def word_tokenize(sent):
        r"""
        tokenize fiction

        :param str sent: the sentence need to be tokenized
        :return: list.the tokens in it
        """
        assert isinstance(sent, str)

        return [word for word in sent]

    def get_ner(self, sentence):
        r"""
        NER function.

        :param str sent: the sentence need to be ner
        :return two forms of tags
            The first is the triple form (tags,start,end)
            The second is the list form, which marks the ner label of each word
            such as 周小明去玩
            ['Nh', 'Nh', 'Nh', 'O', 'O']
        """
        assert isinstance(sentence, (list, str))
        from ltp import LTP
        if isinstance(sentence, list):
            # Turn the list into sentence
            tmp = ''
            for word in sentence:
                tmp += word
            sentence = tmp

        if not sentence:
            return [], []

        if self.__ner is None:
            self.__ner = LTP()
        seg, hidden = self.__ner.seg([sentence])
        seg = seg[0]
        ner = self.__ner.ner(hidden)
        ner = ner[0]

        ner_label = len(sentence) * ['O']
        for i in range(len(ner)):
            tag, start, end = ner[i]
            tmp = 0
            for j in range(start):
                tmp += len(seg[j])
            start = tmp
            tmp = 0
            for j in range(end + 1):
                tmp += len(seg[j])
            end = tmp
            ner[i] = (tag, start, end - 1)
            for j in range(start, end):
                ner_label[j] = tag

        return ner, ner_label

    def get_pos_tag(self, sentence):
        r"""
        pos tag function.

        :param str sentence: the sentence need to be ner
        :return: the triple form (tags,start,end)
        """

        assert isinstance(sentence, (list, str))
        from ltp import LTP
        if isinstance(sentence, list):
            # Turn the list into sentence
            tmp = ''
            for word in sentence:
                tmp += word
            sentence = tmp

        if not sentence:
            return []

        if self.__pos is None:
            # get pos tag
            self.__pos = LTP()
        seg, hidden = self.__pos.seg([sentence])
        pos = self.__pos.pos(hidden)
        seg = seg[0]
        pos = pos[0]
        pos_tag = []
        cnt = 0
        for tag in range(len(pos)):
            pos_tag.append([pos[tag], cnt, cnt + len(seg[tag]) - 1])
            cnt += len(seg[tag])

        return pos_tag
class NLP:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir: str,用户自定义词典目录
    """
    RESOURCE_DIR = os.path.abspath(
        os.path.join(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
            "resource"))

    def __init__(self, model_type='base', user_dict_dir=RESOURCE_DIR):
        self.default_user_dict_dir = user_dict_dir
        # 加载ltp模型
        self.ltp = LTP(model_type)
        # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快
        files = os.listdir(user_dict_dir)
        for file in files:
            file_path = os.path.join(user_dict_dir, file)
            # 文件夹则跳过
            if os.path.isdir(file):
                continue
            self.ltp.init_dict(file_path)

        # # 词性标注模型
        # self.postagger = Postagger()
        # postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model'))
        # # 命名实体识别模型
        # self.recognizer = NamedEntityRecognizer()
        # ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model'))
        # # 依存句法分析模型
        # self.parser = Parser()
        # parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model'))

    def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR进行分词处理
        Args:
            sentence: string,句子
            entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
        Returns:
            lemmas: list,分词结果
        """
        # 添加实体词典
        if entity_postag:
            for entity in entity_postag:
                self.ltp.add_words([entity])
        segment, hidden = self.ltp.seg([sentence])
        return segment[0], hidden

    def postag(self, segment, hidden):
        """对分词后的结果进行词性标注
        Args:
            segment: list,分词后的结果
        Returns:
            words: WordUnit list,包含分词与词性标注结果
        """
        words = []  # 存储句子处理后的词单元
        # 词性标注
        postags = self.ltp.pos(hidden)
        for i in range(len(segment)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, segment[i], postags[0][i])
            words.append(word)
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word: str,单词
        Returns:
            post_tag: str,该单词的词性标注
        """
        _, hidden = self.ltp.seg([word], is_preseged=True)
        post_tag = self.ltp.pos(hidden)
        return post_tag[0]

    def netag(self, words, hidden):
        """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Args:
            words: WordUnit list,包含分词与词性标注结果
        Returns:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标书结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.ltp.ner(hidden, as_entities=False)
        words_netag = EntityCombine.combine(words, netags[0])
        return words_netag

    def parse_seged(self, words):
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        _, hidden = self.ltp.seg([lemmas], is_preseged=True)
        arcs = self.ltp.dep(hidden)[0]
        for i in range(len(arcs)):
            words[i].head = arcs[i][1]
            words[i].dependency = arcs[i][2]
        return SentenceUnit(words)

    def parse(self, words, hidden):
        """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns:
            *: SentenceUnit,该句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.ltp.dep(hidden)[0]
        for i in range(len(arcs)):
            words[i].head = arcs[i][1]
            words[i].dependency = arcs[i][2]
        return SentenceUnit(words)

    def close(self):
        """关闭与释放nlp"""
        pass
Beispiel #11
0
class NLP:
    """
    在LTP分析的结果上进行封装

    """
    def __init__(self, default_model_dir=LTP4_MODEL_DIR):
        self.ltp = LTP(path=default_model_dir)

    def segment(self, sentences):
        lemmas, hidden = self.ltp.seg(sentences)
        return lemmas, hidden

    def postag(self, lemmas, hidden):
        """
        根据postag的结果抽取words
        :param lemmas:
        :param hidden:
        :return:
        """
        words = []
        postags = self.ltp.pos(hidden)
        for idx_sent, postags_sent in enumerate(postags):
            words_sent = []
            for i in range(len(postags_sent)):
                # 词的编号从1开始
                word = WordUnit(i + 1, lemmas[idx_sent][i], postags_sent[i])
                words_sent.append(word)
            words.append(words_sent)
        # for i in range(len(postags)):
        #     word = WordUnit(i+1, lemmas[i], postags[i])
        #     words.append(word)
        return words

    def nertag(self, words, hidden):
        """
        根据nertag的结果抽取words,将ner得到的信息作为pos的纠正和补充,例如n->ni/ns/nl
        :param lemmas:
        :param hidden:
        :return:
        """
        # Nh 人名     Ni 机构名      Ns 地名
        nertags = self.ltp.ner(hidden)
        '''
        为了进行三元组提取,使用到ner信息,需要将一些ner分析后的词进行合并得到新词。
        NOTE:NER之后可能将一些tokens合并成一个word
        例如:
            [['高克', '访问', '中国', ',', '并', '在', '同济', '大学', '发表', '演讲', '。']]
            [['nh', 'v', 'ns', 'wp', 'c', 'p', 'nz', 'n', 'v', 'v', 'wp']]
            [[('Nh', 0, 0), ('Ns', 2, 2), ('Ni', 6, 7)]]
            [[(1, 2, 'SBV'), (2, 0, 'HED'), (3, 2, 'VOB'), (4, 2, 'WP'), (5, 9, 'ADV'), (6, 9, 'ADV'), (7, 8, 'ATT'), (8, 6, 'POB'), (9, 2, 'COO'), (10, 9, 'VOB'), (11, 2, 'WP')]]
        '''
        ner2pos = {'Nh': 'nh', 'Ns': 'ns', 'Ni': 'ni'}
        n = 1
        #for i in range(len(words)):
        for idx_sent, nertags_sent in enumerate(nertags):
            for item in nertags_sent:
                for i in range(item[1], item[2] + 1):
                    words[idx_sent][i].nertag = item[0]
                    words[idx_sent][i].postag = ner2pos[item[0]]
        # for item in nertags:
        #     for i in range(item[1], item[2]+1):
        #         words[i].postag = ner2pos[item[0]]
        return words

    def dependency(self, words, hidden):
        """
        根据dp结果,抽取words信息,用于之后的三元组抽取。(主要是词之间的依赖关系)
        :param hidden:
        :return:
        """
        sentences = []
        dep = self.ltp.dep(hidden)
        for idx_sent, dep_sent in enumerate(dep):
            for i in range(len(words[idx_sent])):
                if i < len(
                        dep_sent
                ):  # [(1, 2, 'ATT'), (2, 3, 'ATT')]] 省略了(3, 0, 'HED)
                    words[idx_sent][i].head = dep_sent[i][1]
                    words[idx_sent][i].dependency = dep_sent[i][2]
            sentences.append(SentenceUnit(words[idx_sent]))
        return sentences
Beispiel #12
0
class Conllu(object):
    """
    :param path: 模型路径,或者自动从网上下载 ['base', 'small', 'tiny']
    :param batch_size: 最大 Batch Size 自动切分
    :param device: ['cpu', 'cuda']
    :param onnx: 是否启用 onnx
    """
    def __init__(self,
                 path: str = 'small',
                 batch_size: int = 50,
                 device: str = None,
                 onnx: bool = False):

        if onnx:
            self.ltp = FastLTP(path=path, device=device, need_config=True)
        else:
            self.ltp = LTP(path=path, device=device, need_config=True)
        self._split = lambda a: map(lambda b: a[b:b + batch_size],
                                    range(0, len(a), batch_size))

    def _predict(self,
                 sentences: List[str],
                 pos=True,
                 ner=True,
                 srl=True,
                 dep=True,
                 sdp=True):
        result = []
        for sentences_batch in self._split(sentences):
            batch_seg, hidden = self.ltp.seg(sentences_batch)

            batch_size = len(sentences_batch)
            batch_pos = self.ltp.pos(hidden) if pos else ([[]] * batch_size)
            batch_ner = self.ltp.ner(hidden) if ner else ([None] * batch_size)
            batch_srl = self.ltp.srl(
                hidden, keep_empty=False) if srl else ([None] * batch_size)
            batch_dep = self.ltp.dep(hidden) if dep else ([None] * batch_size)
            batch_sdp = self.ltp.sdp(hidden) if sdp else ([None] * batch_size)

            result += list(
                zip(batch_seg, batch_pos, batch_ner, batch_dep, batch_sdp,
                    batch_srl))

        return result

    def predict(self,
                input: str,
                output: Optional[str] = None,
                pos: bool = True,
                ner: bool = False,
                srl: bool = False,
                dep: bool = True,
                sdp: bool = False):
        """
        预测文本并输出为 conllu 格式
        :param input: 要预测的文件,每行一句话
        :param output: 输出的结果文件,默认是输入文件添加 .conll 后缀
        :param pos: 是否输出 词性标注 结果 ['True','False']
        :param ner: 是否输出 命名实体识别 结果 ['True','False'], 占用 conllu feats 列
        :param srl: 是否输出 语义角色标注 结果 ['True','False'], 占用 conllu misc 列
        :param dep: 是否输出 依存句法分析 结果 ['True','False']
        :param sdp: 是否输出 语义依存分析 结果 ['True','False']
        """
        if output is None:
            output = f"{input}.conllu"

        with open(output, mode='w', encoding='utf-8') as f:
            sentences = sum([sent for idx, sent in iter_lines(input)], [])
            results = self._predict(sentences, pos, ner, srl, dep, sdp)

            for text, (seg_s, pos_s, ner_s, dep_s, sdp_s,
                       srl_s) in zip(sentences, results):
                tokens = conllu.TokenList([
                    conllu.models.Token(id=idx + 1,
                                        form=token,
                                        lemma=token,
                                        upos=pos if pos else '_',
                                        xpos=pos if pos else '_',
                                        feats='O' if ner else '_',
                                        head=idx,
                                        deprel='_',
                                        deps='' if sdp else '_',
                                        misc='SpaceAfter=No')
                    for idx, (token,
                              pos) in enumerate(zip_longest(seg_s, pos_s))
                ], conllu.models.Metadata(text=text))

                if ner:
                    for tag, start, end in ner_s:
                        tokens[start]['feats'] = f'B-{tag}'
                        for i in range(start + 1, end):
                            tokens[start]['feats'] = f'I-{tag}'
                if dep:
                    for id, head, tag in dep_s:
                        tokens[id - 1]['head'] = head
                        tokens[id - 1]['deprel'] = tag
                if sdp:
                    for id, head, tag in sdp_s:
                        if tokens[id - 1]['deps']:
                            tokens[id - 1]['deps'] = tokens[
                                id - 1]['deps'] + f"|{head}:{tag}"
                        else:
                            tokens[id - 1]['deps'] = f"{head}:{tag}"

                if srl:
                    srl_predicate, srl_roles = list(zip(*srl_s))
                    srl_predicate_num = len(srl_predicate)
                    if srl_predicate_num > 0:
                        srl_misc = [[
                            f'Predicate={"Y" if i in srl_predicate else "_"}',
                            ['O'] * srl_predicate_num
                        ] for i in range(len(tokens))]
                        for idx, srl_role in enumerate(srl_roles):
                            for tag, start, end in srl_role:
                                srl_misc[start][-1][idx] = f'B-{tag}'
                                for i in range(start + 1, end):
                                    srl_misc[start][-1][idx] = f'I-{tag}'
                        srl_misc = [
                            "|".join([s[0], "Role=" + ",".join(s[-1])])
                            for s in srl_misc
                        ]

                        for token, misc in zip(tokens, srl_misc):
                            token['misc'] = f"{token['misc']}|{misc}"

                f.write(tokens.serialize())