コード例 #1
0
ファイル: frontend.py プロジェクト: ztzdxqj/ltp
def get_entities_with_list(labels_, itos):
    res = []
    for labels in labels_:
        labels = [itos[label] for label in labels]
        labels = get_entities(labels)
        res.append(labels)
    return res
コード例 #2
0
 def ner(self, hidden: dict):
     # 命名实体识别
     word_length = torch.as_tensor(hidden['word_length'],
                                   device=self.device)
     ner_output = self.model.ner_decoder(hidden['word_input'], word_length)
     ner_output = torch.argmax(ner_output, dim=-1).cpu().numpy()
     ner_output = convert_idx_to_name(ner_output, hidden['word_length'],
                                      self.ner_vocab)
     return [get_entities(ner) for ner in ner_output]
コード例 #3
0
ファイル: frontend.py プロジェクト: RobotLiu2015/ltp
    def ner(self, hidden: dict):
        """
        命名实体识别
        Args:
            hidden: 分词时所得到的中间表示

        Returns:
            pos: 命名实体识别结果
        """
        ner_output = self.model.ner_classifier.forward(
            hidden['word_input'], word_attention_mask=hidden['word_cls_mask'][:, 1:]
        ).logits
        ner_output = torch.argmax(ner_output, dim=-1).cpu().numpy()
        ner_output = convert_idx_to_name(ner_output, hidden['word_length'], self.ner_vocab)
        return [get_entities(ner) for ner in ner_output]
コード例 #4
0
ファイル: frontend.py プロジェクト: ztzdxqj/ltp
    def ner(self, hidden: dict, as_entities=True):
        """
        命名实体识别
        Args:
            hidden: 分词时所得到的中间表示
            as_entities: 是否以 Entity(Type, Start, End) 的形式返回

        Returns:
            pos: 命名实体识别结果
        """
        if len(self.ner_vocab) == 0:
            return []
        ner_output = self.model.ner_classifier.forward(
            hidden['word_input'],
            word_attention_mask=hidden['word_cls_mask'][:, 1:])
        ner_output = ner_output.decoded or torch.argmax(ner_output.logits,
                                                        dim=-1).cpu().numpy()
        ner_output = convert_idx_to_name(ner_output, hidden['word_length'],
                                         self.ner_vocab)
        return [get_entities(ner)
                for ner in ner_output] if as_entities else ner_output
コード例 #5
0
ファイル: frontend.py プロジェクト: ztzdxqj/ltp
    def seg(self,
            inputs: Union[List[str], List[List[str]]],
            truncation: bool = True,
            is_preseged=False):
        """
        分词

        Args:
            inputs: 句子列表
            truncation: 是否对过长的句子进行截断,如果为 False 可能会抛出异常
            is_preseged:  是否已经进行过分词

        Returns:
            words: 分词后的序列
            hidden: 用于其他任务的中间表示
        """

        if transformers_version.major >= 3 and transformers_version.major > 1:
            kwargs = {'is_split_into_words': is_preseged}
        else:
            kwargs = {'is_pretokenized': is_preseged}

        tokenized = self.tokenizer.batch_encode_plus(
            inputs,
            padding=True,
            truncation=truncation,
            return_tensors=self.tensor,
            max_length=self.max_length,
            **kwargs)
        cls, hidden, seg, lengths = self._seg(tokenized,
                                              is_preseged=is_preseged)

        batch_prefix = [[
            word_idx != encoding.words[idx - 1]
            for idx, word_idx in enumerate(encoding.words)
            if word_idx is not None
        ] for encoding in tokenized.encodings]

        # merge segments with maximum forward matching
        if self.trie.is_init and not is_preseged:
            matches = self.seg_with_dict(inputs, tokenized, batch_prefix)
            for sent_match, sent_seg in zip(matches, seg):
                for start, end in sent_match:
                    sent_seg[start] = self.seg_vocab_dict[WORD_START]
                    sent_seg[start + 1:end] = self.seg_vocab_dict[WORD_MIDDLE]
                    if end < len(sent_seg):
                        sent_seg[end] = self.seg_vocab_dict[WORD_START]

        if is_preseged:
            sentences = inputs
            word_length = [len(sentence) for sentence in sentences]

            word_idx = []
            for encodings in tokenized.encodings:
                sentence_word_idx = []
                for idx, (start, end) in enumerate(encodings.offsets[1:]):
                    if start == 0 and end != 0:
                        sentence_word_idx.append(idx)
                word_idx.append(
                    torch.as_tensor(sentence_word_idx, device=self.device))
        else:
            segment_output = convert_idx_to_name(seg, lengths, self.seg_vocab)
            sentences = []
            word_idx = []
            word_length = []

            for source_text, length, encoding, seg_tag, preffix in \
                    zip(inputs, lengths, tokenized.encodings, segment_output, batch_prefix):
                offsets = encoding.offsets[1:length + 1]
                text = []
                last_offset = None
                for start, end in offsets:
                    text.append('' if last_offset == (
                        start, end) else source_text[start:end])
                    last_offset = (start, end)

                for idx in range(1, length):
                    current_beg = offsets[idx][0]
                    forward_end = offsets[idx - 1][-1]
                    if forward_end < current_beg:
                        text[idx] = source_text[
                            forward_end:current_beg] + text[idx]
                    if not preffix[idx]:
                        seg_tag[idx] = WORD_MIDDLE

                entities = get_entities(seg_tag)
                word_length.append(len(entities))
                sentences.append([
                    ''.join(text[entity[1]:entity[2] + 1]).strip()
                    for entity in entities
                ])
                word_idx.append(
                    torch.as_tensor([entity[1] for entity in entities],
                                    device=self.device))

        word_idx = torch.nn.utils.rnn.pad_sequence(word_idx, batch_first=True)
        word_idx = word_idx.unsqueeze(-1).expand(-1, -1,
                                                 hidden.shape[-1])  # 展开

        word_input = torch.gather(hidden, dim=1,
                                  index=word_idx)  # 每个word第一个char的向量

        if len(self.dep_vocab) + len(self.sdp_vocab) > 0:
            word_cls_input = torch.cat([cls, word_input], dim=1)
            word_cls_mask = length_to_mask(
                torch.as_tensor(word_length, device=self.device) + 1)
            word_cls_mask[:, 0] = False
        else:
            word_cls_input, word_cls_mask = None, None

        return sentences, {
            'word_cls': cls,
            'word_input': word_input,
            'word_length': word_length,
            'word_cls_input': word_cls_input,
            'word_cls_mask': word_cls_mask
        }
コード例 #6
0
ファイル: ltp.py プロジェクト: wgmzone/ltp
    def seg(self, inputs: Union[List[str], List[List[str]]], truncation: bool = True, is_preseged=False):
        """
        分词

        Args:
            inputs: 句子列表
            truncation: 是否对过长的句子进行截断,如果为 False 可能会抛出异常
            is_preseged:  是否已经进行过分词

        Returns:
            words: 分词后的序列
            hidden: 用于其他任务的中间表示
        """
        tokenized = self.tokenizer.batch_encode_plus(
            inputs, padding=True, truncation=truncation,
            return_tensors=self.tensor, max_length=self.max_length,
            is_pretokenized=is_preseged
        )
        cls, hidden, seg, lengths = self._seg(tokenized, is_preseged=is_preseged)

        # merge segments with maximum forward matching
        if self.trie.is_init and not is_preseged:
            matches = self.seg_with_dict(inputs, tokenized)
            for sent_match, sent_seg in zip(matches, seg):
                for start, end in sent_match:
                    sent_seg[start] = 0
                    sent_seg[start + 1:end] = 1
                    if end < len(sent_seg):
                        sent_seg[end] = 0

        if is_preseged:
            sentences = inputs
            word_length = [len(sentence) for sentence in sentences]

            word_idx = []
            for encodings in tokenized.encodings:
                sentence_word_idx = []
                for idx, (start, end) in enumerate(encodings.offsets[1:]):
                    if start == 0 and end != 0:
                        sentence_word_idx.append(idx)
                word_idx.append(torch.as_tensor(sentence_word_idx, device=self.device))
        else:
            segment_output = convert_idx_to_name(seg, lengths, self.seg_vocab)
            sentences = []
            word_idx = []
            word_length = []

            for source_text, length, encoding, seg_tag in zip(inputs, lengths, tokenized.encodings, segment_output):
                words = encoding.words[1:length + 1]
                offsets = encoding.offsets[1:length + 1]
                text = [source_text[start:end] for start, end in offsets]

                for idx in range(1, length):
                    current_beg = offsets[idx][0]
                    forward_end = offsets[idx - 1][-1]
                    if forward_end < current_beg:
                        text[idx] = source_text[forward_end:current_beg] + text[idx]
                    if words[idx - 1] == words[idx]:
                        seg_tag[idx] = WORD_MIDDLE

                entities = get_entities(seg_tag)
                word_length.append(len(entities))
                sentences.append([''.join(text[entity[1]:entity[2] + 1]).strip() for entity in entities])
                word_idx.append(torch.as_tensor([entity[1] for entity in entities], device=self.device))

        word_idx = torch.nn.utils.rnn.pad_sequence(word_idx, batch_first=True)
        word_idx = word_idx.unsqueeze(-1).expand(-1, -1, hidden.shape[-1])  # 展开

        word_input = torch.gather(hidden, dim=1, index=word_idx)  # 每个word第一个char的向量

        word_cls_input = torch.cat([cls, word_input], dim=1)
        word_cls_mask = length_to_mask(torch.as_tensor(word_length, device=self.device) + 1)
        word_cls_mask[:, 0] = False  # ignore the first token of each sentence
        return sentences, {
            'word_cls': cls, 'word_input': word_input, 'word_length': word_length,
            'word_cls_input': word_cls_input, 'word_cls_mask': word_cls_mask
        }
コード例 #7
0
    def seg(self, inputs: List[str]):
        tokenizerd = self.tokenizer.batch_encode_plus(
            inputs, return_tensors=self.tensor, padding=True)
        cls, hidden, seg, length = self._seg(tokenizerd)

        # merge segments with maximum forward matching
        if self.trie.is_init:
            matches = self.seg_with_dict(inputs, tokenizerd)
            for sent_match, sent_seg in zip(matches, seg):
                for start, end in sent_match:
                    sent_seg[start] = 0
                    sent_seg[start + 1:end] = 1
                    if end < len(sent_seg):
                        sent_seg[end] = 0

        segment_output = convert_idx_to_name(seg, length, self.seg_vocab)
        if USE_PLUGIN:
            offsets = [
                list(filter(lambda x: x != (0, 0), encodings.offsets))
                for encodings in tokenizerd.encodings
            ]
            words = [
                list(filter(lambda x: x is not None, encodings.words))
                for encodings in tokenizerd.encodings
            ]
            sentences, word_idx, word_length = segment_decode(
                inputs, segment_output, offsets, words)
            word_idx = [
                torch.as_tensor(idx, device=self.device) for idx in word_idx
            ]
        else:
            sentences = []
            word_idx = []
            word_length = []

            for source_text, encoding, sentence_seg_tag in zip(
                    inputs, tokenizerd.encodings, segment_output):
                text = [
                    source_text[start:end]
                    for start, end in encoding.offsets[1:-1] if end != 0
                ]

                last_word = 0
                for idx, word in enumerate(encoding.words[1:-1]):
                    if word is None or is_chinese_char(text[idx][-1]):
                        continue
                    if word != last_word:
                        text[idx] = ' ' + text[idx]
                        last_word = word
                    else:
                        sentence_seg_tag[idx] = WORD_MIDDLE

                entities = get_entities(sentence_seg_tag)
                word_length.append(len(entities))
                sentences.append([
                    ''.join(text[entity[1]:entity[2] + 1]).strip()
                    for entity in entities
                ])
                word_idx.append(
                    torch.as_tensor([entity[1] for entity in entities],
                                    device=self.device))

        word_idx = torch.nn.utils.rnn.pad_sequence(word_idx, batch_first=True)
        word_idx = word_idx.unsqueeze(-1).expand(-1, -1,
                                                 hidden.shape[-1])  # 展开

        word_input = torch.gather(hidden, dim=1,
                                  index=word_idx)  # 每个word第一个char的向量

        word_cls_input = torch.cat([cls, word_input], dim=1)
        word_cls_mask = length_to_mask(
            torch.as_tensor(word_length, device=self.device) + 1)
        word_cls_mask[:, 0] = False  # ignore the first token of each sentence
        return sentences, {
            'word_cls': cls,
            'word_input': word_input,
            'word_length': word_length,
            'word_cls_input': word_cls_input,
            'word_cls_mask': word_cls_mask
        }