def get_entities_with_list(labels_, itos): res = [] for labels in labels_: labels = [itos[label] for label in labels] labels = get_entities(labels) res.append(labels) return res
def ner(self, hidden: dict): # 命名实体识别 word_length = torch.as_tensor(hidden['word_length'], device=self.device) ner_output = self.model.ner_decoder(hidden['word_input'], word_length) ner_output = torch.argmax(ner_output, dim=-1).cpu().numpy() ner_output = convert_idx_to_name(ner_output, hidden['word_length'], self.ner_vocab) return [get_entities(ner) for ner in ner_output]
def ner(self, hidden: dict): """ 命名实体识别 Args: hidden: 分词时所得到的中间表示 Returns: pos: 命名实体识别结果 """ ner_output = self.model.ner_classifier.forward( hidden['word_input'], word_attention_mask=hidden['word_cls_mask'][:, 1:] ).logits ner_output = torch.argmax(ner_output, dim=-1).cpu().numpy() ner_output = convert_idx_to_name(ner_output, hidden['word_length'], self.ner_vocab) return [get_entities(ner) for ner in ner_output]
def ner(self, hidden: dict, as_entities=True): """ 命名实体识别 Args: hidden: 分词时所得到的中间表示 as_entities: 是否以 Entity(Type, Start, End) 的形式返回 Returns: pos: 命名实体识别结果 """ if len(self.ner_vocab) == 0: return [] ner_output = self.model.ner_classifier.forward( hidden['word_input'], word_attention_mask=hidden['word_cls_mask'][:, 1:]) ner_output = ner_output.decoded or torch.argmax(ner_output.logits, dim=-1).cpu().numpy() ner_output = convert_idx_to_name(ner_output, hidden['word_length'], self.ner_vocab) return [get_entities(ner) for ner in ner_output] if as_entities else ner_output
def seg(self, inputs: Union[List[str], List[List[str]]], truncation: bool = True, is_preseged=False): """ 分词 Args: inputs: 句子列表 truncation: 是否对过长的句子进行截断,如果为 False 可能会抛出异常 is_preseged: 是否已经进行过分词 Returns: words: 分词后的序列 hidden: 用于其他任务的中间表示 """ if transformers_version.major >= 3 and transformers_version.major > 1: kwargs = {'is_split_into_words': is_preseged} else: kwargs = {'is_pretokenized': is_preseged} tokenized = self.tokenizer.batch_encode_plus( inputs, padding=True, truncation=truncation, return_tensors=self.tensor, max_length=self.max_length, **kwargs) cls, hidden, seg, lengths = self._seg(tokenized, is_preseged=is_preseged) batch_prefix = [[ word_idx != encoding.words[idx - 1] for idx, word_idx in enumerate(encoding.words) if word_idx is not None ] for encoding in tokenized.encodings] # merge segments with maximum forward matching if self.trie.is_init and not is_preseged: matches = self.seg_with_dict(inputs, tokenized, batch_prefix) for sent_match, sent_seg in zip(matches, seg): for start, end in sent_match: sent_seg[start] = self.seg_vocab_dict[WORD_START] sent_seg[start + 1:end] = self.seg_vocab_dict[WORD_MIDDLE] if end < len(sent_seg): sent_seg[end] = self.seg_vocab_dict[WORD_START] if is_preseged: sentences = inputs word_length = [len(sentence) for sentence in sentences] word_idx = [] for encodings in tokenized.encodings: sentence_word_idx = [] for idx, (start, end) in enumerate(encodings.offsets[1:]): if start == 0 and end != 0: sentence_word_idx.append(idx) word_idx.append( torch.as_tensor(sentence_word_idx, device=self.device)) else: segment_output = convert_idx_to_name(seg, lengths, self.seg_vocab) sentences = [] word_idx = [] word_length = [] for source_text, length, encoding, seg_tag, preffix in \ zip(inputs, lengths, tokenized.encodings, segment_output, batch_prefix): offsets = encoding.offsets[1:length + 1] text = [] last_offset = None for start, end in offsets: text.append('' if last_offset == ( start, end) else source_text[start:end]) last_offset = (start, end) for idx in range(1, length): current_beg = offsets[idx][0] forward_end = offsets[idx - 1][-1] if forward_end < current_beg: text[idx] = source_text[ forward_end:current_beg] + text[idx] if not preffix[idx]: seg_tag[idx] = WORD_MIDDLE entities = get_entities(seg_tag) word_length.append(len(entities)) sentences.append([ ''.join(text[entity[1]:entity[2] + 1]).strip() for entity in entities ]) word_idx.append( torch.as_tensor([entity[1] for entity in entities], device=self.device)) word_idx = torch.nn.utils.rnn.pad_sequence(word_idx, batch_first=True) word_idx = word_idx.unsqueeze(-1).expand(-1, -1, hidden.shape[-1]) # 展开 word_input = torch.gather(hidden, dim=1, index=word_idx) # 每个word第一个char的向量 if len(self.dep_vocab) + len(self.sdp_vocab) > 0: word_cls_input = torch.cat([cls, word_input], dim=1) word_cls_mask = length_to_mask( torch.as_tensor(word_length, device=self.device) + 1) word_cls_mask[:, 0] = False else: word_cls_input, word_cls_mask = None, None return sentences, { 'word_cls': cls, 'word_input': word_input, 'word_length': word_length, 'word_cls_input': word_cls_input, 'word_cls_mask': word_cls_mask }
def seg(self, inputs: Union[List[str], List[List[str]]], truncation: bool = True, is_preseged=False): """ 分词 Args: inputs: 句子列表 truncation: 是否对过长的句子进行截断,如果为 False 可能会抛出异常 is_preseged: 是否已经进行过分词 Returns: words: 分词后的序列 hidden: 用于其他任务的中间表示 """ tokenized = self.tokenizer.batch_encode_plus( inputs, padding=True, truncation=truncation, return_tensors=self.tensor, max_length=self.max_length, is_pretokenized=is_preseged ) cls, hidden, seg, lengths = self._seg(tokenized, is_preseged=is_preseged) # merge segments with maximum forward matching if self.trie.is_init and not is_preseged: matches = self.seg_with_dict(inputs, tokenized) for sent_match, sent_seg in zip(matches, seg): for start, end in sent_match: sent_seg[start] = 0 sent_seg[start + 1:end] = 1 if end < len(sent_seg): sent_seg[end] = 0 if is_preseged: sentences = inputs word_length = [len(sentence) for sentence in sentences] word_idx = [] for encodings in tokenized.encodings: sentence_word_idx = [] for idx, (start, end) in enumerate(encodings.offsets[1:]): if start == 0 and end != 0: sentence_word_idx.append(idx) word_idx.append(torch.as_tensor(sentence_word_idx, device=self.device)) else: segment_output = convert_idx_to_name(seg, lengths, self.seg_vocab) sentences = [] word_idx = [] word_length = [] for source_text, length, encoding, seg_tag in zip(inputs, lengths, tokenized.encodings, segment_output): words = encoding.words[1:length + 1] offsets = encoding.offsets[1:length + 1] text = [source_text[start:end] for start, end in offsets] for idx in range(1, length): current_beg = offsets[idx][0] forward_end = offsets[idx - 1][-1] if forward_end < current_beg: text[idx] = source_text[forward_end:current_beg] + text[idx] if words[idx - 1] == words[idx]: seg_tag[idx] = WORD_MIDDLE entities = get_entities(seg_tag) word_length.append(len(entities)) sentences.append([''.join(text[entity[1]:entity[2] + 1]).strip() for entity in entities]) word_idx.append(torch.as_tensor([entity[1] for entity in entities], device=self.device)) word_idx = torch.nn.utils.rnn.pad_sequence(word_idx, batch_first=True) word_idx = word_idx.unsqueeze(-1).expand(-1, -1, hidden.shape[-1]) # 展开 word_input = torch.gather(hidden, dim=1, index=word_idx) # 每个word第一个char的向量 word_cls_input = torch.cat([cls, word_input], dim=1) word_cls_mask = length_to_mask(torch.as_tensor(word_length, device=self.device) + 1) word_cls_mask[:, 0] = False # ignore the first token of each sentence return sentences, { 'word_cls': cls, 'word_input': word_input, 'word_length': word_length, 'word_cls_input': word_cls_input, 'word_cls_mask': word_cls_mask }
def seg(self, inputs: List[str]): tokenizerd = self.tokenizer.batch_encode_plus( inputs, return_tensors=self.tensor, padding=True) cls, hidden, seg, length = self._seg(tokenizerd) # merge segments with maximum forward matching if self.trie.is_init: matches = self.seg_with_dict(inputs, tokenizerd) for sent_match, sent_seg in zip(matches, seg): for start, end in sent_match: sent_seg[start] = 0 sent_seg[start + 1:end] = 1 if end < len(sent_seg): sent_seg[end] = 0 segment_output = convert_idx_to_name(seg, length, self.seg_vocab) if USE_PLUGIN: offsets = [ list(filter(lambda x: x != (0, 0), encodings.offsets)) for encodings in tokenizerd.encodings ] words = [ list(filter(lambda x: x is not None, encodings.words)) for encodings in tokenizerd.encodings ] sentences, word_idx, word_length = segment_decode( inputs, segment_output, offsets, words) word_idx = [ torch.as_tensor(idx, device=self.device) for idx in word_idx ] else: sentences = [] word_idx = [] word_length = [] for source_text, encoding, sentence_seg_tag in zip( inputs, tokenizerd.encodings, segment_output): text = [ source_text[start:end] for start, end in encoding.offsets[1:-1] if end != 0 ] last_word = 0 for idx, word in enumerate(encoding.words[1:-1]): if word is None or is_chinese_char(text[idx][-1]): continue if word != last_word: text[idx] = ' ' + text[idx] last_word = word else: sentence_seg_tag[idx] = WORD_MIDDLE entities = get_entities(sentence_seg_tag) word_length.append(len(entities)) sentences.append([ ''.join(text[entity[1]:entity[2] + 1]).strip() for entity in entities ]) word_idx.append( torch.as_tensor([entity[1] for entity in entities], device=self.device)) word_idx = torch.nn.utils.rnn.pad_sequence(word_idx, batch_first=True) word_idx = word_idx.unsqueeze(-1).expand(-1, -1, hidden.shape[-1]) # 展开 word_input = torch.gather(hidden, dim=1, index=word_idx) # 每个word第一个char的向量 word_cls_input = torch.cat([cls, word_input], dim=1) word_cls_mask = length_to_mask( torch.as_tensor(word_length, device=self.device) + 1) word_cls_mask[:, 0] = False # ignore the first token of each sentence return sentences, { 'word_cls': cls, 'word_input': word_input, 'word_length': word_length, 'word_cls_input': word_cls_input, 'word_cls_mask': word_cls_mask }