class NERBert(BertPreTrainedModel): def __init__(self, config, args): super(NERBert, self).__init__(config) self.args = args self.bert = BertModel(config) if self.args.weighted: self.weight = nn.Parameter(torch.Tensor(self.args.num_layers)) self.weight.data.uniform_(0.1, 0.9) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.args.num_labels) if self.args.use_crf: self.crf = ConditionalRandomField(self.args.num_labels) self.apply(self.init_bert_weights) def forward(self, input_ids, labels=None): attention_mask = input_ids.gt(0) encoded_layers, _ = self.bert(input_ids, None, attention_mask, output_all_encoded_layers=True) if not self.args.weighted: sequence_output = encoded_layers[-1] else: last_layers = torch.cat(encoded_layers[-self.num_layers:], dim=-1).view(encoded_layers[0].size(0), encoded_layers[0].size(1), encoded_layers[0].size(2), self.num_layers) soft_weight = F.softmax(self.weight) sequence_output = torch.matmul(last_layers, soft_weight) sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) if not self.args.use_crf: if labels is not None: loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # Only keep active parts of the loss loss = loss_fct(logits.view(-1, self.args.num_labels), labels.view(-1)) return logits, loss else: return logits else: if labels is not None: # Only keep active parts of the loss if attention_mask is not None: total_loss = self.crf(logits, labels, attention_mask) return torch.mean(total_loss) else: max_len = logits.shape[1] tag_seq = self.crf.viterbi_decode(logits, attention_mask) for pred in tag_seq: if len(pred) < max_len: pred += [0] * (max_len - len(pred)) return tag_seq
class NERModel(nn.Module): def __init__(self, args, word_emb_matrix=None): super(NERModel, self).__init__() self.args = args if word_emb_matrix is not None: self.embedding = nn.Embedding.from_pretrained(torch.tensor(word_emb_matrix, dtype=torch.float)) self.embedding.weight.requires_grad = args.trainable_embedding else: self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim) self.embedding.weight.requires_grad = True if args.model == 'cnn': self.encoder = CNNEncoder(args) elif args.model == 'rnn': self.encoder = DynamicRNN(args.embedding_dim, args.hidden_dim, bidirectional=True) self.linear = nn.Linear(args.hidden_dim*2, args.num_labels) self.dropout = nn.Dropout(0.2) if self.args.use_crf: self.crf = ConditionalRandomField(args.num_labels) def forward(self, input_ids, labels=None): attention_mask = input_ids.gt(0) inputs = self.embedding(input_ids) if self.args.model == 'cnn': rep = self.encoder(inputs) elif self.args.model == 'rnn': x_len = torch.sum(input_ids != 0, dim=1) rep, _ = self.encoder(inputs, x_len) logits = self.linear(self.dropout(rep)) if not self.args.use_crf: if labels is not None: loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # Only keep active parts of the loss loss = loss_fct(logits.view(-1, self.args.num_labels), labels.view(-1)) return logits, loss else: return logits else: if labels is not None: # Only keep active parts of the loss if attention_mask is not None: total_loss = self.crf(logits, labels, attention_mask) return 0,torch.mean(total_loss) else: max_len = logits.shape[1] tag_seq = self.crf.viterbi_decode(logits, attention_mask) for pred in tag_seq: if len(pred) < max_len: pred += [0] * (max_len - len(pred)) return tag_seq
class BiLSTM_CRF(nn.Module): """ 别名::class:`fastNLP.models.AdvSeqLabel` :class:`fastNLP.models.sequence_labeling.AdvSeqLabel` 更复杂的Sequence Labelling模型。结构为Embedding, LayerNorm, 双向LSTM(两层),FC,LayerNorm,DropOut,FC,CRF。 :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding :param int hidden_size: LSTM的隐层大小 :param int num_classes: 有多少个类 :param float dropout: LSTM中以及DropOut层的drop概率 :param dict id2words: tag id转为其tag word的表。用于在CRF解码时防止解出非法的顺序,比如'BMES'这个标签规范中,'S' 不能出现在'B'之后。这里也支持类似与'B-NN',即'-'前为标签类型的指示,后面为具体的tag的情况。这里不但会保证 'B-NN'后面不为'S-NN'还会保证'B-NN'后面不会出现'M-xx'(任何非'M-NN'和'E-NN'的情况。) :param str encoding_type: 支持"BIO", "BMES", "BEMSO", 只有在id2words不为None的情况有用。 """ def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, sentence_length, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bieso', weight=None): super().__init__() # self.Embedding = nn.Embedding(init_embed) # print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) # word2vec self.word_embed.weight.data.copy_(torch.from_numpy(weight)) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim # sentence length #self.sen_len = sentence_length #self.zeros = torch.zeros(self.sen_len, dtype=torch.long) self.norm1 = torch.nn.LayerNorm(self.embed_dim) self.Rnn = nn.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True) self.Linear1 = nn.Linear(hidden_size * 2, hidden_size * 2 // 3) self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions(id2words, encoding_type=encoding_type)) def _decode(self, x): """ :param torch.FloatTensor x: [batch_size, max_len, tag_size] :return torch.LongTensor, [batch_size, max_len] """ tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) return tag_seq def _internal_loss(self, x, y): """ Negative log likelihood loss. :param x: Tensor, [batch_size, max_len, tag_size] :param y: Tensor, [batch_size, max_len] :return loss: a scalar Tensor """ x = x.float() y = y.long() assert x.shape[:2] == y.shape assert y.shape == self.mask.shape total_loss = self.Crf(x, y, self.mask) return torch.mean(total_loss) def _make_mask(self, x, seq_len): batch_size, max_len = x.size(0), x.size(1) mask = seq_len_to_mask(seq_len, max_len) # print(seq_len) # print(seq_len.size()) # print(x) # print(x.size()) # print(mask.size()) # mask = mask.view(batch_size, max_len) mask = mask.to(x).float() return mask def _forward(self, char, word, pos, spo, seq_len, tag=None): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len:[batch_size, ] :param torch.LongTensor target: [batch_size, max_len] :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. If truth is not None, return loss, a scalar. Used in training. """ char = char.long() #word = word.long() #pos = pos.long() #spo = spo.long() seq_len = seq_len.long() self.mask = self._make_mask(char, seq_len) # seq_len = seq_len.long() tag = tag.long() if tag is not None else None #if next(self.parameters()).is_cuda: # char = char.cuda() # self.mask = self.mask.cuda() # x = self.Embedding(words) char = self.char_embed(char) word = self.word_embed(word) pos = self.pos_embed(pos) #print(spo) #print(self.zeros) spo = spo.unsqueeze(1).repeat(1, char.shape[1], 1).float() #print(char.shape) #print(word.shape) #print(pos.shape) #print(spo.shape) x = torch.cat((char, word, pos, spo), dim=2) #print(x.shape) x = self.norm1(x) # [batch_size, max_len, char_embed_dim + word_embed_dim + pos_embed_dim + spo_embed_dim ] # x, _ = self.Rnn(x, seq_len=seq_len) x, _ = self.Rnn(x) x = self.Linear1(x) x = self.norm2(x) x = self.relu(x) x = self.drop(x) x = self.Linear2(x) if tag is not None: return self._internal_loss(x, tag) else: return self._decode(x) def forward(self, char, word, pos, spo, seq_len, tag): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :param torch.LongTensor target: [batch_size, max_len], 目标 :return torch.Tensor: a scalar loss """ return self._forward(char, word, pos, spo, seq_len, tag) def predict(self, char, word, pos, spo, seq_len): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :return torch.LongTensor: [batch_size, max_len] """ return self._forward(char, word, pos, spo, seq_len)
class Lattice_Transformer_SeqLabel(nn.Module): def __init__(self, lattice_weight, lattice_num, lattice_dim, bigram_weight, bigram_num, bigram_dim, hidden_size, label_size, num_heads, num_layers, learnable_position, layer_preprocess_sequence, layer_postprocess_sequence, ff_size=-1, dropout=None, max_seq_len=-1): super().__init__() self.lattice_embed = nn.Embedding(lattice_num, lattice_dim) self.lattice_embed.weight.data.copy_(torch.from_numpy(lattice_weight)) self.bigram_embed = nn.Embedding(bigram_num, bigram_dim) self.bigram_embed.weight.data.copy_(torch.from_numpy(bigram_weight)) pe_ss = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) pe_se = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) pe_es = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) pe_ee = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) # self.bigram_size = self.bigram_embed.embedding.weight.size(1) # char_input_size = self.lattice_embed.embedding.weight.size(1) + self.bigram_embed.embedding.weight.size(1) # lex_input_size = self.lattice_embed.embedding.weight.size(1) self.bigram_size = bigram_dim char_input_size = bigram_dim + lattice_dim lex_input_size = lattice_dim self.embed_dropout = nn.Dropout(p=dropout['embed']) self.gaz_dropout = nn.Dropout(p=dropout['gaz']) self.output_dropout = nn.Dropout(p=dropout['output']) self.char_proj = nn.Linear(char_input_size, hidden_size) self.lex_proj = nn.Linear(lex_input_size, hidden_size) self.encoder = Transformer_Encoder( hidden_size, num_heads, num_layers, learnable_position=learnable_position, layer_preprocess_sequence=layer_preprocess_sequence, layer_postprocess_sequence=layer_postprocess_sequence, dropout=dropout, ff_size=ff_size, max_seq_len=max_seq_len, pe_ss=pe_ss, pe_se=pe_se, pe_es=pe_es, pe_ee=pe_ee) self.output = nn.Linear(hidden_size, label_size) self.crf = ConditionalRandomField(label_size, include_start_end_trans=True) # self.crf.trans_m = nn.Parameter(torch.zeros(size=[label_size, label_size], requires_grad=True)) self.loss_func = nn.CrossEntropyLoss(ignore_index=-100) # 训练时用 # TODO 参数类型 def forward(self, lattice: torch.Tensor, bigrams: torch.Tensor, seq_len: torch.Tensor, lex_num: torch.Tensor, pos_s: torch.Tensor, pos_e: torch.Tensor, target: Optional[torch.Tensor]): batch_size = lattice.size(0) max_seq_len_and_lex_num = lattice.size(1) max_seq_len = bigrams.size(1) raw_embed = self.lattice_embed(lattice) bigrams_embed = self.bigram_embed(bigrams) bigrams_embed = torch.cat([ bigrams_embed, torch.zeros(size=[ batch_size, max_seq_len_and_lex_num - max_seq_len, self.bigram_size ]).to(bigrams_embed) ], dim=1) raw_embed_char = torch.cat([raw_embed, bigrams_embed], dim=-1) raw_embed_char = self.embed_dropout(raw_embed_char) raw_embed = self.gaz_dropout(raw_embed) embed_char = self.char_proj(raw_embed_char) char_mask = seq_len_to_mask(seq_len, max_len=max_seq_len_and_lex_num) embed_char.masked_fill_(~(char_mask.unsqueeze(-1)), 0) embed_lex = self.lex_proj(raw_embed) lex_mask = (seq_len_to_mask(seq_len + lex_num) ^ char_mask) embed_lex.masked_fill_(~(lex_mask).unsqueeze(-1), 0) embedding = embed_char + embed_lex encoded = self.encoder(embedding, seq_len, lex_num=lex_num, pos_s=pos_s, pos_e=pos_e) encoded = self.output_dropout(encoded) # 这里只获取transformer输出的char部分 encoded = encoded[:, :max_seq_len, :] pred = self.output(encoded) mask = seq_len_to_mask(seq_len) # script使用 # pred, path = self.crf.viterbi_decode(pred, mask) # return pred if self.training: loss = self.crf(pred, target, mask).mean(dim=0) return {'loss': loss} else: pred, path = self.crf.viterbi_decode(pred, mask) result = {'pred': pred} return result
class Transformer_CRF(nn.Module): def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, num_classes, num_layers, inner_size, key_size, value_size, num_head, dropout=0.1, id2words=None, encoding_type='bieso', weight=None): super().__init__() # self.Embedding = nn.Embedding(init_embed) #print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) self.word_embed.weight.data.copy_(torch.from_numpy(weight)) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim self.norm1 = torch.nn.LayerNorm(self.embed_dim) self.transformer = encoder.TransformerEncoder( num_layers=num_layers, model_size=self.embed_dim, inner_size=inner_size, key_size=key_size, value_size=value_size, num_head=num_head, dropout=dropout) self.Linear1 = nn.Linear(self.embed_dim, self.embed_dim // 3) self.norm2 = torch.nn.LayerNorm(self.embed_dim // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(self.embed_dim // 3, num_classes) self.Linear = nn.Linear(self.embed_dim, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions( id2words, encoding_type=encoding_type)) def _decode(self, x): """ :param torch.FloatTensor x: [batch_size, max_len, tag_size] :return torch.LongTensor, [batch_size, max_len] """ tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) return tag_seq def _internal_loss(self, x, y): """ Negative log likelihood loss. :param x: Tensor, [batch_size, max_len, tag_size] :param y: Tensor, [batch_size, max_len] :return loss: a scalar Tensor """ x = x.float() y = y.long() assert x.shape[:2] == y.shape assert y.shape == self.mask.shape total_loss = self.Crf(x, y, self.mask) return torch.mean(total_loss) def _make_mask(self, x, seq_len): batch_size, max_len = x.size(0), x.size(1) mask = seq_len_to_mask(seq_len) # mask = mask.view(batch_size, max_len) mask = mask.to(x).float() return mask def _forward(self, char, word, pos, spo, seq_len, tag=None): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len:[batch_size, ] :param torch.LongTensor target: [batch_size, max_len] :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. If truth is not None, return loss, a scalar. Used in training. """ char = char.long() #word = word.long() #pos = pos.long() #spo = spo.long() seq_len = seq_len.long() self.mask = self._make_mask(char, seq_len) # seq_len = seq_len.long() tag = tag.long() if tag is not None else None #if next(self.parameters()).is_cuda: # char = char.cuda() # self.mask = self.mask.cuda() # x = self.Embedding(words) char = self.char_embed(char) word = self.word_embed(word) pos = self.pos_embed(pos) #print(spo) #print(self.zeros) spo = spo.unsqueeze(1).repeat(1, char.shape[1], 1).float() #print(char.shape) #print(word.shape) #print(pos.shape) #print(spo.shape) x = torch.cat((char, word, pos, spo), dim=2) #print(x.shape) x = self.norm1(x) # [batch_size, max_len, char_embed_dim + word_embed_dim + pos_embed_dim + spo_embed_dim ] x = self.transformer(x, seq_mask=self.mask) #x = self.Linear1(x) #x = self.norm2(x) #x = self.relu(x) #x = self.drop(x) #x = self.Linear2(x) x = self.Linear(x) if tag is not None: return self._internal_loss(x, tag) else: return self._decode(x) #return {"pred": self._decode(x)} def forward(self, char, word, pos, spo, seq_len, tag): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :param torch.LongTensor target: [batch_size, max_len], 目标 :return torch.Tensor: a scalar loss """ return self._forward(char, word, pos, spo, seq_len, tag) def predict(self, char, word, pos, spo, seq_len): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :return torch.LongTensor: [batch_size, max_len] """ return self._forward(char, word, pos, spo, seq_len)