def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, sentence_length, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bieso', weight=None): super().__init__() # self.Embedding = nn.Embedding(init_embed) # print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) # word2vec self.word_embed.weight.data.copy_(torch.from_numpy(weight)) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim # sentence length #self.sen_len = sentence_length #self.zeros = torch.zeros(self.sen_len, dtype=torch.long) self.norm1 = torch.nn.LayerNorm(self.embed_dim) self.Rnn = nn.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True) self.Linear1 = nn.Linear(hidden_size * 2, hidden_size * 2 // 3) self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions(id2words, encoding_type=encoding_type))
def __init__(self, config, args): super(NERBert, self).__init__(config) self.args = args self.bert = BertModel(config) if self.args.weighted: self.weight = nn.Parameter(torch.Tensor(self.args.num_layers)) self.weight.data.uniform_(0.1, 0.9) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.args.num_labels) if self.args.use_crf: self.crf = ConditionalRandomField(self.args.num_labels) self.apply(self.init_bert_weights)
def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, num_classes, num_layers, inner_size, key_size, value_size, num_head, dropout=0.1, id2words=None, encoding_type='bieso', weight=None): super().__init__() # self.Embedding = nn.Embedding(init_embed) #print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) self.word_embed.weight.data.copy_(torch.from_numpy(weight)) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim self.norm1 = torch.nn.LayerNorm(self.embed_dim) self.transformer = encoder.TransformerEncoder( num_layers=num_layers, model_size=self.embed_dim, inner_size=inner_size, key_size=key_size, value_size=value_size, num_head=num_head, dropout=dropout) self.Linear1 = nn.Linear(self.embed_dim, self.embed_dim // 3) self.norm2 = torch.nn.LayerNorm(self.embed_dim // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(self.embed_dim // 3, num_classes) self.Linear = nn.Linear(self.embed_dim, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions( id2words, encoding_type=encoding_type))
def __init__(self, args): super(LacNet, self).__init__() vocab_size = args.vocab_size word_dim = args.word_dim num_gru_layers = args.num_gru_layers num_labels = args.num_labels hidden_dim = args.hidden_dim self.word_emb = nn.Embedding(vocab_size, word_dim) self.gru_layers = nn.ModuleList( [BiGruLayer(args) for _ in range(num_gru_layers)]) self.emission = nn.Linear(hidden_dim * 2, num_labels) self.crf = ConditionalRandomField(num_labels)
def __init__(self, tagset_size, vocab_size, hidden_dim, embedding_dim, pretrained_embeddings, dropout, num_layers, pad_index, device, fine_tune=True, bidirectional=True): super(LSTM_CRF_Softmax, self).__init__() self.tagset_size = tagset_size self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.dropout = nn.Dropout(p=dropout) self.bidirectional = bidirectional self.num_layers = num_layers self.pad_index = pad_index self.device = device self.embedding_layer = nn.Embedding(self.vocab_size, self.embedding_dim) if type(pretrained_embeddings) == torch.Tensor: self.embedding_layer.weight.data.copy_(pretrained_embeddings) if not fine_tune: self.embedding_layer.weight.requires_grad = False self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, num_layers=self.num_layers, bidirectional=self.bidirectional) self.hidden2tag = nn.Linear(2 * self.hidden_dim, self.tagset_size) self.crf = ConditionalRandomField(self.tagset_size, 1, 2) self.loss_fn = nn.CrossEntropyLoss(ignore_index=self.pad_index)
class LacNet(nn.Module): def __init__(self, args): super(LacNet, self).__init__() vocab_size = args.vocab_size word_dim = args.word_dim num_gru_layers = args.num_gru_layers num_labels = args.num_labels hidden_dim = args.hidden_dim self.word_emb = nn.Embedding(vocab_size, word_dim) self.gru_layers = nn.ModuleList( [BiGruLayer(args) for _ in range(num_gru_layers)]) self.emission = nn.Linear(hidden_dim * 2, num_labels) self.crf = ConditionalRandomField(num_labels) # self.crf_decode = crf_decoding() # self.crf_cost = linear_chain_crf() def forward(self, x, lens=None): x = self.word_emb(x) for gru in self.gru_layers: x = gru(x) x = self.emission(x) if lens is None: lens = torch.tensor([words.size(1)], device=words.device) mask = sequence_mask(lens) # Run features through Viterbi decode algorithm. preds = self.crf.viterbi_tags(feats, mask) # loglik = self.crf(feats, labs, mask=mask) # loss = -1. * loglik return preds def get_trainable_params(self): module_params = [ self.char_feats_layer.parameters(), self.rnn.parameters(), self.rnn_to_crf.parameters(), self.crf.parameters() ] return module_params
class NERBert(BertPreTrainedModel): def __init__(self, config, args): super(NERBert, self).__init__(config) self.args = args self.bert = BertModel(config) if self.args.weighted: self.weight = nn.Parameter(torch.Tensor(self.args.num_layers)) self.weight.data.uniform_(0.1, 0.9) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.args.num_labels) if self.args.use_crf: self.crf = ConditionalRandomField(self.args.num_labels) self.apply(self.init_bert_weights) def forward(self, input_ids, labels=None): attention_mask = input_ids.gt(0) encoded_layers, _ = self.bert(input_ids, None, attention_mask, output_all_encoded_layers=True) if not self.args.weighted: sequence_output = encoded_layers[-1] else: last_layers = torch.cat(encoded_layers[-self.num_layers:], dim=-1).view(encoded_layers[0].size(0), encoded_layers[0].size(1), encoded_layers[0].size(2), self.num_layers) soft_weight = F.softmax(self.weight) sequence_output = torch.matmul(last_layers, soft_weight) sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) if not self.args.use_crf: if labels is not None: loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # Only keep active parts of the loss loss = loss_fct(logits.view(-1, self.args.num_labels), labels.view(-1)) return logits, loss else: return logits else: if labels is not None: # Only keep active parts of the loss if attention_mask is not None: total_loss = self.crf(logits, labels, attention_mask) return torch.mean(total_loss) else: max_len = logits.shape[1] tag_seq = self.crf.viterbi_decode(logits, attention_mask) for pred in tag_seq: if len(pred) < max_len: pred += [0] * (max_len - len(pred)) return tag_seq
def __init__(self, args, word_emb_matrix=None): super(NERModel, self).__init__() self.args = args if word_emb_matrix is not None: self.embedding = nn.Embedding.from_pretrained(torch.tensor(word_emb_matrix, dtype=torch.float)) self.embedding.weight.requires_grad = args.trainable_embedding else: self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim) self.embedding.weight.requires_grad = True if args.model == 'cnn': self.encoder = CNNEncoder(args) elif args.model == 'rnn': self.encoder = DynamicRNN(args.embedding_dim, args.hidden_dim, bidirectional=True) self.linear = nn.Linear(args.hidden_dim*2, args.num_labels) self.dropout = nn.Dropout(0.2) if self.args.use_crf: self.crf = ConditionalRandomField(args.num_labels)
class NERModel(nn.Module): def __init__(self, args, word_emb_matrix=None): super(NERModel, self).__init__() self.args = args if word_emb_matrix is not None: self.embedding = nn.Embedding.from_pretrained(torch.tensor(word_emb_matrix, dtype=torch.float)) self.embedding.weight.requires_grad = args.trainable_embedding else: self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim) self.embedding.weight.requires_grad = True if args.model == 'cnn': self.encoder = CNNEncoder(args) elif args.model == 'rnn': self.encoder = DynamicRNN(args.embedding_dim, args.hidden_dim, bidirectional=True) self.linear = nn.Linear(args.hidden_dim*2, args.num_labels) self.dropout = nn.Dropout(0.2) if self.args.use_crf: self.crf = ConditionalRandomField(args.num_labels) def forward(self, input_ids, labels=None): attention_mask = input_ids.gt(0) inputs = self.embedding(input_ids) if self.args.model == 'cnn': rep = self.encoder(inputs) elif self.args.model == 'rnn': x_len = torch.sum(input_ids != 0, dim=1) rep, _ = self.encoder(inputs, x_len) logits = self.linear(self.dropout(rep)) if not self.args.use_crf: if labels is not None: loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # Only keep active parts of the loss loss = loss_fct(logits.view(-1, self.args.num_labels), labels.view(-1)) return logits, loss else: return logits else: if labels is not None: # Only keep active parts of the loss if attention_mask is not None: total_loss = self.crf(logits, labels, attention_mask) return 0,torch.mean(total_loss) else: max_len = logits.shape[1] tag_seq = self.crf.viterbi_decode(logits, attention_mask) for pred in tag_seq: if len(pred) < max_len: pred += [0] * (max_len - len(pred)) return tag_seq
def __init__( self, bert_model, num_labels=9, embedding_dim=512, hidden_dim=512, rnn_layers=1, rnn_dropout=0.1, output_dropout=0.1, use_cuda=False, ): super(BertLstmCrf, self).__init__() self.bert_encoder = bert_model self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.rnn_layers = rnn_layers self.lstm = None if rnn_layers > 0: self.lstm = nn.LSTM( embedding_dim, hidden_dim, num_layers=rnn_layers, bidirectional=True, dropout=rnn_dropout, batch_first=True, ) # TODO: add contraints constraints = None include_start_end_transitions = False self.crf = ConditionalRandomField( num_labels, constraints, include_start_end_transitions=include_start_end_transitions, ) self.liner = nn.Linear(hidden_dim * 2, num_labels) self.num_labels = num_labels self.output_dropout = nn.Dropout(p=output_dropout)
def __init__(self, args, config, word_embedding): super(Transformer, self).__init__() n_classes = args.n_classes d_model = int(config['d_model']) h = int(config['n_head']) d_ff = int(config['d_ff']) N = int(config['n_layer']) dropout = float(config['dropout']) vocab_size = args.vocab_size self.use_crf = int(config['use_crf']) c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ffn = PositionwiseFeedForward(d_model, d_ff, dropout) pe = PositionalEncoding(d_model, dropout) layer = EncoderLayer(d_model, c(attn), c(ffn), dropout) self.embed = nn.Sequential( Embeddings(word_embedding, vocab_size, d_model), c(pe)) self.encoder = Encoder(c(layer), N) self.out = nn.ModuleList([ nn.Linear(d_model, n_classes[0]), nn.Linear(d_model, n_classes[1]), nn.Linear(d_model, n_classes[2]) ]) for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) if self.use_crf: self.crf = nn.ModuleList([ ConditionalRandomField(n_classes[0], True), ConditionalRandomField(n_classes[1], True), ConditionalRandomField(n_classes[2], True) ]) self.log_sigma_square_1 = nn.Parameter(torch.Tensor([0])) self.log_sigma_square_2 = nn.Parameter(torch.Tensor([0])) self.log_sigma_square_3 = nn.Parameter(torch.Tensor([0]))
def __init__(self, args, config, word_embedding): super(BiLSTM, self).__init__() self.vocab_size = args.vocab_size self.embed_dim = int(config['embed_dim']) self.hidden_size = int(config['hidden_size']) self.n_classes = args.n_classes self.dropout_p = float(config['dropout']) self.n_layer = int(config['n_layer']) self.use_crf = int(config['use_crf']) we = torch.from_numpy(word_embedding).float() self.embed = nn.Embedding(self.vocab_size, self.embed_dim, _weight=we) self.dropout = nn.Dropout(self.dropout_p) self.bilstm = nn.LSTM(input_size=self.embed_dim, hidden_size=self.hidden_size, num_layers=self.n_layer, batch_first=True, bidirectional=True) self.out = nn.ModuleList([ nn.Linear(self.hidden_size * 2, self.n_classes[0]), nn.Linear(self.hidden_size * 2, self.n_classes[1]), nn.Linear(self.hidden_size * 2, self.n_classes[2]) ]) init_linear(self.out[0]) init_linear(self.out[1]) init_linear(self.out[2]) if self.use_crf: self.crf = nn.ModuleList([ ConditionalRandomField(self.n_classes[0]), ConditionalRandomField(self.n_classes[1]), ConditionalRandomField(self.n_classes[2]) ]) self.log_sigma_square_pos = nn.Parameter(torch.Tensor([0])) self.log_sigma_square_ner = nn.Parameter(torch.Tensor([0])) self.log_sigma_square_chunk = nn.Parameter(torch.Tensor([0]))
class BertLstmCrf(nn.Module): """ bert_lstm_crf model """ def __init__( self, bert_model, num_labels=9, embedding_dim=512, hidden_dim=512, rnn_layers=1, rnn_dropout=0.1, output_dropout=0.1, use_cuda=False, ): super(BertLstmCrf, self).__init__() self.bert_encoder = bert_model self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.rnn_layers = rnn_layers self.lstm = None if rnn_layers > 0: self.lstm = nn.LSTM( embedding_dim, hidden_dim, num_layers=rnn_layers, bidirectional=True, dropout=rnn_dropout, batch_first=True, ) # TODO: add contraints constraints = None include_start_end_transitions = False self.crf = ConditionalRandomField( num_labels, constraints, include_start_end_transitions=include_start_end_transitions, ) self.liner = nn.Linear(hidden_dim * 2, num_labels) self.num_labels = num_labels self.output_dropout = nn.Dropout(p=output_dropout) def rand_init_hidden(self, batch_size): """ random initialize hidden variable """ return ( torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim), torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim), ) def forward(self, **kwargs): """ args: sentence (word_seq_len, batch_size) : word-level representation of sentence hidden: initial hidden state return: crf output (word_seq_len, batch_size, tag_size, tag_size), hidden """ kwargs_copy = copy.deepcopy(kwargs) if "labels" in kwargs_copy: kwargs_copy.pop("labels") batch_size = kwargs["input_ids"].size(0) seq_length = kwargs["input_ids"].size(1) bert_outputs = self.bert_encoder(**kwargs_copy) sequence_output = bert_outputs[0] if self.lstm is not None: hidden = self.rand_init_hidden(batch_size) if kwargs["input_ids"].is_cuda: hidden = [i.cuda() for i in hidden] sequence_output, hidden = self.lstm(sequence_output, hidden) sequence_output = sequence_output.contiguous().view( -1, self.hidden_dim * 2) sequence_output = self.output_dropout(sequence_output) out = self.liner(sequence_output) logits = out.contiguous().view(batch_size, seq_length, -1) best_paths = self.crf.viterbi_tags(logits, kwargs["attention_mask"].long(), top_k=1) # Just get the top tags and ignore the scores. predicted_tags = cast(List[List[int]], [x[0][0] for x in best_paths]) if kwargs.get("labels") is not None: labels = kwargs.get("labels") log_likelihood = self.crf(logits, labels, kwargs["attention_mask"]) loss = -log_likelihood return loss, logits, predicted_tags return None, logits, predicted_tags
class BiLSTM_CRF(nn.Module): """ 别名::class:`fastNLP.models.AdvSeqLabel` :class:`fastNLP.models.sequence_labeling.AdvSeqLabel` 更复杂的Sequence Labelling模型。结构为Embedding, LayerNorm, 双向LSTM(两层),FC,LayerNorm,DropOut,FC,CRF。 :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding :param int hidden_size: LSTM的隐层大小 :param int num_classes: 有多少个类 :param float dropout: LSTM中以及DropOut层的drop概率 :param dict id2words: tag id转为其tag word的表。用于在CRF解码时防止解出非法的顺序,比如'BMES'这个标签规范中,'S' 不能出现在'B'之后。这里也支持类似与'B-NN',即'-'前为标签类型的指示,后面为具体的tag的情况。这里不但会保证 'B-NN'后面不为'S-NN'还会保证'B-NN'后面不会出现'M-xx'(任何非'M-NN'和'E-NN'的情况。) :param str encoding_type: 支持"BIO", "BMES", "BEMSO", 只有在id2words不为None的情况有用。 """ def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, sentence_length, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bieso', weight=None): super().__init__() # self.Embedding = nn.Embedding(init_embed) # print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) # word2vec self.word_embed.weight.data.copy_(torch.from_numpy(weight)) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim # sentence length #self.sen_len = sentence_length #self.zeros = torch.zeros(self.sen_len, dtype=torch.long) self.norm1 = torch.nn.LayerNorm(self.embed_dim) self.Rnn = nn.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True) self.Linear1 = nn.Linear(hidden_size * 2, hidden_size * 2 // 3) self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions(id2words, encoding_type=encoding_type)) def _decode(self, x): """ :param torch.FloatTensor x: [batch_size, max_len, tag_size] :return torch.LongTensor, [batch_size, max_len] """ tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) return tag_seq def _internal_loss(self, x, y): """ Negative log likelihood loss. :param x: Tensor, [batch_size, max_len, tag_size] :param y: Tensor, [batch_size, max_len] :return loss: a scalar Tensor """ x = x.float() y = y.long() assert x.shape[:2] == y.shape assert y.shape == self.mask.shape total_loss = self.Crf(x, y, self.mask) return torch.mean(total_loss) def _make_mask(self, x, seq_len): batch_size, max_len = x.size(0), x.size(1) mask = seq_len_to_mask(seq_len, max_len) # print(seq_len) # print(seq_len.size()) # print(x) # print(x.size()) # print(mask.size()) # mask = mask.view(batch_size, max_len) mask = mask.to(x).float() return mask def _forward(self, char, word, pos, spo, seq_len, tag=None): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len:[batch_size, ] :param torch.LongTensor target: [batch_size, max_len] :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. If truth is not None, return loss, a scalar. Used in training. """ char = char.long() #word = word.long() #pos = pos.long() #spo = spo.long() seq_len = seq_len.long() self.mask = self._make_mask(char, seq_len) # seq_len = seq_len.long() tag = tag.long() if tag is not None else None #if next(self.parameters()).is_cuda: # char = char.cuda() # self.mask = self.mask.cuda() # x = self.Embedding(words) char = self.char_embed(char) word = self.word_embed(word) pos = self.pos_embed(pos) #print(spo) #print(self.zeros) spo = spo.unsqueeze(1).repeat(1, char.shape[1], 1).float() #print(char.shape) #print(word.shape) #print(pos.shape) #print(spo.shape) x = torch.cat((char, word, pos, spo), dim=2) #print(x.shape) x = self.norm1(x) # [batch_size, max_len, char_embed_dim + word_embed_dim + pos_embed_dim + spo_embed_dim ] # x, _ = self.Rnn(x, seq_len=seq_len) x, _ = self.Rnn(x) x = self.Linear1(x) x = self.norm2(x) x = self.relu(x) x = self.drop(x) x = self.Linear2(x) if tag is not None: return self._internal_loss(x, tag) else: return self._decode(x) def forward(self, char, word, pos, spo, seq_len, tag): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :param torch.LongTensor target: [batch_size, max_len], 目标 :return torch.Tensor: a scalar loss """ return self._forward(char, word, pos, spo, seq_len, tag) def predict(self, char, word, pos, spo, seq_len): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :return torch.LongTensor: [batch_size, max_len] """ return self._forward(char, word, pos, spo, seq_len)
class LSTM_CRF(nn.Module): def __init__(self, tagset_size, vocab_size, hidden_dim, embedding_dim, pretrained_embeddings, dropout, num_layers, pad_index, device, fine_tune=True, bidirectional=True): super(LSTM_CRF, self).__init__() self.tagset_size = tagset_size self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.dropout = nn.Dropout(p=dropout) self.bidirectional = bidirectional self.num_layers = num_layers self.pad_index = pad_index self.device = device self.embedding_layer = nn.Embedding(self.vocab_size, self.embedding_dim) if type(pretrained_embeddings) == torch.Tensor: self.embedding_layer.weight.data.copy_(pretrained_embeddings) if not fine_tune: self.embedding_layer.weight.requires_grad = False self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, num_layers=self.num_layers, bidirectional=self.bidirectional) self.hidden2tag = nn.Linear(2 * self.hidden_dim, self.tagset_size) self.crf = ConditionalRandomField(self.tagset_size, 1, 2) def get_lstm_feats(self, batch): lens = batch['lens'] word_sequences = batch['word_sequences'] max_len = max(lens) batch_size = len(word_sequences) embeddings = self.embedding_layer(word_sequences) embeddings = self.dropout(embeddings) packed_input = pack_padded_sequence(embeddings, lens, batch_first=True) packed_hidden_states, _ = self.lstm(packed_input) hidden_states, _ = pad_packed_sequence(packed_hidden_states, batch_first=True) hidden_states = self.dropout(hidden_states) logits = self.hidden2tag(hidden_states) return logits #logits = logits.view(batch_size * max_len, self.tagset_size) def loss(self, batch): logits = self.get_lstm_feats(batch) mask = batch['mask'].squeeze(1) return self.crf.forward(logits, batch['tag_sequences'], mask) def forward(self, batch): logits = self.get_lstm_feats(batch) mask = batch['mask'].squeeze(1) all_tags = self.crf.viterbi_tags(logits.to('cpu'), mask.to('cpu')) max_len = max(batch['lens']) for i in range(len(all_tags)): all_tags[i] += [0 for i in range(max_len - len(all_tags[i]))] #print(all_tags[i]) return None, torch.tensor(all_tags)
class Transformer_CRF(nn.Module): def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, num_classes, num_layers, inner_size, key_size, value_size, num_head, dropout=0.1, id2words=None, encoding_type='bieso', weight=None): super().__init__() # self.Embedding = nn.Embedding(init_embed) #print(char_init_embed) self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1]) self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1]) self.word_embed.weight.data.copy_(torch.from_numpy(weight)) self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1]) # spo embed size: 50 self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim self.norm1 = torch.nn.LayerNorm(self.embed_dim) self.transformer = encoder.TransformerEncoder( num_layers=num_layers, model_size=self.embed_dim, inner_size=inner_size, key_size=key_size, value_size=value_size, num_head=num_head, dropout=dropout) self.Linear1 = nn.Linear(self.embed_dim, self.embed_dim // 3) self.norm2 = torch.nn.LayerNorm(self.embed_dim // 3) self.relu = torch.nn.LeakyReLU() self.drop = torch.nn.Dropout(dropout) self.Linear2 = nn.Linear(self.embed_dim // 3, num_classes) self.Linear = nn.Linear(self.embed_dim, num_classes) if id2words is None: self.Crf = CRF(num_classes, include_start_end_trans=False) else: self.Crf = CRF(num_classes, include_start_end_trans=False, allowed_transitions=allowed_transitions( id2words, encoding_type=encoding_type)) def _decode(self, x): """ :param torch.FloatTensor x: [batch_size, max_len, tag_size] :return torch.LongTensor, [batch_size, max_len] """ tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) return tag_seq def _internal_loss(self, x, y): """ Negative log likelihood loss. :param x: Tensor, [batch_size, max_len, tag_size] :param y: Tensor, [batch_size, max_len] :return loss: a scalar Tensor """ x = x.float() y = y.long() assert x.shape[:2] == y.shape assert y.shape == self.mask.shape total_loss = self.Crf(x, y, self.mask) return torch.mean(total_loss) def _make_mask(self, x, seq_len): batch_size, max_len = x.size(0), x.size(1) mask = seq_len_to_mask(seq_len) # mask = mask.view(batch_size, max_len) mask = mask.to(x).float() return mask def _forward(self, char, word, pos, spo, seq_len, tag=None): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len:[batch_size, ] :param torch.LongTensor target: [batch_size, max_len] :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. If truth is not None, return loss, a scalar. Used in training. """ char = char.long() #word = word.long() #pos = pos.long() #spo = spo.long() seq_len = seq_len.long() self.mask = self._make_mask(char, seq_len) # seq_len = seq_len.long() tag = tag.long() if tag is not None else None #if next(self.parameters()).is_cuda: # char = char.cuda() # self.mask = self.mask.cuda() # x = self.Embedding(words) char = self.char_embed(char) word = self.word_embed(word) pos = self.pos_embed(pos) #print(spo) #print(self.zeros) spo = spo.unsqueeze(1).repeat(1, char.shape[1], 1).float() #print(char.shape) #print(word.shape) #print(pos.shape) #print(spo.shape) x = torch.cat((char, word, pos, spo), dim=2) #print(x.shape) x = self.norm1(x) # [batch_size, max_len, char_embed_dim + word_embed_dim + pos_embed_dim + spo_embed_dim ] x = self.transformer(x, seq_mask=self.mask) #x = self.Linear1(x) #x = self.norm2(x) #x = self.relu(x) #x = self.drop(x) #x = self.Linear2(x) x = self.Linear(x) if tag is not None: return self._internal_loss(x, tag) else: return self._decode(x) #return {"pred": self._decode(x)} def forward(self, char, word, pos, spo, seq_len, tag): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :param torch.LongTensor target: [batch_size, max_len], 目标 :return torch.Tensor: a scalar loss """ return self._forward(char, word, pos, spo, seq_len, tag) def predict(self, char, word, pos, spo, seq_len): """ :param torch.LongTensor words: [batch_size, mex_len] :param torch.LongTensor seq_len: [batch_size, ] :return torch.LongTensor: [batch_size, max_len] """ return self._forward(char, word, pos, spo, seq_len)
class Lattice_Transformer_SeqLabel(nn.Module): def __init__(self, lattice_weight, lattice_num, lattice_dim, bigram_weight, bigram_num, bigram_dim, hidden_size, label_size, num_heads, num_layers, learnable_position, layer_preprocess_sequence, layer_postprocess_sequence, ff_size=-1, dropout=None, max_seq_len=-1): super().__init__() self.lattice_embed = nn.Embedding(lattice_num, lattice_dim) self.lattice_embed.weight.data.copy_(torch.from_numpy(lattice_weight)) self.bigram_embed = nn.Embedding(bigram_num, bigram_dim) self.bigram_embed.weight.data.copy_(torch.from_numpy(bigram_weight)) pe_ss = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) pe_se = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) pe_es = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) pe_ee = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) # self.bigram_size = self.bigram_embed.embedding.weight.size(1) # char_input_size = self.lattice_embed.embedding.weight.size(1) + self.bigram_embed.embedding.weight.size(1) # lex_input_size = self.lattice_embed.embedding.weight.size(1) self.bigram_size = bigram_dim char_input_size = bigram_dim + lattice_dim lex_input_size = lattice_dim self.embed_dropout = nn.Dropout(p=dropout['embed']) self.gaz_dropout = nn.Dropout(p=dropout['gaz']) self.output_dropout = nn.Dropout(p=dropout['output']) self.char_proj = nn.Linear(char_input_size, hidden_size) self.lex_proj = nn.Linear(lex_input_size, hidden_size) self.encoder = Transformer_Encoder( hidden_size, num_heads, num_layers, learnable_position=learnable_position, layer_preprocess_sequence=layer_preprocess_sequence, layer_postprocess_sequence=layer_postprocess_sequence, dropout=dropout, ff_size=ff_size, max_seq_len=max_seq_len, pe_ss=pe_ss, pe_se=pe_se, pe_es=pe_es, pe_ee=pe_ee) self.output = nn.Linear(hidden_size, label_size) self.crf = ConditionalRandomField(label_size, include_start_end_trans=True) # self.crf.trans_m = nn.Parameter(torch.zeros(size=[label_size, label_size], requires_grad=True)) self.loss_func = nn.CrossEntropyLoss(ignore_index=-100) # 训练时用 # TODO 参数类型 def forward(self, lattice: torch.Tensor, bigrams: torch.Tensor, seq_len: torch.Tensor, lex_num: torch.Tensor, pos_s: torch.Tensor, pos_e: torch.Tensor, target: Optional[torch.Tensor]): batch_size = lattice.size(0) max_seq_len_and_lex_num = lattice.size(1) max_seq_len = bigrams.size(1) raw_embed = self.lattice_embed(lattice) bigrams_embed = self.bigram_embed(bigrams) bigrams_embed = torch.cat([ bigrams_embed, torch.zeros(size=[ batch_size, max_seq_len_and_lex_num - max_seq_len, self.bigram_size ]).to(bigrams_embed) ], dim=1) raw_embed_char = torch.cat([raw_embed, bigrams_embed], dim=-1) raw_embed_char = self.embed_dropout(raw_embed_char) raw_embed = self.gaz_dropout(raw_embed) embed_char = self.char_proj(raw_embed_char) char_mask = seq_len_to_mask(seq_len, max_len=max_seq_len_and_lex_num) embed_char.masked_fill_(~(char_mask.unsqueeze(-1)), 0) embed_lex = self.lex_proj(raw_embed) lex_mask = (seq_len_to_mask(seq_len + lex_num) ^ char_mask) embed_lex.masked_fill_(~(lex_mask).unsqueeze(-1), 0) embedding = embed_char + embed_lex encoded = self.encoder(embedding, seq_len, lex_num=lex_num, pos_s=pos_s, pos_e=pos_e) encoded = self.output_dropout(encoded) # 这里只获取transformer输出的char部分 encoded = encoded[:, :max_seq_len, :] pred = self.output(encoded) mask = seq_len_to_mask(seq_len) # script使用 # pred, path = self.crf.viterbi_decode(pred, mask) # return pred if self.training: loss = self.crf(pred, target, mask).mean(dim=0) return {'loss': loss} else: pred, path = self.crf.viterbi_decode(pred, mask) result = {'pred': pred} return result
def __init__(self, lattice_weight, lattice_num, lattice_dim, bigram_weight, bigram_num, bigram_dim, hidden_size, label_size, num_heads, num_layers, learnable_position, layer_preprocess_sequence, layer_postprocess_sequence, ff_size=-1, dropout=None, max_seq_len=-1): super().__init__() self.lattice_embed = nn.Embedding(lattice_num, lattice_dim) self.lattice_embed.weight.data.copy_(torch.from_numpy(lattice_weight)) self.bigram_embed = nn.Embedding(bigram_num, bigram_dim) self.bigram_embed.weight.data.copy_(torch.from_numpy(bigram_weight)) pe_ss = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) pe_se = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) pe_es = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) pe_ee = nn.Parameter(get_pos_embedding(max_seq_len, hidden_size, rel_pos_init=0), requires_grad=learnable_position) # self.bigram_size = self.bigram_embed.embedding.weight.size(1) # char_input_size = self.lattice_embed.embedding.weight.size(1) + self.bigram_embed.embedding.weight.size(1) # lex_input_size = self.lattice_embed.embedding.weight.size(1) self.bigram_size = bigram_dim char_input_size = bigram_dim + lattice_dim lex_input_size = lattice_dim self.embed_dropout = nn.Dropout(p=dropout['embed']) self.gaz_dropout = nn.Dropout(p=dropout['gaz']) self.output_dropout = nn.Dropout(p=dropout['output']) self.char_proj = nn.Linear(char_input_size, hidden_size) self.lex_proj = nn.Linear(lex_input_size, hidden_size) self.encoder = Transformer_Encoder( hidden_size, num_heads, num_layers, learnable_position=learnable_position, layer_preprocess_sequence=layer_preprocess_sequence, layer_postprocess_sequence=layer_postprocess_sequence, dropout=dropout, ff_size=ff_size, max_seq_len=max_seq_len, pe_ss=pe_ss, pe_se=pe_se, pe_es=pe_es, pe_ee=pe_ee) self.output = nn.Linear(hidden_size, label_size) self.crf = ConditionalRandomField(label_size, include_start_end_trans=True) # self.crf.trans_m = nn.Parameter(torch.zeros(size=[label_size, label_size], requires_grad=True)) self.loss_func = nn.CrossEntropyLoss(ignore_index=-100)