class BiLSTM_CRF(nn.Module): def __init__(self, data): super(BiLSTM_CRF, self).__init__() print "build batched lstmcrf..." self.gpu = data.HP_gpu label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.lstm = BiLSTM(data) self.crf = CRF(label_size, self.gpu) def neg_log_likelihood_loss(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return total_loss, tag_seq def forward(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): outs = self.lstm.get_output_score(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return tag_seq def get_lstm_features(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): return self.lstm.get_lstm_features(gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
class SeqModel(nn.Module): def __init__(self, data): super(SeqModel, self).__init__() self.gpu = data.HP_gpu ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size # data.label_alphabet_size += 2 # self.word_hidden = WordSequence(data, False, True, data.use_char) # The linear layer that maps from hidden state space to tag space self.hidden2tag = nn.Linear(data.HP_hidden_dim, label_size + 2) self.crf = CRF(label_size, self.gpu) if torch.cuda.is_available(): self.hidden2tag = self.hidden2tag.cuda(self.gpu) # def neg_log_likelihood_loss(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): # outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, None, None) def neg_log_likelihood_loss(self, hidden, hidden_adv, batch_label, mask): if hidden_adv is not None: hidden = (hidden + hidden_adv) outs = self.hidden2tag(hidden) batch_size = hidden.size(0) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) total_loss = total_loss / batch_size return total_loss, tag_seq def forward(self, hidden, mask): outs = self.hidden2tag(hidden) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return tag_seq # def get_lstm_features(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): # return self.word_hidden(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) def decode_nbest(self, hidden, mask, nbest): outs = self.hidden2tag(hidden) scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest) return scores, tag_seq
class SeqModel(nn.Module): def __init__(self, data, opt): super(SeqModel, self).__init__() self.gpu = opt.gpu ## add two more label for downlayer lstm, use original label size for CRF self.word_hidden = WordSequence(data, opt) self.crf = CRF(data.label_alphabet.size(), self.gpu) def neg_log_likelihood_loss(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask, feature_inputs, text_inputs): outs = self.word_hidden(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, feature_inputs, text_inputs) batch_size = word_inputs.size(0) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) total_loss = total_loss / batch_size return total_loss, tag_seq def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask, feature_inputs, text_inputs): outs = self.word_hidden(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, feature_inputs, text_inputs) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return tag_seq def decode_nbest(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask, nbest, feature_inputs, text_inputs): outs = self.word_hidden(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, feature_inputs, text_inputs) scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest) return scores, tag_seq
class Net(nn.Module): def __init__(self, args): super().__init__() self.args = args self.wemb = Wemb(args) self.drop = nn.Dropout(args.dropout) odim = len(args.tag_stoi) if args.ner: self.crf = CRF(args.tag_stoi) odim = len(args.tag_stoi) + 2 if not args.lstm: self.ffn = nn.Sequential(nn.Linear(300, 400), nn.ReLU(), nn.Dropout(args.dropout)) else: self.lstm = nn.LSTM(input_size=300, hidden_size=200, num_layers=2, bias=True, batch_first=True, dropout=args.dropout, bidirectional=True) self.hid2tag = nn.Linear(400, odim) def forward(self, batch): mask = pad_sequence([torch.ones(len(x)) for x in batch], True, 0).byte().cuda() if self.args.fix: with torch.no_grad(): x = self.wemb.eval()(batch) else: x = self.wemb(batch) x = self.drop(x) if not self.args.lstm: x = self.ffn(x) else: x = Lstm(self.lstm, x, mask.sum(-1)) x = self.hid2tag(x) return x, mask def train_batch(self, batch, tags): x, mask = self.forward(batch) tag_ids = pad_sequence([ torch.LongTensor([self.args.tag_stoi[t] for t in s]) for s in tags ], True, self.args.tag_stoi["<pad>"]).cuda() if not self.args.ner: loss = nn.functional.cross_entropy(x[mask], tag_ids[mask]) else: loss = self.crf.neg_log_likelihood_loss(x, mask, tag_ids) return loss def test_batch(self, batch): x, mask = self.forward(batch) if not self.args.ner: path = x.max(-1)[1] else: _, path = self.crf._viterbi_decode(x, mask) path = [p[m].tolist() for p, m in zip(path, mask)] tags = [[self.args.tag_itos[i] for i in s] for s in path] return tags
class deepBiLSTM_CRF(nn.Module): def __init__(self, word_HPs, char_HPs, num_labels=None, drop_final=0.5): super(deepBiLSTM_CRF, self).__init__() [word_size, word_dim, word_pre_embs, word_hidden_dim, word_dropout, word_layers, word_bidirect] = word_HPs if char_HPs: [char_size, char_dim, char_pred_embs, char_hidden_dim, char_dropout, char_layers, char_bidirect] = char_HPs self.lstm = Deep_bisltm(word_HPs, char_HPs, num_labels, att=True) # add two more labels for CRF self.crf = CRF(num_labels+2, use_cuda) ## add two more labels to learn hidden features for start and end transition self.hidden2tag = nn.Linear(2*word_hidden_dim, num_labels+2) self.dropfinal = nn.Dropout(drop_final) if use_cuda: self.hidden2tag = self.hidden2tag.cuda() self.dropfinal = self.dropfinal.cuda() def NLL_loss(self, label_score, mask_tensor, label_tensor): batch_loss = self.crf.neg_log_likelihood_loss(label_score, mask_tensor, label_tensor) return batch_loss def inference(self, label_score, mask_tensor): label_prob, label_pred = self.crf._viterbi_decode(label_score, mask_tensor) return label_prob, label_pred def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): # (batch_size,sequence_len,hidden_dim) rnn_out = self.lstm.get_all_atthiddens(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) # (batch_size,sequence_len,num_labels+2) label_score = self.hidden2tag(rnn_out) label_score = self.dropfinal(label_score) return label_score
class Net(nn.Module): def __init__(self): super().__init__() self.net = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(tag_stoi) + 2) self.g2b = nn.Linear(300, 768) self.gate = nn.Linear(768, 1) self.crf = CRF(tag_stoi) def forward(self, inputs, wids, attention_mask, labels): b = self.net.bert.embeddings(input_ids=inputs) a = self.gate(b).sigmoid() g = self.g2b(wvec[wids].cuda()) x = (1 - a) * b + a * g logits = self.net(inputs_embeds=x, attention_mask=attention_mask)[0] first_mask = labels != -100 mask = lens2mask(first_mask.sum(-1)).cuda() logits = torch.zeros(*mask.shape, logits.shape[-1]).cuda().masked_scatter(mask[:, :, None], logits[first_mask]) labels = torch.zeros(*mask.shape).long().cuda().masked_scatter(mask, labels[first_mask]) return logits, mask, labels def train_batch(self, inputs, wids, attention_mask, labels): logits, mask, labels = self.forward(inputs, wids, attention_mask, labels) loss = self.crf.neg_log_likelihood_loss(logits, mask, labels) return loss def test_batch(self, inputs, wids, attention_mask, labels): logits, mask, labels = self.forward(inputs, wids, attention_mask, labels) _, path = self.crf._viterbi_decode(logits, mask) pred = [[tag_itos[i] for i in p[m]] for p, m in zip(path, mask)] return pred
class BiLSTM_CRF(nn.Module): def __init__(self, data): super(BiLSTM_CRF, self).__init__() print "build batched lstmcrf..." self.gpu = data.HP_gpu self.average_batch = data.HP_average_batch_loss ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.lstm = BiLSTM(data) self.crf = CRF(label_size, self.gpu) def neg_log_likelihood_loss(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): outs = self.lstm.get_output_score(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) if self.average_batch: total_loss = total_loss / batch_size return total_loss, tag_seq def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): outs = self.lstm.get_output_score(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) scores, tag_seq = self.crf._viterbi_decode(outs, mask) return tag_seq def get_lstm_features(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): return self.lstm.get_lstm_features(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
class SeqModel(nn.Module): def __init__(self, data): super(SeqModel, self).__init__() self.use_crf = data.use_crf print "build network..." print "use_char: ", data.use_char if data.use_char: print "char feature extractor: ", data.char_feature_extractor print "word feature extractor: ", data.word_feature_extractor print "use crf: ", self.use_crf self.gpu = data.HP_gpu self.average_batch = data.average_batch_loss ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.word_hidden = WordSequence(data) if self.use_crf: self.crf = CRF(label_size, self.gpu) def neg_log_likelihood_loss(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) if self.use_crf: total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: loss_function = nn.NLLLoss(ignore_index=0, size_average=False) outs = outs.view(batch_size * seq_len, -1) score = F.log_softmax(outs, 1) total_loss = loss_function(score, batch_label.view(batch_size * seq_len)) _, tag_seq = torch.max(score, 1) tag_seq = tag_seq.view(batch_size, seq_len) if self.average_batch: total_loss = total_loss / batch_size return total_loss, tag_seq def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) if self.use_crf: scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: outs = outs.view(batch_size * seq_len, -1) _, tag_seq = torch.max(outs, 1) tag_seq = tag_seq.view(batch_size, seq_len) ## filter padded position with zero tag_seq = mask.long() * tag_seq return tag_seq # def get_lstm_features(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): # return self.word_hidden(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) def decode_nbest(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask, nbest): if not self.use_crf: print "Nbest output is currently supported only for CRF! Exit..." exit(0) outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest) return scores, tag_seq
class NamedEntityRecog(nn.Module): def __init__(self, vocab_size, word_embed_dim, word_hidden_dim, alphabet_size, char_embedding_dim, char_hidden_dim, feature_extractor, tag_num, dropout, pretrain_embed=None, use_char=False, use_crf=False, use_gpu=False): super(NamedEntityRecog, self).__init__() self.use_crf = use_crf self.use_char = use_char self.drop = nn.Dropout(dropout) self.input_dim = word_embed_dim self.feature_extractor = feature_extractor self.embeds = nn.Embedding(vocab_size, word_embed_dim, padding_idx=0) if pretrain_embed is not None: self.embeds.weight.data.copy_(torch.from_numpy(pretrain_embed)) else: self.embeds.weight.data.copy_( torch.from_numpy( self.random_embedding(vocab_size, word_embed_dim))) if self.use_char: self.input_dim += char_hidden_dim self.char_feature = CharCNN(alphabet_size, char_embedding_dim, char_hidden_dim, dropout) if feature_extractor == 'lstm': self.lstm = nn.LSTM(self.input_dim, word_hidden_dim, batch_first=True, bidirectional=True) else: self.word2cnn = nn.Linear(self.input_dim, word_hidden_dim * 2) self.cnn_list = list() for _ in range(4): self.cnn_list.append( nn.Conv1d(word_hidden_dim * 2, word_hidden_dim * 2, kernel_size=3, padding=1)) self.cnn_list.append(nn.ReLU()) self.cnn_list.append(nn.Dropout(dropout)) self.cnn_list.append(nn.BatchNorm1d(word_hidden_dim * 2)) self.cnn = nn.Sequential(*self.cnn_list) if self.use_crf: self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num + 2) self.crf = CRF(tag_num, use_gpu) else: self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num) def random_embedding(self, vocab_size, embedding_dim): pretrain_emb = np.empty([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(1, vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb def neg_log_likelihood_loss(self, word_inputs, word_seq_lengths, char_inputs, batch_label, mask): batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) word_embeding = self.embeds(word_inputs) word_list = [word_embeding] if self.use_char: char_features = self.char_feature(char_inputs).contiguous().view( batch_size, seq_len, -1) word_list.append(char_features) word_embeding = torch.cat(word_list, 2) word_represents = self.drop(word_embeding) if self.feature_extractor == 'lstm': packed_words = pack_padded_sequence(word_represents, word_seq_lengths, True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out) lstm_out = lstm_out.transpose(0, 1) feature_out = self.drop(lstm_out) else: batch_size = word_inputs.size(0) word_in = torch.tanh(self.word2cnn(word_represents)).transpose( 2, 1).contiguous() feature_out = self.cnn(word_in).transpose(1, 2).contiguous() feature_out = self.hidden2tag(feature_out) if self.use_crf: total_loss = self.crf.neg_log_likelihood_loss( feature_out, mask, batch_label) else: loss_function = nn.CrossEntropyLoss(ignore_index=0, reduction='sum') feature_out = feature_out.contiguous().view( batch_size * seq_len, -1) total_loss = loss_function( feature_out, batch_label.contiguous().view(batch_size * seq_len)) return total_loss def forward(self, word_inputs, word_seq_lengths, char_inputs, batch_label, mask): batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) word_embeding = self.embeds(word_inputs) word_list = [word_embeding] if self.use_char: char_features = self.char_feature(char_inputs).contiguous().view( batch_size, seq_len, -1) word_list.append(char_features) word_embeding = torch.cat(word_list, 2) word_represents = self.drop(word_embeding) if self.feature_extractor == 'lstm': packed_words = pack_padded_sequence(word_represents, word_seq_lengths, True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out) lstm_out = lstm_out.transpose(0, 1) feature_out = self.drop(lstm_out) else: batch_size = word_inputs.size(0) word_in = torch.tanh(self.word2cnn(word_represents)).transpose( 2, 1).contiguous() feature_out = self.cnn(word_in).transpose(1, 2).contiguous() feature_out = self.hidden2tag(feature_out) if self.use_crf: scores, tag_seq = self.crf._viterbi_decode(feature_out, mask) else: feature_out = feature_out.contiguous().view( batch_size * seq_len, -1) _, tag_seq = torch.max(feature_out, 1) tag_seq = tag_seq.view(batch_size, seq_len) tag_seq = mask.long() * tag_seq return tag_seq
class entityRelation(nn.Module): def __init__(self, args, model_params): super(entityRelation, self).__init__() print("build network...") print("bbb") self.gpu = args.ifgpu self.label_size = model_params.label_alphabet.size() self.bert_encoder_dim = args.encoder_dim self.targetHiddenDim = args.targetHiddenDim self.relationHiddenDim = args.relationHiddenDim self.relation_num = args.relationNum self.drop = args.dropout # buliding model # encoding layer self.Embedding = WordEmbedding(args, model_params) self.encoder = WordHiddenRep(args, model_params) # module linear self.u_input_Linear = nn.Linear(self.bert_encoder_dim, self.targetHiddenDim) self.r_input_Linear = nn.Linear(self.bert_encoder_dim, self.relationHiddenDim) # Tag Linear self.targetHidden2Tag = nn.Linear(self.targetHiddenDim, self.label_size + 2) # CRF self.crf = CRF(self.label_size, self.gpu) # Relation self.relationAttention = RelationAttention(args) # Dropout self.dropout = nn.Dropout(self.drop) if self.gpu: self.Embedding = self.Embedding.cuda() self.encoder = self.encoder.cuda() self.u_input_Linear = self.u_input_Linear.cuda() self.r_input_Linear = self.r_input_Linear.cuda() self.targetHidden2Tag = self.targetHidden2Tag.cuda() self.crf = self.crf.cuda() self.relationAttention = self.relationAttention.cuda() self.dropout = self.dropout.cuda() def neg_log_likelihood_loss(self, all_input_ids, input_length, all_input_mask, all_char_ids, char_length, char_recover, all_relations, all_labels): batch_size = all_input_ids.size(0) seq_len = all_input_ids.size(1) targetPredictScore, R_tensor = self.mainStructure( all_input_ids, input_length, all_input_mask, all_char_ids, char_length, char_recover) target_loss = self.crf.neg_log_likelihood_loss( targetPredictScore, all_input_mask.byte(), all_labels) / (batch_size) scores, tag_seq = self.crf._viterbi_decode(targetPredictScore, all_input_mask.byte()) relationScale = all_relations.transpose(1, 3).contiguous().view( -1, self.relation_num) relation_loss_function = nn.BCELoss(size_average=False) relationScoreLoss = R_tensor.transpose(1, 3).contiguous().view( -1, self.relation_num) relation_loss = relation_loss_function( relationScoreLoss, relationScale.float()) / (batch_size * seq_len) return target_loss, relation_loss, tag_seq, R_tensor def forward(self, all_input_ids, input_length, all_input_mask, all_char_ids, char_length, char_recover): targetPredictScore, R_tensor = self.mainStructure( all_input_ids, input_length, all_input_mask, all_char_ids, char_length, char_recover) scores, tag_seq = self.crf._viterbi_decode(targetPredictScore, all_input_mask.byte()) return tag_seq, R_tensor def mainStructure(self, all_input_ids, input_length, all_input_mask, all_char_ids, char_length, char_recover): batch_size = all_input_ids.size(0) seq_len = all_input_ids.size(1) # encoding layer wordEmbedding = self.Embedding(all_input_ids, all_char_ids, char_length, char_recover) maskEmb = all_input_mask.view(batch_size, seq_len, 1).repeat(1, 1, wordEmbedding.size(2)) wordEmbedding = wordEmbedding * (maskEmb.float()) sequence_output = self.encoder(wordEmbedding, input_length) # module linear h_t = self.u_input_Linear(sequence_output) h_r = self.r_input_Linear(sequence_output) # entity extraction module targetPredictInput = self.targetHidden2Tag(self.dropout(h_t)) # relation detection module relationScore = self.relationAttention(self.dropout(h_r)) return targetPredictInput, relationScore
class Bilstmcrf(nn.Module): """ bilstm-crf模型 """ def __init__(self, args, pretrain_word_embedding, label_size): super(Bilstmcrf, self).__init__() self.use_crf = args.use_crf self.use_char = args.use_char self.gpu = args.gpu self.use_char = args.use_char self.rnn_hidden_dim = args.rnn_hidden_dim self.rnn_type = args.rnn_type self.max_seq_length = args.max_seq_length self.use_highway = args.use_highway self.dropoutlstm = nn.Dropout(args.dropoutlstm) self.wordrep = WordRep(args, pretrain_word_embedding) if self.use_char: self.lstm = nn.LSTM(350, self.rnn_hidden_dim, num_layers=args.num_layers, batch_first=True, bidirectional=True) self.gru = nn.GRU(350, self.rnn_hidden_dim, num_layers=args.num_layers, batch_first=True, bidirectional=True) else: self.lstm = nn.LSTM(300, self.rnn_hidden_dim, num_layers=args.num_layers, batch_first=True, bidirectional=True) self.gru = nn.GRU(300, self.rnn_hidden_dim, num_layers=args.num_layers, batch_first=True, bidirectional=True) self.label_size = label_size if self.use_crf: self.crf = CRF(self.label_size, self.gpu) self.label_size += 2 if self.use_highway: self.highway = Highway(args.rnn_hidden_dim * 2, 1) self.hidden2tag = nn.Linear(args.rnn_hidden_dim * 2, self.label_size) # pack_padded pad_packed_sequence def forward(self, word_input, input_mask, labels, char_input=None): # word_input input_mask FloatTensor if self.use_char: word_input = self.wordrep(word_input, char_input) else: word_input = self.wordrep(word_input) input_mask.requires_grad = False word_input = word_input * (input_mask.unsqueeze(-1).float()) batch_size = word_input.size(0) total_length = word_input.size(1) ttt = input_mask.ge(1) word_seq_lengths = [int(torch.sum(i).cpu().numpy()) for i in ttt] if self.rnn_type == 'LSTM': packed_words = pack_padded_sequence(word_input, word_seq_lengths, True, enforce_sorted=False) lstm_out, hidden = self.lstm(packed_words) output, _ = pad_packed_sequence(lstm_out, batch_first=True, total_length=total_length) elif self.rnn_type == 'GRU': packed_words = pack_padded_sequence(word_input, word_seq_lengths, True, enforce_sorted=False) lstm_out, hidden = self.gru(packed_words) output, _ = pad_packed_sequence(lstm_out, batch_first=True, total_length=total_length) if self.use_highway: output = self.highway(output) output = self.dropoutlstm(output) output = self.hidden2tag(output) maskk = input_mask.ge(1) if self.use_crf: total_loss = self.crf.neg_log_likelihood_loss( output, maskk, labels) scores, tag_seq = self.crf._viterbi_decode(output, input_mask) return total_loss / batch_size, tag_seq else: loss_fct = nn.CrossEntropyLoss(ignore_index=0) active_loss = input_mask.view(-1) == 1 active_logits = output.view(-1, self.label_size)[active_loss] active_labels = labels.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) return loss, output def calculate_loss(self, word_input, input_mask, labels, char_input=None): # word_input input_mask FloatTensor if self.use_char: word_input = self.wordrep(word_input, char_input) else: word_input = self.wordrep(word_input) # print(word_input.shape) input_mask.requires_grad = False word_input = word_input * (input_mask.unsqueeze(-1).float()) batch_size = word_input.size(0) if self.rnn_type == 'LSTM': output, _ = self.lstm(word_input) elif self.rnn_type == 'GRU': output, _ = self.gru(word_input) if self.use_highway: output = self.highway(output) output = self.dropoutlstm(output) output = self.hidden2tag(output) maskk = input_mask.ge(1) if self.use_crf: total_loss = self.crf.neg_log_likelihood_loss( output, maskk, labels) scores, tag_seq = self.crf._viterbi_decode(output, input_mask) return total_loss / batch_size, tag_seq else: loss_fct = nn.CrossEntropyLoss(ignore_index=0) active_loss = input_mask.view(-1) == 1 active_logits = output.view(-1, self.label_size)[active_loss] active_labels = labels.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) return loss, output
class SeqModel(nn.Module): def __init__(self, data): super(SeqModel, self).__init__() self.use_crf = data.use_crf self.use_trans = data.use_trans self.use_mapping = data.use_mapping print "build network..." print "use_char: ", data.use_char if data.use_char: print "char feature extractor: ", data.char_seq_feature print "use_trans: ", data.use_trans print "word feature extractor: ", data.word_feature_extractor print "use crf: ", self.use_crf self.gpu = data.gpu self.average_batch = data.average_batch_loss # add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.word_hidden = WordSequence(data) if self.use_crf: self.crf = CRF(label_size, self.gpu) def neg_log_likelihood_loss(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask, trans_inputs, trans_seq_length, trans_seq_recover): outs, w_word_embs, trans_features_wc = self.word_hidden( word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, trans_inputs, trans_seq_length, trans_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) wc_loss = 0 if self.use_trans: if self.use_crf: total_loss = self.crf.neg_log_likelihood_loss( outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) if self.use_mapping: wc_loss = torch.norm(w_word_embs - trans_features_wc) else: loss_function = nn.NLLLoss(ignore_index=0, size_average=False) outs = outs.view(batch_size * seq_len, -1) score = F.log_softmax(outs, 1) total_loss = loss_function( score, batch_label.view(batch_size * seq_len)) _, tag_seq = torch.max(score, 1) tag_seq = tag_seq.view(batch_size, seq_len) if self.use_mapping: wc_loss = torch.norm(w_word_embs - trans_features_wc) else: if self.use_crf: total_loss = self.crf.neg_log_likelihood_loss( outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: loss_function = nn.NLLLoss(ignore_index=0, size_average=False) outs = outs.view(batch_size * seq_len, -1) score = F.log_softmax(outs, 1) total_loss = loss_function( score, batch_label.view(batch_size * seq_len)) _, tag_seq = torch.max(score, 1) tag_seq = tag_seq.view(batch_size, seq_len) if self.average_batch: total_loss = total_loss / batch_size if self.use_mapping: wc_loss = wc_loss / batch_size return total_loss, tag_seq, wc_loss def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask, trans_inputs, trans_seq_length, trans_seq_recover): # outs:(after hidden) [batch * seq_len * label_size] outs, w_word_embs, trans_features_wc = self.word_hidden( word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, trans_inputs, trans_seq_length, trans_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) if self.use_crf: scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: outs = outs.view(batch_size * seq_len, -1) # [batch_size * seq_len,label_size] _, tag_seq = torch.max( outs, 1 ) # tag_seq:[batch_size * seq_len , 1] range from 0 to label_size-1 tag_seq = tag_seq.view(batch_size, seq_len) # [batch_size,seq_len] # print "before mask:{}".format(tag_seq) # print "mask:{}".format(mask) # filter padded position with zero tag_seq = mask.long() * tag_seq return tag_seq # [batch_size,seq_len] and padding part is zero # def get_lstm_features(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): # return self.word_hidden(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) def decode_nbest(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask, nbest, trans_inputs, trans_seq_length, trans_seq_recover): if not self.use_crf: print "Nbest output is currently supported only for CRF! Exit..." exit(0) outs, w_word_embs, trans_features_wc = self.word_hidden( word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, trans_inputs, trans_seq_length, trans_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest) return scores, tag_seq def decode_output_intermediate_result(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask, trans_inputs, trans_seq_length, trans_seq_recover): outs, w_word_embs, trans_features_wc = self.word_hidden( word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, trans_inputs, trans_seq_length, trans_seq_recover) return outs, self.crf.transitions
class SeqModel(nn.Module): def __init__(self, data): super(SeqModel, self).__init__() self.use_crf = data.use_crf print "build network..." print "use_char: ", data.use_char if data.use_char: print "char feature extractor: ", data.char_feature_extractor print "word feature extractor: ", data.word_feature_extractor print "use crf: ", self.use_crf self.gpu = data.HP_gpu self.average_batch = data.average_batch_loss ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size # data.label_alphabet_size += 2 # self.word_hidden = WordSequence(data, False, True, data.use_char) # The linear layer that maps from hidden state space to tag space self.hidden2tag = nn.Linear(data.HP_hidden_dim, label_size + 2) if self.use_crf: self.crf = CRF(label_size, self.gpu) if torch.cuda.is_available(): self.hidden2tag = self.hidden2tag.cuda(self.gpu) self.frozen = False # def neg_log_likelihood_loss(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): # outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, None, None) def neg_log_likelihood_loss(self, hidden, hidden_adv, batch_label, mask): if hidden_adv is not None: hidden = (hidden + hidden_adv) outs = self.hidden2tag(hidden) batch_size = hidden.size(0) seq_len = hidden.size(1) if self.use_crf: total_loss = self.crf.neg_log_likelihood_loss( outs, mask, batch_label) scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: loss_function = nn.NLLLoss(ignore_index=0, size_average=False) outs = outs.view(batch_size * seq_len, -1) score = F.log_softmax(outs, 1) total_loss = loss_function(score, batch_label.view(batch_size * seq_len)) _, tag_seq = torch.max(score, 1) tag_seq = tag_seq.view(batch_size, seq_len) if self.average_batch: total_loss = total_loss / batch_size return total_loss, tag_seq def forward(self, hidden, mask): outs = self.hidden2tag(hidden) batch_size = hidden.size(0) seq_len = hidden.size(1) if self.use_crf: scores, tag_seq = self.crf._viterbi_decode(outs, mask) else: outs = outs.view(batch_size * seq_len, -1) _, tag_seq = torch.max(outs, 1) tag_seq = tag_seq.view(batch_size, seq_len) ## filter padded position with zero tag_seq = mask.long() * tag_seq return tag_seq # def get_lstm_features(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): # return self.word_hidden(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) def decode_nbest(self, hidden, mask, nbest): if not self.use_crf: print "Nbest output is currently supported only for CRF! Exit..." exit(0) outs = self.hidden2tag(hidden) batch_size = hidden.size(0) seq_len = hidden.size(1) scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest) return scores, tag_seq def freeze_net(self): if self.frozen: return self.frozen = True for p in self.parameters(): p.requires_grad = False def unfreeze_net(self): if not self.frozen: return self.frozen = False for p in self.parameters(): p.requires_grad = True