Ejemplo n.º 1
0
class NERBert(BertPreTrainedModel):
    def __init__(self, config, args):
        super(NERBert, self).__init__(config)
        self.args = args
        self.bert = BertModel(config)
        if self.args.weighted:
            self.weight = nn.Parameter(torch.Tensor(self.args.num_layers))
            self.weight.data.uniform_(0.1, 0.9)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.args.num_labels)
        if self.args.use_crf:
            self.crf = ConditionalRandomField(self.args.num_labels)

        self.apply(self.init_bert_weights)

    def forward(self, input_ids, labels=None):
        attention_mask = input_ids.gt(0)
        encoded_layers, _ = self.bert(input_ids,
                                      None,
                                      attention_mask,
                                      output_all_encoded_layers=True)
        if not self.args.weighted:
            sequence_output = encoded_layers[-1]
        else:
            last_layers = torch.cat(encoded_layers[-self.num_layers:],
                                    dim=-1).view(encoded_layers[0].size(0),
                                                 encoded_layers[0].size(1),
                                                 encoded_layers[0].size(2),
                                                 self.num_layers)
            soft_weight = F.softmax(self.weight)
            sequence_output = torch.matmul(last_layers, soft_weight)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        if not self.args.use_crf:
            if labels is not None:
                loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
                # Only keep active parts of the loss
                loss = loss_fct(logits.view(-1, self.args.num_labels),
                                labels.view(-1))
                return logits, loss
            else:
                return logits
        else:
            if labels is not None:
                # Only keep active parts of the loss
                if attention_mask is not None:
                    total_loss = self.crf(logits, labels, attention_mask)
                    return torch.mean(total_loss)
            else:
                max_len = logits.shape[1]

                tag_seq = self.crf.viterbi_decode(logits, attention_mask)
                for pred in tag_seq:
                    if len(pred) < max_len:
                        pred += [0] * (max_len - len(pred))
                return tag_seq
Ejemplo n.º 2
0
class NERModel(nn.Module):

    def __init__(self, args, word_emb_matrix=None):
        super(NERModel, self).__init__()
        self.args = args

        if word_emb_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(torch.tensor(word_emb_matrix, dtype=torch.float))
            self.embedding.weight.requires_grad = args.trainable_embedding
        else:
            self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim)
            self.embedding.weight.requires_grad = True
        if args.model == 'cnn':
            self.encoder = CNNEncoder(args)
        elif args.model == 'rnn':
            self.encoder = DynamicRNN(args.embedding_dim, args.hidden_dim, bidirectional=True)

        self.linear = nn.Linear(args.hidden_dim*2, args.num_labels)
        self.dropout = nn.Dropout(0.2)
        if self.args.use_crf:
            self.crf = ConditionalRandomField(args.num_labels)

    def forward(self, input_ids, labels=None):
        attention_mask = input_ids.gt(0)
        inputs = self.embedding(input_ids)
        if self.args.model == 'cnn':

            rep = self.encoder(inputs)
        elif self.args.model == 'rnn':
            x_len = torch.sum(input_ids != 0, dim=1)
            rep, _ = self.encoder(inputs, x_len)
        logits = self.linear(self.dropout(rep))

        if not self.args.use_crf:
            if labels is not None:
                loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
                # Only keep active parts of the loss
                loss = loss_fct(logits.view(-1, self.args.num_labels), labels.view(-1))
                return logits, loss
            else:
                return logits
        else:
            if labels is not None:
                # Only keep active parts of the loss
                if attention_mask is not None:
                    total_loss = self.crf(logits, labels, attention_mask)
                    return 0,torch.mean(total_loss)
            else:
                max_len = logits.shape[1]

                tag_seq = self.crf.viterbi_decode(logits, attention_mask)
                for pred in tag_seq:
                    if len(pred) < max_len:
                        pred += [0] * (max_len - len(pred))
                return tag_seq
Ejemplo n.º 3
0
class BiLSTM_CRF(nn.Module):
    """
    别名::class:`fastNLP.models.AdvSeqLabel`  :class:`fastNLP.models.sequence_labeling.AdvSeqLabel`

    更复杂的Sequence Labelling模型。结构为Embedding, LayerNorm, 双向LSTM(两层),FC,LayerNorm,DropOut,FC,CRF。
    
    :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
        第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding
    :param int hidden_size: LSTM的隐层大小
    :param int num_classes: 有多少个类
    :param float dropout: LSTM中以及DropOut层的drop概率
    :param dict id2words: tag id转为其tag word的表。用于在CRF解码时防止解出非法的顺序,比如'BMES'这个标签规范中,'S'
        不能出现在'B'之后。这里也支持类似与'B-NN',即'-'前为标签类型的指示,后面为具体的tag的情况。这里不但会保证
        'B-NN'后面不为'S-NN'还会保证'B-NN'后面不会出现'M-xx'(任何非'M-NN'和'E-NN'的情况。)
    :param str encoding_type: 支持"BIO", "BMES", "BEMSO", 只有在id2words不为None的情况有用。
    """
    
    def __init__(self, char_init_embed, word_init_embed, pos_init_embed, spo_embed_dim, sentence_length, 
        hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bieso', weight=None):
        
        super().__init__()
        
        # self.Embedding = nn.Embedding(init_embed)
#         print(char_init_embed)
        self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1])
        self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1])
        # word2vec
        self.word_embed.weight.data.copy_(torch.from_numpy(weight))
        self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1])
        # spo embed size: 50
        self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim
        # sentence length
        #self.sen_len = sentence_length
        #self.zeros = torch.zeros(self.sen_len, dtype=torch.long)

        self.norm1 = torch.nn.LayerNorm(self.embed_dim)
        self.Rnn = nn.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2,
                            dropout=dropout, bidirectional=True, batch_first=True)
        self.Linear1 = nn.Linear(hidden_size * 2, hidden_size * 2 // 3)
        self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3)
        self.relu = torch.nn.LeakyReLU()
        self.drop = torch.nn.Dropout(dropout)
        self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes)
        
        if id2words is None:
            self.Crf = CRF(num_classes, include_start_end_trans=False)
        else:
            self.Crf = CRF(num_classes, include_start_end_trans=False,
                            allowed_transitions=allowed_transitions(id2words, encoding_type=encoding_type))
    
    def _decode(self, x):
        """
        :param torch.FloatTensor x: [batch_size, max_len, tag_size]
        :return torch.LongTensor, [batch_size, max_len]
        """
        tag_seq, _ = self.Crf.viterbi_decode(x, self.mask)
        return tag_seq
    
    def _internal_loss(self, x, y):
        """
        Negative log likelihood loss.
        :param x: Tensor, [batch_size, max_len, tag_size]
        :param y: Tensor, [batch_size, max_len]
        :return loss: a scalar Tensor

        """
        x = x.float()
        y = y.long()
        assert x.shape[:2] == y.shape
        assert y.shape == self.mask.shape
        total_loss = self.Crf(x, y, self.mask)
        return torch.mean(total_loss)
    
    def _make_mask(self, x, seq_len):
        batch_size, max_len = x.size(0), x.size(1)
        mask = seq_len_to_mask(seq_len, max_len)
#         print(seq_len)
#         print(seq_len.size())
#         print(x)
#         print(x.size())
#         print(mask.size())
#         mask = mask.view(batch_size, max_len)
        mask = mask.to(x).float()
        return mask

    def _forward(self, char, word, pos, spo, seq_len, tag=None):
        """
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len:[batch_size, ]
        :param torch.LongTensor target: [batch_size, max_len]
        :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting.
                   If truth is not None, return loss, a scalar. Used in training.
        """
        
        char = char.long()
        #word = word.long()
        #pos = pos.long()
        #spo = spo.long()
        seq_len = seq_len.long()
        self.mask = self._make_mask(char, seq_len)
        
        # seq_len = seq_len.long()
        tag = tag.long() if tag is not None else None
        
        #if next(self.parameters()).is_cuda:
        #    char = char.cuda()
        #    self.mask = self.mask.cuda()
        
        # x = self.Embedding(words)
        char = self.char_embed(char)
        word = self.word_embed(word)
        pos = self.pos_embed(pos)
        #print(spo)
        #print(self.zeros)
        spo = spo.unsqueeze(1).repeat(1, char.shape[1], 1).float()
        #print(char.shape)
        #print(word.shape)
        #print(pos.shape)
        #print(spo.shape)
        x = torch.cat((char, word, pos, spo), dim=2)
        #print(x.shape)

        x = self.norm1(x)
        # [batch_size, max_len, char_embed_dim + word_embed_dim + pos_embed_dim + spo_embed_dim ]
        
#         x, _ = self.Rnn(x, seq_len=seq_len)
        x, _ = self.Rnn(x)
        
        x = self.Linear1(x)
        x = self.norm2(x)
        x = self.relu(x)
        x = self.drop(x)
        x = self.Linear2(x)
        if tag is not None:
            return self._internal_loss(x, tag)
        else:
            return self._decode(x)

    def forward(self, char, word, pos, spo, seq_len, tag):
        """
        
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len: [batch_size, ]
        :param torch.LongTensor target: [batch_size, max_len], 目标
        :return torch.Tensor: a scalar loss
        """
        return self._forward(char, word, pos, spo, seq_len, tag)

    def predict(self, char, word, pos, spo, seq_len):
        """
        
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len: [batch_size, ]
        :return torch.LongTensor: [batch_size, max_len]
        """
        return self._forward(char, word, pos, spo, seq_len)
Ejemplo n.º 4
0
class Lattice_Transformer_SeqLabel(nn.Module):
    def __init__(self,
                 lattice_weight,
                 lattice_num,
                 lattice_dim,
                 bigram_weight,
                 bigram_num,
                 bigram_dim,
                 hidden_size,
                 label_size,
                 num_heads,
                 num_layers,
                 learnable_position,
                 layer_preprocess_sequence,
                 layer_postprocess_sequence,
                 ff_size=-1,
                 dropout=None,
                 max_seq_len=-1):
        super().__init__()
        self.lattice_embed = nn.Embedding(lattice_num, lattice_dim)
        self.lattice_embed.weight.data.copy_(torch.from_numpy(lattice_weight))
        self.bigram_embed = nn.Embedding(bigram_num, bigram_dim)
        self.bigram_embed.weight.data.copy_(torch.from_numpy(bigram_weight))

        pe_ss = nn.Parameter(get_pos_embedding(max_seq_len,
                                               hidden_size,
                                               rel_pos_init=0),
                             requires_grad=learnable_position)
        pe_se = nn.Parameter(get_pos_embedding(max_seq_len,
                                               hidden_size,
                                               rel_pos_init=0),
                             requires_grad=learnable_position)
        pe_es = nn.Parameter(get_pos_embedding(max_seq_len,
                                               hidden_size,
                                               rel_pos_init=0),
                             requires_grad=learnable_position)
        pe_ee = nn.Parameter(get_pos_embedding(max_seq_len,
                                               hidden_size,
                                               rel_pos_init=0),
                             requires_grad=learnable_position)

        # self.bigram_size = self.bigram_embed.embedding.weight.size(1)
        # char_input_size = self.lattice_embed.embedding.weight.size(1) + self.bigram_embed.embedding.weight.size(1)
        # lex_input_size = self.lattice_embed.embedding.weight.size(1)

        self.bigram_size = bigram_dim
        char_input_size = bigram_dim + lattice_dim
        lex_input_size = lattice_dim

        self.embed_dropout = nn.Dropout(p=dropout['embed'])
        self.gaz_dropout = nn.Dropout(p=dropout['gaz'])
        self.output_dropout = nn.Dropout(p=dropout['output'])

        self.char_proj = nn.Linear(char_input_size, hidden_size)
        self.lex_proj = nn.Linear(lex_input_size, hidden_size)

        self.encoder = Transformer_Encoder(
            hidden_size,
            num_heads,
            num_layers,
            learnable_position=learnable_position,
            layer_preprocess_sequence=layer_preprocess_sequence,
            layer_postprocess_sequence=layer_postprocess_sequence,
            dropout=dropout,
            ff_size=ff_size,
            max_seq_len=max_seq_len,
            pe_ss=pe_ss,
            pe_se=pe_se,
            pe_es=pe_es,
            pe_ee=pe_ee)
        self.output = nn.Linear(hidden_size, label_size)
        self.crf = ConditionalRandomField(label_size,
                                          include_start_end_trans=True)
        # self.crf.trans_m = nn.Parameter(torch.zeros(size=[label_size, label_size], requires_grad=True))
        self.loss_func = nn.CrossEntropyLoss(ignore_index=-100)

    # 训练时用
    # TODO 参数类型
    def forward(self, lattice: torch.Tensor, bigrams: torch.Tensor,
                seq_len: torch.Tensor, lex_num: torch.Tensor,
                pos_s: torch.Tensor, pos_e: torch.Tensor,
                target: Optional[torch.Tensor]):
        batch_size = lattice.size(0)
        max_seq_len_and_lex_num = lattice.size(1)
        max_seq_len = bigrams.size(1)

        raw_embed = self.lattice_embed(lattice)
        bigrams_embed = self.bigram_embed(bigrams)
        bigrams_embed = torch.cat([
            bigrams_embed,
            torch.zeros(size=[
                batch_size, max_seq_len_and_lex_num -
                max_seq_len, self.bigram_size
            ]).to(bigrams_embed)
        ],
                                  dim=1)
        raw_embed_char = torch.cat([raw_embed, bigrams_embed], dim=-1)

        raw_embed_char = self.embed_dropout(raw_embed_char)
        raw_embed = self.gaz_dropout(raw_embed)

        embed_char = self.char_proj(raw_embed_char)
        char_mask = seq_len_to_mask(seq_len, max_len=max_seq_len_and_lex_num)
        embed_char.masked_fill_(~(char_mask.unsqueeze(-1)), 0)

        embed_lex = self.lex_proj(raw_embed)
        lex_mask = (seq_len_to_mask(seq_len + lex_num) ^ char_mask)
        embed_lex.masked_fill_(~(lex_mask).unsqueeze(-1), 0)

        embedding = embed_char + embed_lex
        encoded = self.encoder(embedding,
                               seq_len,
                               lex_num=lex_num,
                               pos_s=pos_s,
                               pos_e=pos_e)
        encoded = self.output_dropout(encoded)

        # 这里只获取transformer输出的char部分
        encoded = encoded[:, :max_seq_len, :]
        pred = self.output(encoded)
        mask = seq_len_to_mask(seq_len)

        # script使用
        # pred, path = self.crf.viterbi_decode(pred, mask)
        # return pred

        if self.training:
            loss = self.crf(pred, target, mask).mean(dim=0)
            return {'loss': loss}
        else:
            pred, path = self.crf.viterbi_decode(pred, mask)
            result = {'pred': pred}
            return result
Ejemplo n.º 5
0
class Transformer_CRF(nn.Module):
    def __init__(self,
                 char_init_embed,
                 word_init_embed,
                 pos_init_embed,
                 spo_embed_dim,
                 num_classes,
                 num_layers,
                 inner_size,
                 key_size,
                 value_size,
                 num_head,
                 dropout=0.1,
                 id2words=None,
                 encoding_type='bieso',
                 weight=None):
        super().__init__()

        # self.Embedding = nn.Embedding(init_embed)
        #print(char_init_embed)
        self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1])
        self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1])
        self.word_embed.weight.data.copy_(torch.from_numpy(weight))
        self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1])
        # spo embed size: 50
        self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim

        self.norm1 = torch.nn.LayerNorm(self.embed_dim)
        self.transformer = encoder.TransformerEncoder(
            num_layers=num_layers,
            model_size=self.embed_dim,
            inner_size=inner_size,
            key_size=key_size,
            value_size=value_size,
            num_head=num_head,
            dropout=dropout)
        self.Linear1 = nn.Linear(self.embed_dim, self.embed_dim // 3)
        self.norm2 = torch.nn.LayerNorm(self.embed_dim // 3)
        self.relu = torch.nn.LeakyReLU()
        self.drop = torch.nn.Dropout(dropout)
        self.Linear2 = nn.Linear(self.embed_dim // 3, num_classes)
        self.Linear = nn.Linear(self.embed_dim, num_classes)

        if id2words is None:
            self.Crf = CRF(num_classes, include_start_end_trans=False)
        else:
            self.Crf = CRF(num_classes,
                           include_start_end_trans=False,
                           allowed_transitions=allowed_transitions(
                               id2words, encoding_type=encoding_type))

    def _decode(self, x):
        """
        :param torch.FloatTensor x: [batch_size, max_len, tag_size]
        :return torch.LongTensor, [batch_size, max_len]
        """
        tag_seq, _ = self.Crf.viterbi_decode(x, self.mask)
        return tag_seq

    def _internal_loss(self, x, y):
        """
        Negative log likelihood loss.
        :param x: Tensor, [batch_size, max_len, tag_size]
        :param y: Tensor, [batch_size, max_len]
        :return loss: a scalar Tensor

        """
        x = x.float()
        y = y.long()
        assert x.shape[:2] == y.shape
        assert y.shape == self.mask.shape
        total_loss = self.Crf(x, y, self.mask)
        return torch.mean(total_loss)

    def _make_mask(self, x, seq_len):
        batch_size, max_len = x.size(0), x.size(1)
        mask = seq_len_to_mask(seq_len)
        #         mask = mask.view(batch_size, max_len)
        mask = mask.to(x).float()
        return mask

    def _forward(self, char, word, pos, spo, seq_len, tag=None):
        """
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len:[batch_size, ]
        :param torch.LongTensor target: [batch_size, max_len]
        :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting.
                   If truth is not None, return loss, a scalar. Used in training.
        """

        char = char.long()
        #word = word.long()
        #pos = pos.long()
        #spo = spo.long()
        seq_len = seq_len.long()
        self.mask = self._make_mask(char, seq_len)

        # seq_len = seq_len.long()
        tag = tag.long() if tag is not None else None

        #if next(self.parameters()).is_cuda:
        #    char = char.cuda()
        #    self.mask = self.mask.cuda()

        # x = self.Embedding(words)
        char = self.char_embed(char)
        word = self.word_embed(word)
        pos = self.pos_embed(pos)
        #print(spo)
        #print(self.zeros)
        spo = spo.unsqueeze(1).repeat(1, char.shape[1], 1).float()
        #print(char.shape)
        #print(word.shape)
        #print(pos.shape)
        #print(spo.shape)
        x = torch.cat((char, word, pos, spo), dim=2)
        #print(x.shape)

        x = self.norm1(x)
        # [batch_size, max_len, char_embed_dim + word_embed_dim + pos_embed_dim + spo_embed_dim ]

        x = self.transformer(x, seq_mask=self.mask)

        #x = self.Linear1(x)
        #x = self.norm2(x)
        #x = self.relu(x)
        #x = self.drop(x)
        #x = self.Linear2(x)
        x = self.Linear(x)
        if tag is not None:
            return self._internal_loss(x, tag)
        else:
            return self._decode(x)
        #return {"pred": self._decode(x)}

    def forward(self, char, word, pos, spo, seq_len, tag):
        """
        
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len: [batch_size, ]
        :param torch.LongTensor target: [batch_size, max_len], 目标
        :return torch.Tensor: a scalar loss
        """
        return self._forward(char, word, pos, spo, seq_len, tag)

    def predict(self, char, word, pos, spo, seq_len):
        """
        
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len: [batch_size, ]
        :return torch.LongTensor: [batch_size, max_len]
        """
        return self._forward(char, word, pos, spo, seq_len)