Esempio n. 1
0
    def __init__(self,
                 config,
                 data_bundle,
                 embed,
                 num_layers,
                 d_model,
                 n_head,
                 feedforward_dim,
                 dropout,
                 after_norm=True,
                 attn_type='adatrans',
                 bi_embed=None,
                 fc_dropout=0.3,
                 pos_embed=None,
                 scale=False,
                 dropout_attn=None):
        """

        :param tag_vocab: fastNLP Vocabulary
        :param embed: fastNLP TokenEmbedding
        :param num_layers: number of self-attention layers
        :param d_model: input size
        :param n_head: number of head
        :param feedforward_dim: the dimension of ffn
        :param dropout: dropout in self-attention
        :param after_norm: normalization place
        :param attn_type: adatrans, naive
        :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None
        :param bi_embed: Used in Chinese scenerio
        :param fc_dropout: dropout rate before the fc layer
        """
        super().__init__()
        self.config = config
        self.data_bundle = data_bundle
        tag_vocab = data_bundle.get_vocab('target')
        self.embed = embed
        embed_size = self.embed.embed_size
        self.bi_embed = None
        if bi_embed is not None:
            self.bi_embed = bi_embed
            embed_size += self.bi_embed.embed_size

        self.in_fc = nn.Linear(embed_size, d_model)

        self.transformer = TransformerEncoder(num_layers,
                                              d_model,
                                              n_head,
                                              feedforward_dim,
                                              dropout,
                                              after_norm=after_norm,
                                              attn_type=attn_type,
                                              scale=scale,
                                              dropout_attn=dropout_attn,
                                              pos_embed=pos_embed)
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.out_fc = nn.Linear(d_model, len(tag_vocab))
        trans = allowed_transitions(tag_vocab, include_start_end=True)
        self.crf = ConditionalRandomField(len(tag_vocab),
                                          include_start_end_trans=True,
                                          allowed_transitions=trans)
Esempio n. 2
0
    def __init__(self,
                 char_embed,
                 hidden_size,
                 num_layers,
                 target_vocab=None,
                 bigram_embed=None,
                 trigram_embed=None,
                 dropout=0.5):
        super().__init__()

        embed_size = char_embed.embed_size
        self.char_embed = char_embed
        if bigram_embed:
            embed_size += bigram_embed.embed_size
        self.bigram_embed = bigram_embed
        if trigram_embed:
            embed_size += trigram_embed.embed_size
        self.trigram_embed = trigram_embed

        self.lstm = LSTM(embed_size,
                         hidden_size=hidden_size // 2,
                         bidirectional=True,
                         batch_first=True,
                         num_layers=num_layers)
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(hidden_size, len(target_vocab))

        transitions = None
        if target_vocab:
            transitions = allowed_transitions(target_vocab,
                                              include_start_end=True,
                                              encoding_type='bmes')

        self.crf = ConditionalRandomField(num_tags=len(target_vocab),
                                          allowed_transitions=transitions)
Esempio n. 3
0
    def __init__(self,
                 embed,
                 hidden_size,
                 num_layers,
                 tag_vocab,
                 dropout=0.5,
                 encoding_type='bioes'):
        super().__init__()
        self.embedding = embed
        self.lstm = LSTM(input_size=self.embedding.embedding_dim,
                         hidden_size=hidden_size // 2,
                         num_layers=num_layers,
                         bidirectional=True,
                         batch_first=True)
        self.fc = nn.Linear(hidden_size, len(tag_vocab))

        transitions = allowed_transitions(tag_vocab.idx2word,
                                          encoding_type=encoding_type,
                                          include_start_end=True)
        self.crf = ConditionalRandomField(len(tag_vocab),
                                          include_start_end_trans=True,
                                          allowed_transitions=transitions)

        self.dropout = nn.Dropout(dropout, inplace=True)

        for name, param in self.named_parameters():
            if 'fc' in name:
                if param.data.dim() > 1:
                    nn.init.xavier_uniform_(param)
                else:
                    nn.init.constant_(param, 0)
            if 'crf' in name:
                nn.init.zeros_(param)
Esempio n. 4
0
def get_crf_zero_init(label_size, include_start_end_trans=False, allowed_transitions=None,
                 initial_method=None):

    crf = ConditionalRandomField(label_size, include_start_end_trans)

    crf.trans_m = nn.Parameter(torch.zeros(size=[label_size, label_size], requires_grad=True))
    if crf.include_start_end_trans:
        crf.start_scores = nn.Parameter(torch.zeros(size=[label_size], requires_grad=True))
        crf.end_scores = nn.Parameter(torch.zeros(size=[label_size], requires_grad=True))
    return crf
Esempio n. 5
0
 def __init__(self, embed, tag_vocab, encoding_type='bio'):
     super().__init__()
     self.embed = embed
     self.fc = nn.Linear(self.embed.embed_size, len(tag_vocab))
     trans = allowed_transitions(tag_vocab,
                                 encoding_type=encoding_type,
                                 include_start_end=True)
     self.crf = ConditionalRandomField(len(tag_vocab),
                                       include_start_end_trans=True,
                                       allowed_transitions=trans)
Esempio n. 6
0
File: CMG.py Progetto: Tannidy/CMGN
    def __init__(self,
                 tag_vocab,
                 embed,
                 d_model,
                 n_heads,
                 d_k,
                 d_v,
                 n_layers,
                 d_label=10,
                 fc_dropout=0.3,
                 dropout=0.15,
                 gpu=0,
                 pos_embed=None,
                 scale=False):
        """

        :param tag_vocab: fastNLP Vocabulary
        :param embed: fastNLP TokenEmbedding
        :param num_layers: number of self-attention layers
        :param d_model: input size
        :param n_head: number of head
        :param feedforward_dim: the dimension of ffn
        :param dropout: dropout in self-attention
        :param after_norm: normalization place
        :param attn_type: adatrans, naive
        :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None
        :param bi_embed: Used in Chinese scenerio
        :param fc_dropout: dropout rate before the fc layer
        """
        super().__init__()

        self.embed = embed
        embed_size = self.embed.embed_size

        self.in_fc = nn.Linear(embed_size, d_model)
        self.encoder = Encoder(d_model,
                               n_heads,
                               d_k,
                               d_v,
                               n_layers,
                               d_label,
                               dropout,
                               feedforward_dim=int(2 * d_model))

        self.fc_dropout = nn.Dropout(fc_dropout)
        self.out_fc = nn.Linear(d_model, len(tag_vocab))

        trans = allowed_transitions(tag_vocab, include_start_end=True)
        self.crf = ConditionalRandomField(len(tag_vocab),
                                          include_start_end_trans=True,
                                          allowed_transitions=trans)
Esempio n. 7
0
    def __init__(self,
                 char_embed,
                 bigram_embed,
                 word_embed,
                 hidden_size,
                 label_size,
                 bias=True,
                 bidirectional=False,
                 device=None,
                 embed_dropout=0,
                 output_dropout=0,
                 use_bigram=True):

        if device is None:
            self.device = torch.device('cpu')
        else:
            self.device = torch.device(device)
        super().__init__()
        self.char_embed_size = char_embed.embedding.weight.size(1)
        self.bigram_embed_size = bigram_embed.embedding.weight.size(1)
        self.word_embed_size = word_embed.embedding.weight.size(1)
        self.hidden_size = hidden_size
        self.label_size = label_size
        self.bidirectional = bidirectional
        self.use_bigram = use_bigram

        self.char_embed = char_embed
        self.bigram_embed = bigram_embed
        self.word_embed = word_embed

        if self.use_bigram:
            self.input_size = self.char_embed_size + self.bigram_embed_size
        else:
            self.input_size = self.char_embed_size

        self.encoder = LSTM(self.input_size,
                            self.hidden_size,
                            bidirectional=self.bidirectional)

        better_init_rnn(self.encoder.lstm)

        self.output = nn.Linear(
            self.hidden_size * (2 if self.bidirectional else 1),
            self.label_size)

        self.debug = False
        self.loss_func = nn.CrossEntropyLoss()
        self.embed_dropout = nn.Dropout(embed_dropout)
        self.output_dropout = nn.Dropout(output_dropout)
        self.crf = ConditionalRandomField(label_size, True)
Esempio n. 8
0
    def __init__(self,
                 char_init_embed,
                 word_init_embed,
                 pos_init_embed,
                 spo_embed_dim,
                 num_classes,
                 num_layers,
                 inner_size,
                 key_size,
                 value_size,
                 num_head,
                 dropout=0.1,
                 id2words=None,
                 encoding_type='bieso'):
        super().__init__()

        # self.Embedding = nn.Embedding(init_embed)
        #print(char_init_embed)
        self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1])
        self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1])
        self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1])
        # spo embed size: 50
        self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim

        self.norm1 = torch.nn.LayerNorm(self.embed_dim)
        # self.Rnn = encoder.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2,
        #                     dropout=dropout, bidirectional=True, batch_first=True)
        self.transformer = encoder.TransformerEncoder(
            num_layers=num_layers,
            model_size=self.embed_dim,
            inner_size=inner_size,
            key_size=key_size,
            value_size=value_size,
            num_head=num_head,
            dropout=dropout)
        self.Linear1 = nn.Linear(self.embed_dim, self.embed_dim // 3)
        self.norm2 = torch.nn.LayerNorm(self.embed_dim // 3)
        self.relu = torch.nn.LeakyReLU()
        self.drop = torch.nn.Dropout(dropout)
        self.Linear2 = nn.Linear(self.embed_dim // 3, num_classes)
        self.Linear = nn.Linear(self.embed_dim, num_classes)

        if id2words is None:
            self.Crf = CRF(num_classes, include_start_end_trans=False)
        else:
            self.Crf = CRF(num_classes,
                           include_start_end_trans=False,
                           allowed_transitions=allowed_transitions(
                               id2words, encoding_type=encoding_type))
Esempio n. 9
0
class BertCRF(nn.Module):
    def __init__(self, embed, tag_vocab, encoding_type='bio'):
        super().__init__()
        self.embed = embed
        self.fc = nn.Linear(self.embed.embed_size, len(tag_vocab))
        trans = allowed_transitions(tag_vocab,
                                    encoding_type=encoding_type,
                                    include_start_end=True)
        self.crf = ConditionalRandomField(len(tag_vocab),
                                          include_start_end_trans=True,
                                          allowed_transitions=trans)

    def _forward(self, words, target):
        mask = words.ne(0)
        words = self.embed(words)
        words = self.fc(words)
        logits = F.log_softmax(words, dim=-1)
        if target is not None:
            loss = self.crf(logits, target, mask)
            return {'loss': loss}
        else:
            paths, _ = self.crf.viterbi_decode(logits, mask)
            return {'pred': paths}

    def forward(self, words, target):
        return self._forward(words, target)

    def predict(self, words):
        return self._forward(words, None)
Esempio n. 10
0
    def __init__(self, tag_vocabs, embed, num_layers, d_model, n_head, feedforward_dim, dropout,
                 after_norm=True, attn_type='adatrans', bi_embed=None,
                 fc_dropout=0.3, pos_embed=None, scale=False, dropout_attn=None):

        super().__init__()

        self.embed = embed
        embed_size = self.embed.embed_size
        self.bi_embed = None
        if bi_embed is not None:
            self.bi_embed = bi_embed
            embed_size += self.bi_embed.embed_size

        self.tag_vocabs = []
        self.out_fcs = nn.ModuleList()
        self.crfs = nn.ModuleList()

        for i in range(len(tag_vocabs)):
            self.tag_vocabs.append(tag_vocabs[i])
            out_fc = nn.Linear(1536, len(tag_vocabs[i]))
            self.out_fcs.append(out_fc)
            trans = allowed_transitions(
                tag_vocabs[i], encoding_type='bioes', include_start_end=True)
            crf = ConditionalRandomField(
                len(tag_vocabs[i]), include_start_end_trans=True, allowed_transitions=trans)
            self.crfs.append(crf)

        self.in_fc = nn.Linear(embed_size, d_model)

        self.transformer = TransformerEncoder(num_layers, d_model, n_head, feedforward_dim, dropout,
                                              after_norm=after_norm, attn_type=attn_type,
                                              scale=scale, dropout_attn=dropout_attn,
                                              pos_embed=pos_embed)

        self.fc_dropout = nn.Dropout(fc_dropout)
Esempio n. 11
0
    def __init__(self,
                 char_embed,
                 num_classes,
                 bigram_embed=None,
                 trigram_embed=None,
                 num_layers=1,
                 hidden_size=100,
                 dropout=0.5,
                 target_vocab=None,
                 encoding_type=None):
        super().__init__()

        self.char_embed = get_embeddings(char_embed)
        embed_size = self.char_embed.embedding_dim
        if bigram_embed:
            self.bigram_embed = get_embeddings(bigram_embed)
            embed_size += self.bigram_embed.embedding_dim
        if trigram_embed:
            self.trigram_ebmbed = get_embeddings(trigram_embed)
            embed_size += self.bigram_embed.embedding_dim

        if num_layers > 1:
            self.lstm = LSTM(embed_size,
                             num_layers=num_layers,
                             hidden_size=hidden_size // 2,
                             bidirectional=True,
                             batch_first=True,
                             dropout=dropout)
        else:
            self.lstm = LSTM(embed_size,
                             num_layers=num_layers,
                             hidden_size=hidden_size // 2,
                             bidirectional=True,
                             batch_first=True)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, num_classes)

        trans = None
        if target_vocab is not None and encoding_type is not None:
            trans = allowed_transitions(target_vocab.idx2word,
                                        encoding_type=encoding_type,
                                        include_start_end=True)

        self.crf = ConditionalRandomField(num_classes,
                                          include_start_end_trans=True,
                                          allowed_transitions=trans)
Esempio n. 12
0
class TransformerCWS(nn.Module):
    def __init__(self, vocab_num, max_len, embed_dim=100, bigram_vocab_num=None, 
                 bigram_embed_dim=100, num_bigram_per_char=None,
                 hidden_size=200, embed_drop_p=0.3, num_layers=2, num_heads=6, tag_size=4):
        super().__init__()

        input_size = embed_dim
        if bigram_vocab_num:
            self.bigram_embedding = nn.Embedding(bigram_vocab_num, bigram_embed_dim)
            input_size += num_bigram_per_char*bigram_embed_dim

        self.drop = nn.Dropout(embed_drop_p, inplace=True)

        self.fc1 = nn.Linear(input_size, hidden_size)

        self.transformer = StarTransEnc(nn.Embedding(vocab_num, embed_dim), num_layers=num_layers, 
                                        hidden_size=hidden_size, num_head=num_heads, head_dim=32,
                                        emb_dropout=0.3, dropout=0.1, max_len=max_len)
        self.fc2 = nn.Linear(hidden_size, tag_size)

        # allowed_trans = allowed_transitions({0:'b', 1:'m', 2:'e', 3:'s'}, encoding_type='bmes')
        allowed_trans = None
        self.crf = ConditionalRandomField(num_tags=tag_size, include_start_end_trans=False,
                                          allowed_transitions=allowed_trans)

    def forward(self, chars, target, seq_lens, bigrams=None):
        masks = seq_len_to_mask(seq_lens)
        batch_size = x.size(0)
        length = x.size(1)
        if hasattr(self, 'bigram_embedding'):
            bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size
            x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1)
        feats, _ = self.transformer(chars, masks)
        feats = self.fc2(feats)
        losses = self.crf(feats, target, masks.float())

        pred_dict = {}
        pred_dict['seq_lens'] = seq_lens
        pred_dict['loss'] = torch.mean(losses)

        return pred_dict

    def predict(self, chars, seq_lens, bigrams=None):
        masks = seq_len_to_mask(seq_lens)

        x = self.embedding(chars)
        batch_size = x.size(0)
        length = x.size(1)
        if hasattr(self, 'bigram_embedding'):
            bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size
            x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1)
        self.drop(x)
        x = self.fc1(x)
        feats = self.transformer(x, masks)
        feats = self.fc2(feats)

        probs = self.crf.viterbi_decode(feats, masks, get_score=False)

        return {'pred': probs, 'seq_lens':seq_lens}
Esempio n. 13
0
    def __init__(self,
                 char_init_embed,
                 word_init_embed,
                 pos_init_embed,
                 spo_embed_dim,
                 sentence_length,
                 hidden_size,
                 num_classes,
                 dropout=0.3,
                 id2words=None,
                 encoding_type='bmes'):

        super().__init__()

        # self.Embedding = nn.Embedding(init_embed)
        #print(char_init_embed)
        self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1])
        self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1])
        self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1])
        # spo embed size: 50
        self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim
        # sentence length
        #self.sen_len = sentence_length
        #self.zeros = torch.zeros(self.sen_len, dtype=torch.long)

        self.norm1 = torch.nn.LayerNorm(self.embed_dim)
        self.Rnn = encoder.LSTM(input_size=self.embed_dim,
                                hidden_size=hidden_size,
                                num_layers=2,
                                dropout=dropout,
                                bidirectional=True,
                                batch_first=True)
        self.Linear1 = nn.Linear(hidden_size * 2, hidden_size * 2 // 3)
        self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3)
        self.relu = torch.nn.LeakyReLU()
        self.drop = torch.nn.Dropout(dropout)
        self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes)

        if id2words is None:
            self.Crf = CRF(num_classes, include_start_end_trans=False)
        else:
            self.Crf = CRF(num_classes,
                           include_start_end_trans=False,
                           allowed_transitions=allowed_transitions(
                               id2words, encoding_type=encoding_type))
Esempio n. 14
0
class CNNBiLSTMCRF(nn.Module):
    def __init__(self,
                 embed,
                 hidden_size,
                 num_layers,
                 tag_vocab,
                 dropout=0.5,
                 encoding_type='bioes'):
        super().__init__()
        self.embedding = embed
        self.lstm = LSTM(input_size=self.embedding.embedding_dim,
                         hidden_size=hidden_size // 2,
                         num_layers=num_layers,
                         bidirectional=True,
                         batch_first=True)
        self.fc = nn.Linear(hidden_size, len(tag_vocab))

        transitions = allowed_transitions(tag_vocab.idx2word,
                                          encoding_type=encoding_type,
                                          include_start_end=True)
        self.crf = ConditionalRandomField(len(tag_vocab),
                                          include_start_end_trans=True,
                                          allowed_transitions=transitions)

        self.dropout = nn.Dropout(dropout, inplace=True)

        for name, param in self.named_parameters():
            if 'fc' in name:
                if param.data.dim() > 1:
                    nn.init.xavier_uniform_(param)
                else:
                    nn.init.constant_(param, 0)
            if 'crf' in name:
                nn.init.zeros_(param)

    def _forward(self, words, seq_len, target=None):
        words = self.embedding(words)
        outputs, _ = self.lstm(words, seq_len)
        self.dropout(outputs)

        logits = F.log_softmax(self.fc(outputs), dim=-1)

        if target is not None:
            loss = self.crf(logits, target,
                            seq_len_to_mask(seq_len,
                                            max_len=logits.size(1))).mean()
            return {Const.LOSS: loss}
        else:
            pred, _ = self.crf.viterbi_decode(
                logits, seq_len_to_mask(seq_len, max_len=logits.size(1)))
            return {Const.OUTPUT: pred}

    def forward(self, words, seq_len, target):
        return self._forward(words, seq_len, target)

    def predict(self, words, seq_len):
        return self._forward(words, seq_len, None)
class KnowledgePointExtractionModel(BertPreTrainedModel):
    """知识抽取---参照序列标注模型
        1. Embedding - 8 layer以下bert model,
        2. multi layer MLP 线性变换
        3. CRF layer 修正"""
    def __init__(self, config: BertConfig):
        super(KnowledgePointExtractionModel, self).__init__(config=config)

        self.bert = BertModel(
            config=config,
            add_pooling_layer=False)  # word to vector(embeddings)
        # MLP输入输出向量size, mlp_layer_sizes: [hidden_size, middle_size1, middle_size2, len(config.crf_labels)]
        self.kpe_mlp = MLP(size_layer=config.mlp_layer_sizes,
                           activation='relu',
                           output_activation=None)
        # crf_labels = {0:"<pad>", 1: "S", 2: "B", 3: "M", 4: "E"} (id2label)
        tag_labels = {}
        for key, value in config.crf_labels.items():
            if not isinstance(key, int):
                tag_labels[int(key)] = value
        if tag_labels:
            config.crf_labels = tag_labels
        trans = allowed_transitions(tag_vocab=config.crf_labels,
                                    include_start_end=True)
        self.kpe_crf = ConditionalRandomField(num_tags=len(config.crf_labels),
                                              include_start_end_trans=True,
                                              allowed_transitions=trans)

    def forward(self, input_ids, labels=None, attention_mask=None):
        """前向传播"""
        bert_outputs = self.bert(input_ids,
                                 attention_mask=attention_mask,
                                 return_dict=True)
        embedding_output = bert_outputs.last_hidden_state

        mlp_outputs = self.kpe_mlp(embedding_output)
        logits = F.log_softmax(mlp_outputs, dim=-1)

        if attention_mask is None:
            attention_mask = input_ids.ne(0)
        if labels is not None:
            # train
            crf_outputs = self.kpe_crf(logits, labels, mask=attention_mask)
            # logger.info("loss shape: {}".format(crf_outputs.shape))
            loss = crf_outputs.sum() / attention_mask.type_as(
                input_ids).sum()  # token loss
            # logger.info("loss value: {}".format(loss))
            return (loss, )  # {"loss": loss}  # 4.0以上版本
        else:
            # inference
            paths, _ = self.kpe_crf.viterbi_decode(logits, mask=attention_mask)
            return {"pred": paths}

        return logits
    def __init__(self, config: BertConfig):
        super(KnowledgePointExtractionModel, self).__init__(config=config)

        self.bert = BertModel(
            config=config,
            add_pooling_layer=False)  # word to vector(embeddings)
        # MLP输入输出向量size, mlp_layer_sizes: [hidden_size, middle_size1, middle_size2, len(config.crf_labels)]
        self.kpe_mlp = MLP(size_layer=config.mlp_layer_sizes,
                           activation='relu',
                           output_activation=None)
        # crf_labels = {0:"<pad>", 1: "S", 2: "B", 3: "M", 4: "E"} (id2label)
        tag_labels = {}
        for key, value in config.crf_labels.items():
            if not isinstance(key, int):
                tag_labels[int(key)] = value
        if tag_labels:
            config.crf_labels = tag_labels
        trans = allowed_transitions(tag_vocab=config.crf_labels,
                                    include_start_end=True)
        self.kpe_crf = ConditionalRandomField(num_tags=len(config.crf_labels),
                                              include_start_end_trans=True,
                                              allowed_transitions=trans)
Esempio n. 17
0
    def __init__(self, vocab_num, max_len, embed_dim=100, bigram_vocab_num=None, 
                 bigram_embed_dim=100, num_bigram_per_char=None,
                 hidden_size=200, embed_drop_p=0.3, num_layers=2, num_heads=6, tag_size=4):
        super().__init__()

        input_size = embed_dim
        if bigram_vocab_num:
            self.bigram_embedding = nn.Embedding(bigram_vocab_num, bigram_embed_dim)
            input_size += num_bigram_per_char*bigram_embed_dim

        self.drop = nn.Dropout(embed_drop_p, inplace=True)

        self.fc1 = nn.Linear(input_size, hidden_size)

        self.transformer = StarTransEnc(nn.Embedding(vocab_num, embed_dim), num_layers=num_layers, 
                                        hidden_size=hidden_size, num_head=num_heads, head_dim=32,
                                        emb_dropout=0.3, dropout=0.1, max_len=max_len)
        self.fc2 = nn.Linear(hidden_size, tag_size)

        # allowed_trans = allowed_transitions({0:'b', 1:'m', 2:'e', 3:'s'}, encoding_type='bmes')
        allowed_trans = None
        self.crf = ConditionalRandomField(num_tags=tag_size, include_start_end_trans=False,
                                          allowed_transitions=allowed_trans)
Esempio n. 18
0
    def __init__(self, tag_vocab, bert_config, bi_embed=None):
        """

        :param tag_vocab: fastNLP Vocabulary
        :param embed: fastNLP TokenEmbedding
        :param num_layers: number of self-attention layers
        :param d_model: input size
        :param n_head: number of head
        :param feedforward_dim: the dimension of ffn
        :param dropout: dropout in self-attention
        :param after_norm: normalization place
        :param attn_type: adatrans, naive
        :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None
        :param bi_embed: Used in Chinese scenerio
        :param fc_dropout: dropout rate before the fc layer
        """
        super().__init__()

        self.embed = BertModel.from_pretrained(bert_config)
        embed_size = self.embed.embeddings.word_embeddings.weight.shape[1]
        self.bi_embed = None
        if bi_embed is not None:
            self.bi_embed = bi_embed
            embed_size += self.bi_embed.embed_size
        self.configuration = TransfoXLConfig(d_model=768,
                                             d_head=16,
                                             n_head=16,
                                             n_layer=4,
                                             mem_len=1000)
        self.xl_model = TransfoXLModel(self.configuration)
        self.liner = nn.Linear(768, len(tag_vocab))
        # trans = allowed_transitions(tag_vocab, include_start_end=True, encoding_type = "bioes")
        #TODO: trans 为限制转移的数组,非常有用,过后加上
        self.crf = ConditionalRandomField(len(tag_vocab),
                                          include_start_end_trans=True,
                                          allowed_transitions=None)
Esempio n. 19
0
    def __init__(self, embed, tag_vocabs, encoding_type='bio'):
        super().__init__()
        self.embed = embed
        self.tag_vocabs = []
        self.fcs = nn.ModuleList()
        self.crfs = nn.ModuleList()

        for i in range(len(tag_vocabs)):
            self.tag_vocabs.append(tag_vocabs[i])
            linear = nn.Linear(self.embed.embed_size, len(tag_vocabs[i]))
            self.fcs.append(linear)
            trans = allowed_transitions(
                tag_vocabs[i], encoding_type=encoding_type, include_start_end=True)
            crf = ConditionalRandomField(
                len(tag_vocabs[i]), include_start_end_trans=True, allowed_transitions=trans)
            self.crfs.append(crf)
Esempio n. 20
0
    def __init__(self, batch_size, word_vocab_size, char_vocab_size, pos_vocab_size, spo_vocab_size,
                 embed_dim, hidden_dim, id2words, dropout=0.5):
        super().__init__()
        self.batch_size = batch_size
        self.word_embeds = nn.Embedding(word_vocab_size, embed_dim)
        self.char_embeds = nn.Embedding(char_vocab_size, embed_dim)
        self.pos_embeds = nn.Embedding(pos_vocab_size, embed_dim)
        self.spo_embeds = nn.Embedding(spo_vocab_size, embed_dim)
        self.norm1 = torch.nn.LayerNorm(embed_dim)
        self.Rnn = nn.LSTM(embed_dim, hidden_dim, num_layers=2,
                           dropout=dropout, bidirectional=True, batch_first=True)
        self.Linear1 = nn.Linear(hidden_dim * 2, hidden_dim * 2 // 3)
        self.norm2 = torch.nn.LayerNorm(hidden_dim * 2 // 3)
        self.relu = torch.nn.LeakyReLU()
        self.drop = torch.nn.Dropout(dropout)
        self.Linear2 = nn.Linear(hidden_dim * 2 // 3, len(id2words))

        self.Crf = CRF(len(id2words), allowed_transitions=allowed_transitions(id2words))
Esempio n. 21
0
    def __init__(self, embed,label_vocab,pos_idx=31,
                Parsing_rnn_layers=3, Parsing_arc_mlp_size=500,
                Parsing_label_mlp_size=100,Parsing_use_greedy_infer=False,
                encoding_type='bmeso',embedding_dim=768,dropout=0.1,use_pos_embedding=True,
                use_average=True):
        super().__init__()
        self.embed = embed
        self.use_pos_embedding=use_pos_embedding
        self.use_average=use_average
        self.label_vocab=label_vocab
        self.pos_idx=pos_idx
        self.user_dict_weight=0.05
        embedding_dim_1=512
        embedding_dim_2=256
        
        
        self.layers_map={'CWS':'-1','POS':'-1','Parsing':'-1','NER':'-1'}
        #NER
        self.ner_linear=nn.Linear(embedding_dim,len(label_vocab['NER']))
        trans = allowed_transitions(label_vocab['NER'], encoding_type='bmeso', include_start_end=True)
        self.ner_crf = ConditionalRandomField(len(label_vocab['NER']), include_start_end_trans=True, allowed_transitions=trans)

        #parsing
        self.biaffine_parser=BertCharParser(
                    app_index=self.label_vocab['Parsing'].to_index('APP'),
                    vector_size=768,
                    num_label=len(label_vocab['Parsing']),
                    rnn_layers=Parsing_rnn_layers,
                    arc_mlp_size=Parsing_arc_mlp_size,
                    label_mlp_size=Parsing_label_mlp_size,
                    dropout=dropout,
                    use_greedy_infer=Parsing_use_greedy_infer)
        
        if self.use_pos_embedding:
            self.pos_embedding=nn.Embedding(len(self.label_vocab['pos']),embedding_dim, padding_idx=0)
        
        
        self.loss=CrossEntropyLoss(padding_idx=0)

        #CWS
        self.cws_mlp=MLP([embedding_dim, embedding_dim_1,embedding_dim_2, len(label_vocab['CWS'])], 'relu', output_activation=None)
        trans=allowed_transitions(label_vocab['CWS'],include_start_end=True)
        self.cws_crf = ConditionalRandomField(len(label_vocab['CWS']), include_start_end_trans=True, allowed_transitions=trans)

        #POS
        self.pos_mlp=MLP([embedding_dim, embedding_dim_1,embedding_dim_2, len(label_vocab['POS'])], 'relu', output_activation=None)
        trans=allowed_transitions(label_vocab['POS'],include_start_end=True)
        self.pos_crf = ConditionalRandomField(len(label_vocab['POS']), include_start_end_trans=True, allowed_transitions=trans)
Esempio n. 22
0
class TransformerSeqLabel(nn.Module):
    def __init__(self,
                 char_init_embed,
                 word_init_embed,
                 pos_init_embed,
                 spo_embed_dim,
                 num_classes,
                 num_layers,
                 inner_size,
                 key_size,
                 value_size,
                 num_head,
                 dropout=0.1,
                 id2words=None,
                 encoding_type='bieso'):
        super().__init__()

        # self.Embedding = nn.Embedding(init_embed)
        #print(char_init_embed)
        self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1])
        self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1])
        self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1])
        # spo embed size: 50
        self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim

        self.norm1 = torch.nn.LayerNorm(self.embed_dim)
        # self.Rnn = encoder.LSTM(input_size=self.embed_dim, hidden_size=hidden_size, num_layers=2,
        #                     dropout=dropout, bidirectional=True, batch_first=True)
        self.transformer = encoder.TransformerEncoder(
            num_layers=num_layers,
            model_size=self.embed_dim,
            inner_size=inner_size,
            key_size=key_size,
            value_size=value_size,
            num_head=num_head,
            dropout=dropout)
        self.Linear1 = nn.Linear(self.embed_dim, self.embed_dim // 3)
        self.norm2 = torch.nn.LayerNorm(self.embed_dim // 3)
        self.relu = torch.nn.LeakyReLU()
        self.drop = torch.nn.Dropout(dropout)
        self.Linear2 = nn.Linear(self.embed_dim // 3, num_classes)
        self.Linear = nn.Linear(self.embed_dim, num_classes)

        if id2words is None:
            self.Crf = CRF(num_classes, include_start_end_trans=False)
        else:
            self.Crf = CRF(num_classes,
                           include_start_end_trans=False,
                           allowed_transitions=allowed_transitions(
                               id2words, encoding_type=encoding_type))

    def _decode(self, x):
        """
        :param torch.FloatTensor x: [batch_size, max_len, tag_size]
        :return torch.LongTensor, [batch_size, max_len]
        """
        tag_seq, _ = self.Crf.viterbi_decode(x, self.mask)
        return tag_seq

    def _internal_loss(self, x, y):
        """
        Negative log likelihood loss.
        :param x: Tensor, [batch_size, max_len, tag_size]
        :param y: Tensor, [batch_size, max_len]
        :return loss: a scalar Tensor

        """
        x = x.float()
        y = y.long()
        assert x.shape[:2] == y.shape
        assert y.shape == self.mask.shape
        total_loss = self.Crf(x, y, self.mask)
        return torch.mean(total_loss)

    def _make_mask(self, x, seq_len):
        batch_size, max_len = x.size(0), x.size(1)
        mask = seq_len_to_mask(seq_len)
        mask = mask.view(batch_size, max_len)
        mask = mask.to(x).float()
        return mask

    def _forward(self, char, word, pos, spo, seq_len, tag=None):
        """
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len:[batch_size, ]
        :param torch.LongTensor target: [batch_size, max_len]
        :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting.
                   If truth is not None, return loss, a scalar. Used in training.
        """

        char = char.long()
        #word = word.long()
        #pos = pos.long()
        #spo = spo.long()
        seq_len = seq_len.long()
        self.mask = self._make_mask(char, seq_len)

        # seq_len = seq_len.long()
        tag = tag.long() if tag is not None else None

        #if next(self.parameters()).is_cuda:
        #    char = char.cuda()
        #    self.mask = self.mask.cuda()

        # x = self.Embedding(words)
        char = self.char_embed(char)
        word = self.word_embed(word)
        pos = self.pos_embed(pos)
        #print(spo)
        #print(self.zeros)
        spo = spo.unsqueeze(1).repeat(1, char.shape[1], 1).float()
        #print(char.shape)
        #print(word.shape)
        #print(pos.shape)
        #print(spo.shape)
        x = torch.cat((char, word, pos, spo), dim=2)
        #print(x.shape)

        x = self.norm1(x)
        # [batch_size, max_len, char_embed_dim + word_embed_dim + pos_embed_dim + spo_embed_dim ]

        x = self.transformer(x, seq_mask=self.mask)

        #x = self.Linear1(x)
        #x = self.norm2(x)
        #x = self.relu(x)
        #x = self.drop(x)
        #x = self.Linear2(x)
        x = self.Linear(x)
        if tag is not None:
            return {"loss": self._internal_loss(x, tag)}
        else:
            return {"pred": self._decode(x)}
        #return {"pred": self._decode(x)}

    def forward(self, char, word, pos, spo, seq_len, tag):
        """
        
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len: [batch_size, ]
        :param torch.LongTensor target: [batch_size, max_len], 目标
        :return torch.Tensor: a scalar loss
        """
        return self._forward(char, word, pos, spo, seq_len, tag)

    def predict(self, char, word, pos, spo, seq_len):
        """
        
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len: [batch_size, ]
        :return torch.LongTensor: [batch_size, max_len]
        """
        return self._forward(char, word, pos, spo, seq_len)
Esempio n. 23
0
class AdvSeqLabel(nn.Module):
    """
    别名::class:`fastNLP.models.AdvSeqLabel`  :class:`fastNLP.models.sequence_labeling.AdvSeqLabel`

    更复杂的Sequence Labelling模型。结构为Embedding, LayerNorm, 双向LSTM(两层),FC,LayerNorm,DropOut,FC,CRF。
    
    :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
        第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding
    :param int hidden_size: LSTM的隐层大小
    :param int num_classes: 有多少个类
    :param float dropout: LSTM中以及DropOut层的drop概率
    :param dict id2words: tag id转为其tag word的表。用于在CRF解码时防止解出非法的顺序,比如'BMES'这个标签规范中,'S'
        不能出现在'B'之后。这里也支持类似与'B-NN',即'-'前为标签类型的指示,后面为具体的tag的情况。这里不但会保证
        'B-NN'后面不为'S-NN'还会保证'B-NN'后面不会出现'M-xx'(任何非'M-NN'和'E-NN'的情况。)
    :param str encoding_type: 支持"BIO", "BMES", "BEMSO", 只有在id2words不为None的情况有用。
    """
    def __init__(self,
                 char_init_embed,
                 word_init_embed,
                 pos_init_embed,
                 spo_embed_dim,
                 sentence_length,
                 hidden_size,
                 num_classes,
                 dropout=0.3,
                 id2words=None,
                 encoding_type='bmes'):

        super().__init__()

        # self.Embedding = nn.Embedding(init_embed)
        #print(char_init_embed)
        self.char_embed = nn.Embedding(char_init_embed[0], char_init_embed[1])
        self.word_embed = nn.Embedding(word_init_embed[0], word_init_embed[1])
        self.pos_embed = nn.Embedding(pos_init_embed[0], pos_init_embed[1])
        # spo embed size: 50
        self.embed_dim = self.char_embed.embedding_dim + self.word_embed.embedding_dim + self.pos_embed.embedding_dim + spo_embed_dim
        # sentence length
        #self.sen_len = sentence_length
        #self.zeros = torch.zeros(self.sen_len, dtype=torch.long)

        self.norm1 = torch.nn.LayerNorm(self.embed_dim)
        self.Rnn = encoder.LSTM(input_size=self.embed_dim,
                                hidden_size=hidden_size,
                                num_layers=2,
                                dropout=dropout,
                                bidirectional=True,
                                batch_first=True)
        self.Linear1 = nn.Linear(hidden_size * 2, hidden_size * 2 // 3)
        self.norm2 = torch.nn.LayerNorm(hidden_size * 2 // 3)
        self.relu = torch.nn.LeakyReLU()
        self.drop = torch.nn.Dropout(dropout)
        self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes)

        if id2words is None:
            self.Crf = CRF(num_classes, include_start_end_trans=False)
        else:
            self.Crf = CRF(num_classes,
                           include_start_end_trans=False,
                           allowed_transitions=allowed_transitions(
                               id2words, encoding_type=encoding_type))

    def _decode(self, x):
        """
        :param torch.FloatTensor x: [batch_size, max_len, tag_size]
        :return torch.LongTensor, [batch_size, max_len]
        """
        tag_seq, _ = self.Crf.viterbi_decode(x, self.mask)
        return tag_seq

    def _internal_loss(self, x, y):
        """
        Negative log likelihood loss.
        :param x: Tensor, [batch_size, max_len, tag_size]
        :param y: Tensor, [batch_size, max_len]
        :return loss: a scalar Tensor

        """
        x = x.float()
        y = y.long()
        assert x.shape[:2] == y.shape
        assert y.shape == self.mask.shape
        total_loss = self.Crf(x, y, self.mask)
        return torch.mean(total_loss)

    def _make_mask(self, x, seq_len):
        batch_size, max_len = x.size(0), x.size(1)
        mask = seq_len_to_mask(seq_len)
        mask = mask.view(batch_size, max_len)
        mask = mask.to(x).float()
        return mask

    def _forward(self, char, word, pos, spo, seq_len, tag=None):
        """
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len:[batch_size, ]
        :param torch.LongTensor target: [batch_size, max_len]
        :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting.
                   If truth is not None, return loss, a scalar. Used in training.
        """

        char = char.long()
        #word = word.long()
        #pos = pos.long()
        #spo = spo.long()
        seq_len = seq_len.long()
        self.mask = self._make_mask(char, seq_len)

        # seq_len = seq_len.long()
        tag = tag.long() if tag is not None else None

        #if next(self.parameters()).is_cuda:
        #    char = char.cuda()
        #    self.mask = self.mask.cuda()

        # x = self.Embedding(words)
        char = self.char_embed(char)
        word = self.word_embed(word)
        pos = self.pos_embed(pos)
        #print(spo)
        #print(self.zeros)
        spo = spo.unsqueeze(1).repeat(1, char.shape[1], 1).float()
        #print(char.shape)
        #print(word.shape)
        #print(pos.shape)
        #print(spo.shape)
        x = torch.cat((char, word, pos, spo), dim=2)
        #print(x.shape)

        x = self.norm1(x)
        # [batch_size, max_len, char_embed_dim + word_embed_dim + pos_embed_dim + spo_embed_dim ]

        x, _ = self.Rnn(x, seq_len=seq_len)

        x = self.Linear1(x)
        x = self.norm2(x)
        x = self.relu(x)
        x = self.drop(x)
        x = self.Linear2(x)
        if tag is not None:
            return {"loss": self._internal_loss(x, tag)}
        else:
            return {"pred": self._decode(x)}
        #return {"pred": self._decode(x)}

    def forward(self, char, word, pos, spo, seq_len, tag):
        """
        
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len: [batch_size, ]
        :param torch.LongTensor target: [batch_size, max_len], 目标
        :return torch.Tensor: a scalar loss
        """
        return self._forward(char, word, pos, spo, seq_len, tag)

    def predict(self, char, word, pos, spo, seq_len):
        """
        
        :param torch.LongTensor words: [batch_size, mex_len]
        :param torch.LongTensor seq_len: [batch_size, ]
        :return torch.LongTensor: [batch_size, max_len]
        """
        return self._forward(char, word, pos, spo, seq_len)
Esempio n. 24
0
class CharModel(nn.Module):
    def __init__(self,
                 embed,
                 label_vocab,
                 pos_idx,
                 Parsing_rnn_layers,
                 Parsing_arc_mlp_size,
                 Parsing_label_mlp_size,
                 Parsing_use_greedy_infer=False,
                 encoding_type='bmeso',
                 embedding_dim=768,
                 dropout=0.1,
                 use_pos_embedding=False,
                 use_average=False):
        super().__init__()
        self.embed = embed
        self.use_pos_embedding = use_pos_embedding
        self.use_average = use_average
        self.label_vocab = label_vocab
        self.pos_idx = pos_idx
        embedding_dim_1 = 512
        embedding_dim_2 = 256

        self.layers_map = {'CWS': '1', 'POS': '2', 'Parsing': '3', 'NER': '2'}
        #NER
        self.ner_linear = nn.Linear(embedding_dim, len(label_vocab['NER']))
        trans = allowed_transitions(label_vocab['NER'],
                                    encoding_type='bmeso',
                                    include_start_end=True)
        self.ner_crf = ConditionalRandomField(len(label_vocab['NER']),
                                              include_start_end_trans=True,
                                              allowed_transitions=trans)

        #parsing
        self.biaffine_parser = BertCharParser(
            vector_size=768,
            num_label=len(label_vocab['Parsing']),
            rnn_layers=Parsing_rnn_layers,
            arc_mlp_size=Parsing_arc_mlp_size,
            label_mlp_size=Parsing_label_mlp_size,
            dropout=dropout,
            use_greedy_infer=Parsing_use_greedy_infer)

        if self.use_pos_embedding:
            self.pos_embedding = nn.Embedding(len(self.label_vocab['pos']),
                                              embedding_dim,
                                              padding_idx=0)

        self.loss = CrossEntropyLoss(padding_idx=0)

        #CWS
        self.cws_mlp = MLP([
            embedding_dim, embedding_dim_1, embedding_dim_2,
            len(label_vocab['CWS'])
        ],
                           'relu',
                           output_activation=None)

        #POS
        self.pos_mlp = MLP([
            embedding_dim, embedding_dim_1, embedding_dim_2,
            len(label_vocab['POS'])
        ],
                           'relu',
                           output_activation=None)

    def _generate_embedding(self, feats, word_lens, seq_len, pos):
        new_feats = []
        batch_size = feats.size()[0]
        sentence_length = feats.size()[1]
        device = feats.device
        if self.use_average == False:
            for i in range(batch_size):
                new_feats.append(torch.index_select(feats[i], 0, word_lens[i]))
            new_feats = torch.stack(new_feats, 0)
        else:
            for i in range(batch_size):
                feats_for_one_sample = []
                for j in range(word_lens.size()[1]):
                    if word_lens[i][j] == 0 and j != 0:
                        feats_for_one_word = torch.zeros(feats.size()[-1])
                    else:
                        if j == word_lens.size()[1] - 1 or word_lens[i][
                                j + 1] == 0:
                            index = range(word_lens[i][j], seq_len[i])
                        else:
                            index = range(word_lens[i][j], word_lens[i][j + 1])
                        index = torch.tensor(index).to(device)
                        feats_for_one_word = torch.index_select(
                            feats[i], 0, index)
                        word_len = feats_for_one_word.size()[0]
                        feats_for_one_word = torch.mean(feats_for_one_word,
                                                        dim=0)
                    feats_for_one_sample.append(feats_for_one_word)
                feats_for_one_sample = torch.stack(feats_for_one_sample, dim=0)
                new_feats.append(feats_for_one_sample)
            new_feats = torch.stack(new_feats, 0)
        if self.use_pos_embedding:
            pos_feats = self.pos_embedding(pos)
            new_feats = new_feats + pos_feats
        return new_feats

    def _generate_from_pos(self, paths, seq_len):
        device = paths.device
        word_lens = []
        batch_size = paths.size()[0]
        new_seq_len = []
        batch_pos = []
        for i in range(batch_size):
            word_len = []
            pos = []
            for j in range(seq_len[i]):
                tag = paths[i][j]
                tag = self.label_vocab['POS'].to_word(int(tag))
                if tag.startswith('<'):
                    continue
                tag1, tag2 = tag.split('-')
                tag2 = self.label_vocab['pos'].to_index(tag2)
                if tag1 == 'S' or tag1 == 'B':
                    word_len.append(j)
                    pos.append(tag2)
            if len(pos) == 1:
                word_len.append(seq_len[i] - 1)
                pos.append(tag2)
            new_seq_len.append(len(pos))
            word_lens.append(word_len)
            batch_pos.append(pos)
        max_len = max(new_seq_len)
        for i in range(batch_size):
            word_lens[i] = word_lens[i] + [0] * (max_len - new_seq_len[i])
            batch_pos[i] = batch_pos[i] + [0] * (max_len - new_seq_len[i])
        word_lens = torch.tensor(word_lens, device=device)
        batch_pos = torch.tensor(batch_pos, device=device)
        new_seq_len = torch.tensor(new_seq_len, device=device)
        return word_lens, batch_pos, new_seq_len

    def _decode_parsing(self, dep_head, dep_label, seq_len,
                        seq_len_for_wordlist, word_lens):
        device = dep_head.device
        heads = []
        labels = []
        batch_size = dep_head.size()[0]
        app_index = self.label_vocab['Parsing'].to_index('APP')

        max_len = seq_len.max()
        for i in range(batch_size):
            head = list(range(1, seq_len[i] + 1))
            label = [app_index] * int(seq_len[i])
            head[0] = 0

            for j in range(1, seq_len_for_wordlist[i]):
                if j + 1 == seq_len_for_wordlist[i]:
                    idx = seq_len[i] - 1
                else:
                    idx = word_lens[i][j + 1] - 1

                label[idx] = int(dep_label[i][j])
                root = dep_head[i][j]
                if root >= seq_len_for_wordlist[i] - 1:
                    head[idx] = int(seq_len[i] - 1)
                else:
                    try:
                        head[idx] = int(word_lens[i][root + 1] - 1)
                    except:
                        print(len(head), idx, word_lens.size(), i, root)

            head = head + [0] * int(max_len - seq_len[i])
            label = label + [0] * int(max_len - seq_len[i])

            heads.append(head)
            labels.append(label)
        heads = torch.tensor(heads, device=device)
        labels = torch.tensor(labels, device=device)

        return heads, labels

    def forward(self,
                chars,
                seq_len,
                task_class,
                target,
                seq_len_for_wordlist=None,
                dep_head=None,
                dep_label=None,
                pos=None,
                word_lens=None):
        task = task_class[0]
        mask = chars.ne(0)

        layers = self.layers_map[task]
        feats = self.embed(chars, layers)

        if task == 'Parsing':
            parsing_feats = self._generate_embedding(feats, word_lens, seq_len,
                                                     pos)
            loss_parsing = self.biaffine_parser(parsing_feats,
                                                seq_len_for_wordlist, dep_head,
                                                dep_label)

            return loss_parsing

        if task == 'NER':
            #?需要relu吗
            feats = F.relu(self.ner_linear(feats))
            logits = F.log_softmax(feats, dim=-1)
            loss = self.ner_crf(logits, target, mask)
            return {'loss': loss}

        if task == 'CWS':
            feats = self.cws_mlp(feats)
            #logits=F.log_softmax(feats, dim=-1)
            #loss=self.cws_crf(logits, target, mask)
            loss = self.loss.get_loss(feats, target, seq_len)
            return {'loss': loss}

        if task == 'POS':
            feats = self.pos_mlp(feats)
            #logits=F.log_softmax(feats, dim=-1)
            #loss=self.pos_crf(logits, target, mask)
            loss = self.loss.get_loss(feats, target, seq_len)
            return {'loss': loss}

    def predict(self, chars, seq_len, task_class):
        task = task_class[0]
        mask = chars.ne(0)
        layers = self.layers_map[task]
        feats = self.embed(chars, layers)

        if task == 'Parsing':
            for sample in chars:
                sample[0] = self.pos_idx
            pos_feats = self.embed(chars, '2')
            pos_feats = self.pos_mlp(pos_feats)
            #logits = F.log_softmax(pos_feats, dim=-1)
            #paths, _ = self.pos_crf.viterbi_decode(logits, mask)
            paths = pos_feats.max(dim=-1)[1]

            word_lens, batch_pos, seq_len_for_wordlist = self._generate_from_pos(
                paths, seq_len)
            parsing_feats = self._generate_embedding(feats, word_lens, seq_len,
                                                     batch_pos)
            answer = self.biaffine_parser.predict(parsing_feats,
                                                  seq_len_for_wordlist)
            head_preds = answer['head_preds']
            label_preds = answer['label_preds']
            heads, labels = self._decode_parsing(head_preds, label_preds,
                                                 seq_len, seq_len_for_wordlist,
                                                 word_lens)

            return {'head_preds': heads, 'label_preds': labels, 'pred': paths}

        if task == 'CWS':
            feats = self.cws_mlp(feats)
            #logits = F.log_softmax(feats, dim=-1)
            #paths, _ = self.cws_crf.viterbi_decode(logits, mask)
            paths = feats.max(dim=-1)[1]
            return {'pred': paths}

        if task == 'POS':
            feats = self.pos_mlp(feats)
            #logits = F.log_softmax(feats, dim=-1)
            #paths, _ = self.pos_crf.viterbi_decode(logits, mask)
            paths = feats.max(dim=-1)[1]
            return {'pred': paths}
            #output=feats.max(dim=-1)[1]

        if task == 'NER':
            feats = F.relu(self.ner_linear(feats))
            logits = F.log_softmax(feats, dim=-1)
            paths, _ = self.ner_crf.viterbi_decode(logits, mask)
            return {'pred': paths}
Esempio n. 25
0
    def __init__(self,
                 tag_vocab,
                 embed,
                 num_layers,
                 d_model,
                 n_head,
                 feedforward_dim,
                 dropout,
                 after_norm=True,
                 attn_type='adatrans',
                 bi_embed=None,
                 fc_dropout=0.3,
                 pos_embed=None,
                 scale=False,
                 dropout_attn=None,
                 use_knowledge=False,
                 feature2count=None,
                 vocab_size=None,
                 feature_vocab_size=None,
                 kv_attn_type="dot",
                 memory_dropout=0.2,
                 fusion_dropout=0.2,
                 fusion_type='concat',
                 highway_layer=0,
                 key_embed_dropout=0.2,
                 knowledge_type="all",
                 use_zen=False,
                 zen_model=None):
        """
        :param tag_vocab: fastNLP Vocabulary
        :param embed: fastNLP TokenEmbedding
        :param num_layers: number of self-attention layers
        :param d_model: input size
        :param n_head: number of head
        :param feedforward_dim: the dimension of ffn
        :param dropout: dropout in self-attention
        :param after_norm: normalization place
        :param attn_type: adatrans, naive
        :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None
        :param bi_embed: Used in Chinese scenerio
        :param fc_dropout: dropout rate before the fc layer
        :param use_knowledge: 是否使用stanford corenlp的知识
        :param feature2count: 字典, {"gram2count": dict, "pos_tag2count": dict, "chunk_tag2count": dict, "dep_tag2count": dict},
        :param
        """
        super().__init__()
        self.use_knowledge = use_knowledge
        self.feature2count = feature2count
        self.vocab_size = vocab_size
        self.feature_vocab_size = feature_vocab_size

        # add ZEN
        self.use_zen = use_zen

        self.embed = embed
        embed_size = self.embed.embed_size
        self.bi_embed = None
        if bi_embed is not None:
            self.bi_embed = bi_embed
            embed_size += self.bi_embed.embed_size

        self.in_fc = nn.Linear(embed_size, d_model)

        self.transformer = TransformerEncoder(num_layers,
                                              d_model,
                                              n_head,
                                              feedforward_dim,
                                              dropout,
                                              after_norm=after_norm,
                                              attn_type=attn_type,
                                              scale=scale,
                                              dropout_attn=dropout_attn,
                                              pos_embed=pos_embed)

        self.kv_memory = KeyValueMemoryNetwork(
            vocab_size=vocab_size,
            feature_vocab_size=feature_vocab_size,
            attn_type=kv_attn_type,
            emb_size=d_model,
            scaled=True,
            key_embed_dropout=key_embed_dropout,
            knowledge_type=knowledge_type)

        self.output_dim = d_model * _dim_map[fusion_type]
        self.fusion = FusionModule(fusion_type=fusion_type,
                                   layer=highway_layer,
                                   input_size=d_model,
                                   output_size=self.output_dim,
                                   dropout=fusion_dropout)

        self.memory_dropout = nn.Dropout(p=memory_dropout)

        self.out_fc = nn.Linear(self.output_dim, len(tag_vocab))

        self.fc_dropout = nn.Dropout(fc_dropout)

        trans = allowed_transitions(tag_vocab, include_start_end=True)
        self.crf = ConditionalRandomField(len(tag_vocab),
                                          include_start_end_trans=True,
                                          allowed_transitions=trans)
Esempio n. 26
0
class BiLSTMCRF(nn.Module):
    def __init__(self,
                 char_embed,
                 hidden_size,
                 num_layers,
                 target_vocab=None,
                 bigram_embed=None,
                 trigram_embed=None,
                 dropout=0.5):
        super().__init__()

        embed_size = char_embed.embed_size
        self.char_embed = char_embed
        if bigram_embed:
            embed_size += bigram_embed.embed_size
        self.bigram_embed = bigram_embed
        if trigram_embed:
            embed_size += trigram_embed.embed_size
        self.trigram_embed = trigram_embed

        self.lstm = LSTM(embed_size,
                         hidden_size=hidden_size // 2,
                         bidirectional=True,
                         batch_first=True,
                         num_layers=num_layers)
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(hidden_size, len(target_vocab))

        transitions = None
        if target_vocab:
            transitions = allowed_transitions(target_vocab,
                                              include_start_end=True,
                                              encoding_type='bmes')

        self.crf = ConditionalRandomField(num_tags=len(target_vocab),
                                          allowed_transitions=transitions)

    def _forward(self, chars, bigrams, trigrams, seq_len, target=None):
        chars = self.char_embed(chars)
        if bigrams is not None:
            bigrams = self.bigram_embed(bigrams)
            chars = torch.cat([chars, bigrams], dim=-1)
        if trigrams is not None:
            trigrams = self.trigram_embed(trigrams)
            chars = torch.cat([chars, trigrams], dim=-1)

        output, _ = self.lstm(chars, seq_len)
        output = self.dropout(output)
        output = self.fc(output)
        output = F.log_softmax(output, dim=-1)
        mask = seq_len_to_mask(seq_len)
        if target is None:
            pred, _ = self.crf.viterbi_decode(output, mask)
            return {Const.OUTPUT: pred}
        else:
            loss = self.crf.forward(output, tags=target, mask=mask)
            return {Const.LOSS: loss}

    def forward(self, chars, seq_len, target, bigrams=None, trigrams=None):
        return self._forward(chars, bigrams, trigrams, seq_len, target)

    def predict(self, chars, seq_len, bigrams=None, trigrams=None):
        return self._forward(chars, bigrams, trigrams, seq_len)
Esempio n. 27
0
class CNBiLSTMCRFNER(nn.Module):
    def __init__(self,
                 char_embed,
                 num_classes,
                 bigram_embed=None,
                 trigram_embed=None,
                 num_layers=1,
                 hidden_size=100,
                 dropout=0.5,
                 target_vocab=None,
                 encoding_type=None):
        super().__init__()

        self.char_embed = get_embeddings(char_embed)
        embed_size = self.char_embed.embedding_dim
        if bigram_embed:
            self.bigram_embed = get_embeddings(bigram_embed)
            embed_size += self.bigram_embed.embedding_dim
        if trigram_embed:
            self.trigram_ebmbed = get_embeddings(trigram_embed)
            embed_size += self.bigram_embed.embedding_dim

        if num_layers > 1:
            self.lstm = LSTM(embed_size,
                             num_layers=num_layers,
                             hidden_size=hidden_size // 2,
                             bidirectional=True,
                             batch_first=True,
                             dropout=dropout)
        else:
            self.lstm = LSTM(embed_size,
                             num_layers=num_layers,
                             hidden_size=hidden_size // 2,
                             bidirectional=True,
                             batch_first=True)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, num_classes)

        trans = None
        if target_vocab is not None and encoding_type is not None:
            trans = allowed_transitions(target_vocab.idx2word,
                                        encoding_type=encoding_type,
                                        include_start_end=True)

        self.crf = ConditionalRandomField(num_classes,
                                          include_start_end_trans=True,
                                          allowed_transitions=trans)

    def _forward(self,
                 chars,
                 bigrams=None,
                 trigrams=None,
                 seq_len=None,
                 target=None):
        chars = self.char_embed(chars)
        if hasattr(self, 'bigram_embed'):
            bigrams = self.bigram_embed(bigrams)
            chars = torch.cat((chars, bigrams), dim=-1)
        if hasattr(self, 'trigram_embed'):
            trigrams = self.trigram_embed(trigrams)
            chars = torch.cat((chars, trigrams), dim=-1)
        feats, _ = self.lstm(chars, seq_len=seq_len)
        feats = self.fc(feats)
        feats = self.dropout(feats)
        logits = F.log_softmax(feats, dim=-1)
        mask = seq_len_to_mask(seq_len)
        if target is None:
            pred, _ = self.crf.viterbi_decode(logits, mask)
            return {C.OUTPUT: pred}
        else:
            loss = self.crf(logits, target, mask).mean()
            return {C.LOSS: loss}

    def forward(self,
                chars,
                target,
                bigrams=None,
                trigrams=None,
                seq_len=None):
        return self._forward(chars, bigrams, trigrams, seq_len, target)

    def predict(self, chars, seq_len=None, bigrams=None, trigrams=None):
        return self._forward(chars, bigrams, trigrams, seq_len)
Esempio n. 28
0
class TransXL(nn.Module):
    def __init__(self, tag_vocab, bert_config, bi_embed=None):
        """

        :param tag_vocab: fastNLP Vocabulary
        :param embed: fastNLP TokenEmbedding
        :param num_layers: number of self-attention layers
        :param d_model: input size
        :param n_head: number of head
        :param feedforward_dim: the dimension of ffn
        :param dropout: dropout in self-attention
        :param after_norm: normalization place
        :param attn_type: adatrans, naive
        :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None
        :param bi_embed: Used in Chinese scenerio
        :param fc_dropout: dropout rate before the fc layer
        """
        super().__init__()

        self.embed = BertModel.from_pretrained(bert_config)
        embed_size = self.embed.embeddings.word_embeddings.weight.shape[1]
        self.bi_embed = None
        if bi_embed is not None:
            self.bi_embed = bi_embed
            embed_size += self.bi_embed.embed_size
        self.configuration = TransfoXLConfig(d_model=768,
                                             d_head=16,
                                             n_head=16,
                                             n_layer=4,
                                             mem_len=1000)
        self.xl_model = TransfoXLModel(self.configuration)
        self.liner = nn.Linear(768, len(tag_vocab))
        # trans = allowed_transitions(tag_vocab, include_start_end=True, encoding_type = "bioes")
        #TODO: trans 为限制转移的数组,非常有用,过后加上
        self.crf = ConditionalRandomField(len(tag_vocab),
                                          include_start_end_trans=True,
                                          allowed_transitions=None)

    def _forward(self, sentence, target=None, mems=None):
        batch_size = sentence.size(0)
        seq_length = sentence.size(1)
        mask = sentence.ne(0)

        embeds, _ = self.embed(sentence,
                               attention_mask=None,
                               output_all_encoded_layers=False)
        trans_out = self.xl_model(None, mems, inputs_embeds=embeds)[:2]
        feats, mems = trans_out[0], trans_out[1]
        feats = self.liner(feats.contiguous().view(-1, 768))
        feats = feats.contiguous().view(batch_size, seq_length, -1)
        logits = F.log_softmax(feats, dim=-1)
        if target is None:
            paths, _ = self.crf.viterbi_decode(logits, mask)
            return {'pred': [paths, mems]}
        else:
            loss = self.crf(logits, target, mask)
            return {'loss': [loss, mems]}

    def forward(self, chars, target=None, mems=None):
        return self._forward(chars, target, mems)

    def predict(self, chars, mems=None):
        return self._forward(chars, target=None, mems=mems)
Esempio n. 29
0
class TENER(nn.Module):
    def __init__(self,
                 config,
                 data_bundle,
                 embed,
                 num_layers,
                 d_model,
                 n_head,
                 feedforward_dim,
                 dropout,
                 after_norm=True,
                 attn_type='adatrans',
                 bi_embed=None,
                 fc_dropout=0.3,
                 pos_embed=None,
                 scale=False,
                 dropout_attn=None):
        """

        :param tag_vocab: fastNLP Vocabulary
        :param embed: fastNLP TokenEmbedding
        :param num_layers: number of self-attention layers
        :param d_model: input size
        :param n_head: number of head
        :param feedforward_dim: the dimension of ffn
        :param dropout: dropout in self-attention
        :param after_norm: normalization place
        :param attn_type: adatrans, naive
        :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None
        :param bi_embed: Used in Chinese scenerio
        :param fc_dropout: dropout rate before the fc layer
        """
        super().__init__()
        self.config = config
        self.data_bundle = data_bundle
        tag_vocab = data_bundle.get_vocab('target')
        self.embed = embed
        embed_size = self.embed.embed_size
        self.bi_embed = None
        if bi_embed is not None:
            self.bi_embed = bi_embed
            embed_size += self.bi_embed.embed_size

        self.in_fc = nn.Linear(embed_size, d_model)

        self.transformer = TransformerEncoder(num_layers,
                                              d_model,
                                              n_head,
                                              feedforward_dim,
                                              dropout,
                                              after_norm=after_norm,
                                              attn_type=attn_type,
                                              scale=scale,
                                              dropout_attn=dropout_attn,
                                              pos_embed=pos_embed)
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.out_fc = nn.Linear(d_model, len(tag_vocab))
        trans = allowed_transitions(tag_vocab, include_start_end=True)
        self.crf = ConditionalRandomField(len(tag_vocab),
                                          include_start_end_trans=True,
                                          allowed_transitions=trans)

    def _forward(self, chars, target, bigrams=None):
        mask = chars.ne(0)
        chars = self.embed(chars)
        if self.bi_embed is not None:
            bigrams = self.bi_embed(bigrams)
            chars = torch.cat([chars, bigrams], dim=-1)

        chars = self.in_fc(chars)
        chars = self.transformer(chars, mask)
        chars = self.fc_dropout(chars)
        chars = self.out_fc(chars)
        logits = F.log_softmax(chars, dim=-1)
        if target is None:
            paths, _ = self.crf.viterbi_decode(logits, mask)
            return {'pred': paths}
        else:
            loss = self.crf(logits, target, mask)
            return {'loss': loss}

    def forward(self, chars, target, bigrams=None):
        return self._forward(chars, target, bigrams)

    def predict(self, chars, bigrams=None):
        return self._forward(chars, target=None, bigrams=bigrams)

    def _get_trainer(self, models_folder):
        optimizer = optim.SGD(self.parameters(),
                              lr=self.config['lr'],
                              momentum=0.9)

        callbacks = []
        clip_callback = GradientClipCallback(clip_type='value', clip_value=5)
        evaluate_callback = EvaluateCallback(
            self.data_bundle.get_dataset('test'))

        if self.config['warmup_steps'] > 0:
            warmup_callback = WarmupCallback(self.config['warmup_steps'],
                                             schedule='linear')
            callbacks.append(warmup_callback)
        callbacks.extend([clip_callback, evaluate_callback])

        return Trainer(self.data_bundle.get_dataset('train'),
                       self,
                       optimizer,
                       batch_size=self.config['batch_size'],
                       sampler=BucketSampler(),
                       num_workers=2,
                       n_epochs=100,
                       dev_data=self.data_bundle.get_dataset('dev'),
                       metrics=SpanFPreRecMetric(
                           tag_vocab=self.data_bundle.get_vocab('target'),
                           encoding_type=self.config['encoding_type']),
                       dev_batch_size=self.config['batch_size'] * 5,
                       callbacks=callbacks,
                       device=self.config['device'],
                       test_use_tqdm=False,
                       use_tqdm=True,
                       print_every=300,
                       save_path=models_folder)

    def train_model(self, models_folder):
        trainer = self._get_trainer(models_folder)
        trainer.train(load_best_model=False)

    def load(self, path):
        self.load_state_dict(torch.load(path).state_dict())
        print("Reloaded trained model.")
        return self

    def test(self, dataset, subset):
        metrics_to_test = [fastNLP.core.metrics.AccuracyMetric()]

        # Load dataset for testing
        databundle_for_test = read_dataset(dataset, self.config)

        # Perform testing
        tester = Tester(databundle_for_test.get_dataset(subset),
                        self,
                        metrics_to_test,
                        batch_size=self.config['batch_size'],
                        num_workers=0,
                        device=None,
                        verbose=1,
                        use_tqdm=True)
        tester.test()

        flattened_true_entities, flattened_predicted_entities = flatten_prediction_results(
            self.data_bundle, databundle_for_test, subset,
            self._predict(
                subset_for_prediction=databundle_for_test.get_dataset(subset),
                targets=self.data_bundle.vocabs["target"],
                filename=None))

        print("Precision per label:")
        labels = get_unique_targets(self.data_bundle.vocabs["target"])
        scores = get_average_precision(y_true=flattened_true_entities,
                                       y_pred=flattened_predicted_entities,
                                       labels=labels,
                                       average=None)
        for label, score in zip(labels, scores):
            print(f'{label:10s} {score:.2f}')

        #print(get_average_precision(flattened_true_entities, flattened_predicted_entities, 'weighted'))
        #for averaging_method in ['micro', 'macro', 'weighted', 'samples']:
        #print(averaging_method)
        #print(get_average_precision(flattened_true_entities, flattened_predicted_entities, averaging_method))

        # print(len(flattened_predicted_entities))
        # print(len(flattened_true_entities))

    def _predict(self, subset_for_prediction, targets, filename):
        predictor = Predictor(self)
        predictions = predictor.predict(subset_for_prediction)['pred']
        words = list(subset_for_prediction.get_field('raw_words'))
        lines = []

        words_sequence_index = 1
        labels_sequence_index = 0
        for sentence in list(zip(predictions, words)):
            if type(sentence[labels_sequence_index][0]) == int:
                continue
            words = sentence[words_sequence_index]
            #print(sentence[labels_sequence_index])
            #labels = map(lambda label: f'{targets.to_word(label).split("-")[-1]}', sentence[labels_sequence_index][0])
            labels = map(lambda label: f'{targets.to_word(label)}',
                         sentence[labels_sequence_index][0])
            for pair in zip(words, labels):
                lines.append(' '.join(pair))
            lines.append('')
        if filename is not None:
            write_lines(filename, lines)
        return lines

    def export_predictions(self, dataset, subset, output_file):
        # Load dataset for prediction
        databundle_for_prediction = read_dataset(dataset, self.config)

        # Perform prediction
        return self._predict(databundle_for_prediction.get_dataset(subset),
                             self.data_bundle.vocabs["target"], output_file)
Esempio n. 30
0
class TENER(nn.Module):
    def __init__(self,
                 tag_vocab,
                 embed,
                 num_layers,
                 d_model,
                 n_head,
                 feedforward_dim,
                 dropout,
                 after_norm=True,
                 attn_type='adatrans',
                 bi_embed=None,
                 fc_dropout=0.3,
                 pos_embed=None,
                 scale=False,
                 dropout_attn=None,
                 use_knowledge=False,
                 feature2count=None,
                 vocab_size=None,
                 feature_vocab_size=None,
                 kv_attn_type="dot",
                 memory_dropout=0.2,
                 fusion_dropout=0.2,
                 fusion_type='concat',
                 highway_layer=0,
                 key_embed_dropout=0.2,
                 knowledge_type="all",
                 use_zen=False,
                 zen_model=None):
        """
        :param tag_vocab: fastNLP Vocabulary
        :param embed: fastNLP TokenEmbedding
        :param num_layers: number of self-attention layers
        :param d_model: input size
        :param n_head: number of head
        :param feedforward_dim: the dimension of ffn
        :param dropout: dropout in self-attention
        :param after_norm: normalization place
        :param attn_type: adatrans, naive
        :param rel_pos_embed: position embedding的类型,支持sin, fix, None. relative时可为None
        :param bi_embed: Used in Chinese scenerio
        :param fc_dropout: dropout rate before the fc layer
        :param use_knowledge: 是否使用stanford corenlp的知识
        :param feature2count: 字典, {"gram2count": dict, "pos_tag2count": dict, "chunk_tag2count": dict, "dep_tag2count": dict},
        :param
        """
        super().__init__()
        self.use_knowledge = use_knowledge
        self.feature2count = feature2count
        self.vocab_size = vocab_size
        self.feature_vocab_size = feature_vocab_size

        # add ZEN
        self.use_zen = use_zen

        self.embed = embed
        embed_size = self.embed.embed_size
        self.bi_embed = None
        if bi_embed is not None:
            self.bi_embed = bi_embed
            embed_size += self.bi_embed.embed_size

        self.in_fc = nn.Linear(embed_size, d_model)

        self.transformer = TransformerEncoder(num_layers,
                                              d_model,
                                              n_head,
                                              feedforward_dim,
                                              dropout,
                                              after_norm=after_norm,
                                              attn_type=attn_type,
                                              scale=scale,
                                              dropout_attn=dropout_attn,
                                              pos_embed=pos_embed)

        self.kv_memory = KeyValueMemoryNetwork(
            vocab_size=vocab_size,
            feature_vocab_size=feature_vocab_size,
            attn_type=kv_attn_type,
            emb_size=d_model,
            scaled=True,
            key_embed_dropout=key_embed_dropout,
            knowledge_type=knowledge_type)

        self.output_dim = d_model * _dim_map[fusion_type]
        self.fusion = FusionModule(fusion_type=fusion_type,
                                   layer=highway_layer,
                                   input_size=d_model,
                                   output_size=self.output_dim,
                                   dropout=fusion_dropout)

        self.memory_dropout = nn.Dropout(p=memory_dropout)

        self.out_fc = nn.Linear(self.output_dim, len(tag_vocab))

        self.fc_dropout = nn.Dropout(fc_dropout)

        trans = allowed_transitions(tag_vocab, include_start_end=True)
        self.crf = ConditionalRandomField(len(tag_vocab),
                                          include_start_end_trans=True,
                                          allowed_transitions=trans)

    def _forward(self,
                 chars,
                 target,
                 bigrams=None,
                 pos_features=None,
                 dep_features=None,
                 chunk_features=None,
                 pos_matrix=None,
                 dep_matrix=None,
                 chunk_matrix=None,
                 nan_matrix=None,
                 zen_input=None):
        # get the hidden state from transformer encoder
        mask = chars.ne(0)
        hidden = self.embed(chars)

        if self.use_zen:
            hidden_dim = hidden.shape[-1]
            zen_dim = zen_input.shape[-1]
            hidden[:, :, (hidden_dim - zen_dim):] = zen_input

        if self.bi_embed is not None:
            bigrams = self.bi_embed(bigrams)
            hidden = torch.cat([hidden, bigrams], dim=-1)
        hidden = self.in_fc(hidden)

        encoder_output = self.transformer(hidden, mask)
        # new add
        # kv_output: hidden state of key value memory network
        kv_output = self.kv_memory(chars, pos_features, dep_features,
                                   chunk_features, encoder_output, pos_matrix,
                                   dep_matrix, chunk_matrix, nan_matrix)
        kv_output = self.memory_dropout(kv_output)
        # o: output of gating mechanism
        concat = self.fusion(encoder_output, kv_output)

        concat = self.fc_dropout(concat)
        concat = self.out_fc(concat)
        logits = F.log_softmax(concat, dim=-1)
        if target is None:
            paths, _ = self.crf.viterbi_decode(logits, mask)
            return {'pred': paths}
        else:
            loss = self.crf(logits, target, mask)
            return {'loss': loss}

    def forward(self,
                chars,
                target,
                bigrams=None,
                pos_features=None,
                dep_features=None,
                chunk_features=None,
                pos_matrix=None,
                dep_matrix=None,
                chunk_matrix=None,
                nan_matrix=None,
                zen_input=None):
        return self._forward(chars, target, bigrams, pos_features,
                             dep_features, chunk_features, pos_matrix,
                             dep_matrix, chunk_matrix, nan_matrix, zen_input)

    def predict(self,
                chars,
                bigrams=None,
                pos_features=None,
                dep_features=None,
                chunk_features=None,
                pos_matrix=None,
                dep_matrix=None,
                chunk_matrix=None,
                nan_matrix=None,
                zen_input=None):
        return self._forward(chars,
                             target=None,
                             bigrams=bigrams,
                             pos_features=pos_features,
                             dep_features=dep_features,
                             chunk_features=chunk_features,
                             pos_matrix=pos_matrix,
                             dep_matrix=dep_matrix,
                             chunk_matrix=chunk_matrix,
                             nan_matrix=nan_matrix,
                             zen_input=zen_input)