def __init__(self, args): super(BiLSTM_CRF, self).__init__() self.embedding_dim = args.embedding_dim self.hidden_dim = args.hidden_dim self.vocab_size = args.vocab_size # self.tag_to_ix = args.tag_to_ix # don't count the padding tag for the classifier output self.tagset_size = args.tagset_size # whenever the embedding sees the padding index it'll make the whole vector zeros padding_idx = 0 self.word_embeds = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=padding_idx) self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, dropout=0.5, num_layers=1, bidirectional=True) self.gru = nn.GRU(self.embedding_dim, self.hidden_dim, dropout=0.5, num_layers=1, bidirectional=True) # Maps the output of the LSTM into tag space. self.hidden2tag = nn.Linear(self.hidden_dim * 2, self.tagset_size) # initial crf layer self.crf = CRF(self.tagset_size)
def __init__(self, config): super(Sequence_Label, self).__init__() self.config = config # embed self.embed_num = config.embed_num self.embed_dim = config.embed_dim self.label_num = config.class_num self.paddingId = config.paddingId # dropout self.dropout_emb = config.dropout_emb self.dropout = config.dropout # lstm self.lstm_hiddens = config.lstm_hiddens self.lstm_layers = config.lstm_layers # pretrain self.pretrained_embed = config.pretrained_embed self.pretrained_weight = config.pretrained_weight # char self.use_char = config.use_char self.char_embed_num = config.char_embed_num self.char_paddingId = config.char_paddingId self.char_dim = config.char_dim self.conv_filter_sizes = self._conv_filter(config.conv_filter_sizes) self.conv_filter_nums = self._conv_filter(config.conv_filter_nums) assert len(self.conv_filter_sizes) == len(self.conv_filter_nums) # print(self.conv_filter_nums) # print(self.conv_filter_sizes) # exit() # use crf self.use_crf = config.use_crf # cuda or cpu self.device = config.device self.target_size = self.label_num if self.use_crf is False else self.label_num + 2 if self.use_char is True: self.encoder_model = BiLSTM_CNN(embed_num=self.embed_num, embed_dim=self.embed_dim, label_num=self.target_size, paddingId=self.paddingId, dropout_emb=self.dropout_emb, dropout=self.dropout, lstm_hiddens=self.lstm_hiddens, lstm_layers=self.lstm_layers, pretrained_embed=self.pretrained_embed, pretrained_weight=self.pretrained_weight, char_embed_num=self.char_embed_num, char_dim=self.char_dim, char_paddingId=self.char_paddingId, conv_filter_sizes=self.conv_filter_sizes, conv_filter_nums=self.conv_filter_nums, device=self.device) else: self.encoder_model = BiLSTM(embed_num=self.embed_num, embed_dim=self.embed_dim, label_num=self.target_size, paddingId=self.paddingId, dropout_emb=self.dropout_emb, dropout=self.dropout, lstm_hiddens=self.lstm_hiddens, lstm_layers=self.lstm_layers, pretrained_embed=self.pretrained_embed, pretrained_weight=self.pretrained_weight, device=self.device) if self.use_crf is True: args_crf = dict({'target_size': self.label_num, 'device': self.device}) self.crf_layer = CRF(**args_crf)
def __init__(self, args): super(BERT_CRF, self).__init__() self.embedding_dim = args.embedding_dim self.hidden_dim = args.hidden_dim # self.vocab_size = args.vocab_size # self.tag_to_ix = args.tag_to_ix # don't count the padding tag for the classifier output self.tagset_size = args.tagset_size self.bert_model_name = args.bert_model_name # whenever the embedding sees the padding index it'll make the whole vector zeros # padding_idx = 0 # self.word_embeds = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=padding_idx) if self.bert_model_name.startswith('bert-'): self.word_embeds = BertModel.from_pretrained(self.bert_model_name) print('load pre-trained model of {}'.format(self.bert_model_name)) elif self.bert_model_name.startswith('albert-'): self.word_embeds = AlbertModel.from_pretrained( self.bert_model_name) print('load pre-trained model of {}'.format(self.bert_model_name)) else: print('bert model {} not found!!!'.format(self.bert_model_name)) self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, num_layers=1, bidirectional=True, dropout=0.5) # self.gru = nn.GRU(self.embedding_dim, self.hidden_dim, # dropout=0.5, num_layers=1, bidirectional=True) # Maps the output of the LSTM into tag space. self.hidden2tag = nn.Linear(self.hidden_dim * 2, self.tagset_size) # self.linear = nn.Linear(self.embedding_dim, self.tagset_size) # initial crf layer self.crf = CRF(self.tagset_size)
class BiLSTM_CRF(nn.Module): def __init__(self, args): super(BiLSTM_CRF, self).__init__() self.embedding_dim = args.embedding_dim self.hidden_dim = args.hidden_dim self.vocab_size = args.vocab_size # self.tag_to_ix = args.tag_to_ix # don't count the padding tag for the classifier output self.tagset_size = args.tagset_size # whenever the embedding sees the padding index it'll make the whole vector zeros padding_idx = 0 self.word_embeds = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=padding_idx) self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, dropout=0.5, num_layers=1, bidirectional=True) self.gru = nn.GRU(self.embedding_dim, self.hidden_dim, dropout=0.5, num_layers=1, bidirectional=True) # Maps the output of the LSTM into tag space. self.hidden2tag = nn.Linear(self.hidden_dim * 2, self.tagset_size) # initial crf layer self.crf = CRF(self.tagset_size) # def init_hidden(self): # return (torch.randn(2, 2, self.hidden_dim), # torch.randn(2, 2, self.hidden_dim)) def _get_lstm_features(self, sentence, lengths): # (batch_size, seq_length) embeds = self.word_embeds(sentence).transpose( 1, 0) # (seq_length, batch_size, embedding_size) embeds_packed = pack_padded_sequence(embeds, lengths) lstm_out, hidden = self.lstm( embeds_packed) # (seq_length, batch_size, hidden_size) lstm_out_padded, _ = pad_packed_sequence(lstm_out) lstm_feats = self.hidden2tag( lstm_out_padded) # (seq_length, batch_size, tag_size) # print(lstm_feats.shape) return lstm_feats def neg_log_likelihood(self, sentence, targets, lengths): feats = self._get_lstm_features(sentence, lengths) # feats: (seq_length, batch_size, tag_size) # tags: (batch_size, seq_length) mask = (sentence > 0).transpose(1, 0) return -self.crf(feats, targets.transpose(0, 1), mask) def forward(self, sentence, lengths, concated=False): # use for prediction # Get the emission scores from the BiLSTM lstm_feats = self._get_lstm_features(sentence, lengths) # Find the best path, given the features. mask = (sentence > 0).transpose(1, 0) tag_seq = self.crf.decode(lstm_feats, mask, concated) return tag_seq
class BERT_CRF(nn.Module): def __init__(self, args): super(BERT_CRF, self).__init__() self.embedding_dim = args.embedding_dim self.hidden_dim = args.hidden_dim # self.vocab_size = args.vocab_size # self.tag_to_ix = args.tag_to_ix # don't count the padding tag for the classifier output self.tagset_size = args.tagset_size self.bert_model_name = args.bert_model_name # whenever the embedding sees the padding index it'll make the whole vector zeros # padding_idx = 0 # self.word_embeds = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=padding_idx) if self.bert_model_name.startswith('bert-'): self.word_embeds = BertModel.from_pretrained(self.bert_model_name) print('load pre-trained model of {}'.format(self.bert_model_name)) elif self.bert_model_name.startswith('albert-'): self.word_embeds = AlbertModel.from_pretrained( self.bert_model_name) print('load pre-trained model of {}'.format(self.bert_model_name)) else: print('bert model {} not found!!!'.format(self.bert_model_name)) self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, num_layers=1, bidirectional=True, dropout=0.5) # self.gru = nn.GRU(self.embedding_dim, self.hidden_dim, # dropout=0.5, num_layers=1, bidirectional=True) # Maps the output of the LSTM into tag space. self.hidden2tag = nn.Linear(self.hidden_dim * 2, self.tagset_size) # self.linear = nn.Linear(self.embedding_dim, self.tagset_size) # initial crf layer self.crf = CRF(self.tagset_size) # def init_hidden(self): # return (torch.randn(2, 2, self.hidden_dim), # torch.randn(2, 2, self.hidden_dim)) def _get_lstm_features(self, sentence, lengths): # (batch_size, seq_length) # embeds = self.word_embeds(sentence).transpose(1, 0) # (seq_length, batch_size, embedding_size) attention_mask = (sentence > 0) embeds = self.word_embeds(sentence, attention_mask=attention_mask) embeds = embeds[0].transpose(0, 1) # embeds_packed = pack_padded_sequence(embeds, lengths) lstm_out, hidden = self.lstm( embeds) # (seq_length, batch_size, hidden_size) # lstm_out_padded, _ = pad_packed_sequence(lstm_out) lstm_feats = self.hidden2tag( lstm_out) # (seq_length, batch_size, tag_size) # lstm_feats = self.linear(embeds) # print(lstm_feats.shape) return lstm_feats def neg_log_likelihood(self, sentence, targets, lengths): feats = self._get_lstm_features(sentence, lengths) # feats: (seq_length, batch_size, tag_size) # tags: (batch_size, seq_length) mask = (sentence > 0).transpose(1, 0) return -self.crf(feats, targets.transpose(0, 1), mask) def forward(self, sentence, lengths, concated=False): # use for prediction # Get the emission scores from the BiLSTM lstm_feats = self._get_lstm_features(sentence, lengths) # Find the best path, given the features. mask = (sentence > 0).transpose(1, 0) tag_seq = self.crf.decode(lstm_feats, mask, concated) return tag_seq
def build_model(self, trainable=True): if self.embedding_name is None: from tensorflow.keras.layers import Input, Dense, LSTM, GRU, Bidirectional, Embedding, Dropout, TimeDistributed, Activation from tensorflow.keras import Model Input_layer = Input(shape=(None, ), name='Input_layer') embedd_layer = Embedding(self.vocab_size, 100) x = embedd_layer(Input_layer) if self.model_name.lower() == 'lstm': x = Dropout(0.4, name='lstm_dropout')(x) for i in range(self.layer_number): x = Bidirectional(LSTM(128, return_sequences=True), merge_mode='concat', name='bilstm_{}'.format(i))(x) x = TimeDistributed(Dense(1), name='Time_Dense')(x) x = Activation('sigmoid')(x) model = Model(Input_layer, x) model.summary() return model, 'binary_crossentropy', ['acc'], embedd_layer elif self.model_name.lower() == 'gru': x = Dropout(0.4, name='gru_dropout')(x) for i in range(self.layer_number): x = Bidirectional(GRU(128, return_sequences=True), merge_mode='concat', name='bigru_{}'.format(i))(x) x = TimeDistributed(Dense(1), name='Time_Dense')(x) x = Activation('sigmoid')(x) model = Model(Input_layer, x) model.summary() return model, 'binary_crossentropy', ['acc'], embedd_layer elif self.model_name.lower() == 'lstm-crf': x = Dropout(0.4, name='lstm_dropout')(x) for i in range(self.layer_number): x = Bidirectional(LSTM(128, return_sequences=True), merge_mode='concat', name='bilstm_{}'.format(i))(x) x = Dense(64, activation='tanh', name='dense_layer')(x) x = Dense(1, name='dense_for_crf', activation='sigmoid')(x) crf_layer = CRF(1, name='crf') x = crf_layer(x) model = Model(Input_layer, x) model.summary() return model, crf_layer.loss, [crf_layer.viterbi_accuracy ], embedd_layer else: x = Dropout(0.4, name='gru_dropout')(x) for i in range(self.layer_number): x = Bidirectional(GRU(128, return_sequences=True), merge_mode='concat', name='bigru_{}'.format(i))(x) x = Dense(64, activation='tanh', name='dense_layer')(x) x = Dense(1, name='dense_for_crf', activation='sigmoid')(x) crf_layer = CRF(1, name='crf') x = crf_layer(x) model = Model(Input_layer, x) model.summary() return model, crf_layer.loss, [crf_layer.viterbi_accuracy ], embedd_layer else: from keras.layers import Input, Dense, LSTM, GRU, Bidirectional, Embedding, Dropout, TimeDistributed, Activation from keras import Model assert self.paths is not None config_path, checkpoints_path, vocab_path = self.paths bert_model = load_trained_model_from_checkpoint( config_file=config_path, checkpoint_file=checkpoints_path, training=trainable) inputs = bert_model.inputs[:2] x = bert_model.layers[ -1].output if trainable == False else bert_model.get_layer( name='Encoder-24-FeedForward-Norm').output x = Dropout(0.4)(x) if self.model_name.lower() == 'lstm': for i in range(self.layer_number): x = Bidirectional(LSTM(128, return_sequences=True), merge_mode='concat', name='bilstm_{}'.format(i))(x) x = TimeDistributed(Dense(1), name='Time_Dense')(x) x = Activation('sigmoid')(x) model = Model(inputs, x) model.summary() return model, 'binary_crossentropy', ['acc'], bert_model elif self.model_name.lower() == 'gru': for i in range(self.layer_number): x = Bidirectional(GRU(128, return_sequences=True), merge_mode='concat', name='bigru_{}'.format(i))(x) x = TimeDistributed(Dense(1), name='Time_Dense')(x) x = Activation('sigmoid')(x) model = Model(inputs, x) model.summary() return model, 'binary_crossentropy', ['acc'], bert_model elif self.model_name.lower() == 'lstm-crf': for i in range(self.layer_number): x = Bidirectional(LSTM(128, return_sequences=True), merge_mode='concat', name='bilstm_{}'.format(i))(x) x = Dense(64, activation='tanh', name='dense_layer')(x) x = Dense(1, name='dense_for_crf', activation='sigmoid')(x) crf_layer = CRF(1, name='crf') x = crf_layer(x) model = Model(inputs, x) model.summary() return model, crf_layer.loss, [crf_layer.viterbi_accuracy ], bert_model else: for i in range(self.layer_number): x = Bidirectional(GRU(128, return_sequences=True), merge_mode='concat', name='bigru_{}'.format(i))(x) x = Dense(64, activation='tanh', name='dense_layer')(x) x = Dense(1, name='dense_for_crf', activation='sigmoid')(x) crf_layer = CRF(1, name='crf') x = crf_layer(x) model = Model(inputs, x) model.summary() return model, crf_layer.loss, [crf_layer.viterbi_accuracy ], bert_model