def __init__(self, vocab, config, elmo_shape): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD self.word_dims = config.word_dims self.elmo_layers = elmo_shape[0] self.elmo_dims = elmo_shape[1] weights = torch.randn(self.elmo_layers) self.weights = torch.nn.Parameter(weights, requires_grad=True) self.mlp_elmo = nn.Linear(self.elmo_dims, self.word_dims, bias=False) self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0) word_init = np.random.randn(vocab.vocab_size, config.word_dims).astype(np.float32) self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5)) self.lstm_input_dims = config.word_dims + config.predict_dims self.bilstm = MyLSTM( input_size=self.lstm_input_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5)) self.crf = CRF(vocab.label_size)
def __init__(self, vocab, config, input_dims, bert_layers): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD self.input_dims = input_dims self.input_depth = bert_layers if config.bert_tune == 0 else 1 self.hidden_dims = config.word_dims self.projections = nn.ModuleList([NonLinear(self.input_dims, self.hidden_dims, activation=GELU()) \ for i in range(self.input_depth)]) self.rescale = ScalarMix(mixture_size=self.input_depth) self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0) word_init = np.random.randn(vocab.vocab_size, config.word_dims).astype(np.float32) self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5)) self.lstm_input_dims = config.word_dims + config.predict_dims self.bilstm = MyLSTM( input_size=self.lstm_input_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5)) self.crf = CRF(vocab.label_size)
def __init__(self, vocab, config, pretrained_embedding): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD extvocab_size, extword_dims = pretrained_embedding.shape self.word_dims = extword_dims if config.word_dims != extword_dims: print("word dim size does not match, check config file") self.word_embed = nn.Embedding(vocab.vocab_size, self.word_dims, padding_idx=vocab.PAD) if vocab.extvocab_size != extvocab_size: print("word vocab size does not match, check word embedding file") self.extword_embed = CPUEmbedding(vocab.extvocab_size, self.word_dims, padding_idx=vocab.PAD) word_init = np.zeros((vocab.vocab_size, self.word_dims), dtype=np.float32) self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) self.extword_embed.weight.data.copy_( torch.from_numpy(pretrained_embedding)) self.extword_embed.weight.requires_grad = False self.rel_embed = nn.Embedding(vocab.rel_size, self.word_dims, padding_idx=vocab.PAD) self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims**0.5)) self.input_dims = 2 * config.word_dims + config.predict_dims self.dt_tree = DTTreeGRU(self.input_dims, config.lstm_hiddens) self.td_tree = TDTreeGRU(self.input_dims, config.lstm_hiddens) self.bilstm = MyLSTM( input_size=2 * config.lstm_hiddens, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens)**0.5)) self.crf = CRF(vocab.label_size)
def __init__(self, vocab, config, parser_config, input_dims, bert_layers): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD self.input_dims = input_dims self.input_depth = bert_layers if config.bert_tune == 0 else 1 self.hidden_dims = 2 * config.lstm_hiddens self.projections = nn.ModuleList([NonLinear(self.input_dims, self.hidden_dims, activation=GELU()) \ for i in range(self.input_depth)]) self.rescale = ScalarMix(mixture_size=self.input_depth) parser_dim = 2 * parser_config.lstm_hiddens self.transformer_lstm = nn.ModuleList([ NonLinear(parser_dim, self.hidden_dims, activation=GELU()) for i in range(parser_config.lstm_layers) ]) parser_mlp_dim = parser_config.mlp_arc_size + parser_config.mlp_rel_size self.transformer_dep = NonLinear(parser_mlp_dim, self.hidden_dims, activation=GELU()) self.transformer_head = NonLinear(parser_mlp_dim, self.hidden_dims, activation=GELU()) self.parser_lstm_layers = parser_config.lstm_layers self.synscale = ScalarMix(mixture_size=3 + parser_config.lstm_layers) self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims**0.5)) self.lstm_input_dims = 2 * self.hidden_dims + config.predict_dims self.bilstm = MyLSTM( input_size=self.lstm_input_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens)**0.5)) self.crf = CRF(vocab.label_size)
def __init__(self, vocabs, word_embed_file, word_embed_dim, char_embed_dim, char_filters, char_feat_dim, lstm_hidden_size, lstm_dropout=.5, feat_dropout=.5, parameters=None): super(LstmCnnFeatGate, self).__init__() assert word_embed_dim == char_feat_dim self.vocabs = vocabs self.label_size = len(self.vocabs['label']) # input features if parameters is not None: self.word_embed = nn.Embedding(parameters['word_embed_num'], parameters['word_embed_dim']) else: self.word_embed = load_embedding_from_file(word_embed_file, word_embed_dim, vocabs['token'], vocabs['embed'], vocabs['form'], padding_idx=C.PAD_INDEX, trainable=True) self.char_embed = CharCNNFF(len(vocabs['char']), char_embed_dim, char_filters, output_size=char_feat_dim) # word dim = char_dim = feat_dim in this model self.word_dim = self.word_embed.embedding_dim self.char_dim = self.char_embed.output_size self.feat_dim = self.word_dim # layers self.char_gate = Linear(self.char_dim, self.char_dim, bias=False) self.word_gate = Linear(self.word_dim, self.word_dim, bias=False) self.gate = Linear(self.feat_dim, self.feat_dim, bias=False) self.lstm = LSTM(input_size=self.feat_dim, hidden_size=lstm_hidden_size, batch_first=True, bidirectional=True) self.output_linear = Linear(self.lstm.output_size, self.label_size) self.crf = CRF(vocabs['label']) self.feat_dropout = nn.Dropout(p=feat_dropout) self.lstm_dropout = nn.Dropout(p=lstm_dropout)
def __init__(self, vocab, config, parser_config, elmo_shape): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD self.word_dims = config.word_dims self.elmo_layers = elmo_shape[0] self.elmo_dims = elmo_shape[1] weights = torch.randn(self.elmo_layers) self.weights = torch.nn.Parameter(weights, requires_grad=True) self.mlp_elmo = nn.Linear(self.elmo_dims, self.word_dims, bias=False) self.transformer_emb = nn.Linear(parser_config.word_dims, self.word_dims, bias=False) parser_dim = 2 * parser_config.lstm_hiddens transformer_lstm = [] for layer in range(parser_config.lstm_layers): transformer_lstm.append(nn.Linear(parser_dim, self.word_dims, bias=False)) self.transformer_lstm = nn.ModuleList(transformer_lstm) parser_mlp_dim = parser_config.mlp_arc_size + parser_config.mlp_rel_size self.transformer_dep = nn.Linear(parser_mlp_dim, self.word_dims, bias=False) self.transformer_head = nn.Linear(parser_mlp_dim, self.word_dims, bias=False) self.parser_lstm_layers = parser_config.lstm_layers self.synscale = ScalarMix(mixture_size=3+parser_config.lstm_layers) self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5)) self.lstm_input_dims = 2 * config.word_dims + config.predict_dims self.bilstm = MyLSTM( input_size=self.lstm_input_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5)) self.crf = CRF(vocab.label_size)
class BiLSTMModel(nn.Module): def __init__(self, vocab, config, parser_config, input_dims, bert_layers): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD self.input_dims = input_dims self.input_depth = bert_layers if config.bert_tune == 0 else 1 self.hidden_dims = 2 * config.lstm_hiddens self.projections = nn.ModuleList([NonLinear(self.input_dims, self.hidden_dims, activation=GELU()) \ for i in range(self.input_depth)]) self.rescale = ScalarMix(mixture_size=self.input_depth) parser_dim = 2 * parser_config.lstm_hiddens self.transformer_lstm = nn.ModuleList([ NonLinear(parser_dim, self.hidden_dims, activation=GELU()) for i in range(parser_config.lstm_layers) ]) parser_mlp_dim = parser_config.mlp_arc_size + parser_config.mlp_rel_size self.transformer_dep = NonLinear(parser_mlp_dim, self.hidden_dims, activation=GELU()) self.transformer_head = NonLinear(parser_mlp_dim, self.hidden_dims, activation=GELU()) self.parser_lstm_layers = parser_config.lstm_layers self.synscale = ScalarMix(mixture_size=3 + parser_config.lstm_layers) self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims**0.5)) self.lstm_input_dims = 2 * self.hidden_dims + config.predict_dims self.bilstm = MyLSTM( input_size=self.lstm_input_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens)**0.5)) self.crf = CRF(vocab.label_size) def forward(self, inputs, predicts, synxs, masks): # x = (batch size, sequence length, dimension of embedding) proj_hiddens = [] for idx, input in enumerate(inputs): cur_hidden = self.projections[idx](input) proj_hiddens.append(cur_hidden) x_embed = self.rescale(proj_hiddens) syn_idx = 0 x_syns = [] x_syn_emb = self.transformer_emb(synxs[syn_idx]) x_syns.append(x_syn_emb) syn_idx += 1 for layer in range(self.parser_lstm_layers): x_syn_lstm = self.transformer_lstm[layer].forward(synxs[syn_idx]) syn_idx += 1 x_syns.append(x_syn_lstm) x_syn_dep = self.transformer_dep(synxs[syn_idx]) x_syns.append(x_syn_dep) syn_idx += 1 x_syn_head = self.transformer_head(synxs[syn_idx]) x_syns.append(x_syn_head) syn_idx += 1 x_syn = self.synscale(x_syns) x_predict_embed = self.predicate_embed(predicts) if self.training: x_embed, x_syn, x_predict_embed = drop_tri_input_independent( x_embed, x_syn, x_predict_embed, self.config.dropout_emb) embeddings = torch.cat((x_embed, x_syn, x_predict_embed), dim=2) lstm_out, _ = self.bilstm(embeddings, masks) lstm_out = lstm_out.transpose(1, 0) label_scores = self.outlayer(lstm_out) return label_scores def compute_loss(self, output, answer, masks): # output: [B, T, L], answer: [B, T], mask: [B, T, L] # print answer output = output.transpose(1, 0).contiguous() answer = answer.transpose(1, 0).contiguous() masks = masks.transpose(1, 0).contiguous() total_loss = self.crf(output, answer, masks) num_words = masks.float().sum() total_loss = total_loss / num_words return total_loss def decode(self, label_scores, masks): label_scores = label_scores.transpose(1, 0).contiguous() masks = masks.transpose(1, 0).contiguous() tag_seq = self.crf.decode(label_scores, masks) return tag_seq def save(self, filepath): """ Save model parameters to file. """ torch.save(self.state_dict(), filepath) print('Saved model to: {}'.format(filepath)) def load(self, filepath): """ Load model parameters from file. """ self.load_state_dict(torch.load(filepath)) print('Loaded model from: {}'.format(filepath))
class LstmCnn(nn.Module): def __init__(self, vocabs, word_embed_file, word_embed_dim, char_embed_dim, char_filters, char_feat_dim, lstm_hidden_size, lstm_dropout=0, feat_dropout=0, parameters=None): super(LstmCnn, self).__init__() self.vocabs = vocabs self.label_size = len(self.vocabs['label']) # input features if parameters is not None: self.word_embed = nn.Embedding(parameters['word_embed_num'], parameters['word_embed_dim'], padding_idx=C.PAD_INDEX) else: self.word_embed = load_embedding_from_file(word_embed_file, word_embed_dim, vocabs['token'], vocabs['embed'], vocabs['form'], padding_idx=C.PAD_INDEX, trainable=True) self.char_embed = CharCNNFF(len(vocabs['char']), char_embed_dim, char_filters, output_size=char_feat_dim) self.word_dim = self.word_embed.embedding_dim self.char_dim = self.char_embed.output_size self.feat_dim = self.char_dim + self.word_dim # layers self.lstm = LSTM(input_size=self.feat_dim, hidden_size=lstm_hidden_size, batch_first=True, bidirectional=True) self.output_linear = Linear(self.lstm.output_size, self.label_size) self.crf = CRF(vocabs['label']) self.feat_dropout = nn.Dropout(p=feat_dropout) self.lstm_dropout = nn.Dropout(p=lstm_dropout) @property def params(self): return { 'word_embed_num': self.word_embed.num_embeddings, 'word_embed_dim': self.word_embed.embedding_dim } def forward_nn(self, token_ids, char_ids, lens): batch_size, seq_len = token_ids.size() # word representation word_in = self.word_embed(token_ids) char_in = self.char_embed(char_ids) char_in = char_in.view(batch_size, seq_len, self.char_dim) feats = torch.cat([word_in, char_in], dim=2) feats = self.feat_dropout(feats) # LSTM layer lstm_in = R.pack_padded_sequence(feats, lens.tolist(), batch_first=True) lstm_out, _ = self.lstm(lstm_in) lstm_out, _ = R.pad_packed_sequence(lstm_out, batch_first=True) lstm_out = self.lstm_dropout(lstm_out) # output linear layer linear_out = self.output_linear(lstm_out) return linear_out def forward(self, token_ids, char_ids, lens, labels): logits = self.forward_nn(token_ids, char_ids, lens) logits = self.crf.pad_logits(logits) norm_score = self.crf.calc_norm_score(logits, lens) gold_score = self.crf.calc_gold_score(logits, labels, lens) loglik = gold_score - norm_score return loglik, logits def predict(self, token_ids, char_ids, lens): self.eval() logits = self.forward_nn(token_ids, char_ids, lens) logits = self.crf.pad_logits(logits) _scores, preds = self.crf.viterbi_decode(logits, lens) preds = preds.data.tolist() self.train() return preds
def __init__(self, vocabs, counters, word_embed_file, word_embed_dim, char_embed_dim, char_filters, char_feat_dim, lstm_hidden_size, lstm_dropout=0.5, feat_dropout=0.5): # TODO: init function for saved model super(LstmCnnGate, self).__init__() self.vocabs = vocabs self.label_size = len(self.vocabs['label']) # input features self.word_embed = load_embedding_from_file(word_embed_file, word_embed_dim, vocabs['token'], vocabs['embed'], vocabs['form'], padding_idx=C.PAD_INDEX, trainable=True) self.char_embed = CharCNNFF(len(vocabs['char']), char_embed_dim, char_filters, output_size=char_feat_dim) self.word_dim = self.word_embed.embedding_dim self.char_dim = self.char_embed.output_size self.feat_dim = self.char_dim # layers self.lstm = LSTM(input_size=self.feat_dim, hidden_size=lstm_hidden_size, batch_first=True, bidirectional=True) self.output_linear = Linear(self.lstm.output_size, self.label_size) self.crf = CRF(vocabs['label']) self.feat_dropout = nn.Dropout(p=feat_dropout) self.lstm_dropout = nn.Dropout(p=lstm_dropout) self.lstm_size = self.lstm.output_size self.uni_lstm_size = self.lstm_size // 2 # word representation level self.word_gate = Linear(self.word_dim, self.word_dim) self.char_gate = Linear(self.word_dim, self.word_dim) # feature extraction level # context-only feature linear layers self.cof_linear_fwd = Linear(self.uni_lstm_size, self.uni_lstm_size) self.cof_linear_bwd = Linear(self.uni_lstm_size, self.uni_lstm_size) # hidden states gates self.hs_gates = nn.ModuleList([ Linear(self.uni_lstm_size, self.uni_lstm_size), Linear(self.uni_lstm_size, self.uni_lstm_size) ]) # context-only feature gates self.cof_gates = nn.ModuleList([ Linear(self.uni_lstm_size, self.uni_lstm_size), Linear(self.uni_lstm_size, self.uni_lstm_size) ])
class LstmCnnGate(nn.Module): def __init__(self, vocabs, counters, word_embed_file, word_embed_dim, char_embed_dim, char_filters, char_feat_dim, lstm_hidden_size, lstm_dropout=0.5, feat_dropout=0.5): # TODO: init function for saved model super(LstmCnnGate, self).__init__() self.vocabs = vocabs self.label_size = len(self.vocabs['label']) # input features self.word_embed = load_embedding_from_file(word_embed_file, word_embed_dim, vocabs['token'], vocabs['embed'], vocabs['form'], padding_idx=C.PAD_INDEX, trainable=True) self.char_embed = CharCNNFF(len(vocabs['char']), char_embed_dim, char_filters, output_size=char_feat_dim) self.word_dim = self.word_embed.embedding_dim self.char_dim = self.char_embed.output_size self.feat_dim = self.char_dim # layers self.lstm = LSTM(input_size=self.feat_dim, hidden_size=lstm_hidden_size, batch_first=True, bidirectional=True) self.output_linear = Linear(self.lstm.output_size, self.label_size) self.crf = CRF(vocabs['label']) self.feat_dropout = nn.Dropout(p=feat_dropout) self.lstm_dropout = nn.Dropout(p=lstm_dropout) self.lstm_size = self.lstm.output_size self.uni_lstm_size = self.lstm_size // 2 # word representation level self.word_gate = Linear(self.word_dim, self.word_dim) self.char_gate = Linear(self.word_dim, self.word_dim) # feature extraction level # context-only feature linear layers self.cof_linear_fwd = Linear(self.uni_lstm_size, self.uni_lstm_size) self.cof_linear_bwd = Linear(self.uni_lstm_size, self.uni_lstm_size) # hidden states gates self.hs_gates = nn.ModuleList([ Linear(self.uni_lstm_size, self.uni_lstm_size), Linear(self.uni_lstm_size, self.uni_lstm_size) ]) # context-only feature gates self.cof_gates = nn.ModuleList([ Linear(self.uni_lstm_size, self.uni_lstm_size), Linear(self.uni_lstm_size, self.uni_lstm_size) ]) def _repr_gate(self, word, char): gate_w = self.word_gate(word) gate_c = self.char_gate(char) gate = gate_w + gate_c gate = gate.sigmoid() return gate def _feat_gate(self, hs, cof, idx): """Calculate feature extraction level gates. :param hs: Hidden states. :param cof: Context-only features. """ gate_h = self.hs_gates[idx](hs) gate_c = self.cof_gates[idx](cof) gate = gate_h + gate_c gate = gate.sigmoid() return gate def forward_nn(self, token_ids, char_ids, lens): batch_size, seq_len = token_ids.size() word_dim = self.word_dim char_dim = self.char_dim # word representations word_in = self.word_embed(token_ids) char_in = self.char_embed(char_ids) char_in = char_in.view(batch_size, seq_len, char_dim) # combine features repr_mix_gate = self._repr_gate(word_in, char_in) feats = repr_mix_gate * word_in + (1 - repr_mix_gate) * char_in feats = self.feat_dropout(feats) # LSTM layer lstm_in = R.pack_padded_sequence(feats, lens.tolist(), batch_first=True) lstm_out, _ = self.lstm(lstm_in) lstm_out, _ = R.pad_packed_sequence(lstm_out, batch_first=True) lstm_out = self.lstm_dropout(lstm_out) # context-only features (cof) hs_pad = lstm_out.new_zeros([batch_size, 1, self.uni_lstm_size], requires_grad=False) hs_fwd = lstm_out[:, :, :self.uni_lstm_size] hs_bwd = lstm_out[:, :, self.uni_lstm_size:] hs_fwd_padded = torch.cat([hs_pad, hs_fwd], dim=1)[:, :-1, :] hs_bwd_padded = torch.cat([hs_bwd, hs_pad], dim=1)[:, 1:, :] cof_fwd = self.cof_linear_fwd(hs_fwd_padded).tanh() cof_bwd = self.cof_linear_bwd(hs_bwd_padded).tanh() # feature extract level gates feat_mix_gate_fwd = self._feat_gate(hs_fwd, cof_fwd, 0) feat_mix_gate_bwd = self._feat_gate(hs_bwd, cof_bwd, 1) # enhanced hidden states hs_fwd_enh = feat_mix_gate_fwd * hs_fwd + (1 - feat_mix_gate_fwd) * cof_fwd hs_bwd_enh = feat_mix_gate_bwd * hs_bwd + (1 - feat_mix_gate_bwd) * cof_bwd hs_enh = torch.cat([hs_fwd_enh, hs_bwd_enh], dim=2) # output linear layer linear_out = self.output_linear(hs_enh) return linear_out def forward(self, token_ids, char_ids, lens, labels): logits = self.forward_nn(token_ids, char_ids, lens) logits = self.crf.pad_logits(logits) norm_score = self.crf.calc_norm_score(logits, lens) gold_score = self.crf.calc_gold_score(logits, labels, lens) loglik = gold_score - norm_score return loglik, logits def predict(self, token_ids, char_ids, lens): self.eval() logits = self.forward_nn(token_ids, char_ids, lens) logits = self.crf.pad_logits(logits) _scores, preds = self.crf.viterbi_decode(logits, lens) preds = preds.data.tolist() self.train() return preds
class LstmCnnDfc(nn.Module): def __init__( self, vocabs, counters, word_embed_file, word_embed_dim, char_embed_dim, char_filters, char_feat_dim, lstm_hidden_size, lstm_dropout=0.5, feat_dropout=0.5, signal_dropout=0, ctx_size=5, use_signal=True, parameters=None, ): assert char_feat_dim >= word_embed_dim super(LstmCnnDfc, self).__init__() self.vocabs = vocabs self.label_size = len(self.vocabs['label']) self.use_signal = use_signal # input features if parameters is not None: self.word_embed = nn.Embedding(parameters['word_embed_num'], parameters['word_embed_dim']) else: self.word_embed = load_embedding_from_file(word_embed_file, word_embed_dim, vocabs['token'], vocabs['embed'], vocabs['form'], padding_idx=C.PAD_INDEX, trainable=True) self.char_embed = CharCNNFF(len(vocabs['char']), char_embed_dim, char_filters, output_size=char_feat_dim) if use_signal: if parameters is not None: self.signal_embed = nn.Embedding( parameters['signal_embed_num'], parameters['signal_embed_dim']) else: self.signal_embed = build_signal_embed(counters['embed'], counters['token'], vocabs['token'], vocabs['form']) self.word_dim = self.word_embed.embedding_dim self.char_dim = self.char_embed.output_size self.feat_dim = self.char_dim self.signal_dim = self.signal_embed.embedding_dim self.ctx_size = ctx_size # layers self.lstm = LSTM(input_size=self.feat_dim, hidden_size=lstm_hidden_size, batch_first=True, bidirectional=True) self.output_linear = Linear(self.lstm.output_size, self.label_size) self.crf = CRF(vocabs['label']) self.feat_dropout = nn.Dropout(p=feat_dropout) self.lstm_dropout = nn.Dropout(p=lstm_dropout) self.signal_dropout = nn.Dropout(p=signal_dropout) self.lstm_size = self.lstm.output_size self.uni_lstm_size = self.lstm_size // 2 # word representation level self.word_gates = nn.ModuleList([ Linear(self.word_dim, self.word_dim), Linear(self.word_dim, self.word_dim) ]) self.char_gates = nn.ModuleList([ Linear(self.word_dim, self.word_dim), Linear(self.word_dim, self.word_dim) ]) if use_signal: self.signal_gates = nn.ModuleList([ Linear(self.signal_dim, self.word_dim), Linear(self.signal_dim, self.word_dim) ]) # feature extraction level # context-only feature linear layers self.cof_linear_fwd = Linear(self.uni_lstm_size, self.uni_lstm_size) self.cof_linear_bwd = Linear(self.uni_lstm_size, self.uni_lstm_size) # hidden states gates self.hs_gates = nn.ModuleList( [Linear(self.uni_lstm_size, self.uni_lstm_size) for _ in range(4)]) # context-only feature gates self.cof_gates = nn.ModuleList( [Linear(self.uni_lstm_size, self.uni_lstm_size) for _ in range(4)]) if use_signal: self.crs_gates = nn.ModuleList([ Linear(self.signal_dim * (ctx_size + 1), self.uni_lstm_size) for _ in range(4) ]) @property def params(self): return { 'word_embed_num': self.word_embed.num_embeddings, 'word_embed_dim': self.word_embed.embedding_dim, 'signal_embed_num': self.signal_embed.num_embeddings, 'signal_embed_dim': self.signal_embed.embedding_dim } def _repr_gate(self, word, char, signal=None, idx=0): gate_w = self.word_gates[idx](word) gate_c = self.char_gates[idx](char) if self.use_signal: gate_s = self.signal_gates[idx](self.signal_dropout(signal)) gate = gate_w + gate_c + gate_s else: gate = gate_w + gate_c gate = gate.sigmoid() return gate def _feat_gate(self, hs, cof, crs=None, idx=0): """Calculate feature extraction level gates. :param hs: Hidden states. :param cof: Context-only features. :param crs: Context reliability signals. """ gate_h = self.hs_gates[idx](hs) gate_c = self.cof_gates[idx](cof) if self.use_signal: gate_s = self.crs_gates[idx](self.signal_dropout(crs)) gate = gate_h + gate_c + gate_s else: gate = gate_h + gate_c gate = gate.sigmoid() return gate def forward_nn(self, token_ids, char_ids, lens): batch_size, seq_len = token_ids.size() word_dim = self.word_dim char_dim = self.char_dim signal_dim = self.signal_dim use_signal = self.use_signal ctx_size = self.ctx_size # word representations word_in = self.word_embed(token_ids) char_in = self.char_embed(char_ids) char_in = char_in.view(batch_size, seq_len, char_dim) signal_in = self.signal_embed(token_ids) if use_signal else None # combine features if char_dim == word_dim: # without additional char features repr_mix_gate_1 = self._repr_gate(word_in, char_in, signal_in, 0) repr_mix_gate_2 = self._repr_gate(word_in, char_in, signal_in, 1) feats = repr_mix_gate_1 * word_in + repr_mix_gate_2 * char_in else: # with additional char features char_in_alt = char_in[:, :, :word_dim] char_in_cat = char_in[:, :, word_dim:] repr_mix_gate_1 = self._repr_gate(word_in, char_in_alt, signal_in, 0) repr_mix_gate_2 = self._repr_gate(word_in, char_in_alt, signal_in, 1) feats = repr_mix_gate_1 * word_in + repr_mix_gate_2 * char_in_alt feats = torch.cat([feats, char_in_cat], dim=2) feats = self.feat_dropout(feats) # LSTM layer lstm_in = R.pack_padded_sequence(feats, lens.tolist(), batch_first=True) lstm_out, _ = self.lstm(lstm_in) lstm_out, _ = R.pad_packed_sequence(lstm_out, batch_first=True) lstm_out = self.lstm_dropout(lstm_out) # context reliability signals (crs) if use_signal: rs_pad = lstm_out.new_zeros([batch_size, ctx_size, signal_dim], requires_grad=False) signal_in_padded = torch.cat([rs_pad, signal_in, rs_pad], dim=1) signal_in_padded = signal_in_padded.view(batch_size, -1) crs = signal_in_padded.unfold(1, signal_dim * (ctx_size + 1), signal_dim) crs_fwd = crs[:, :-ctx_size, :] crs_bwd = crs[:, ctx_size:, :] else: crs_fwd = crs_bwd = None # context-only features (cof) hs_pad = lstm_out.new_zeros([batch_size, 1, self.uni_lstm_size], requires_grad=False) hs_fwd = lstm_out[:, :, :self.uni_lstm_size] hs_bwd = lstm_out[:, :, self.uni_lstm_size:] hs_fwd_padded = torch.cat([hs_pad, hs_fwd], dim=1)[:, :-1, :] hs_bwd_padded = torch.cat([hs_bwd, hs_pad], dim=1)[:, 1:, :] cof_fwd = self.cof_linear_fwd(hs_fwd_padded).tanh() cof_bwd = self.cof_linear_bwd(hs_bwd_padded).tanh() # feature extract level gates feat_mix_gate_fwd_1 = self._feat_gate(hs_fwd, cof_fwd, crs_fwd, 0) feat_mix_gate_fwd_2 = self._feat_gate(hs_fwd, cof_fwd, crs_fwd, 1) feat_mix_gate_bwd_1 = self._feat_gate(hs_bwd, cof_bwd, crs_bwd, 2) feat_mix_gate_bwd_2 = self._feat_gate(hs_bwd, cof_bwd, crs_bwd, 3) # enhanced hidden states hs_fwd_enh = feat_mix_gate_fwd_1 * hs_fwd + feat_mix_gate_fwd_2 * cof_fwd hs_bwd_enh = feat_mix_gate_bwd_1 * hs_bwd + feat_mix_gate_bwd_2 * cof_bwd hs_enh = torch.cat([hs_fwd_enh, hs_bwd_enh], dim=2) # output linear layer linear_out = self.output_linear(hs_enh) return linear_out def forward(self, token_ids, char_ids, lens, labels): logits = self.forward_nn(token_ids, char_ids, lens) logits = self.crf.pad_logits(logits) norm_score = self.crf.calc_norm_score(logits, lens) gold_score = self.crf.calc_gold_score(logits, labels, lens) loglik = gold_score - norm_score return loglik, logits def predict(self, token_ids, char_ids, lens): self.eval() logits = self.forward_nn(token_ids, char_ids, lens) logits = self.crf.pad_logits(logits) _scores, preds = self.crf.viterbi_decode(logits, lens) preds = preds.data.tolist() self.train() return preds
class BiLSTMModel(nn.Module): def __init__(self, vocab, config, pretrained_embedding): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD extvocab_size, extword_dims = pretrained_embedding.shape self.word_dims = extword_dims if config.word_dims != extword_dims: print("word dim size does not match, check config file") self.word_embed = nn.Embedding(vocab.vocab_size, self.word_dims, padding_idx=vocab.PAD) if vocab.extvocab_size != extvocab_size: print("word vocab size does not match, check word embedding file") self.extword_embed = CPUEmbedding(vocab.extvocab_size, self.word_dims, padding_idx=vocab.PAD) word_init = np.zeros((vocab.vocab_size, self.word_dims), dtype=np.float32) self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) self.extword_embed.weight.data.copy_(torch.from_numpy(pretrained_embedding)) self.extword_embed.weight.requires_grad = False self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5)) self.input_dims = config.word_dims + config.predict_dims self.bilstm = MyLSTM( input_size=self.input_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5)) self.crf = CRF(vocab.label_size) def forward(self, words, extwords, predicts, masks): # x = (batch size, sequence length, dimension of embedding) x_word_embed = self.word_embed(words) x_extword_embed = self.extword_embed(extwords) x_embed = x_word_embed + x_extword_embed x_predict_embed = self.predicate_embed(predicts) if self.training: x_embed, x_predict_embed = drop_bi_input_independent(x_embed, x_predict_embed, self.config.dropout_emb) embeddings = torch.cat((x_embed, x_predict_embed), dim=2) lstm_out, _ = self.bilstm(embeddings, masks) lstm_out = lstm_out.transpose(1, 0) label_scores = self.outlayer(lstm_out) return label_scores def compute_loss(self, output, answer, masks): # output: [B, T, L], answer: [B, T], mask: [B, T, L] # print answer output = output.transpose(1, 0).contiguous() answer = answer.transpose(1, 0).contiguous() masks = masks.transpose(1, 0).contiguous() total_loss = self.crf(output, answer, masks) num_words = masks.float().sum() total_loss = total_loss / num_words return total_loss def decode(self, label_scores, masks): label_scores = label_scores.transpose(1, 0).contiguous() masks = masks.transpose(1, 0).contiguous() tag_seq = self.crf.decode(label_scores, masks) return tag_seq def save(self, filepath): """ Save model parameters to file. """ torch.save(self.state_dict(), filepath) print('Saved model to: {}'.format(filepath)) def load(self, filepath): """ Load model parameters from file. """ self.load_state_dict(torch.load(filepath)) print('Loaded model from: {}'.format(filepath))
class BiLSTMModel(nn.Module): def __init__(self, vocab, config, elmo_shape): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD self.word_dims = config.word_dims self.elmo_layers = elmo_shape[0] self.elmo_dims = elmo_shape[1] weights = torch.randn(self.elmo_layers) self.weights = torch.nn.Parameter(weights, requires_grad=True) self.mlp_elmo = nn.Linear(self.elmo_dims, self.word_dims, bias=False) self.rel_embed = nn.Embedding(vocab.rel_size, self.word_dims, padding_idx=vocab.PAD) self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims**0.5)) self.input_dims = 2 * config.word_dims + config.predict_dims self.dt_tree = DTTreeGRU(self.input_dims, config.lstm_hiddens) self.td_tree = TDTreeGRU(self.input_dims, config.lstm_hiddens) self.lstm_input_dims = 2 * config.lstm_hiddens self.bilstm = MyLSTM( input_size=self.lstm_input_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens)**0.5)) self.crf = CRF(vocab.label_size) def forward(self, elmos, predicts, masks, rels, heads, lengths): # x = (batch size, sequence length, dimension of embedding) elmos = elmos.permute(0, 2, 3, 1).matmul(self.weights) x_embed = self.mlp_elmo(elmos) x_rel_embed = self.rel_embed(rels) x_predict_embed = self.predicate_embed(predicts) if self.training: x_embed, x_rel_embed, x_predict_embed = drop_tri_input_independent( x_embed, x_rel_embed, x_predict_embed, self.config.dropout_emb) x_lexical = torch.cat((x_embed, x_rel_embed, x_predict_embed), dim=2) x_lexical = x_lexical.transpose(1, 0) max_length, batch_size, input_dim = x_lexical.size() trees = [] indexes = np.zeros((max_length, batch_size), dtype=np.int32) for b, head in enumerate(heads): root, tree = creatTree(head) root.traverse() for step, index in enumerate(root.order): indexes[step, b] = index trees.append(tree) dt_outputs, dt_hidden_ts = self.dt_tree(x_lexical, indexes, trees, lengths) td_outputs, td_hidden_ts = self.td_tree(x_lexical, indexes, trees, lengths) tree_outputs = torch.cat([dt_outputs, td_outputs], dim=2) lstm_out, _ = self.bilstm(tree_outputs, masks) lstm_out = lstm_out.transpose(1, 0) label_scores = self.outlayer(lstm_out) return label_scores def compute_loss(self, output, answer, masks): # output: [B, T, L], answer: [B, T], mask: [B, T, L] # print answer output = output.transpose(1, 0).contiguous() answer = answer.transpose(1, 0).contiguous() masks = masks.transpose(1, 0).contiguous() total_loss = self.crf(output, answer, masks) num_words = masks.float().sum() total_loss = total_loss / num_words return total_loss def decode(self, label_scores, masks): label_scores = label_scores.transpose(1, 0).contiguous() masks = masks.transpose(1, 0).contiguous() tag_seq = self.crf.decode(label_scores, masks) return tag_seq def save(self, filepath): """ Save model parameters to file. """ torch.save(self.state_dict(), filepath) print('Saved model to: {}'.format(filepath)) def load(self, filepath): """ Load model parameters from file. """ self.load_state_dict(torch.load(filepath)) print('Loaded model from: {}'.format(filepath))
def __init__(self, vocab, config, parser_config, pretrained_embedding): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD extvocab_size, extword_dims = pretrained_embedding.shape self.word_dims = extword_dims if config.word_dims != extword_dims: print("word dim size does not match, check config file") self.word_embed = nn.Embedding(vocab.vocab_size, self.word_dims, padding_idx=vocab.PAD) if vocab.extvocab_size != extvocab_size: print("word vocab size does not match, check word embedding file") self.extword_embed = CPUEmbedding(vocab.extvocab_size, self.word_dims, padding_idx=vocab.PAD) word_init = np.zeros((vocab.vocab_size, self.word_dims), dtype=np.float32) self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) self.extword_embed.weight.data.copy_( torch.from_numpy(pretrained_embedding)) self.extword_embed.weight.requires_grad = False self.transformer_emb = nn.Linear(parser_config.word_dims, self.word_dims, bias=False) parser_dim = 2 * parser_config.lstm_hiddens transformer_lstm = [] for layer in range(parser_config.lstm_layers): transformer_lstm.append( nn.Linear(parser_dim, self.word_dims, bias=False)) self.transformer_lstm = nn.ModuleList(transformer_lstm) parser_mlp_dim = parser_config.mlp_arc_size + parser_config.mlp_rel_size self.transformer_dep = nn.Linear(parser_mlp_dim, self.word_dims, bias=False) self.transformer_head = nn.Linear(parser_mlp_dim, self.word_dims, bias=False) self.parser_lstm_layers = parser_config.lstm_layers self.synscale = ScalarMix(mixture_size=3 + parser_config.lstm_layers) self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims**0.5)) self.input_dims = 2 * config.word_dims + config.predict_dims self.bilstm = MyLSTM( input_size=self.input_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens)**0.5)) self.crf = CRF(vocab.label_size)
class BiLSTMModel(nn.Module): def __init__(self, vocab, config, elmo_shape): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD self.word_dims = config.word_dims self.elmo_layers = elmo_shape[0] self.elmo_dims = elmo_shape[1] weights = torch.randn(self.elmo_layers) self.weights = torch.nn.Parameter(weights, requires_grad=True) self.mlp_elmo = nn.Linear(self.elmo_dims, self.word_dims, bias=False) self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0) word_init = np.random.randn(vocab.vocab_size, config.word_dims).astype(np.float32) self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5)) self.lstm_input_dims = config.word_dims + config.predict_dims self.bilstm = MyLSTM( input_size=self.lstm_input_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5)) self.crf = CRF(vocab.label_size) def forward(self, elmos, actions, predicts, masks, indices): # x = (batch size, sequence length, dimension of embedding) elmos = elmos.matmul(self.weights) x_elmo_embed = self.mlp_elmo(elmos) x_action_embed = self.word_embed(actions) x_embed = x_elmo_embed + x_action_embed x_predict_embed = self.predicate_embed(predicts) if self.training: x_embed, x_predict_embed = drop_bi_input_independent(x_embed, x_predict_embed, self.config.dropout_emb) embeddings = torch.cat((x_embed, x_predict_embed), dim=2) lstm_out, _ = self.bilstm(embeddings, masks) lstm_out = lstm_out.transpose(1, 0) filtered = torch.gather(lstm_out, 1, indices) label_scores = self.outlayer(filtered) return label_scores def compute_loss(self, output, answer, wmasks): # output: [B, T, L], answer: [B, T], mask: [B, T, L] # print answer output = output.transpose(1, 0).contiguous() answer = answer.transpose(1, 0).contiguous() wmasks = wmasks.transpose(1, 0).contiguous() total_loss = self.crf(output, answer, wmasks) num_words = wmasks.float().sum() total_loss = total_loss / num_words return total_loss def decode(self, label_scores, wmasks): label_scores = label_scores.transpose(1, 0).contiguous() wmasks = wmasks.transpose(1, 0).contiguous() tag_seq = self.crf.decode(label_scores, wmasks) return tag_seq def save(self, filepath): """ Save model parameters to file. """ torch.save(self.state_dict(), filepath) print('Saved model to: {}'.format(filepath)) def load(self, filepath): """ Load model parameters from file. """ self.load_state_dict(torch.load(filepath)) print('Loaded model from: {}'.format(filepath))
def __init__( self, vocabs, counters, word_embed_file, word_embed_dim, char_embed_dim, char_filters, char_feat_dim, lstm_hidden_size, lstm_dropout=0.5, feat_dropout=0.5, signal_dropout=0, ctx_size=5, use_signal=True, parameters=None, ): assert char_feat_dim >= word_embed_dim super(LstmCnnDfc, self).__init__() self.vocabs = vocabs self.label_size = len(self.vocabs['label']) self.use_signal = use_signal # input features if parameters is not None: self.word_embed = nn.Embedding(parameters['word_embed_num'], parameters['word_embed_dim']) else: self.word_embed = load_embedding_from_file(word_embed_file, word_embed_dim, vocabs['token'], vocabs['embed'], vocabs['form'], padding_idx=C.PAD_INDEX, trainable=True) self.char_embed = CharCNNFF(len(vocabs['char']), char_embed_dim, char_filters, output_size=char_feat_dim) if use_signal: if parameters is not None: self.signal_embed = nn.Embedding( parameters['signal_embed_num'], parameters['signal_embed_dim']) else: self.signal_embed = build_signal_embed(counters['embed'], counters['token'], vocabs['token'], vocabs['form']) self.word_dim = self.word_embed.embedding_dim self.char_dim = self.char_embed.output_size self.feat_dim = self.char_dim self.signal_dim = self.signal_embed.embedding_dim self.ctx_size = ctx_size # layers self.lstm = LSTM(input_size=self.feat_dim, hidden_size=lstm_hidden_size, batch_first=True, bidirectional=True) self.output_linear = Linear(self.lstm.output_size, self.label_size) self.crf = CRF(vocabs['label']) self.feat_dropout = nn.Dropout(p=feat_dropout) self.lstm_dropout = nn.Dropout(p=lstm_dropout) self.signal_dropout = nn.Dropout(p=signal_dropout) self.lstm_size = self.lstm.output_size self.uni_lstm_size = self.lstm_size // 2 # word representation level self.word_gates = nn.ModuleList([ Linear(self.word_dim, self.word_dim), Linear(self.word_dim, self.word_dim) ]) self.char_gates = nn.ModuleList([ Linear(self.word_dim, self.word_dim), Linear(self.word_dim, self.word_dim) ]) if use_signal: self.signal_gates = nn.ModuleList([ Linear(self.signal_dim, self.word_dim), Linear(self.signal_dim, self.word_dim) ]) # feature extraction level # context-only feature linear layers self.cof_linear_fwd = Linear(self.uni_lstm_size, self.uni_lstm_size) self.cof_linear_bwd = Linear(self.uni_lstm_size, self.uni_lstm_size) # hidden states gates self.hs_gates = nn.ModuleList( [Linear(self.uni_lstm_size, self.uni_lstm_size) for _ in range(4)]) # context-only feature gates self.cof_gates = nn.ModuleList( [Linear(self.uni_lstm_size, self.uni_lstm_size) for _ in range(4)]) if use_signal: self.crs_gates = nn.ModuleList([ Linear(self.signal_dim * (ctx_size + 1), self.uni_lstm_size) for _ in range(4) ])
class BiLSTMModel(nn.Module): def __init__(self, vocab, config, input_dims, bert_layers): super(BiLSTMModel, self).__init__() self.config = config self.PAD = vocab.PAD self.input_dims = input_dims self.input_depth = bert_layers if config.bert_tune == 0 else 1 self.hidden_dims = config.word_dims self.projections = nn.ModuleList([NonLinear(self.input_dims, self.hidden_dims, activation=GELU()) \ for i in range(self.input_depth)]) self.rescale = ScalarMix(mixture_size=self.input_depth) self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0) word_init = np.random.randn(vocab.vocab_size, config.word_dims).astype(np.float32) self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0) nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5)) self.lstm_input_dims = config.word_dims + config.predict_dims self.bilstm = MyLSTM( input_size=self.lstm_input_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False) nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5)) self.crf = CRF(vocab.label_size) def forward(self, inputs, actions, predicts, masks, indices): # x = (batch size, sequence length, dimension of embedding) proj_hiddens = [] for idx, input in enumerate(inputs): cur_hidden = self.projections[idx](input) proj_hiddens.append(cur_hidden) x_bert_embed = self.rescale(proj_hiddens) x_action_embed = self.word_embed(actions) x_embed = x_bert_embed + x_action_embed x_predict_embed = self.predicate_embed(predicts) if self.training: x_embed, x_predict_embed = drop_bi_input_independent(x_embed, x_predict_embed, self.config.dropout_emb) embeddings = torch.cat((x_embed, x_predict_embed), dim=2) lstm_out, _ = self.bilstm(embeddings, masks) lstm_out = lstm_out.transpose(1, 0) filtered = torch.gather(lstm_out, 1, indices) label_scores = self.outlayer(filtered) return label_scores def compute_loss(self, output, answer, wmasks): # output: [B, T, L], answer: [B, T], mask: [B, T, L] # print answer output = output.transpose(1, 0).contiguous() answer = answer.transpose(1, 0).contiguous() wmasks = wmasks.transpose(1, 0).contiguous() total_loss = self.crf(output, answer, wmasks) num_words = wmasks.float().sum() total_loss = total_loss / num_words return total_loss def decode(self, label_scores, wmasks): label_scores = label_scores.transpose(1, 0).contiguous() wmasks = wmasks.transpose(1, 0).contiguous() tag_seq = self.crf.decode(label_scores, wmasks) return tag_seq def save(self, filepath): """ Save model parameters to file. """ torch.save(self.state_dict(), filepath) print('Saved model to: {}'.format(filepath)) def load(self, filepath): """ Load model parameters from file. """ self.load_state_dict(torch.load(filepath)) print('Loaded model from: {}'.format(filepath))