Example #1
0
    def __init__(self, vocab, config, elmo_shape):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        self.word_dims = config.word_dims
        self.elmo_layers = elmo_shape[0]
        self.elmo_dims = elmo_shape[1]

        weights = torch.randn(self.elmo_layers)
        self.weights = torch.nn.Parameter(weights, requires_grad=True)
        self.mlp_elmo = nn.Linear(self.elmo_dims, self.word_dims, bias=False)

        self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0)
        word_init = np.random.randn(vocab.vocab_size, config.word_dims).astype(np.float32)
        self.word_embed.weight.data.copy_(torch.from_numpy(word_init))

        self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5))

        self.lstm_input_dims = config.word_dims + config.predict_dims

        self.bilstm = MyLSTM(
            input_size=self.lstm_input_dims,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5))

        self.crf = CRF(vocab.label_size)
Example #2
0
    def __init__(self, vocab, config, input_dims, bert_layers):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        self.input_dims = input_dims
        self.input_depth = bert_layers if config.bert_tune == 0 else 1
        self.hidden_dims = config.word_dims
        self.projections = nn.ModuleList([NonLinear(self.input_dims, self.hidden_dims, activation=GELU()) \
                                          for i in range(self.input_depth)])

        self.rescale = ScalarMix(mixture_size=self.input_depth)

        self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0)
        word_init = np.random.randn(vocab.vocab_size, config.word_dims).astype(np.float32)
        self.word_embed.weight.data.copy_(torch.from_numpy(word_init))

        self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5))

        self.lstm_input_dims = config.word_dims + config.predict_dims

        self.bilstm = MyLSTM(
            input_size=self.lstm_input_dims,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5))

        self.crf = CRF(vocab.label_size)
Example #3
0
    def __init__(self, vocab, config, pretrained_embedding):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        extvocab_size, extword_dims = pretrained_embedding.shape
        self.word_dims = extword_dims
        if config.word_dims != extword_dims:
            print("word dim size does not match, check config file")
        self.word_embed = nn.Embedding(vocab.vocab_size,
                                       self.word_dims,
                                       padding_idx=vocab.PAD)
        if vocab.extvocab_size != extvocab_size:
            print("word vocab size does not match, check word embedding file")
        self.extword_embed = CPUEmbedding(vocab.extvocab_size,
                                          self.word_dims,
                                          padding_idx=vocab.PAD)

        word_init = np.zeros((vocab.vocab_size, self.word_dims),
                             dtype=np.float32)
        self.word_embed.weight.data.copy_(torch.from_numpy(word_init))
        self.extword_embed.weight.data.copy_(
            torch.from_numpy(pretrained_embedding))
        self.extword_embed.weight.requires_grad = False

        self.rel_embed = nn.Embedding(vocab.rel_size,
                                      self.word_dims,
                                      padding_idx=vocab.PAD)

        self.predicate_embed = nn.Embedding(3,
                                            config.predict_dims,
                                            padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0,
                        1.0 / (config.predict_dims**0.5))

        self.input_dims = 2 * config.word_dims + config.predict_dims

        self.dt_tree = DTTreeGRU(self.input_dims, config.lstm_hiddens)
        self.td_tree = TDTreeGRU(self.input_dims, config.lstm_hiddens)

        self.bilstm = MyLSTM(
            input_size=2 * config.lstm_hiddens,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens,
                                  vocab.label_size,
                                  bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0,
                        1.0 / ((2 * config.lstm_hiddens)**0.5))

        self.crf = CRF(vocab.label_size)
Example #4
0
    def __init__(self, vocab, config, parser_config, input_dims, bert_layers):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        self.input_dims = input_dims
        self.input_depth = bert_layers if config.bert_tune == 0 else 1
        self.hidden_dims = 2 * config.lstm_hiddens
        self.projections = nn.ModuleList([NonLinear(self.input_dims, self.hidden_dims, activation=GELU()) \
                                          for i in range(self.input_depth)])

        self.rescale = ScalarMix(mixture_size=self.input_depth)

        parser_dim = 2 * parser_config.lstm_hiddens
        self.transformer_lstm = nn.ModuleList([
            NonLinear(parser_dim, self.hidden_dims, activation=GELU())
            for i in range(parser_config.lstm_layers)
        ])

        parser_mlp_dim = parser_config.mlp_arc_size + parser_config.mlp_rel_size
        self.transformer_dep = NonLinear(parser_mlp_dim,
                                         self.hidden_dims,
                                         activation=GELU())
        self.transformer_head = NonLinear(parser_mlp_dim,
                                          self.hidden_dims,
                                          activation=GELU())

        self.parser_lstm_layers = parser_config.lstm_layers
        self.synscale = ScalarMix(mixture_size=3 + parser_config.lstm_layers)

        self.predicate_embed = nn.Embedding(3,
                                            config.predict_dims,
                                            padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0,
                        1.0 / (config.predict_dims**0.5))

        self.lstm_input_dims = 2 * self.hidden_dims + config.predict_dims

        self.bilstm = MyLSTM(
            input_size=self.lstm_input_dims,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens,
                                  vocab.label_size,
                                  bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0,
                        1.0 / ((2 * config.lstm_hiddens)**0.5))

        self.crf = CRF(vocab.label_size)
Example #5
0
    def __init__(self,
                 vocabs,
                 word_embed_file,
                 word_embed_dim,
                 char_embed_dim,
                 char_filters,
                 char_feat_dim,
                 lstm_hidden_size,
                 lstm_dropout=.5,
                 feat_dropout=.5,
                 parameters=None):
        super(LstmCnnFeatGate, self).__init__()
        assert word_embed_dim == char_feat_dim

        self.vocabs = vocabs
        self.label_size = len(self.vocabs['label'])

        # input features
        if parameters is not None:
            self.word_embed = nn.Embedding(parameters['word_embed_num'],
                                           parameters['word_embed_dim'])
        else:
            self.word_embed = load_embedding_from_file(word_embed_file,
                                                       word_embed_dim,
                                                       vocabs['token'],
                                                       vocabs['embed'],
                                                       vocabs['form'],
                                                       padding_idx=C.PAD_INDEX,
                                                       trainable=True)
        self.char_embed = CharCNNFF(len(vocabs['char']),
                                    char_embed_dim,
                                    char_filters,
                                    output_size=char_feat_dim)
        # word dim = char_dim = feat_dim in this model
        self.word_dim = self.word_embed.embedding_dim
        self.char_dim = self.char_embed.output_size
        self.feat_dim = self.word_dim
        # layers
        self.char_gate = Linear(self.char_dim, self.char_dim, bias=False)
        self.word_gate = Linear(self.word_dim, self.word_dim, bias=False)
        self.gate = Linear(self.feat_dim, self.feat_dim, bias=False)

        self.lstm = LSTM(input_size=self.feat_dim,
                         hidden_size=lstm_hidden_size,
                         batch_first=True,
                         bidirectional=True)
        self.output_linear = Linear(self.lstm.output_size, self.label_size)
        self.crf = CRF(vocabs['label'])
        self.feat_dropout = nn.Dropout(p=feat_dropout)
        self.lstm_dropout = nn.Dropout(p=lstm_dropout)
Example #6
0
    def __init__(self, vocab, config, parser_config, elmo_shape):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        self.word_dims = config.word_dims
        self.elmo_layers = elmo_shape[0]
        self.elmo_dims = elmo_shape[1]

        weights = torch.randn(self.elmo_layers)
        self.weights = torch.nn.Parameter(weights, requires_grad=True)
        self.mlp_elmo = nn.Linear(self.elmo_dims, self.word_dims, bias=False)

        self.transformer_emb = nn.Linear(parser_config.word_dims, self.word_dims, bias=False)

        parser_dim = 2 * parser_config.lstm_hiddens
        transformer_lstm = []
        for layer in range(parser_config.lstm_layers):
            transformer_lstm.append(nn.Linear(parser_dim, self.word_dims, bias=False))
        self.transformer_lstm = nn.ModuleList(transformer_lstm)

        parser_mlp_dim = parser_config.mlp_arc_size + parser_config.mlp_rel_size
        self.transformer_dep = nn.Linear(parser_mlp_dim, self.word_dims, bias=False)
        self.transformer_head = nn.Linear(parser_mlp_dim, self.word_dims, bias=False)

        self.parser_lstm_layers = parser_config.lstm_layers
        self.synscale = ScalarMix(mixture_size=3+parser_config.lstm_layers)

        self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5))

        self.lstm_input_dims = 2 * config.word_dims + config.predict_dims

        self.bilstm = MyLSTM(
            input_size=self.lstm_input_dims,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5))

        self.crf = CRF(vocab.label_size)
Example #7
0
class BiLSTMModel(nn.Module):
    def __init__(self, vocab, config, parser_config, input_dims, bert_layers):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        self.input_dims = input_dims
        self.input_depth = bert_layers if config.bert_tune == 0 else 1
        self.hidden_dims = 2 * config.lstm_hiddens
        self.projections = nn.ModuleList([NonLinear(self.input_dims, self.hidden_dims, activation=GELU()) \
                                          for i in range(self.input_depth)])

        self.rescale = ScalarMix(mixture_size=self.input_depth)

        parser_dim = 2 * parser_config.lstm_hiddens
        self.transformer_lstm = nn.ModuleList([
            NonLinear(parser_dim, self.hidden_dims, activation=GELU())
            for i in range(parser_config.lstm_layers)
        ])

        parser_mlp_dim = parser_config.mlp_arc_size + parser_config.mlp_rel_size
        self.transformer_dep = NonLinear(parser_mlp_dim,
                                         self.hidden_dims,
                                         activation=GELU())
        self.transformer_head = NonLinear(parser_mlp_dim,
                                          self.hidden_dims,
                                          activation=GELU())

        self.parser_lstm_layers = parser_config.lstm_layers
        self.synscale = ScalarMix(mixture_size=3 + parser_config.lstm_layers)

        self.predicate_embed = nn.Embedding(3,
                                            config.predict_dims,
                                            padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0,
                        1.0 / (config.predict_dims**0.5))

        self.lstm_input_dims = 2 * self.hidden_dims + config.predict_dims

        self.bilstm = MyLSTM(
            input_size=self.lstm_input_dims,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens,
                                  vocab.label_size,
                                  bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0,
                        1.0 / ((2 * config.lstm_hiddens)**0.5))

        self.crf = CRF(vocab.label_size)

    def forward(self, inputs, predicts, synxs, masks):
        # x = (batch size, sequence length, dimension of embedding)
        proj_hiddens = []
        for idx, input in enumerate(inputs):
            cur_hidden = self.projections[idx](input)
            proj_hiddens.append(cur_hidden)

        x_embed = self.rescale(proj_hiddens)

        syn_idx = 0
        x_syns = []
        x_syn_emb = self.transformer_emb(synxs[syn_idx])
        x_syns.append(x_syn_emb)
        syn_idx += 1

        for layer in range(self.parser_lstm_layers):
            x_syn_lstm = self.transformer_lstm[layer].forward(synxs[syn_idx])
            syn_idx += 1
            x_syns.append(x_syn_lstm)

        x_syn_dep = self.transformer_dep(synxs[syn_idx])
        x_syns.append(x_syn_dep)
        syn_idx += 1

        x_syn_head = self.transformer_head(synxs[syn_idx])
        x_syns.append(x_syn_head)
        syn_idx += 1

        x_syn = self.synscale(x_syns)

        x_predict_embed = self.predicate_embed(predicts)

        if self.training:
            x_embed, x_syn, x_predict_embed = drop_tri_input_independent(
                x_embed, x_syn, x_predict_embed, self.config.dropout_emb)

        embeddings = torch.cat((x_embed, x_syn, x_predict_embed), dim=2)

        lstm_out, _ = self.bilstm(embeddings, masks)
        lstm_out = lstm_out.transpose(1, 0)

        label_scores = self.outlayer(lstm_out)

        return label_scores

    def compute_loss(self, output, answer, masks):
        # output: [B, T, L], answer: [B, T], mask: [B, T, L]
        # print answer
        output = output.transpose(1, 0).contiguous()
        answer = answer.transpose(1, 0).contiguous()
        masks = masks.transpose(1, 0).contiguous()
        total_loss = self.crf(output, answer, masks)

        num_words = masks.float().sum()
        total_loss = total_loss / num_words

        return total_loss

    def decode(self, label_scores, masks):
        label_scores = label_scores.transpose(1, 0).contiguous()
        masks = masks.transpose(1, 0).contiguous()
        tag_seq = self.crf.decode(label_scores, masks)

        return tag_seq

    def save(self, filepath):
        """ Save model parameters to file.
        """
        torch.save(self.state_dict(), filepath)
        print('Saved model to: {}'.format(filepath))

    def load(self, filepath):
        """ Load model parameters from file.
        """
        self.load_state_dict(torch.load(filepath))
        print('Loaded model from: {}'.format(filepath))
Example #8
0
class LstmCnn(nn.Module):
    def __init__(self,
                 vocabs,
                 word_embed_file,
                 word_embed_dim,
                 char_embed_dim,
                 char_filters,
                 char_feat_dim,
                 lstm_hidden_size,
                 lstm_dropout=0,
                 feat_dropout=0,
                 parameters=None):
        super(LstmCnn, self).__init__()

        self.vocabs = vocabs
        self.label_size = len(self.vocabs['label'])
        # input features
        if parameters is not None:
            self.word_embed = nn.Embedding(parameters['word_embed_num'],
                                           parameters['word_embed_dim'],
                                           padding_idx=C.PAD_INDEX)
        else:
            self.word_embed = load_embedding_from_file(word_embed_file,
                                                       word_embed_dim,
                                                       vocabs['token'],
                                                       vocabs['embed'],
                                                       vocabs['form'],
                                                       padding_idx=C.PAD_INDEX,
                                                       trainable=True)
        self.char_embed = CharCNNFF(len(vocabs['char']),
                                    char_embed_dim,
                                    char_filters,
                                    output_size=char_feat_dim)
        self.word_dim = self.word_embed.embedding_dim
        self.char_dim = self.char_embed.output_size
        self.feat_dim = self.char_dim + self.word_dim
        # layers
        self.lstm = LSTM(input_size=self.feat_dim,
                         hidden_size=lstm_hidden_size,
                         batch_first=True,
                         bidirectional=True)
        self.output_linear = Linear(self.lstm.output_size, self.label_size)
        self.crf = CRF(vocabs['label'])
        self.feat_dropout = nn.Dropout(p=feat_dropout)
        self.lstm_dropout = nn.Dropout(p=lstm_dropout)

    @property
    def params(self):
        return {
            'word_embed_num': self.word_embed.num_embeddings,
            'word_embed_dim': self.word_embed.embedding_dim
        }

    def forward_nn(self, token_ids, char_ids, lens):
        batch_size, seq_len = token_ids.size()
        # word representation
        word_in = self.word_embed(token_ids)
        char_in = self.char_embed(char_ids)
        char_in = char_in.view(batch_size, seq_len, self.char_dim)
        feats = torch.cat([word_in, char_in], dim=2)
        feats = self.feat_dropout(feats)

        # LSTM layer
        lstm_in = R.pack_padded_sequence(feats,
                                         lens.tolist(),
                                         batch_first=True)
        lstm_out, _ = self.lstm(lstm_in)
        lstm_out, _ = R.pad_packed_sequence(lstm_out, batch_first=True)
        lstm_out = self.lstm_dropout(lstm_out)

        # output linear layer
        linear_out = self.output_linear(lstm_out)
        return linear_out

    def forward(self, token_ids, char_ids, lens, labels):
        logits = self.forward_nn(token_ids, char_ids, lens)
        logits = self.crf.pad_logits(logits)
        norm_score = self.crf.calc_norm_score(logits, lens)
        gold_score = self.crf.calc_gold_score(logits, labels, lens)
        loglik = gold_score - norm_score

        return loglik, logits

    def predict(self, token_ids, char_ids, lens):
        self.eval()
        logits = self.forward_nn(token_ids, char_ids, lens)
        logits = self.crf.pad_logits(logits)
        _scores, preds = self.crf.viterbi_decode(logits, lens)
        preds = preds.data.tolist()
        self.train()
        return preds
Example #9
0
    def __init__(self,
                 vocabs,
                 counters,
                 word_embed_file,
                 word_embed_dim,
                 char_embed_dim,
                 char_filters,
                 char_feat_dim,
                 lstm_hidden_size,
                 lstm_dropout=0.5,
                 feat_dropout=0.5):
        # TODO: init function for saved model
        super(LstmCnnGate, self).__init__()

        self.vocabs = vocabs
        self.label_size = len(self.vocabs['label'])

        # input features
        self.word_embed = load_embedding_from_file(word_embed_file,
                                                   word_embed_dim,
                                                   vocabs['token'],
                                                   vocabs['embed'],
                                                   vocabs['form'],
                                                   padding_idx=C.PAD_INDEX,
                                                   trainable=True)
        self.char_embed = CharCNNFF(len(vocabs['char']),
                                    char_embed_dim,
                                    char_filters,
                                    output_size=char_feat_dim)
        self.word_dim = self.word_embed.embedding_dim
        self.char_dim = self.char_embed.output_size
        self.feat_dim = self.char_dim
        # layers
        self.lstm = LSTM(input_size=self.feat_dim,
                         hidden_size=lstm_hidden_size,
                         batch_first=True,
                         bidirectional=True)
        self.output_linear = Linear(self.lstm.output_size, self.label_size)
        self.crf = CRF(vocabs['label'])
        self.feat_dropout = nn.Dropout(p=feat_dropout)
        self.lstm_dropout = nn.Dropout(p=lstm_dropout)
        self.lstm_size = self.lstm.output_size
        self.uni_lstm_size = self.lstm_size // 2

        # word representation level
        self.word_gate = Linear(self.word_dim, self.word_dim)
        self.char_gate = Linear(self.word_dim, self.word_dim)

        # feature extraction level
        # context-only feature linear layers
        self.cof_linear_fwd = Linear(self.uni_lstm_size, self.uni_lstm_size)
        self.cof_linear_bwd = Linear(self.uni_lstm_size, self.uni_lstm_size)
        # hidden states gates
        self.hs_gates = nn.ModuleList([
            Linear(self.uni_lstm_size, self.uni_lstm_size),
            Linear(self.uni_lstm_size, self.uni_lstm_size)
        ])
        # context-only feature gates
        self.cof_gates = nn.ModuleList([
            Linear(self.uni_lstm_size, self.uni_lstm_size),
            Linear(self.uni_lstm_size, self.uni_lstm_size)
        ])
Example #10
0
class LstmCnnGate(nn.Module):
    def __init__(self,
                 vocabs,
                 counters,
                 word_embed_file,
                 word_embed_dim,
                 char_embed_dim,
                 char_filters,
                 char_feat_dim,
                 lstm_hidden_size,
                 lstm_dropout=0.5,
                 feat_dropout=0.5):
        # TODO: init function for saved model
        super(LstmCnnGate, self).__init__()

        self.vocabs = vocabs
        self.label_size = len(self.vocabs['label'])

        # input features
        self.word_embed = load_embedding_from_file(word_embed_file,
                                                   word_embed_dim,
                                                   vocabs['token'],
                                                   vocabs['embed'],
                                                   vocabs['form'],
                                                   padding_idx=C.PAD_INDEX,
                                                   trainable=True)
        self.char_embed = CharCNNFF(len(vocabs['char']),
                                    char_embed_dim,
                                    char_filters,
                                    output_size=char_feat_dim)
        self.word_dim = self.word_embed.embedding_dim
        self.char_dim = self.char_embed.output_size
        self.feat_dim = self.char_dim
        # layers
        self.lstm = LSTM(input_size=self.feat_dim,
                         hidden_size=lstm_hidden_size,
                         batch_first=True,
                         bidirectional=True)
        self.output_linear = Linear(self.lstm.output_size, self.label_size)
        self.crf = CRF(vocabs['label'])
        self.feat_dropout = nn.Dropout(p=feat_dropout)
        self.lstm_dropout = nn.Dropout(p=lstm_dropout)
        self.lstm_size = self.lstm.output_size
        self.uni_lstm_size = self.lstm_size // 2

        # word representation level
        self.word_gate = Linear(self.word_dim, self.word_dim)
        self.char_gate = Linear(self.word_dim, self.word_dim)

        # feature extraction level
        # context-only feature linear layers
        self.cof_linear_fwd = Linear(self.uni_lstm_size, self.uni_lstm_size)
        self.cof_linear_bwd = Linear(self.uni_lstm_size, self.uni_lstm_size)
        # hidden states gates
        self.hs_gates = nn.ModuleList([
            Linear(self.uni_lstm_size, self.uni_lstm_size),
            Linear(self.uni_lstm_size, self.uni_lstm_size)
        ])
        # context-only feature gates
        self.cof_gates = nn.ModuleList([
            Linear(self.uni_lstm_size, self.uni_lstm_size),
            Linear(self.uni_lstm_size, self.uni_lstm_size)
        ])

    def _repr_gate(self, word, char):
        gate_w = self.word_gate(word)
        gate_c = self.char_gate(char)
        gate = gate_w + gate_c
        gate = gate.sigmoid()
        return gate

    def _feat_gate(self, hs, cof, idx):
        """Calculate feature extraction level gates.
        :param hs: Hidden states.
        :param cof: Context-only features.
        """
        gate_h = self.hs_gates[idx](hs)
        gate_c = self.cof_gates[idx](cof)
        gate = gate_h + gate_c
        gate = gate.sigmoid()
        return gate

    def forward_nn(self, token_ids, char_ids, lens):
        batch_size, seq_len = token_ids.size()
        word_dim = self.word_dim
        char_dim = self.char_dim

        # word representations
        word_in = self.word_embed(token_ids)
        char_in = self.char_embed(char_ids)
        char_in = char_in.view(batch_size, seq_len, char_dim)
        # combine features
        repr_mix_gate = self._repr_gate(word_in, char_in)
        feats = repr_mix_gate * word_in + (1 - repr_mix_gate) * char_in
        feats = self.feat_dropout(feats)

        # LSTM layer
        lstm_in = R.pack_padded_sequence(feats,
                                         lens.tolist(),
                                         batch_first=True)
        lstm_out, _ = self.lstm(lstm_in)
        lstm_out, _ = R.pad_packed_sequence(lstm_out, batch_first=True)
        lstm_out = self.lstm_dropout(lstm_out)

        # context-only features (cof)
        hs_pad = lstm_out.new_zeros([batch_size, 1, self.uni_lstm_size],
                                    requires_grad=False)
        hs_fwd = lstm_out[:, :, :self.uni_lstm_size]
        hs_bwd = lstm_out[:, :, self.uni_lstm_size:]
        hs_fwd_padded = torch.cat([hs_pad, hs_fwd], dim=1)[:, :-1, :]
        hs_bwd_padded = torch.cat([hs_bwd, hs_pad], dim=1)[:, 1:, :]
        cof_fwd = self.cof_linear_fwd(hs_fwd_padded).tanh()
        cof_bwd = self.cof_linear_bwd(hs_bwd_padded).tanh()

        # feature extract level gates
        feat_mix_gate_fwd = self._feat_gate(hs_fwd, cof_fwd, 0)
        feat_mix_gate_bwd = self._feat_gate(hs_bwd, cof_bwd, 1)

        # enhanced hidden states
        hs_fwd_enh = feat_mix_gate_fwd * hs_fwd + (1 -
                                                   feat_mix_gate_fwd) * cof_fwd
        hs_bwd_enh = feat_mix_gate_bwd * hs_bwd + (1 -
                                                   feat_mix_gate_bwd) * cof_bwd
        hs_enh = torch.cat([hs_fwd_enh, hs_bwd_enh], dim=2)

        # output linear layer
        linear_out = self.output_linear(hs_enh)

        return linear_out

    def forward(self, token_ids, char_ids, lens, labels):
        logits = self.forward_nn(token_ids, char_ids, lens)
        logits = self.crf.pad_logits(logits)
        norm_score = self.crf.calc_norm_score(logits, lens)
        gold_score = self.crf.calc_gold_score(logits, labels, lens)
        loglik = gold_score - norm_score

        return loglik, logits

    def predict(self, token_ids, char_ids, lens):
        self.eval()

        logits = self.forward_nn(token_ids, char_ids, lens)
        logits = self.crf.pad_logits(logits)
        _scores, preds = self.crf.viterbi_decode(logits, lens)
        preds = preds.data.tolist()

        self.train()
        return preds
Example #11
0
class LstmCnnDfc(nn.Module):
    def __init__(
        self,
        vocabs,
        counters,
        word_embed_file,
        word_embed_dim,
        char_embed_dim,
        char_filters,
        char_feat_dim,
        lstm_hidden_size,
        lstm_dropout=0.5,
        feat_dropout=0.5,
        signal_dropout=0,
        ctx_size=5,
        use_signal=True,
        parameters=None,
    ):
        assert char_feat_dim >= word_embed_dim
        super(LstmCnnDfc, self).__init__()

        self.vocabs = vocabs
        self.label_size = len(self.vocabs['label'])
        self.use_signal = use_signal

        # input features
        if parameters is not None:
            self.word_embed = nn.Embedding(parameters['word_embed_num'],
                                           parameters['word_embed_dim'])
        else:
            self.word_embed = load_embedding_from_file(word_embed_file,
                                                       word_embed_dim,
                                                       vocabs['token'],
                                                       vocabs['embed'],
                                                       vocabs['form'],
                                                       padding_idx=C.PAD_INDEX,
                                                       trainable=True)
        self.char_embed = CharCNNFF(len(vocabs['char']),
                                    char_embed_dim,
                                    char_filters,
                                    output_size=char_feat_dim)
        if use_signal:
            if parameters is not None:
                self.signal_embed = nn.Embedding(
                    parameters['signal_embed_num'],
                    parameters['signal_embed_dim'])
            else:
                self.signal_embed = build_signal_embed(counters['embed'],
                                                       counters['token'],
                                                       vocabs['token'],
                                                       vocabs['form'])
        self.word_dim = self.word_embed.embedding_dim
        self.char_dim = self.char_embed.output_size
        self.feat_dim = self.char_dim
        self.signal_dim = self.signal_embed.embedding_dim
        self.ctx_size = ctx_size
        # layers
        self.lstm = LSTM(input_size=self.feat_dim,
                         hidden_size=lstm_hidden_size,
                         batch_first=True,
                         bidirectional=True)
        self.output_linear = Linear(self.lstm.output_size, self.label_size)
        self.crf = CRF(vocabs['label'])
        self.feat_dropout = nn.Dropout(p=feat_dropout)
        self.lstm_dropout = nn.Dropout(p=lstm_dropout)
        self.signal_dropout = nn.Dropout(p=signal_dropout)
        self.lstm_size = self.lstm.output_size
        self.uni_lstm_size = self.lstm_size // 2

        # word representation level
        self.word_gates = nn.ModuleList([
            Linear(self.word_dim, self.word_dim),
            Linear(self.word_dim, self.word_dim)
        ])
        self.char_gates = nn.ModuleList([
            Linear(self.word_dim, self.word_dim),
            Linear(self.word_dim, self.word_dim)
        ])
        if use_signal:
            self.signal_gates = nn.ModuleList([
                Linear(self.signal_dim, self.word_dim),
                Linear(self.signal_dim, self.word_dim)
            ])

        # feature extraction level
        # context-only feature linear layers
        self.cof_linear_fwd = Linear(self.uni_lstm_size, self.uni_lstm_size)
        self.cof_linear_bwd = Linear(self.uni_lstm_size, self.uni_lstm_size)
        # hidden states gates
        self.hs_gates = nn.ModuleList(
            [Linear(self.uni_lstm_size, self.uni_lstm_size) for _ in range(4)])
        # context-only feature gates
        self.cof_gates = nn.ModuleList(
            [Linear(self.uni_lstm_size, self.uni_lstm_size) for _ in range(4)])
        if use_signal:
            self.crs_gates = nn.ModuleList([
                Linear(self.signal_dim * (ctx_size + 1), self.uni_lstm_size)
                for _ in range(4)
            ])

    @property
    def params(self):
        return {
            'word_embed_num': self.word_embed.num_embeddings,
            'word_embed_dim': self.word_embed.embedding_dim,
            'signal_embed_num': self.signal_embed.num_embeddings,
            'signal_embed_dim': self.signal_embed.embedding_dim
        }

    def _repr_gate(self, word, char, signal=None, idx=0):
        gate_w = self.word_gates[idx](word)
        gate_c = self.char_gates[idx](char)
        if self.use_signal:
            gate_s = self.signal_gates[idx](self.signal_dropout(signal))
            gate = gate_w + gate_c + gate_s
        else:
            gate = gate_w + gate_c
        gate = gate.sigmoid()
        return gate

    def _feat_gate(self, hs, cof, crs=None, idx=0):
        """Calculate feature extraction level gates.
        :param hs: Hidden states.
        :param cof: Context-only features.
        :param crs: Context reliability signals.
        """
        gate_h = self.hs_gates[idx](hs)
        gate_c = self.cof_gates[idx](cof)
        if self.use_signal:
            gate_s = self.crs_gates[idx](self.signal_dropout(crs))
            gate = gate_h + gate_c + gate_s
        else:
            gate = gate_h + gate_c
        gate = gate.sigmoid()
        return gate

    def forward_nn(self, token_ids, char_ids, lens):
        batch_size, seq_len = token_ids.size()
        word_dim = self.word_dim
        char_dim = self.char_dim
        signal_dim = self.signal_dim
        use_signal = self.use_signal
        ctx_size = self.ctx_size

        # word representations
        word_in = self.word_embed(token_ids)
        char_in = self.char_embed(char_ids)
        char_in = char_in.view(batch_size, seq_len, char_dim)
        signal_in = self.signal_embed(token_ids) if use_signal else None
        # combine features
        if char_dim == word_dim:
            # without additional char features
            repr_mix_gate_1 = self._repr_gate(word_in, char_in, signal_in, 0)
            repr_mix_gate_2 = self._repr_gate(word_in, char_in, signal_in, 1)
            feats = repr_mix_gate_1 * word_in + repr_mix_gate_2 * char_in
        else:
            # with additional char features
            char_in_alt = char_in[:, :, :word_dim]
            char_in_cat = char_in[:, :, word_dim:]
            repr_mix_gate_1 = self._repr_gate(word_in, char_in_alt, signal_in,
                                              0)
            repr_mix_gate_2 = self._repr_gate(word_in, char_in_alt, signal_in,
                                              1)
            feats = repr_mix_gate_1 * word_in + repr_mix_gate_2 * char_in_alt
            feats = torch.cat([feats, char_in_cat], dim=2)
        feats = self.feat_dropout(feats)

        # LSTM layer
        lstm_in = R.pack_padded_sequence(feats,
                                         lens.tolist(),
                                         batch_first=True)
        lstm_out, _ = self.lstm(lstm_in)
        lstm_out, _ = R.pad_packed_sequence(lstm_out, batch_first=True)
        lstm_out = self.lstm_dropout(lstm_out)

        # context reliability signals (crs)
        if use_signal:
            rs_pad = lstm_out.new_zeros([batch_size, ctx_size, signal_dim],
                                        requires_grad=False)
            signal_in_padded = torch.cat([rs_pad, signal_in, rs_pad], dim=1)
            signal_in_padded = signal_in_padded.view(batch_size, -1)
            crs = signal_in_padded.unfold(1, signal_dim * (ctx_size + 1),
                                          signal_dim)
            crs_fwd = crs[:, :-ctx_size, :]
            crs_bwd = crs[:, ctx_size:, :]
        else:
            crs_fwd = crs_bwd = None

        # context-only features (cof)
        hs_pad = lstm_out.new_zeros([batch_size, 1, self.uni_lstm_size],
                                    requires_grad=False)
        hs_fwd = lstm_out[:, :, :self.uni_lstm_size]
        hs_bwd = lstm_out[:, :, self.uni_lstm_size:]
        hs_fwd_padded = torch.cat([hs_pad, hs_fwd], dim=1)[:, :-1, :]
        hs_bwd_padded = torch.cat([hs_bwd, hs_pad], dim=1)[:, 1:, :]
        cof_fwd = self.cof_linear_fwd(hs_fwd_padded).tanh()
        cof_bwd = self.cof_linear_bwd(hs_bwd_padded).tanh()

        # feature extract level gates
        feat_mix_gate_fwd_1 = self._feat_gate(hs_fwd, cof_fwd, crs_fwd, 0)
        feat_mix_gate_fwd_2 = self._feat_gate(hs_fwd, cof_fwd, crs_fwd, 1)
        feat_mix_gate_bwd_1 = self._feat_gate(hs_bwd, cof_bwd, crs_bwd, 2)
        feat_mix_gate_bwd_2 = self._feat_gate(hs_bwd, cof_bwd, crs_bwd, 3)

        # enhanced hidden states
        hs_fwd_enh = feat_mix_gate_fwd_1 * hs_fwd + feat_mix_gate_fwd_2 * cof_fwd
        hs_bwd_enh = feat_mix_gate_bwd_1 * hs_bwd + feat_mix_gate_bwd_2 * cof_bwd
        hs_enh = torch.cat([hs_fwd_enh, hs_bwd_enh], dim=2)

        # output linear layer
        linear_out = self.output_linear(hs_enh)

        return linear_out

    def forward(self, token_ids, char_ids, lens, labels):
        logits = self.forward_nn(token_ids, char_ids, lens)
        logits = self.crf.pad_logits(logits)
        norm_score = self.crf.calc_norm_score(logits, lens)
        gold_score = self.crf.calc_gold_score(logits, labels, lens)
        loglik = gold_score - norm_score

        return loglik, logits

    def predict(self, token_ids, char_ids, lens):
        self.eval()

        logits = self.forward_nn(token_ids, char_ids, lens)
        logits = self.crf.pad_logits(logits)
        _scores, preds = self.crf.viterbi_decode(logits, lens)
        preds = preds.data.tolist()

        self.train()
        return preds
Example #12
0
class BiLSTMModel(nn.Module):
    def __init__(self, vocab, config, pretrained_embedding):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        extvocab_size, extword_dims = pretrained_embedding.shape
        self.word_dims = extword_dims
        if config.word_dims != extword_dims:
            print("word dim size does not match, check config file")
        self.word_embed = nn.Embedding(vocab.vocab_size, self.word_dims, padding_idx=vocab.PAD)
        if vocab.extvocab_size != extvocab_size:
            print("word vocab size does not match, check word embedding file")
        self.extword_embed = CPUEmbedding(vocab.extvocab_size, self.word_dims, padding_idx=vocab.PAD)

        word_init = np.zeros((vocab.vocab_size, self.word_dims), dtype=np.float32)
        self.word_embed.weight.data.copy_(torch.from_numpy(word_init))
        self.extword_embed.weight.data.copy_(torch.from_numpy(pretrained_embedding))
        self.extword_embed.weight.requires_grad = False

        self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5))

        self.input_dims = config.word_dims + config.predict_dims

        self.bilstm = MyLSTM(
            input_size=self.input_dims,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5))

        self.crf = CRF(vocab.label_size)

    def forward(self, words, extwords, predicts, masks):
        # x = (batch size, sequence length, dimension of embedding)
        x_word_embed = self.word_embed(words)
        x_extword_embed = self.extword_embed(extwords)
        x_embed = x_word_embed + x_extword_embed
        x_predict_embed = self.predicate_embed(predicts)

        if self.training:
            x_embed, x_predict_embed = drop_bi_input_independent(x_embed, x_predict_embed, self.config.dropout_emb)

        embeddings = torch.cat((x_embed, x_predict_embed), dim=2)

        lstm_out, _ = self.bilstm(embeddings, masks)
        lstm_out = lstm_out.transpose(1, 0)

        label_scores = self.outlayer(lstm_out)

        return label_scores

    def compute_loss(self, output, answer, masks):
        # output: [B, T, L], answer: [B, T], mask: [B, T, L]
        # print answer
        output = output.transpose(1, 0).contiguous()
        answer = answer.transpose(1, 0).contiguous()
        masks = masks.transpose(1, 0).contiguous()
        total_loss = self.crf(output, answer, masks)

        num_words = masks.float().sum()
        total_loss = total_loss / num_words

        return total_loss

    def decode(self, label_scores, masks):
        label_scores = label_scores.transpose(1, 0).contiguous()
        masks = masks.transpose(1, 0).contiguous()
        tag_seq = self.crf.decode(label_scores, masks)

        return tag_seq

    def save(self, filepath):
        """ Save model parameters to file.
        """
        torch.save(self.state_dict(), filepath)
        print('Saved model to: {}'.format(filepath))

    def load(self, filepath):
        """ Load model parameters from file.
        """
        self.load_state_dict(torch.load(filepath))
        print('Loaded model from: {}'.format(filepath))
Example #13
0
class BiLSTMModel(nn.Module):
    def __init__(self, vocab, config, elmo_shape):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        self.word_dims = config.word_dims
        self.elmo_layers = elmo_shape[0]
        self.elmo_dims = elmo_shape[1]

        weights = torch.randn(self.elmo_layers)
        self.weights = torch.nn.Parameter(weights, requires_grad=True)
        self.mlp_elmo = nn.Linear(self.elmo_dims, self.word_dims, bias=False)

        self.rel_embed = nn.Embedding(vocab.rel_size,
                                      self.word_dims,
                                      padding_idx=vocab.PAD)

        self.predicate_embed = nn.Embedding(3,
                                            config.predict_dims,
                                            padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0,
                        1.0 / (config.predict_dims**0.5))

        self.input_dims = 2 * config.word_dims + config.predict_dims

        self.dt_tree = DTTreeGRU(self.input_dims, config.lstm_hiddens)
        self.td_tree = TDTreeGRU(self.input_dims, config.lstm_hiddens)

        self.lstm_input_dims = 2 * config.lstm_hiddens

        self.bilstm = MyLSTM(
            input_size=self.lstm_input_dims,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens,
                                  vocab.label_size,
                                  bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0,
                        1.0 / ((2 * config.lstm_hiddens)**0.5))

        self.crf = CRF(vocab.label_size)

    def forward(self, elmos, predicts, masks, rels, heads, lengths):
        # x = (batch size, sequence length, dimension of embedding)
        elmos = elmos.permute(0, 2, 3, 1).matmul(self.weights)
        x_embed = self.mlp_elmo(elmos)
        x_rel_embed = self.rel_embed(rels)
        x_predict_embed = self.predicate_embed(predicts)

        if self.training:
            x_embed, x_rel_embed, x_predict_embed = drop_tri_input_independent(
                x_embed, x_rel_embed, x_predict_embed, self.config.dropout_emb)

        x_lexical = torch.cat((x_embed, x_rel_embed, x_predict_embed), dim=2)

        x_lexical = x_lexical.transpose(1, 0)

        max_length, batch_size, input_dim = x_lexical.size()

        trees = []
        indexes = np.zeros((max_length, batch_size), dtype=np.int32)
        for b, head in enumerate(heads):
            root, tree = creatTree(head)
            root.traverse()
            for step, index in enumerate(root.order):
                indexes[step, b] = index
            trees.append(tree)

        dt_outputs, dt_hidden_ts = self.dt_tree(x_lexical, indexes, trees,
                                                lengths)
        td_outputs, td_hidden_ts = self.td_tree(x_lexical, indexes, trees,
                                                lengths)

        tree_outputs = torch.cat([dt_outputs, td_outputs], dim=2)

        lstm_out, _ = self.bilstm(tree_outputs, masks)
        lstm_out = lstm_out.transpose(1, 0)

        label_scores = self.outlayer(lstm_out)

        return label_scores

    def compute_loss(self, output, answer, masks):
        # output: [B, T, L], answer: [B, T], mask: [B, T, L]
        # print answer
        output = output.transpose(1, 0).contiguous()
        answer = answer.transpose(1, 0).contiguous()
        masks = masks.transpose(1, 0).contiguous()
        total_loss = self.crf(output, answer, masks)

        num_words = masks.float().sum()
        total_loss = total_loss / num_words

        return total_loss

    def decode(self, label_scores, masks):
        label_scores = label_scores.transpose(1, 0).contiguous()
        masks = masks.transpose(1, 0).contiguous()
        tag_seq = self.crf.decode(label_scores, masks)

        return tag_seq

    def save(self, filepath):
        """ Save model parameters to file.
        """
        torch.save(self.state_dict(), filepath)
        print('Saved model to: {}'.format(filepath))

    def load(self, filepath):
        """ Load model parameters from file.
        """
        self.load_state_dict(torch.load(filepath))
        print('Loaded model from: {}'.format(filepath))
Example #14
0
    def __init__(self, vocab, config, parser_config, pretrained_embedding):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        extvocab_size, extword_dims = pretrained_embedding.shape
        self.word_dims = extword_dims
        if config.word_dims != extword_dims:
            print("word dim size does not match, check config file")
        self.word_embed = nn.Embedding(vocab.vocab_size,
                                       self.word_dims,
                                       padding_idx=vocab.PAD)
        if vocab.extvocab_size != extvocab_size:
            print("word vocab size does not match, check word embedding file")
        self.extword_embed = CPUEmbedding(vocab.extvocab_size,
                                          self.word_dims,
                                          padding_idx=vocab.PAD)

        word_init = np.zeros((vocab.vocab_size, self.word_dims),
                             dtype=np.float32)
        self.word_embed.weight.data.copy_(torch.from_numpy(word_init))
        self.extword_embed.weight.data.copy_(
            torch.from_numpy(pretrained_embedding))
        self.extword_embed.weight.requires_grad = False

        self.transformer_emb = nn.Linear(parser_config.word_dims,
                                         self.word_dims,
                                         bias=False)

        parser_dim = 2 * parser_config.lstm_hiddens
        transformer_lstm = []
        for layer in range(parser_config.lstm_layers):
            transformer_lstm.append(
                nn.Linear(parser_dim, self.word_dims, bias=False))
        self.transformer_lstm = nn.ModuleList(transformer_lstm)

        parser_mlp_dim = parser_config.mlp_arc_size + parser_config.mlp_rel_size
        self.transformer_dep = nn.Linear(parser_mlp_dim,
                                         self.word_dims,
                                         bias=False)
        self.transformer_head = nn.Linear(parser_mlp_dim,
                                          self.word_dims,
                                          bias=False)

        self.parser_lstm_layers = parser_config.lstm_layers
        self.synscale = ScalarMix(mixture_size=3 + parser_config.lstm_layers)

        self.predicate_embed = nn.Embedding(3,
                                            config.predict_dims,
                                            padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0,
                        1.0 / (config.predict_dims**0.5))

        self.input_dims = 2 * config.word_dims + config.predict_dims

        self.bilstm = MyLSTM(
            input_size=self.input_dims,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens,
                                  vocab.label_size,
                                  bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0,
                        1.0 / ((2 * config.lstm_hiddens)**0.5))

        self.crf = CRF(vocab.label_size)
Example #15
0
class BiLSTMModel(nn.Module):
    def __init__(self, vocab, config, elmo_shape):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        self.word_dims = config.word_dims
        self.elmo_layers = elmo_shape[0]
        self.elmo_dims = elmo_shape[1]

        weights = torch.randn(self.elmo_layers)
        self.weights = torch.nn.Parameter(weights, requires_grad=True)
        self.mlp_elmo = nn.Linear(self.elmo_dims, self.word_dims, bias=False)

        self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0)
        word_init = np.random.randn(vocab.vocab_size, config.word_dims).astype(np.float32)
        self.word_embed.weight.data.copy_(torch.from_numpy(word_init))

        self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5))

        self.lstm_input_dims = config.word_dims + config.predict_dims

        self.bilstm = MyLSTM(
            input_size=self.lstm_input_dims,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5))

        self.crf = CRF(vocab.label_size)

    def forward(self, elmos, actions, predicts, masks, indices):
        # x = (batch size, sequence length, dimension of embedding)
        elmos = elmos.matmul(self.weights)
        x_elmo_embed = self.mlp_elmo(elmos)
        x_action_embed = self.word_embed(actions)
        x_embed = x_elmo_embed + x_action_embed

        x_predict_embed = self.predicate_embed(predicts)

        if self.training:
            x_embed, x_predict_embed = drop_bi_input_independent(x_embed, x_predict_embed, self.config.dropout_emb)

        embeddings = torch.cat((x_embed, x_predict_embed), dim=2)

        lstm_out, _ = self.bilstm(embeddings, masks)
        lstm_out = lstm_out.transpose(1, 0)

        filtered = torch.gather(lstm_out, 1, indices)

        label_scores = self.outlayer(filtered)

        return label_scores

    def compute_loss(self, output, answer, wmasks):
        # output: [B, T, L], answer: [B, T], mask: [B, T, L]
        # print answer
        output = output.transpose(1, 0).contiguous()
        answer = answer.transpose(1, 0).contiguous()
        wmasks = wmasks.transpose(1, 0).contiguous()
        total_loss = self.crf(output, answer, wmasks)

        num_words = wmasks.float().sum()
        total_loss = total_loss / num_words

        return total_loss

    def decode(self, label_scores, wmasks):
        label_scores = label_scores.transpose(1, 0).contiguous()
        wmasks = wmasks.transpose(1, 0).contiguous()
        tag_seq = self.crf.decode(label_scores, wmasks)

        return tag_seq

    def save(self, filepath):
        """ Save model parameters to file.
        """
        torch.save(self.state_dict(), filepath)
        print('Saved model to: {}'.format(filepath))

    def load(self, filepath):
        """ Load model parameters from file.
        """
        self.load_state_dict(torch.load(filepath))
        print('Loaded model from: {}'.format(filepath))
Example #16
0
    def __init__(
        self,
        vocabs,
        counters,
        word_embed_file,
        word_embed_dim,
        char_embed_dim,
        char_filters,
        char_feat_dim,
        lstm_hidden_size,
        lstm_dropout=0.5,
        feat_dropout=0.5,
        signal_dropout=0,
        ctx_size=5,
        use_signal=True,
        parameters=None,
    ):
        assert char_feat_dim >= word_embed_dim
        super(LstmCnnDfc, self).__init__()

        self.vocabs = vocabs
        self.label_size = len(self.vocabs['label'])
        self.use_signal = use_signal

        # input features
        if parameters is not None:
            self.word_embed = nn.Embedding(parameters['word_embed_num'],
                                           parameters['word_embed_dim'])
        else:
            self.word_embed = load_embedding_from_file(word_embed_file,
                                                       word_embed_dim,
                                                       vocabs['token'],
                                                       vocabs['embed'],
                                                       vocabs['form'],
                                                       padding_idx=C.PAD_INDEX,
                                                       trainable=True)
        self.char_embed = CharCNNFF(len(vocabs['char']),
                                    char_embed_dim,
                                    char_filters,
                                    output_size=char_feat_dim)
        if use_signal:
            if parameters is not None:
                self.signal_embed = nn.Embedding(
                    parameters['signal_embed_num'],
                    parameters['signal_embed_dim'])
            else:
                self.signal_embed = build_signal_embed(counters['embed'],
                                                       counters['token'],
                                                       vocabs['token'],
                                                       vocabs['form'])
        self.word_dim = self.word_embed.embedding_dim
        self.char_dim = self.char_embed.output_size
        self.feat_dim = self.char_dim
        self.signal_dim = self.signal_embed.embedding_dim
        self.ctx_size = ctx_size
        # layers
        self.lstm = LSTM(input_size=self.feat_dim,
                         hidden_size=lstm_hidden_size,
                         batch_first=True,
                         bidirectional=True)
        self.output_linear = Linear(self.lstm.output_size, self.label_size)
        self.crf = CRF(vocabs['label'])
        self.feat_dropout = nn.Dropout(p=feat_dropout)
        self.lstm_dropout = nn.Dropout(p=lstm_dropout)
        self.signal_dropout = nn.Dropout(p=signal_dropout)
        self.lstm_size = self.lstm.output_size
        self.uni_lstm_size = self.lstm_size // 2

        # word representation level
        self.word_gates = nn.ModuleList([
            Linear(self.word_dim, self.word_dim),
            Linear(self.word_dim, self.word_dim)
        ])
        self.char_gates = nn.ModuleList([
            Linear(self.word_dim, self.word_dim),
            Linear(self.word_dim, self.word_dim)
        ])
        if use_signal:
            self.signal_gates = nn.ModuleList([
                Linear(self.signal_dim, self.word_dim),
                Linear(self.signal_dim, self.word_dim)
            ])

        # feature extraction level
        # context-only feature linear layers
        self.cof_linear_fwd = Linear(self.uni_lstm_size, self.uni_lstm_size)
        self.cof_linear_bwd = Linear(self.uni_lstm_size, self.uni_lstm_size)
        # hidden states gates
        self.hs_gates = nn.ModuleList(
            [Linear(self.uni_lstm_size, self.uni_lstm_size) for _ in range(4)])
        # context-only feature gates
        self.cof_gates = nn.ModuleList(
            [Linear(self.uni_lstm_size, self.uni_lstm_size) for _ in range(4)])
        if use_signal:
            self.crs_gates = nn.ModuleList([
                Linear(self.signal_dim * (ctx_size + 1), self.uni_lstm_size)
                for _ in range(4)
            ])
Example #17
0
class BiLSTMModel(nn.Module):
    def __init__(self, vocab, config, input_dims, bert_layers):
        super(BiLSTMModel, self).__init__()
        self.config = config
        self.PAD = vocab.PAD
        self.input_dims = input_dims
        self.input_depth = bert_layers if config.bert_tune == 0 else 1
        self.hidden_dims = config.word_dims
        self.projections = nn.ModuleList([NonLinear(self.input_dims, self.hidden_dims, activation=GELU()) \
                                          for i in range(self.input_depth)])

        self.rescale = ScalarMix(mixture_size=self.input_depth)

        self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0)
        word_init = np.random.randn(vocab.vocab_size, config.word_dims).astype(np.float32)
        self.word_embed.weight.data.copy_(torch.from_numpy(word_init))

        self.predicate_embed = nn.Embedding(3, config.predict_dims, padding_idx=0)
        nn.init.normal_(self.predicate_embed.weight, 0.0, 1.0 / (config.predict_dims ** 0.5))

        self.lstm_input_dims = config.word_dims + config.predict_dims

        self.bilstm = MyLSTM(
            input_size=self.lstm_input_dims,
            hidden_size=config.lstm_hiddens,
            num_layers=config.lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout_in=config.dropout_lstm_input,
            dropout_out=config.dropout_lstm_hidden,
        )

        self.outlayer = nn.Linear(2 * config.lstm_hiddens, vocab.label_size, bias=False)
        nn.init.normal_(self.outlayer.weight, 0.0, 1.0 / ((2 * config.lstm_hiddens) ** 0.5))

        self.crf = CRF(vocab.label_size)

    def forward(self, inputs, actions, predicts, masks, indices):
        # x = (batch size, sequence length, dimension of embedding)
        proj_hiddens = []
        for idx, input in enumerate(inputs):
            cur_hidden = self.projections[idx](input)
            proj_hiddens.append(cur_hidden)

        x_bert_embed = self.rescale(proj_hiddens)
        x_action_embed = self.word_embed(actions)
        x_embed = x_bert_embed + x_action_embed

        x_predict_embed = self.predicate_embed(predicts)

        if self.training:
            x_embed, x_predict_embed = drop_bi_input_independent(x_embed, x_predict_embed, self.config.dropout_emb)

        embeddings = torch.cat((x_embed, x_predict_embed), dim=2)

        lstm_out, _ = self.bilstm(embeddings, masks)
        lstm_out = lstm_out.transpose(1, 0)

        filtered = torch.gather(lstm_out, 1, indices)

        label_scores = self.outlayer(filtered)

        return label_scores

    def compute_loss(self, output, answer, wmasks):
        # output: [B, T, L], answer: [B, T], mask: [B, T, L]
        # print answer
        output = output.transpose(1, 0).contiguous()
        answer = answer.transpose(1, 0).contiguous()
        wmasks = wmasks.transpose(1, 0).contiguous()
        total_loss = self.crf(output, answer, wmasks)

        num_words = wmasks.float().sum()
        total_loss = total_loss / num_words

        return total_loss

    def decode(self, label_scores, wmasks):
        label_scores = label_scores.transpose(1, 0).contiguous()
        wmasks = wmasks.transpose(1, 0).contiguous()
        tag_seq = self.crf.decode(label_scores, wmasks)

        return tag_seq

    def save(self, filepath):
        """ Save model parameters to file.
        """
        torch.save(self.state_dict(), filepath)
        print('Saved model to: {}'.format(filepath))

    def load(self, filepath):
        """ Load model parameters from file.
        """
        self.load_state_dict(torch.load(filepath))
        print('Loaded model from: {}'.format(filepath))