Esempio n. 1
0
    def __init__(self, data):
        super(BiLSTM, self).__init__()
        print "build batched bilstm..."
        self.gpu = data.HP_gpu
        self.use_char = data.HP_use_char
        self.batch_size = data.HP_batch_size
        self.char_hidden_dim = 0
        self.average_batch = data.HP_average_batch_loss
        if self.use_char:
            self.char_hidden_dim = data.HP_char_hidden_dim
            self.char_embedding_dim = data.char_emb_dim
            if data.char_features == "CNN":
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
            elif data.char_features == "LSTM":
                self.char_feature = CharBiLSTM(data.char_alphabet.size(),
                                               self.char_embedding_dim,
                                               self.char_hidden_dim,
                                               data.HP_dropout, self.gpu)
            else:
                print "Error char feature selection, please check parameter data.char_features (either CNN or LSTM)."
                exit(0)
        self.embedding_dim = data.word_emb_dim
        self.hidden_dim = data.HP_hidden_dim
        self.drop = nn.Dropout(data.HP_dropout)
        self.droplstm = nn.Dropout(data.HP_dropout)
        self.word_embeddings = nn.Embedding(data.word_alphabet.size(),
                                            self.embedding_dim)
        self.bilstm_flag = data.HP_bilstm
        self.lstm_layer = data.HP_lstm_layer
        if data.pretrain_word_embedding is not None:
            self.word_embeddings.weight.data.copy_(
                torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embeddings.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(data.word_alphabet.size(),
                                          self.embedding_dim)))
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        if self.bilstm_flag:
            lstm_hidden = data.HP_hidden_dim // 2
        else:
            lstm_hidden = data.HP_hidden_dim
        self.lstm = nn.LSTM(self.embedding_dim + self.char_hidden_dim,
                            lstm_hidden,
                            num_layers=self.lstm_layer,
                            batch_first=True,
                            bidirectional=self.bilstm_flag)
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(self.hidden_dim, data.label_alphabet_size)

        if self.gpu:
            self.drop = self.drop.cuda()
            self.droplstm = self.droplstm.cuda()
            self.word_embeddings = self.word_embeddings.cuda()
            self.lstm = self.lstm.cuda()
            self.hidden2tag = self.hidden2tag.cuda()
Esempio n. 2
0
    def __init__(self,
                 vocab_size,
                 word_embed_dim,
                 word_hidden_dim,
                 alphabet_size,
                 char_embedding_dim,
                 char_hidden_dim,
                 feature_extractor,
                 tag_num,
                 dropout,
                 pretrain_embed=None,
                 use_char=False,
                 use_crf=False,
                 use_gpu=False):
        super(NamedEntityRecog, self).__init__()
        self.use_crf = use_crf
        self.use_char = use_char
        self.drop = nn.Dropout(dropout)
        self.input_dim = word_embed_dim
        self.feature_extractor = feature_extractor

        self.embeds = nn.Embedding(vocab_size, word_embed_dim, padding_idx=0)
        if pretrain_embed is not None:
            self.embeds.weight.data.copy_(torch.from_numpy(pretrain_embed))
        else:
            self.embeds.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(vocab_size, word_embed_dim)))

        if self.use_char:
            self.input_dim += char_hidden_dim
            self.char_feature = CharCNN(alphabet_size, char_embedding_dim,
                                        char_hidden_dim, dropout)

        if feature_extractor == 'lstm':
            self.lstm = nn.LSTM(self.input_dim,
                                word_hidden_dim,
                                batch_first=True,
                                bidirectional=True)
        else:
            self.word2cnn = nn.Linear(self.input_dim, word_hidden_dim * 2)
            self.cnn_list = list()
            for _ in range(4):
                self.cnn_list.append(
                    nn.Conv1d(word_hidden_dim * 2,
                              word_hidden_dim * 2,
                              kernel_size=3,
                              padding=1))
                self.cnn_list.append(nn.ReLU())
                self.cnn_list.append(nn.Dropout(dropout))
                self.cnn_list.append(nn.BatchNorm1d(word_hidden_dim * 2))
            self.cnn = nn.Sequential(*self.cnn_list)

        if self.use_crf:
            self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num + 2)
            self.crf = CRF(tag_num, use_gpu)
        else:
            self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num)
Esempio n. 3
0
    def __init__(self, args):
        super().__init__()
        L, R, lb = 7, 10, 0.0
        self.wtoi, self.wvec = args.wtoi, args.wvec

        qws = list(set(args.qws))
        self.w2l, self.w2r = get_lr(self.wtoi.keys(), qws, L, R, lb)

        self.cdim = 100
        self.charcnn = CharCNN(self.cdim)
        self.lW = nn.Parameter(
            torch.zeros(self.cdim, self.cdim).uniform_(-0.1, 0.1))
        self.rW = nn.Parameter(
            torch.zeros(self.cdim, self.cdim).uniform_(-0.1, 0.1))
        self.gL = nn.Linear(L + R, 2)
Esempio n. 4
0
    def __init__(self, args, model_params):
        super(WordEmbedding, self).__init__()
        self.gpu = args.ifgpu
        self.use_char = args.useChar
        self.char_hidden_dim = args.char_hidden_dim
        self.char_embedding_dim = args.char_embedding_dim
        self.embedding_dim = model_params.embedding_dim
        self.drop = nn.Dropout(args.dropout)
        #char Embedding
        if self.use_char:
            if args.charExtractor == "CNN":
                self.char_feature = CharCNN(
                    model_params.char_alphabet.size(),
                    model_params.pretrain_char_embedding,
                    self.char_embedding_dim, self.char_hidden_dim,
                    args.dropout, self.gpu)
            elif args.charExtractor == "LSTM":
                self.char_feature = CharBiLSTM(
                    model_params.char_alphabet.size(),
                    model_params.pretrain_char_embedding,
                    self.char_embedding_dim, self.char_hidden_dim,
                    args.dropout, self.gpu)
            else:
                print(
                    "Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM)."
                )
                exit(0)

    #word Embedding
        self.word_embedding = nn.Embedding(model_params.word_alphabet.size(),
                                           self.embedding_dim)
        if model_params.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(model_params.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(model_params.word_alphabet.size(),
                                          self.embedding_dim)))
Esempio n. 5
0
    def __init__(self, data, use_position, use_cap, use_postag, use_char):
        super(WordRep, self).__init__()
        print "build word representation..."
        self.gpu = data.HP_gpu
        self.use_char = use_char
        self.batch_size = data.HP_batch_size
        self.char_hidden_dim = 0
        self.char_all_feature = False
        if self.use_char:
            self.char_hidden_dim = data.HP_char_hidden_dim
            self.char_embedding_dim = data.char_emb_dim
            if data.char_feature_extractor == "CNN":
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            data.pretrain_char_embedding,
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "LSTM":
                self.char_feature = CharBiLSTM(data.char_alphabet.size(),
                                               data.pretrain_char_embedding,
                                               self.char_embedding_dim,
                                               self.char_hidden_dim,
                                               data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "GRU":
                self.char_feature = CharBiGRU(data.char_alphabet.size(),
                                              data.pretrain_char_embedding,
                                              self.char_embedding_dim,
                                              self.char_hidden_dim,
                                              data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "ALL":
                self.char_all_feature = True
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            data.pretrain_char_embedding,
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
                self.char_feature_extra = CharBiLSTM(
                    data.char_alphabet.size(), data.pretrain_char_embedding,
                    self.char_embedding_dim, self.char_hidden_dim,
                    data.HP_dropout, self.gpu)
            else:
                print "Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM/GRU/ALL)."
                exit(0)
        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(data.HP_dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(),
                                           self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(data.word_alphabet.size(),
                                          self.embedding_dim)))

        self.feature_num = 0
        self.feature_embedding_dims = data.feature_emb_dims
        self.feature_embeddings = nn.ModuleList()

        if use_cap:
            self.feature_num += 1
            alphabet_id = data.feature_name2id['[Cap]']
            emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(),
                               self.feature_embedding_dims[alphabet_id])
            emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.feature_alphabets[alphabet_id].size(),
                        self.feature_embedding_dims[alphabet_id])))
            self.feature_embeddings.append(emb)

        if use_postag:
            self.feature_num += 1
            alphabet_id = data.feature_name2id['[POS]']
            emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(),
                               self.feature_embedding_dims[alphabet_id])
            emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.feature_alphabets[alphabet_id].size(),
                        self.feature_embedding_dims[alphabet_id])))
            self.feature_embeddings.append(emb)

        self.use_position = use_position
        if self.use_position:

            position_alphabet_id = data.re_feature_name2id['[POSITION]']
            self.position_embedding_dim = data.re_feature_emb_dims[
                position_alphabet_id]
            self.position1_emb = nn.Embedding(
                data.re_feature_alphabet_sizes[position_alphabet_id],
                self.position_embedding_dim, data.pad_idx)
            self.position1_emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.re_feature_alphabet_sizes[position_alphabet_id],
                        self.position_embedding_dim)))

            self.position2_emb = nn.Embedding(
                data.re_feature_alphabet_sizes[position_alphabet_id],
                self.position_embedding_dim, data.pad_idx)
            self.position2_emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.re_feature_alphabet_sizes[position_alphabet_id],
                        self.position_embedding_dim)))

        if torch.cuda.is_available():
            self.drop = self.drop.cuda(self.gpu)
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            for idx in range(self.feature_num):
                self.feature_embeddings[idx] = self.feature_embeddings[
                    idx].cuda(self.gpu)
            if self.use_position:
                self.position1_emb = self.position1_emb.cuda(self.gpu)
                self.position2_emb = self.position2_emb.cuda(self.gpu)
Esempio n. 6
0
class WordRep(nn.Module):
    def __init__(self, data, use_position, use_cap, use_postag, use_char):
        super(WordRep, self).__init__()
        print "build word representation..."
        self.gpu = data.HP_gpu
        self.use_char = use_char
        self.batch_size = data.HP_batch_size
        self.char_hidden_dim = 0
        self.char_all_feature = False
        if self.use_char:
            self.char_hidden_dim = data.HP_char_hidden_dim
            self.char_embedding_dim = data.char_emb_dim
            if data.char_feature_extractor == "CNN":
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            data.pretrain_char_embedding,
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "LSTM":
                self.char_feature = CharBiLSTM(data.char_alphabet.size(),
                                               data.pretrain_char_embedding,
                                               self.char_embedding_dim,
                                               self.char_hidden_dim,
                                               data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "GRU":
                self.char_feature = CharBiGRU(data.char_alphabet.size(),
                                              data.pretrain_char_embedding,
                                              self.char_embedding_dim,
                                              self.char_hidden_dim,
                                              data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "ALL":
                self.char_all_feature = True
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            data.pretrain_char_embedding,
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
                self.char_feature_extra = CharBiLSTM(
                    data.char_alphabet.size(), data.pretrain_char_embedding,
                    self.char_embedding_dim, self.char_hidden_dim,
                    data.HP_dropout, self.gpu)
            else:
                print "Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM/GRU/ALL)."
                exit(0)
        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(data.HP_dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(),
                                           self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(data.word_alphabet.size(),
                                          self.embedding_dim)))

        self.feature_num = 0
        self.feature_embedding_dims = data.feature_emb_dims
        self.feature_embeddings = nn.ModuleList()

        if use_cap:
            self.feature_num += 1
            alphabet_id = data.feature_name2id['[Cap]']
            emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(),
                               self.feature_embedding_dims[alphabet_id])
            emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.feature_alphabets[alphabet_id].size(),
                        self.feature_embedding_dims[alphabet_id])))
            self.feature_embeddings.append(emb)

        if use_postag:
            self.feature_num += 1
            alphabet_id = data.feature_name2id['[POS]']
            emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(),
                               self.feature_embedding_dims[alphabet_id])
            emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.feature_alphabets[alphabet_id].size(),
                        self.feature_embedding_dims[alphabet_id])))
            self.feature_embeddings.append(emb)

        self.use_position = use_position
        if self.use_position:

            position_alphabet_id = data.re_feature_name2id['[POSITION]']
            self.position_embedding_dim = data.re_feature_emb_dims[
                position_alphabet_id]
            self.position1_emb = nn.Embedding(
                data.re_feature_alphabet_sizes[position_alphabet_id],
                self.position_embedding_dim, data.pad_idx)
            self.position1_emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.re_feature_alphabet_sizes[position_alphabet_id],
                        self.position_embedding_dim)))

            self.position2_emb = nn.Embedding(
                data.re_feature_alphabet_sizes[position_alphabet_id],
                self.position_embedding_dim, data.pad_idx)
            self.position2_emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.re_feature_alphabet_sizes[position_alphabet_id],
                        self.position_embedding_dim)))

        if torch.cuda.is_available():
            self.drop = self.drop.cuda(self.gpu)
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            for idx in range(self.feature_num):
                self.feature_embeddings[idx] = self.feature_embeddings[
                    idx].cuda(self.gpu)
            if self.use_position:
                self.position1_emb = self.position1_emb.cuda(self.gpu)
                self.position2_emb = self.position2_emb.cuda(self.gpu)

    def random_embedding(self, vocab_size, embedding_dim):
        pretrain_emb = np.empty([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale,
                                                       [1, embedding_dim])
        return pretrain_emb

    def forward(self, word_inputs, feature_inputs, word_seq_lengths,
                char_inputs, char_seq_lengths, char_seq_recover,
                position1_inputs, position2_inputs):
        """
            input:
                word_inputs: (batch_size, sent_len)
                features: list [(batch_size, sent_len), (batch_len, sent_len),...]
                word_seq_lengths: list of batch_size, (batch_size,1)
                char_inputs: (batch_size*sent_len, word_length)
                char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
                char_seq_recover: variable which records the char order information, used to recover char order
            output: 
                Variable(batch_size, sent_len, hidden_dim)
        """
        batch_size = word_inputs.size(0)
        sent_len = word_inputs.size(1)
        word_embs = self.word_embedding(word_inputs)
        word_list = [word_embs]
        for idx in range(self.feature_num):
            word_list.append(self.feature_embeddings[idx](feature_inputs[idx]))

        if self.use_char:
            ## calculate char lstm last hidden
            char_features = self.char_feature.get_last_hiddens(
                char_inputs,
                char_seq_lengths.cpu().numpy())
            char_features = char_features[char_seq_recover]
            char_features = char_features.view(batch_size, sent_len, -1)
            ## concat word and char together
            word_list.append(char_features)
            word_embs = torch.cat([word_embs, char_features], 2)
            if self.char_all_feature:
                char_features_extra = self.char_feature_extra.get_last_hiddens(
                    char_inputs,
                    char_seq_lengths.cpu().numpy())
                char_features_extra = char_features_extra[char_seq_recover]
                char_features_extra = char_features_extra.view(
                    batch_size, sent_len, -1)
                ## concat word and char together
                word_list.append(char_features_extra)

        if self.use_position:
            position1_feature = self.position1_emb(position1_inputs)
            position2_feature = self.position2_emb(position2_inputs)
            word_list.append(position1_feature)
            word_list.append(position2_feature)

        word_embs = torch.cat(word_list, 2)
        word_represent = self.drop(word_embs)
        return word_represent
Esempio n. 7
0
    def __init__(self, data):
        super(WordRep, self).__init__()
        print "build word representation..."
        self.gpu = data.HP_gpu
        self.use_char = data.use_char
        self.batch_size = data.HP_batch_size
        self.char_hidden_dim = 0
        self.char_all_feature = False
        if self.use_char:
            self.char_hidden_dim = data.HP_char_hidden_dim
            self.char_embedding_dim = data.char_emb_dim
            if data.char_seq_feature == "CNN":
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
            elif data.char_seq_feature == "LSTM":
                self.char_feature = CharBiLSTM(data.char_alphabet.size(),
                                               self.char_embedding_dim,
                                               self.char_hidden_dim,
                                               data.HP_dropout, self.gpu)
            elif data.char_seq_feature == "GRU":
                self.char_feature = CharBiGRU(data.char_alphabet.size(),
                                              self.char_embedding_dim,
                                              self.char_hidden_dim,
                                              data.HP_dropout, self.gpu)
            elif data.char_seq_feature == "ALL":
                self.char_all_feature = True
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
                self.char_feature_extra = CharBiLSTM(data.char_alphabet.size(),
                                                     self.char_embedding_dim,
                                                     self.char_hidden_dim,
                                                     data.HP_dropout, self.gpu)
            else:
                print "Error char feature selection, please check parameter data.char_seq_feature (CNN/LSTM/GRU/ALL)."
                exit(0)
        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(data.HP_dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(),
                                           self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(data.word_alphabet.size(),
                                          self.embedding_dim)))

        self.feature_num = data.feature_num
        self.feature_embedding_dims = data.feature_emb_dims
        self.feature_embeddings = nn.ModuleList()
        for idx in range(self.feature_num):
            self.feature_embeddings.append(
                nn.Embedding(data.feature_alphabets[idx].size(),
                             self.feature_embedding_dims[idx]))
        for idx in range(self.feature_num):
            if data.pretrain_feature_embeddings[idx] is not None:
                self.feature_embeddings[idx].weight.data.copy_(
                    torch.from_numpy(data.pretrain_feature_embeddings[idx]))
            else:
                self.feature_embeddings[idx].weight.data.copy_(
                    torch.from_numpy(
                        self.random_embedding(
                            data.feature_alphabets[idx].size(),
                            self.feature_embedding_dims[idx])))

        if self.gpu:
            self.drop = self.drop.cuda()
            self.word_embedding = self.word_embedding.cuda()
            for idx in range(self.feature_num):
                self.feature_embeddings[idx] = self.feature_embeddings[
                    idx].cuda()
Esempio n. 8
0
    def __init__(self, data, opt):
        super(WordRep, self).__init__()

        self.gpu = opt.gpu
        self.batch_size = opt.batch_size

        self.use_elmo = False
        if opt.elmo:
            logging.info("use elmo, loading ...")
            self.use_elmo = True
            self.elmo = Embedder(data.config['elmo_path'])
            # we project the elmo representation to the same dim of char embedding
            self.elmo_projection = nn.Linear(
                self.elmo.config['encoder']['projection_dim'] * 2,
                opt.char_hidden_dim, False)
            self.elmo_drop = nn.Dropout(opt.dropout)
        else:
            self.char_hidden_dim = opt.char_hidden_dim
            self.char_embedding_dim = opt.char_emb_dim
            self.char_feature = CharCNN(data.char_alphabet.size(), None,
                                        self.char_embedding_dim,
                                        self.char_hidden_dim, opt.dropout,
                                        self.gpu)

        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(opt.dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(),
                                           self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(data.word_alphabet.size(),
                                          self.embedding_dim)))

        if data.feat_config is not None:
            self.feature_num = len(data.feature_alphabets)
            self.feature_embedding_dims = data.feature_emb_dims
            self.feature_embeddings = nn.ModuleList()
            for idx in range(self.feature_num):
                emb = nn.Embedding(data.feature_alphabets[idx].size(),
                                   self.feature_embedding_dims[idx])
                emb.weight.data.copy_(
                    torch.from_numpy(
                        self.random_embedding(
                            data.feature_alphabets[idx].size(),
                            self.feature_embedding_dims[idx])))
                self.feature_embeddings.append(emb)
        else:
            self.feature_num = 0

        if opt.gpu >= 0 and torch.cuda.is_available():
            self.drop = self.drop.cuda(self.gpu)
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            if data.feat_config is not None:
                for idx in range(self.feature_num):
                    self.feature_embeddings[idx] = self.feature_embeddings[
                        idx].cuda(self.gpu)

            if opt.elmo:
                self.elmo_projection = self.elmo_projection.cuda(self.gpu)
                self.elmo_drop = self.elmo_drop.cuda(self.gpu)
Esempio n. 9
0
class WordRep(nn.Module):
    def __init__(self, data, opt):
        super(WordRep, self).__init__()

        self.gpu = opt.gpu
        self.batch_size = opt.batch_size

        self.use_elmo = False
        if opt.elmo:
            logging.info("use elmo, loading ...")
            self.use_elmo = True
            self.elmo = Embedder(data.config['elmo_path'])
            # we project the elmo representation to the same dim of char embedding
            self.elmo_projection = nn.Linear(
                self.elmo.config['encoder']['projection_dim'] * 2,
                opt.char_hidden_dim, False)
            self.elmo_drop = nn.Dropout(opt.dropout)
        else:
            self.char_hidden_dim = opt.char_hidden_dim
            self.char_embedding_dim = opt.char_emb_dim
            self.char_feature = CharCNN(data.char_alphabet.size(), None,
                                        self.char_embedding_dim,
                                        self.char_hidden_dim, opt.dropout,
                                        self.gpu)

        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(opt.dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(),
                                           self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(data.word_alphabet.size(),
                                          self.embedding_dim)))

        if data.feat_config is not None:
            self.feature_num = len(data.feature_alphabets)
            self.feature_embedding_dims = data.feature_emb_dims
            self.feature_embeddings = nn.ModuleList()
            for idx in range(self.feature_num):
                emb = nn.Embedding(data.feature_alphabets[idx].size(),
                                   self.feature_embedding_dims[idx])
                emb.weight.data.copy_(
                    torch.from_numpy(
                        self.random_embedding(
                            data.feature_alphabets[idx].size(),
                            self.feature_embedding_dims[idx])))
                self.feature_embeddings.append(emb)
        else:
            self.feature_num = 0

        if opt.gpu >= 0 and torch.cuda.is_available():
            self.drop = self.drop.cuda(self.gpu)
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            if data.feat_config is not None:
                for idx in range(self.feature_num):
                    self.feature_embeddings[idx] = self.feature_embeddings[
                        idx].cuda(self.gpu)

            if opt.elmo:
                self.elmo_projection = self.elmo_projection.cuda(self.gpu)
                self.elmo_drop = self.elmo_drop.cuda(self.gpu)

    def random_embedding(self, vocab_size, embedding_dim):
        pretrain_emb = np.zeros([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale,
                                                       [1, embedding_dim])
        return pretrain_emb

    def forward(self, word_inputs, word_seq_lengths, char_inputs,
                char_seq_lengths, char_seq_recover, feature_inputs,
                text_inputs):
        """
            input:
                word_inputs: (batch_size, sent_len)
                features: list [(batch_size, sent_len), (batch_len, sent_len),...]
                word_seq_lengths: list of batch_size, (batch_size,1)
                char_inputs: (batch_size*sent_len, word_length)
                char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
                char_seq_recover: variable which records the char order information, used to recover char order
            output: 
                Variable(batch_size, sent_len, hidden_dim)
        """
        batch_size = word_inputs.size(0)
        sent_len = word_inputs.size(1)
        word_embs = self.word_embedding(word_inputs)
        word_list = [word_embs]
        for idx in range(self.feature_num):
            word_list.append(self.feature_embeddings[idx](feature_inputs[idx]))

        if self.use_elmo:
            with torch.no_grad():
                elmo_rep = torch.from_numpy(
                    np.array(self.elmo.sents2elmo(
                        text_inputs)))  # batch, seq_len, 1024
                if self.gpu >= 0 and torch.cuda.is_available():
                    elmo_rep = elmo_rep.cuda(self.gpu)

            char_features = self.elmo_drop(self.elmo_projection(elmo_rep))
            # char_features = elmo_rep

        else:

            char_features = self.char_feature.get_last_hiddens(
                char_inputs,
                char_seq_lengths.cpu().numpy())
            char_features = char_features[char_seq_recover]
            char_features = char_features.view(batch_size, sent_len, -1)

        word_list.append(char_features)

        word_embs = torch.cat(word_list, 2)
        word_represent = self.drop(word_embs)
        return word_represent
Esempio n. 10
0
    def __init__(self, data):
        super(WordRep, self).__init__()
        print "Build word representation..."
        self.gpu = data.HP_gpu
        self.use_char = data.use_char
        self.use_trans = data.use_trans
        self.batch_size = data.HP_batch_size
        self.char_hidden_dim = 0
        self.char_all_feature = False
        self.use_mapping = data.use_mapping
        self.mapping_func = data.mapping_func

        # character-level
        if self.use_trans:
            if self.use_mapping:
                # linear mapping
                self.w = nn.Linear(data.word_emb_dim, data.HP_trans_hidden_dim)
                # non-linear mapping:w + tanh or w + sigmoid
                if self.mapping_func:
                    if self.mapping_func == "tanh":
                        self.non_linear = nn.Tanh()
                    elif self.mapping_func == "sigmoid":
                        self.non_linear = nn.Sigmoid()
            self.trans_hidden_dim = data.HP_trans_hidden_dim
            self.trans_embedding_dim = data.trans_emb_dim
            self.trans_feature = TransBiLSTM(data.translation_alphabet.size(), self.trans_embedding_dim,
                                             self.trans_hidden_dim,
                                             data.HP_dropout, data.pretrain_trans_embedding, self.gpu)

        # word-level
        if self.use_char:
            self.char_hidden_dim = data.HP_char_hidden_dim
            self.char_embedding_dim = data.char_emb_dim
            if data.char_seq_feature == "CNN":
                self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim,
                                            data.HP_dropout, data.pretrain_char_embedding, self.gpu)
            elif data.char_seq_feature == "LSTM":
                self.char_feature = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim,
                                               data.HP_dropout, data.pretrain_char_embedding, self.gpu)
            elif data.char_seq_feature == "GRU":
                self.char_feature = CharBiGRU(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim,
                                              data.HP_dropout, self.gpu)
            elif data.char_seq_feature == "ALL":
                self.char_all_feature = True
                self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim,
                                            data.HP_dropout, data.pretrain_char_embedding, self.gpu)
                self.char_feature_extra = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim,
                                                     self.char_hidden_dim, data.HP_dropout, self.gpu)
            else:
                print "Error char feature selection, please check parameter data.char_seq_feature (CNN/LSTM/GRU/ALL)."
                exit(0)

        # Word embedding
        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(data.HP_dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(self.random_embedding(data.word_alphabet.size(), self.embedding_dim)))

        # not use
        self.feature_num = data.feature_num
        self.feature_embedding_dims = data.feature_emb_dims
        self.feature_embeddings = nn.ModuleList()
        for idx in range(self.feature_num):
            self.feature_embeddings.append(
                nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))
        for idx in range(self.feature_num):
            if data.pretrain_feature_embeddings[idx] is not None:
                self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(data.pretrain_feature_embeddings[idx]))
            else:
                self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(
                    self.random_embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx])))

        if self.gpu:
            self.drop = self.drop.cuda()
            self.word_embedding = self.word_embedding.cuda()
            for idx in range(self.feature_num):
                self.feature_embeddings[idx] = self.feature_embeddings[idx].cuda()
Esempio n. 11
0
class WordRep(nn.Module):
    def __init__(self, data):
        super(WordRep, self).__init__()
        print "Build word representation..."
        self.gpu = data.HP_gpu
        self.use_char = data.use_char
        self.use_trans = data.use_trans
        self.batch_size = data.HP_batch_size
        self.char_hidden_dim = 0
        self.char_all_feature = False
        self.use_mapping = data.use_mapping
        self.mapping_func = data.mapping_func

        # character-level
        if self.use_trans:
            if self.use_mapping:
                # linear mapping
                self.w = nn.Linear(data.word_emb_dim, data.HP_trans_hidden_dim)
                # non-linear mapping:w + tanh or w + sigmoid
                if self.mapping_func:
                    if self.mapping_func == "tanh":
                        self.non_linear = nn.Tanh()
                    elif self.mapping_func == "sigmoid":
                        self.non_linear = nn.Sigmoid()
            self.trans_hidden_dim = data.HP_trans_hidden_dim
            self.trans_embedding_dim = data.trans_emb_dim
            self.trans_feature = TransBiLSTM(data.translation_alphabet.size(), self.trans_embedding_dim,
                                             self.trans_hidden_dim,
                                             data.HP_dropout, data.pretrain_trans_embedding, self.gpu)

        # word-level
        if self.use_char:
            self.char_hidden_dim = data.HP_char_hidden_dim
            self.char_embedding_dim = data.char_emb_dim
            if data.char_seq_feature == "CNN":
                self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim,
                                            data.HP_dropout, data.pretrain_char_embedding, self.gpu)
            elif data.char_seq_feature == "LSTM":
                self.char_feature = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim,
                                               data.HP_dropout, data.pretrain_char_embedding, self.gpu)
            elif data.char_seq_feature == "GRU":
                self.char_feature = CharBiGRU(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim,
                                              data.HP_dropout, self.gpu)
            elif data.char_seq_feature == "ALL":
                self.char_all_feature = True
                self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim,
                                            data.HP_dropout, data.pretrain_char_embedding, self.gpu)
                self.char_feature_extra = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim,
                                                     self.char_hidden_dim, data.HP_dropout, self.gpu)
            else:
                print "Error char feature selection, please check parameter data.char_seq_feature (CNN/LSTM/GRU/ALL)."
                exit(0)

        # Word embedding
        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(data.HP_dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(self.random_embedding(data.word_alphabet.size(), self.embedding_dim)))

        # not use
        self.feature_num = data.feature_num
        self.feature_embedding_dims = data.feature_emb_dims
        self.feature_embeddings = nn.ModuleList()
        for idx in range(self.feature_num):
            self.feature_embeddings.append(
                nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))
        for idx in range(self.feature_num):
            if data.pretrain_feature_embeddings[idx] is not None:
                self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(data.pretrain_feature_embeddings[idx]))
            else:
                self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(
                    self.random_embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx])))

        if self.gpu:
            self.drop = self.drop.cuda()
            self.word_embedding = self.word_embedding.cuda()
            for idx in range(self.feature_num):
                self.feature_embeddings[idx] = self.feature_embeddings[idx].cuda()

    def random_embedding(self, vocab_size, embedding_dim):
        pretrain_emb = np.empty([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
        return pretrain_emb

    def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover,
                trans_inputs, trans_seq_length, trans_seq_recover):
        """
            input:
                word_inputs: (batch_size, sent_len)
                features: list [(batch_size, sent_len), (batch_len, sent_len),...]
                word_seq_lengths: list of batch_size, (batch_size,1)
                char_inputs: (batch_size*sent_len, word_length)
                char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
                char_seq_recover: variable which records the char order information, used to recover char order
            output: 
                Variable(batch_size, sent_len, hidden_dim)
        """
        batch_size = word_inputs.size(0)
        sent_len = word_inputs.size(1)
        word_embs = self.word_embedding(word_inputs)
        word_list = [word_embs]

        for idx in range(self.feature_num):
            word_list.append(self.feature_embeddings[idx](feature_inputs[idx]))

        if self.use_char:
            # calculate char lstm last hidden
            char_features, _ = self.char_feature.get_last_hiddens(char_inputs, char_seq_lengths.cpu().numpy())
            char_features = char_features[char_seq_recover]
            char_features = char_features.view(batch_size, sent_len, -1)
            word_list.append(char_features)
            if self.char_all_feature:
                char_features_extra, _ = self.char_feature_extra.get_last_hiddens(char_inputs,
                                                                                  char_seq_lengths.cpu().numpy())
                char_features_extra = char_features_extra[char_seq_recover]
                char_features_extra = char_features_extra.view(batch_size, sent_len, -1)
                # concat word and char together
                word_list.append(char_features_extra)

        trans_features_wc_temp = None
        word_embed_mapping = None
        word_embs_temp = word_embs.view(batch_size * sent_len, -1)
        if self.use_trans:
            trans_features, trans_rnn_length = self.trans_feature.get_last_hiddens(trans_inputs,
                                                                                   trans_seq_length.cpu().numpy())

            if self.use_mapping:
                trans_features_wc = trans_features
                if self.gpu:
                    trans_features_wc.cuda()

                trans_features_wc = trans_features_wc[trans_seq_recover]
                trans_inputs = trans_inputs[trans_seq_recover]

                for index, line in enumerate(trans_inputs):
                    if line[0].data.cpu().numpy()[0] == 0:
                        if self.mapping_func:
                            trans_features_wc[index] = self.non_linear(self.w(word_embs_temp[index]))
                        else:
                            trans_features_wc[index] = self.w(word_embs_temp[index])

                trans_features_wc_temp = trans_features_wc
                trans_features_wc = trans_features_wc.view(batch_size, sent_len, -1)
                word_list.append(trans_features_wc)
                if self.mapping_func:
                    word_embed_mapping = self.non_linear(self.w(word_embs_temp))
                else:
                    word_embed_mapping = self.w(word_embs_temp)

            else:
                trans_features = trans_features[trans_seq_recover]
                trans_features = trans_features.view(batch_size, sent_len, -1)
                word_list.append(trans_features)

        word_embs = torch.cat(word_list, 2)
        word_represent = self.drop(word_embs)
        return word_represent, word_embed_mapping, trans_features_wc_temp
Esempio n. 12
0
    def __init__(self, data, use_position, use_cap, use_postag, use_char):
        super(WordRep, self).__init__()

        self.gpu = data.HP_gpu
        self.use_char = use_char
        self.batch_size = data.HP_batch_size
        self.char_hidden_dim = 0

        if self.use_char:
            self.char_hidden_dim = data.HP_char_hidden_dim
            self.char_embedding_dim = data.char_emb_dim

            self.char_feature = CharCNN(data.char_alphabet.size(), None,
                                        self.char_embedding_dim,
                                        self.char_hidden_dim, data.HP_dropout,
                                        self.gpu)

        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(data.HP_dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(),
                                           self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(data.word_alphabet.size(),
                                          self.embedding_dim)))

        self.feature_num = 0
        self.feature_embedding_dims = data.feature_emb_dims
        self.feature_embeddings = nn.ModuleList()

        if use_cap:
            self.feature_num += 1
            alphabet_id = data.feature_name2id['[Cap]']
            emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(),
                               self.feature_embedding_dims[alphabet_id])
            emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.feature_alphabets[alphabet_id].size(),
                        self.feature_embedding_dims[alphabet_id])))
            self.feature_embeddings.append(emb)

        if use_postag:
            self.feature_num += 1
            alphabet_id = data.feature_name2id['[POS]']
            emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(),
                               self.feature_embedding_dims[alphabet_id])
            emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.feature_alphabets[alphabet_id].size(),
                        self.feature_embedding_dims[alphabet_id])))
            self.feature_embeddings.append(emb)

        self.use_position = use_position
        if self.use_position:

            position_alphabet_id = data.re_feature_name2id['[POSITION]']
            self.position_embedding_dim = data.re_feature_emb_dims[
                position_alphabet_id]
            self.position1_emb = nn.Embedding(
                data.re_feature_alphabet_sizes[position_alphabet_id],
                self.position_embedding_dim, data.pad_idx)
            self.position1_emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.re_feature_alphabet_sizes[position_alphabet_id],
                        self.position_embedding_dim)))

            self.position2_emb = nn.Embedding(
                data.re_feature_alphabet_sizes[position_alphabet_id],
                self.position_embedding_dim, data.pad_idx)
            self.position2_emb.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(
                        data.re_feature_alphabet_sizes[position_alphabet_id],
                        self.position_embedding_dim)))

        if torch.cuda.is_available():
            self.drop = self.drop.cuda(self.gpu)
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            for idx in range(self.feature_num):
                self.feature_embeddings[idx] = self.feature_embeddings[
                    idx].cuda(self.gpu)
            if self.use_position:
                self.position1_emb = self.position1_emb.cuda(self.gpu)
                self.position2_emb = self.position2_emb.cuda(self.gpu)
Esempio n. 13
0
 def __init__(self, name, embedding_size):
     self.name = name
     self.embedding_size = embedding_size
     self.is_cuda_available = torch.cuda.is_available()
     self.cnn = CharCNN(16, embedding_size, 3)
     self.log("Done")
Esempio n. 14
0
    def __init__(self,
                 word_sents,
                 name,
                 embedding_size,
                 filler,
                 force_train=False,
                 ce_enabled=True,
                 tf_enabled=True):
        # The following parameters help to distinguish models
        self.name = name
        self.embedding_size = embedding_size

        # Filler is a functor which provides embedding vector
        # when a word or its substring cannot be found in a dictionary
        self.filler = filler

        # Character embedding dramatically increases the precision of taggers
        # but it's not always necessary
        self.ce_enabled = ce_enabled

        self.tf_enabled = tf_enabled

        # Grammer is used to transform a word into a list of its
        # character n-grams
        self.grammer = CharGram()

        # Used models
        self.word_model = None
        self.char_model = None

        # CUDA flag
        self.is_cuda_available = torch.cuda.is_available()

        # CharCNN
        self.cnn = CharCNN(16, embedding_size, 5)

        # Model misses.
        # Can be used for debugging and as an aggregate estimation
        # of vocabulary completeness
        self.num_word_misses = 0
        self.num_char_misses = 0
        self.num_words = 0
        self.num_grams = 0

        # Trained models should be stored
        word_model_path = "internal/{}_{}_word_model.w2v".format(
            name, embedding_size)

        # Very
        # Important
        # Decision
        train_word_model = False

        if force_train:
            train_word_model = True
        else:
            if not os.path.exists(word_model_path):
                train_word_model = True

        # Word model
        if train_word_model:
            # Train model
            self.log("Training word model...")
            iterator = LowercaseSentences(word_sents)
            # iterator = word_sents
            self.word_model = gensim.models.Word2Vec(iterator,
                                                     size=embedding_size,
                                                     sg=1,
                                                     workers=4,
                                                     min_count=3)
            self.word_model.save(word_model_path)

        else:
            # Load
            self.log("Loading existing word model...")
            self.word_model = gensim.models.Word2Vec.load(word_model_path)
        self.log("Done")
Esempio n. 15
0
 def __init__(self, char_size, output_size, kernel_width, vocab_size):
     super(CharCNNTrainer, self).__init__()
     self.cnn = CharCNN(char_size, output_size, kernel_width)
     self.linear = nn.Linear(output_size, vocab_size)
     if torch.cuda.is_available():
         self.linear = self.linear.cuda()