def __init__(self, data): super(BiLSTM, self).__init__() print "build batched bilstm..." self.gpu = data.HP_gpu self.use_char = data.HP_use_char self.batch_size = data.HP_batch_size self.char_hidden_dim = 0 self.average_batch = data.HP_average_batch_loss if self.use_char: self.char_hidden_dim = data.HP_char_hidden_dim self.char_embedding_dim = data.char_emb_dim if data.char_features == "CNN": self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_features == "LSTM": self.char_feature = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) else: print "Error char feature selection, please check parameter data.char_features (either CNN or LSTM)." exit(0) self.embedding_dim = data.word_emb_dim self.hidden_dim = data.HP_hidden_dim self.drop = nn.Dropout(data.HP_dropout) self.droplstm = nn.Dropout(data.HP_dropout) self.word_embeddings = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) self.bilstm_flag = data.HP_bilstm self.lstm_layer = data.HP_lstm_layer if data.pretrain_word_embedding is not None: self.word_embeddings.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) # The LSTM takes word embeddings as inputs, and outputs hidden states # with dimensionality hidden_dim. if self.bilstm_flag: lstm_hidden = data.HP_hidden_dim // 2 else: lstm_hidden = data.HP_hidden_dim self.lstm = nn.LSTM(self.embedding_dim + self.char_hidden_dim, lstm_hidden, num_layers=self.lstm_layer, batch_first=True, bidirectional=self.bilstm_flag) # The linear layer that maps from hidden state space to tag space self.hidden2tag = nn.Linear(self.hidden_dim, data.label_alphabet_size) if self.gpu: self.drop = self.drop.cuda() self.droplstm = self.droplstm.cuda() self.word_embeddings = self.word_embeddings.cuda() self.lstm = self.lstm.cuda() self.hidden2tag = self.hidden2tag.cuda()
def __init__(self, vocab_size, word_embed_dim, word_hidden_dim, alphabet_size, char_embedding_dim, char_hidden_dim, feature_extractor, tag_num, dropout, pretrain_embed=None, use_char=False, use_crf=False, use_gpu=False): super(NamedEntityRecog, self).__init__() self.use_crf = use_crf self.use_char = use_char self.drop = nn.Dropout(dropout) self.input_dim = word_embed_dim self.feature_extractor = feature_extractor self.embeds = nn.Embedding(vocab_size, word_embed_dim, padding_idx=0) if pretrain_embed is not None: self.embeds.weight.data.copy_(torch.from_numpy(pretrain_embed)) else: self.embeds.weight.data.copy_( torch.from_numpy( self.random_embedding(vocab_size, word_embed_dim))) if self.use_char: self.input_dim += char_hidden_dim self.char_feature = CharCNN(alphabet_size, char_embedding_dim, char_hidden_dim, dropout) if feature_extractor == 'lstm': self.lstm = nn.LSTM(self.input_dim, word_hidden_dim, batch_first=True, bidirectional=True) else: self.word2cnn = nn.Linear(self.input_dim, word_hidden_dim * 2) self.cnn_list = list() for _ in range(4): self.cnn_list.append( nn.Conv1d(word_hidden_dim * 2, word_hidden_dim * 2, kernel_size=3, padding=1)) self.cnn_list.append(nn.ReLU()) self.cnn_list.append(nn.Dropout(dropout)) self.cnn_list.append(nn.BatchNorm1d(word_hidden_dim * 2)) self.cnn = nn.Sequential(*self.cnn_list) if self.use_crf: self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num + 2) self.crf = CRF(tag_num, use_gpu) else: self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num)
def __init__(self, args): super().__init__() L, R, lb = 7, 10, 0.0 self.wtoi, self.wvec = args.wtoi, args.wvec qws = list(set(args.qws)) self.w2l, self.w2r = get_lr(self.wtoi.keys(), qws, L, R, lb) self.cdim = 100 self.charcnn = CharCNN(self.cdim) self.lW = nn.Parameter( torch.zeros(self.cdim, self.cdim).uniform_(-0.1, 0.1)) self.rW = nn.Parameter( torch.zeros(self.cdim, self.cdim).uniform_(-0.1, 0.1)) self.gL = nn.Linear(L + R, 2)
def __init__(self, args, model_params): super(WordEmbedding, self).__init__() self.gpu = args.ifgpu self.use_char = args.useChar self.char_hidden_dim = args.char_hidden_dim self.char_embedding_dim = args.char_embedding_dim self.embedding_dim = model_params.embedding_dim self.drop = nn.Dropout(args.dropout) #char Embedding if self.use_char: if args.charExtractor == "CNN": self.char_feature = CharCNN( model_params.char_alphabet.size(), model_params.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, args.dropout, self.gpu) elif args.charExtractor == "LSTM": self.char_feature = CharBiLSTM( model_params.char_alphabet.size(), model_params.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, args.dropout, self.gpu) else: print( "Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM)." ) exit(0) #word Embedding self.word_embedding = nn.Embedding(model_params.word_alphabet.size(), self.embedding_dim) if model_params.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(model_params.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(model_params.word_alphabet.size(), self.embedding_dim)))
def __init__(self, data, use_position, use_cap, use_postag, use_char): super(WordRep, self).__init__() print "build word representation..." self.gpu = data.HP_gpu self.use_char = use_char self.batch_size = data.HP_batch_size self.char_hidden_dim = 0 self.char_all_feature = False if self.use_char: self.char_hidden_dim = data.HP_char_hidden_dim self.char_embedding_dim = data.char_emb_dim if data.char_feature_extractor == "CNN": self.char_feature = CharCNN(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_feature_extractor == "LSTM": self.char_feature = CharBiLSTM(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_feature_extractor == "GRU": self.char_feature = CharBiGRU(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_feature_extractor == "ALL": self.char_all_feature = True self.char_feature = CharCNN(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) self.char_feature_extra = CharBiLSTM( data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) else: print "Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM/GRU/ALL)." exit(0) self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(data.HP_dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) self.feature_num = 0 self.feature_embedding_dims = data.feature_emb_dims self.feature_embeddings = nn.ModuleList() if use_cap: self.feature_num += 1 alphabet_id = data.feature_name2id['[Cap]'] emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]) emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]))) self.feature_embeddings.append(emb) if use_postag: self.feature_num += 1 alphabet_id = data.feature_name2id['[POS]'] emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]) emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]))) self.feature_embeddings.append(emb) self.use_position = use_position if self.use_position: position_alphabet_id = data.re_feature_name2id['[POSITION]'] self.position_embedding_dim = data.re_feature_emb_dims[ position_alphabet_id] self.position1_emb = nn.Embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim, data.pad_idx) self.position1_emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim))) self.position2_emb = nn.Embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim, data.pad_idx) self.position2_emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim))) if torch.cuda.is_available(): self.drop = self.drop.cuda(self.gpu) self.word_embedding = self.word_embedding.cuda(self.gpu) for idx in range(self.feature_num): self.feature_embeddings[idx] = self.feature_embeddings[ idx].cuda(self.gpu) if self.use_position: self.position1_emb = self.position1_emb.cuda(self.gpu) self.position2_emb = self.position2_emb.cuda(self.gpu)
class WordRep(nn.Module): def __init__(self, data, use_position, use_cap, use_postag, use_char): super(WordRep, self).__init__() print "build word representation..." self.gpu = data.HP_gpu self.use_char = use_char self.batch_size = data.HP_batch_size self.char_hidden_dim = 0 self.char_all_feature = False if self.use_char: self.char_hidden_dim = data.HP_char_hidden_dim self.char_embedding_dim = data.char_emb_dim if data.char_feature_extractor == "CNN": self.char_feature = CharCNN(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_feature_extractor == "LSTM": self.char_feature = CharBiLSTM(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_feature_extractor == "GRU": self.char_feature = CharBiGRU(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_feature_extractor == "ALL": self.char_all_feature = True self.char_feature = CharCNN(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) self.char_feature_extra = CharBiLSTM( data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) else: print "Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM/GRU/ALL)." exit(0) self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(data.HP_dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) self.feature_num = 0 self.feature_embedding_dims = data.feature_emb_dims self.feature_embeddings = nn.ModuleList() if use_cap: self.feature_num += 1 alphabet_id = data.feature_name2id['[Cap]'] emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]) emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]))) self.feature_embeddings.append(emb) if use_postag: self.feature_num += 1 alphabet_id = data.feature_name2id['[POS]'] emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]) emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]))) self.feature_embeddings.append(emb) self.use_position = use_position if self.use_position: position_alphabet_id = data.re_feature_name2id['[POSITION]'] self.position_embedding_dim = data.re_feature_emb_dims[ position_alphabet_id] self.position1_emb = nn.Embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim, data.pad_idx) self.position1_emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim))) self.position2_emb = nn.Embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim, data.pad_idx) self.position2_emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim))) if torch.cuda.is_available(): self.drop = self.drop.cuda(self.gpu) self.word_embedding = self.word_embedding.cuda(self.gpu) for idx in range(self.feature_num): self.feature_embeddings[idx] = self.feature_embeddings[ idx].cuda(self.gpu) if self.use_position: self.position1_emb = self.position1_emb.cuda(self.gpu) self.position2_emb = self.position2_emb.cuda(self.gpu) def random_embedding(self, vocab_size, embedding_dim): pretrain_emb = np.empty([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, position1_inputs, position2_inputs): """ input: word_inputs: (batch_size, sent_len) features: list [(batch_size, sent_len), (batch_len, sent_len),...] word_seq_lengths: list of batch_size, (batch_size,1) char_inputs: (batch_size*sent_len, word_length) char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) char_seq_recover: variable which records the char order information, used to recover char order output: Variable(batch_size, sent_len, hidden_dim) """ batch_size = word_inputs.size(0) sent_len = word_inputs.size(1) word_embs = self.word_embedding(word_inputs) word_list = [word_embs] for idx in range(self.feature_num): word_list.append(self.feature_embeddings[idx](feature_inputs[idx])) if self.use_char: ## calculate char lstm last hidden char_features = self.char_feature.get_last_hiddens( char_inputs, char_seq_lengths.cpu().numpy()) char_features = char_features[char_seq_recover] char_features = char_features.view(batch_size, sent_len, -1) ## concat word and char together word_list.append(char_features) word_embs = torch.cat([word_embs, char_features], 2) if self.char_all_feature: char_features_extra = self.char_feature_extra.get_last_hiddens( char_inputs, char_seq_lengths.cpu().numpy()) char_features_extra = char_features_extra[char_seq_recover] char_features_extra = char_features_extra.view( batch_size, sent_len, -1) ## concat word and char together word_list.append(char_features_extra) if self.use_position: position1_feature = self.position1_emb(position1_inputs) position2_feature = self.position2_emb(position2_inputs) word_list.append(position1_feature) word_list.append(position2_feature) word_embs = torch.cat(word_list, 2) word_represent = self.drop(word_embs) return word_represent
def __init__(self, data): super(WordRep, self).__init__() print "build word representation..." self.gpu = data.HP_gpu self.use_char = data.use_char self.batch_size = data.HP_batch_size self.char_hidden_dim = 0 self.char_all_feature = False if self.use_char: self.char_hidden_dim = data.HP_char_hidden_dim self.char_embedding_dim = data.char_emb_dim if data.char_seq_feature == "CNN": self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_seq_feature == "LSTM": self.char_feature = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_seq_feature == "GRU": self.char_feature = CharBiGRU(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_seq_feature == "ALL": self.char_all_feature = True self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) self.char_feature_extra = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) else: print "Error char feature selection, please check parameter data.char_seq_feature (CNN/LSTM/GRU/ALL)." exit(0) self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(data.HP_dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) self.feature_num = data.feature_num self.feature_embedding_dims = data.feature_emb_dims self.feature_embeddings = nn.ModuleList() for idx in range(self.feature_num): self.feature_embeddings.append( nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx])) for idx in range(self.feature_num): if data.pretrain_feature_embeddings[idx] is not None: self.feature_embeddings[idx].weight.data.copy_( torch.from_numpy(data.pretrain_feature_embeddings[idx])) else: self.feature_embeddings[idx].weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))) if self.gpu: self.drop = self.drop.cuda() self.word_embedding = self.word_embedding.cuda() for idx in range(self.feature_num): self.feature_embeddings[idx] = self.feature_embeddings[ idx].cuda()
def __init__(self, data, opt): super(WordRep, self).__init__() self.gpu = opt.gpu self.batch_size = opt.batch_size self.use_elmo = False if opt.elmo: logging.info("use elmo, loading ...") self.use_elmo = True self.elmo = Embedder(data.config['elmo_path']) # we project the elmo representation to the same dim of char embedding self.elmo_projection = nn.Linear( self.elmo.config['encoder']['projection_dim'] * 2, opt.char_hidden_dim, False) self.elmo_drop = nn.Dropout(opt.dropout) else: self.char_hidden_dim = opt.char_hidden_dim self.char_embedding_dim = opt.char_emb_dim self.char_feature = CharCNN(data.char_alphabet.size(), None, self.char_embedding_dim, self.char_hidden_dim, opt.dropout, self.gpu) self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(opt.dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) if data.feat_config is not None: self.feature_num = len(data.feature_alphabets) self.feature_embedding_dims = data.feature_emb_dims self.feature_embeddings = nn.ModuleList() for idx in range(self.feature_num): emb = nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]) emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))) self.feature_embeddings.append(emb) else: self.feature_num = 0 if opt.gpu >= 0 and torch.cuda.is_available(): self.drop = self.drop.cuda(self.gpu) self.word_embedding = self.word_embedding.cuda(self.gpu) if data.feat_config is not None: for idx in range(self.feature_num): self.feature_embeddings[idx] = self.feature_embeddings[ idx].cuda(self.gpu) if opt.elmo: self.elmo_projection = self.elmo_projection.cuda(self.gpu) self.elmo_drop = self.elmo_drop.cuda(self.gpu)
class WordRep(nn.Module): def __init__(self, data, opt): super(WordRep, self).__init__() self.gpu = opt.gpu self.batch_size = opt.batch_size self.use_elmo = False if opt.elmo: logging.info("use elmo, loading ...") self.use_elmo = True self.elmo = Embedder(data.config['elmo_path']) # we project the elmo representation to the same dim of char embedding self.elmo_projection = nn.Linear( self.elmo.config['encoder']['projection_dim'] * 2, opt.char_hidden_dim, False) self.elmo_drop = nn.Dropout(opt.dropout) else: self.char_hidden_dim = opt.char_hidden_dim self.char_embedding_dim = opt.char_emb_dim self.char_feature = CharCNN(data.char_alphabet.size(), None, self.char_embedding_dim, self.char_hidden_dim, opt.dropout, self.gpu) self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(opt.dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) if data.feat_config is not None: self.feature_num = len(data.feature_alphabets) self.feature_embedding_dims = data.feature_emb_dims self.feature_embeddings = nn.ModuleList() for idx in range(self.feature_num): emb = nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]) emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))) self.feature_embeddings.append(emb) else: self.feature_num = 0 if opt.gpu >= 0 and torch.cuda.is_available(): self.drop = self.drop.cuda(self.gpu) self.word_embedding = self.word_embedding.cuda(self.gpu) if data.feat_config is not None: for idx in range(self.feature_num): self.feature_embeddings[idx] = self.feature_embeddings[ idx].cuda(self.gpu) if opt.elmo: self.elmo_projection = self.elmo_projection.cuda(self.gpu) self.elmo_drop = self.elmo_drop.cuda(self.gpu) def random_embedding(self, vocab_size, embedding_dim): pretrain_emb = np.zeros([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, feature_inputs, text_inputs): """ input: word_inputs: (batch_size, sent_len) features: list [(batch_size, sent_len), (batch_len, sent_len),...] word_seq_lengths: list of batch_size, (batch_size,1) char_inputs: (batch_size*sent_len, word_length) char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) char_seq_recover: variable which records the char order information, used to recover char order output: Variable(batch_size, sent_len, hidden_dim) """ batch_size = word_inputs.size(0) sent_len = word_inputs.size(1) word_embs = self.word_embedding(word_inputs) word_list = [word_embs] for idx in range(self.feature_num): word_list.append(self.feature_embeddings[idx](feature_inputs[idx])) if self.use_elmo: with torch.no_grad(): elmo_rep = torch.from_numpy( np.array(self.elmo.sents2elmo( text_inputs))) # batch, seq_len, 1024 if self.gpu >= 0 and torch.cuda.is_available(): elmo_rep = elmo_rep.cuda(self.gpu) char_features = self.elmo_drop(self.elmo_projection(elmo_rep)) # char_features = elmo_rep else: char_features = self.char_feature.get_last_hiddens( char_inputs, char_seq_lengths.cpu().numpy()) char_features = char_features[char_seq_recover] char_features = char_features.view(batch_size, sent_len, -1) word_list.append(char_features) word_embs = torch.cat(word_list, 2) word_represent = self.drop(word_embs) return word_represent
def __init__(self, data): super(WordRep, self).__init__() print "Build word representation..." self.gpu = data.HP_gpu self.use_char = data.use_char self.use_trans = data.use_trans self.batch_size = data.HP_batch_size self.char_hidden_dim = 0 self.char_all_feature = False self.use_mapping = data.use_mapping self.mapping_func = data.mapping_func # character-level if self.use_trans: if self.use_mapping: # linear mapping self.w = nn.Linear(data.word_emb_dim, data.HP_trans_hidden_dim) # non-linear mapping:w + tanh or w + sigmoid if self.mapping_func: if self.mapping_func == "tanh": self.non_linear = nn.Tanh() elif self.mapping_func == "sigmoid": self.non_linear = nn.Sigmoid() self.trans_hidden_dim = data.HP_trans_hidden_dim self.trans_embedding_dim = data.trans_emb_dim self.trans_feature = TransBiLSTM(data.translation_alphabet.size(), self.trans_embedding_dim, self.trans_hidden_dim, data.HP_dropout, data.pretrain_trans_embedding, self.gpu) # word-level if self.use_char: self.char_hidden_dim = data.HP_char_hidden_dim self.char_embedding_dim = data.char_emb_dim if data.char_seq_feature == "CNN": self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, data.pretrain_char_embedding, self.gpu) elif data.char_seq_feature == "LSTM": self.char_feature = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, data.pretrain_char_embedding, self.gpu) elif data.char_seq_feature == "GRU": self.char_feature = CharBiGRU(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_seq_feature == "ALL": self.char_all_feature = True self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, data.pretrain_char_embedding, self.gpu) self.char_feature_extra = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) else: print "Error char feature selection, please check parameter data.char_seq_feature (CNN/LSTM/GRU/ALL)." exit(0) # Word embedding self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(data.HP_dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_(torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy(self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) # not use self.feature_num = data.feature_num self.feature_embedding_dims = data.feature_emb_dims self.feature_embeddings = nn.ModuleList() for idx in range(self.feature_num): self.feature_embeddings.append( nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx])) for idx in range(self.feature_num): if data.pretrain_feature_embeddings[idx] is not None: self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(data.pretrain_feature_embeddings[idx])) else: self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy( self.random_embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))) if self.gpu: self.drop = self.drop.cuda() self.word_embedding = self.word_embedding.cuda() for idx in range(self.feature_num): self.feature_embeddings[idx] = self.feature_embeddings[idx].cuda()
class WordRep(nn.Module): def __init__(self, data): super(WordRep, self).__init__() print "Build word representation..." self.gpu = data.HP_gpu self.use_char = data.use_char self.use_trans = data.use_trans self.batch_size = data.HP_batch_size self.char_hidden_dim = 0 self.char_all_feature = False self.use_mapping = data.use_mapping self.mapping_func = data.mapping_func # character-level if self.use_trans: if self.use_mapping: # linear mapping self.w = nn.Linear(data.word_emb_dim, data.HP_trans_hidden_dim) # non-linear mapping:w + tanh or w + sigmoid if self.mapping_func: if self.mapping_func == "tanh": self.non_linear = nn.Tanh() elif self.mapping_func == "sigmoid": self.non_linear = nn.Sigmoid() self.trans_hidden_dim = data.HP_trans_hidden_dim self.trans_embedding_dim = data.trans_emb_dim self.trans_feature = TransBiLSTM(data.translation_alphabet.size(), self.trans_embedding_dim, self.trans_hidden_dim, data.HP_dropout, data.pretrain_trans_embedding, self.gpu) # word-level if self.use_char: self.char_hidden_dim = data.HP_char_hidden_dim self.char_embedding_dim = data.char_emb_dim if data.char_seq_feature == "CNN": self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, data.pretrain_char_embedding, self.gpu) elif data.char_seq_feature == "LSTM": self.char_feature = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, data.pretrain_char_embedding, self.gpu) elif data.char_seq_feature == "GRU": self.char_feature = CharBiGRU(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_seq_feature == "ALL": self.char_all_feature = True self.char_feature = CharCNN(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, data.pretrain_char_embedding, self.gpu) self.char_feature_extra = CharBiLSTM(data.char_alphabet.size(), self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) else: print "Error char feature selection, please check parameter data.char_seq_feature (CNN/LSTM/GRU/ALL)." exit(0) # Word embedding self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(data.HP_dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_(torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy(self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) # not use self.feature_num = data.feature_num self.feature_embedding_dims = data.feature_emb_dims self.feature_embeddings = nn.ModuleList() for idx in range(self.feature_num): self.feature_embeddings.append( nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx])) for idx in range(self.feature_num): if data.pretrain_feature_embeddings[idx] is not None: self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(data.pretrain_feature_embeddings[idx])) else: self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy( self.random_embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))) if self.gpu: self.drop = self.drop.cuda() self.word_embedding = self.word_embedding.cuda() for idx in range(self.feature_num): self.feature_embeddings[idx] = self.feature_embeddings[idx].cuda() def random_embedding(self, vocab_size, embedding_dim): pretrain_emb = np.empty([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, trans_inputs, trans_seq_length, trans_seq_recover): """ input: word_inputs: (batch_size, sent_len) features: list [(batch_size, sent_len), (batch_len, sent_len),...] word_seq_lengths: list of batch_size, (batch_size,1) char_inputs: (batch_size*sent_len, word_length) char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) char_seq_recover: variable which records the char order information, used to recover char order output: Variable(batch_size, sent_len, hidden_dim) """ batch_size = word_inputs.size(0) sent_len = word_inputs.size(1) word_embs = self.word_embedding(word_inputs) word_list = [word_embs] for idx in range(self.feature_num): word_list.append(self.feature_embeddings[idx](feature_inputs[idx])) if self.use_char: # calculate char lstm last hidden char_features, _ = self.char_feature.get_last_hiddens(char_inputs, char_seq_lengths.cpu().numpy()) char_features = char_features[char_seq_recover] char_features = char_features.view(batch_size, sent_len, -1) word_list.append(char_features) if self.char_all_feature: char_features_extra, _ = self.char_feature_extra.get_last_hiddens(char_inputs, char_seq_lengths.cpu().numpy()) char_features_extra = char_features_extra[char_seq_recover] char_features_extra = char_features_extra.view(batch_size, sent_len, -1) # concat word and char together word_list.append(char_features_extra) trans_features_wc_temp = None word_embed_mapping = None word_embs_temp = word_embs.view(batch_size * sent_len, -1) if self.use_trans: trans_features, trans_rnn_length = self.trans_feature.get_last_hiddens(trans_inputs, trans_seq_length.cpu().numpy()) if self.use_mapping: trans_features_wc = trans_features if self.gpu: trans_features_wc.cuda() trans_features_wc = trans_features_wc[trans_seq_recover] trans_inputs = trans_inputs[trans_seq_recover] for index, line in enumerate(trans_inputs): if line[0].data.cpu().numpy()[0] == 0: if self.mapping_func: trans_features_wc[index] = self.non_linear(self.w(word_embs_temp[index])) else: trans_features_wc[index] = self.w(word_embs_temp[index]) trans_features_wc_temp = trans_features_wc trans_features_wc = trans_features_wc.view(batch_size, sent_len, -1) word_list.append(trans_features_wc) if self.mapping_func: word_embed_mapping = self.non_linear(self.w(word_embs_temp)) else: word_embed_mapping = self.w(word_embs_temp) else: trans_features = trans_features[trans_seq_recover] trans_features = trans_features.view(batch_size, sent_len, -1) word_list.append(trans_features) word_embs = torch.cat(word_list, 2) word_represent = self.drop(word_embs) return word_represent, word_embed_mapping, trans_features_wc_temp
def __init__(self, data, use_position, use_cap, use_postag, use_char): super(WordRep, self).__init__() self.gpu = data.HP_gpu self.use_char = use_char self.batch_size = data.HP_batch_size self.char_hidden_dim = 0 if self.use_char: self.char_hidden_dim = data.HP_char_hidden_dim self.char_embedding_dim = data.char_emb_dim self.char_feature = CharCNN(data.char_alphabet.size(), None, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(data.HP_dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) self.feature_num = 0 self.feature_embedding_dims = data.feature_emb_dims self.feature_embeddings = nn.ModuleList() if use_cap: self.feature_num += 1 alphabet_id = data.feature_name2id['[Cap]'] emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]) emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]))) self.feature_embeddings.append(emb) if use_postag: self.feature_num += 1 alphabet_id = data.feature_name2id['[POS]'] emb = nn.Embedding(data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]) emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[alphabet_id].size(), self.feature_embedding_dims[alphabet_id]))) self.feature_embeddings.append(emb) self.use_position = use_position if self.use_position: position_alphabet_id = data.re_feature_name2id['[POSITION]'] self.position_embedding_dim = data.re_feature_emb_dims[ position_alphabet_id] self.position1_emb = nn.Embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim, data.pad_idx) self.position1_emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim))) self.position2_emb = nn.Embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim, data.pad_idx) self.position2_emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.re_feature_alphabet_sizes[position_alphabet_id], self.position_embedding_dim))) if torch.cuda.is_available(): self.drop = self.drop.cuda(self.gpu) self.word_embedding = self.word_embedding.cuda(self.gpu) for idx in range(self.feature_num): self.feature_embeddings[idx] = self.feature_embeddings[ idx].cuda(self.gpu) if self.use_position: self.position1_emb = self.position1_emb.cuda(self.gpu) self.position2_emb = self.position2_emb.cuda(self.gpu)
def __init__(self, name, embedding_size): self.name = name self.embedding_size = embedding_size self.is_cuda_available = torch.cuda.is_available() self.cnn = CharCNN(16, embedding_size, 3) self.log("Done")
def __init__(self, word_sents, name, embedding_size, filler, force_train=False, ce_enabled=True, tf_enabled=True): # The following parameters help to distinguish models self.name = name self.embedding_size = embedding_size # Filler is a functor which provides embedding vector # when a word or its substring cannot be found in a dictionary self.filler = filler # Character embedding dramatically increases the precision of taggers # but it's not always necessary self.ce_enabled = ce_enabled self.tf_enabled = tf_enabled # Grammer is used to transform a word into a list of its # character n-grams self.grammer = CharGram() # Used models self.word_model = None self.char_model = None # CUDA flag self.is_cuda_available = torch.cuda.is_available() # CharCNN self.cnn = CharCNN(16, embedding_size, 5) # Model misses. # Can be used for debugging and as an aggregate estimation # of vocabulary completeness self.num_word_misses = 0 self.num_char_misses = 0 self.num_words = 0 self.num_grams = 0 # Trained models should be stored word_model_path = "internal/{}_{}_word_model.w2v".format( name, embedding_size) # Very # Important # Decision train_word_model = False if force_train: train_word_model = True else: if not os.path.exists(word_model_path): train_word_model = True # Word model if train_word_model: # Train model self.log("Training word model...") iterator = LowercaseSentences(word_sents) # iterator = word_sents self.word_model = gensim.models.Word2Vec(iterator, size=embedding_size, sg=1, workers=4, min_count=3) self.word_model.save(word_model_path) else: # Load self.log("Loading existing word model...") self.word_model = gensim.models.Word2Vec.load(word_model_path) self.log("Done")
def __init__(self, char_size, output_size, kernel_width, vocab_size): super(CharCNNTrainer, self).__init__() self.cnn = CharCNN(char_size, output_size, kernel_width) self.linear = nn.Linear(output_size, vocab_size) if torch.cuda.is_available(): self.linear = self.linear.cuda()