def forward(self, x): x = self.embed(x) if (self.cove): outputs_both_layer_cove_with_glove = MTLSTM( n_vocab=None, vectors=None, layer0=True, residual_embeddings=True) outputs_both_layer_cove_with_glove.cuda() x = outputs_both_layer_cove_with_glove(x, [x.shape[1]] * x.shape[0]) x = x.unsqueeze(1) x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] x = torch.cat(x, 1) x = self.dropout(x) output = self.fully_connected(x) return output
def compute_torch_values(inputs, embeddings): model = MTLSTM(n_vocab=embeddings.shape[0], vectors=torch.from_numpy(embeddings.astype(np.float32))) model.cuda(0) model_inputs = Variable(torch.from_numpy(inputs.astype(np.int64))) lengths = torch.from_numpy( np.ones((inputs.shape[0], ), dtype=np.int64) * inputs.shape[1]) cove_outputs = model.forward(model_inputs.cuda(), lengths=lengths.cuda()) torch_output = (cove_outputs.data.cpu().numpy()) print("Torch output shape", torch_output.shape) return torch_output
class tmcove(Model): def load(self, vectors): self.model = MTLSTM(n_vocab=len(vectors.keys()), vectors=vectors) self.model.cuda() def train(self, X, Y): pass def predict(self, X): X, Y = self.input_function(X, []) return [[get_word2vec(token, self.vectors) for token in tokens_list] for tokens in X]
def save_cove_weights(options): """Saves the weights of the CoVe LSTM for manual TensorFlow initialization. """ folder_name = os.path.join(options.data_dir, constants.COVE_WEIGHTS_FOLDER) if all([os.path.exists(os.path.join(folder_name, name + ".npy")) \ for name in constants.COVE_WEIGHT_NAMES]): print("Cove weights already saved") return os.makedirs(folder_name, exist_ok=True) vocab = get_vocab(options.data_dir) embeddings = embedding_util.load_word_embeddings_including_unk_and_padding( options) vec_size = 2 * embeddings.shape[1] print("Loading CoVe model") model = MTLSTM(n_vocab=embeddings.shape[0], vectors=torch.from_numpy(embeddings.astype(np.float32))) print("Saving CoVe weights") for weight_name in constants.COVE_WEIGHT_NAMES: tensor = getattr(model.rnn, weight_name) np_value = tensor.cpu().data.numpy() full_file_name = os.path.join(folder_name, weight_name + ".npy") np.save(full_file_name, np_value)
from torchtext import data from torchtext import datasets from torchtext.vocab import GloVe from cove import MTLSTM TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) LABEL = data.Field(lower=True, include_lengths=True, batch_first=True) train_path = "C:\\Users\\bhara\\Downloads\\NNNlpHW3\\suggestionMining\\data\\Subtask-A\\V1.4_Training.csv" train = data.TabularDataset(path=train_path, format='csv', fields=[('id', None), ('sentence', TEXT), ('label', LABEL)]) TEXT.build_vocab(train, vectors=GloVe(name='840B', dim=300, cache='.embeddings')) LABEL.build_vocab(train) outputs_cove_with_glove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors, residual_embeddings=True, model_cache='.embeddings') #glove_then_first_then_last_layer_cove = outputs_both_layer_cove_with_glove(<pass a sentence Glove embedding>) train_iter = data.Iterator((train), batch_size=100) z = None for batch_idx, batch in enumerate(train_iter): z = batch glove_then_last_layer_cove = outputs_cove_with_glove(*batch.sentence) print(glove_then_last_layer_cove.size())
NUM), # we won't be needing the id, so we pass in None as the field ('moment', TEXT) ] # process it as text tst = data.TabularDataset( path=path + "test_data.csv", # the file path format='csv', skip_header= True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields=tst_datafields) # build the vocabulary using train and validation dataset and assign the vectors TEXT.build_vocab(trainds, valds, max_size=100000, vectors=vec) # build vocab for labels LABEL.build_vocab(trainds) outputs_last_layer_cove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors) outputs_both_layer_cove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors, layer0=True) outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors, layer0=True, residual_embeddings=True) traindl, valdl = data.BucketIterator.splits( datasets=(trainds, valds), # specify train and validation Tabulardataset batch_sizes=(64, len(valid)), # batch size of train and validation sort_key=lambda x: len(x.moment ), # on what attribute the text should be sorted device=None, # -1 mean cpu and 0 or None mean gpu sort_within_batch=True,
def __init__(self, field, args): super().__init__() self.field = field self.args = args self.pad_idx = self.field.vocab.stoi[self.field.pad_token] self.encoder_embeddings = Embedding(field, args.dimension, dropout=args.dropout_ratio, project=not args.cove) self.decoder_embeddings = Embedding(field, args.dimension, dropout=args.dropout_ratio, project=True) if self.args.cove or self.args.intermediate_cove: self.cove = MTLSTM(model_cache=args.embeddings, layer0=args.intermediate_cove, layer1=args.cove) cove_dim = int(args.intermediate_cove) * 600 + int( args.cove ) * 600 + 400 # the last 400 is for GloVe and char n-gram embeddings self.project_cove = Feedforward(cove_dim, args.dimension) self.bilstm_before_coattention = PackedLSTM(args.dimension, args.dimension, batch_first=True, dropout=args.dropout_ratio, bidirectional=True, num_layers=1) self.coattention = CoattentiveLayer(args.dimension, dropout=0.3) dim = 2 * args.dimension + args.dimension + args.dimension self.context_bilstm_after_coattention = PackedLSTM( dim, args.dimension, batch_first=True, dropout=args.dropout_ratio, bidirectional=True, num_layers=args.rnn_layers) self.self_attentive_encoder_context = TransformerEncoder( args.dimension, args.transformer_heads, args.transformer_hidden, args.transformer_layers, args.dropout_ratio) self.bilstm_context = PackedLSTM(args.dimension, args.dimension, batch_first=True, dropout=args.dropout_ratio, bidirectional=True, num_layers=args.rnn_layers) self.question_bilstm_after_coattention = PackedLSTM( dim, args.dimension, batch_first=True, dropout=args.dropout_ratio, bidirectional=True, num_layers=args.rnn_layers) self.self_attentive_encoder_question = TransformerEncoder( args.dimension, args.transformer_heads, args.transformer_hidden, args.transformer_layers, args.dropout_ratio) self.bilstm_question = PackedLSTM(args.dimension, args.dimension, batch_first=True, dropout=args.dropout_ratio, bidirectional=True, num_layers=args.rnn_layers) self.self_attentive_decoder = TransformerDecoder( args.dimension, args.transformer_heads, args.transformer_hidden, args.transformer_layers, args.dropout_ratio) self.dual_ptr_rnn_decoder = DualPtrRNNDecoder( args.dimension, args.dimension, dropout=args.dropout_ratio, num_layers=args.rnn_layers) self.generative_vocab_size = min(len(field.vocab), args.max_generative_vocab) self.out = nn.Linear(args.dimension, self.generative_vocab_size) self.dropout = nn.Dropout(0.4)
def load(self, vectors): self.model = MTLSTM(n_vocab=len(vectors.keys()), vectors=vectors) self.model.cuda()
import torch from torchtext import data from torchtext import datasets from cove import MTLSTM inputs = data.Field(lower=True, include_lengths=True, batch_first=True) answers = data.Field(sequential=False) print('Generating train, dev, test splits') train, dev, test = datasets.SNLI.splits(inputs, answers) print('Building vocabulary') inputs.build_vocab(train, dev, test) inputs.vocab.load_vectors(wv_type='glove.840B', wv_dim=300) answers.build_vocab(train) model = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors) model.cuda(0) train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train, dev, test), batch_size=100, device=0) train_iter.init_epoch() print('Generating CoVe') for batch_idx, batch in enumerate(train_iter): model.train() cove_premise = model(*batch.premise) cove_hypothesis = model(*batch.hypothesis)
def __init__(self, field, args): super().__init__() self.field = field self.args = args self.pad_idx = self.field.vocab.stoi[self.field.pad_token] def dp(args): return args.dropout_ratio if args.rnn_layers > 1 else 0. if self.args.glove_and_char: self.encoder_embeddings = Embedding(field, args.dimension, dropout=args.dropout_ratio, project=not args.cove) if self.args.cove or self.args.intermediate_cove: self.cove = MTLSTM(model_cache=args.embeddings, layer0=args.intermediate_cove, layer1=args.cove) cove_params = get_trainable_params(self.cove) for p in cove_params: p.requires_grad = False cove_dim = int(args.intermediate_cove) * 600 + int(args.cove) * 600 + 400 # the last 400 is for GloVe and char n-gram embeddings self.project_cove = Feedforward(cove_dim, args.dimension) if -1 not in self.args.elmo: options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" self.elmo = Elmo(options_file, weight_file, 3, dropout=0.0, do_layer_norm=False) elmo_params = get_trainable_params(self.elmo) for p in elmo_params: p.requires_grad = False elmo_dim = 1024 * len(self.args.elmo) self.project_elmo = Feedforward(elmo_dim, args.dimension) if self.args.glove_and_char: self.project_embeddings = Feedforward(2 * args.dimension, args.dimension, dropout=0.0) self.decoder_embeddings = Embedding(field, args.dimension, dropout=args.dropout_ratio, project=True) self.bilstm_before_coattention = PackedLSTM(args.dimension, args.dimension, batch_first=True, bidirectional=True, num_layers=1, dropout=0) self.coattention = CoattentiveLayer(args.dimension, dropout=0.3) dim = 2*args.dimension + args.dimension + args.dimension self.context_bilstm_after_coattention = PackedLSTM(dim, args.dimension, batch_first=True, dropout=dp(args), bidirectional=True, num_layers=args.rnn_layers) self.self_attentive_encoder_context = TransformerEncoder(args.dimension, args.transformer_heads, args.transformer_hidden, args.transformer_layers, args.dropout_ratio) self.bilstm_context = PackedLSTM(args.dimension, args.dimension, batch_first=True, dropout=dp(args), bidirectional=True, num_layers=args.rnn_layers) self.question_bilstm_after_coattention = PackedLSTM(dim, args.dimension, batch_first=True, dropout=dp(args), bidirectional=True, num_layers=args.rnn_layers) self.self_attentive_encoder_question = TransformerEncoder(args.dimension, args.transformer_heads, args.transformer_hidden, args.transformer_layers, args.dropout_ratio) self.bilstm_question = PackedLSTM(args.dimension, args.dimension, batch_first=True, dropout=dp(args), bidirectional=True, num_layers=args.rnn_layers) self.self_attentive_decoder = TransformerDecoder(args.dimension, args.transformer_heads, args.transformer_hidden, args.transformer_layers, args.dropout_ratio) self.dual_ptr_rnn_decoder = DualPtrRNNDecoder(args.dimension, args.dimension, dropout=args.dropout_ratio, num_layers=args.rnn_layers) self.generative_vocab_size = min(len(field.vocab), args.max_generative_vocab) self.out = nn.Linear(args.dimension, self.generative_vocab_size) self.dropout = nn.Dropout(0.4)
class RnnDocReader(nn.Module): """Network for the Document Reader module of DrQA.""" RNN_TYPES = {'lstm': nn.LSTM, 'gru': nn.GRU, 'rnn': nn.RNN} def __init__(self, opt, padding_idx=0, embedding=None, normalize_emb=False,embedding_order=True): super(RnnDocReader, self).__init__() # Store config self.opt = opt ''' # Word embeddings if opt['pretrained_words']: assert embedding is not None self.embedding = nn.Embedding(embedding.size(0), embedding.size(1), padding_idx=padding_idx) if normalize_emb: normalize_emb_(embedding) self.embedding.weight.data = embedding if opt['fix_embeddings']: assert opt['tune_partial'] == 0 for p in self.embedding.parameters(): p.requires_grad = False elif opt['tune_partial'] > 0: assert opt['tune_partial'] + 2 < embedding.size(0) fixed_embedding = embedding[opt['tune_partial'] + 2:] self.register_buffer('fixed_embedding', fixed_embedding) self.fixed_embedding = fixed_embedding else: # random initialized self.embedding = nn.Embedding(opt['vocab_size'], opt['embedding_dim'], padding_idx=padding_idx) ''' if opt['pos']: self.pos_embedding = nn.Embedding(opt['pos_size'], opt['pos_dim']) if normalize_emb: normalize_emb_(self.pos_embedding.weight.data) if opt['ner']: self.ner_embedding = nn.Embedding(opt['ner_size'], opt['ner_dim']) if normalize_emb: normalize_emb_(self.ner_embedding.weight.data) # Projection for attention weighted question if opt['use_qemb']: self.qemb_match = layers.SeqAttnMatch(3 * opt['embedding_dim']) if opt['use_cove']: self.cove_embedding = MTLSTM(n_vocab=embedding.size(0),vectors=embedding.clone(),residual_embeddings=True) if not opt['fine_tune']: for p in self.cove_embedding.parameters(): p.requires_grad=False # Input size to RNN: word emb + question emb + manual features doc_input_size = opt['embedding_dim'] + opt['num_features'] question_input_size = opt['embedding_dim'] if opt['use_qemb']: doc_input_size += opt['embedding_dim'] if opt['pos']: doc_input_size += opt['pos_dim'] if opt['ner']: doc_input_size += opt['ner_dim'] if opt['use_cove']: # for Cove doc_input_size+=2* opt['embedding_dim'] question_input_size += 2*opt['embedding_dim'] print('doc_input_size:',doc_input_size) self.attention_rnns= custom.AttentionRNN(opt,doc_input_size=doc_input_size,question_input_size=question_input_size, ratio=opt['reduction_ratio']) # Output sizes of rnn encoders doc_hidden_size = 2 * opt['hidden_size'] +opt['hidden_size']//opt['reduction_ratio'] question_hidden_size = 2 * opt['hidden_size']+opt['hidden_size']//opt['reduction_ratio'] # Question merging if opt['question_merge'] not in ['avg', 'self_attn']: raise NotImplementedError('question_merge = %s' % opt['question_merge']) if opt['question_merge'] == 'self_attn': self.self_attn = layers.LinearSeqAttn(question_hidden_size) # Bilinear attention for span start/end self.start_attn = layers.BilinearSeqAttn( doc_hidden_size, question_hidden_size, ) self.end_attn = layers.BilinearSeqAttn( doc_hidden_size, question_hidden_size, ) def forward(self, x1, x1_f, x1_pos, x1_ner, x1_mask, x2, x2_mask,x1_order,x2_order): """Inputs: x1 = document word indices [batch * len_d] x1_f = document word features indices [batch * len_d * nfeat] x1_pos = document POS tags [batch * len_d] x1_ner = document entity tags [batch * len_d] x1_mask = document padding mask [batch * len_d] x2 = question word indices [batch * len_q] x2_mask = question padding mask [batch * len_q] """ # Embed both document and question #x1_emb = self.embedding(x1) if self.opt['use_cove']: x1_emb_cove=self.cove_embedding(x1,torch.LongTensor(x1.size(0)).fill_(x1.size(1)).cuda()) #x1_emb_order = self.embedding_order(x1_order) #x2_emb = self.embedding(x2) if self.opt['use_cove']: x2_emb_cove= self.cove_embedding(x2,torch.LongTensor(x2.size(0)).fill_(x2.size(1)).cuda()) #x2_emb += self.embedding_order(x2_order) ''' if self.opt['dropout_emb'] > 0: x1_emb = nn.functional.dropout(x1_emb, p=self.opt['dropout_emb'], training=self.training) x2_emb = nn.functional.dropout(x2_emb, p=self.opt['dropout_emb'], training=self.training) x2_emb = torch.cat([x2_emb, x2_emb_cove], dim=2) x1_emb = torch.cat([x1_emb, x1_emb_cove], dim=2) ''' x2_emb = x2_emb_cove x1_emb = x1_emb_cove drnn_input_list = [x1_emb, x1_f] # Add attention-weighted question representation if self.opt['use_qemb']: x2_weighted_emb = self.qemb_match(x1_emb, x2_emb, x2_mask) drnn_input_list.append(x2_weighted_emb) if self.opt['pos']: x1_pos_emb = self.pos_embedding(x1_pos) if self.opt['dropout_emb'] > 0: x1_pos_emb = nn.functional.dropout(x1_pos_emb, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_pos_emb) if self.opt['ner']: x1_ner_emb = self.ner_embedding(x1_ner) if self.opt['dropout_emb'] > 0: x1_ner_emb = nn.functional.dropout(x1_ner_emb, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_ner_emb) drnn_input = torch.cat(drnn_input_list, 2) #print('drnn_input:',drnn_input.size()) # Encode document with RNN doc_hiddens, question_hiddens = self.attention_rnns(drnn_input,x1_mask,x2_emb,x2_mask) if self.opt['question_merge'] == 'avg': q_merge_weights = layers.uniform_weights(question_hiddens, x2_mask) elif self.opt['question_merge'] == 'self_attn': q_merge_weights = self.self_attn(question_hiddens, x2_mask) question_hidden = layers.weighted_avg(question_hiddens, q_merge_weights) start_scores = self.start_attn(doc_hiddens, question_hidden, x1_mask) end_scores = self.end_attn(doc_hiddens, question_hidden, x1_mask) return start_scores, end_scores
def __init__(self, args): super(FusionNetReader, self).__init__() # Store config self.args = args # Word embeddings (+1 for padding) self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim, padding_idx=0) if args.use_cove and args.embedding_dim == 300: # init cove_encoder without additional embeddings self.cove_encoder = MTLSTM() # 300 for p in self.cove_encoder.parameters(): p.requires_grad = False if args.use_qemb: self.qemb_match = layers.SeqAttnMatch(args.embedding_dim) # Input size to RNN: word emb + cove emb + manual features + question emb doc_input_size = args.embedding_dim + args.num_features question_input_size = args.embedding_dim if args.use_cove: doc_input_size += 2 * args.cove_embedding_dim question_input_size += 2 * args.cove_embedding_dim if args.use_qemb: doc_input_size += args.embedding_dim # Reading component (low-level layer) self.reading_low_level_doc_rnn = layers.StackedBRNN( input_size=doc_input_size, hidden_size=args.hidden_size, num_layers=1, dropout_rate=args.dropout_rnn, dropout_output=args.dropout_rnn_output, padding=args.rnn_padding ) self.reading_low_level_question_rnn = layers.StackedBRNN( input_size=question_input_size, hidden_size=args.hidden_size, num_layers=1, dropout_rate=args.dropout_rnn, dropout_output=args.dropout_rnn_output, padding=args.rnn_padding ) # Reading component (high-level layer) self.reading_high_level_doc_rnn = layers.StackedBRNN( input_size=args.hidden_size * 2, hidden_size=args.hidden_size, num_layers=1, dropout_rate=args.dropout_rnn, dropout_output=args.dropout_rnn_output, padding=args.rnn_padding ) self.reading_high_level_question_rnn = layers.StackedBRNN( input_size=args.hidden_size * 2, hidden_size=args.hidden_size, num_layers=1, dropout_rate=args.dropout_rnn, dropout_output=args.dropout_rnn_output, padding=args.rnn_padding ) # Question understanding component # input: [low_level_question, high_level_question] self.understanding_question_rnn = layers.StackedBRNN( input_size=args.hidden_size * 4, hidden_size=args.hidden_size, num_layers=1, dropout_rate=args.dropout_rnn, dropout_output=args.dropout_rnn_output, padding=args.rnn_padding ) # [word_embedding, cove_embedding, low_level_doc_hidden, high_level_doc_hidden] history_of_word_size = args.embedding_dim + 2 * args.cove_embedding_dim + 4 * args.hidden_size # self.low_level_matrix_attention = MatrixAttention(SymmetricBilinearSimilarity(history_of_word_size, # args.attention_size, # F.relu)) # self.high_level_matrix_attention = MatrixAttention(SymmetricBilinearSimilarity(history_of_word_size, # args.attention_size, # F.relu)) # self.understanding_matrix_attention = MatrixAttention(SymmetricBilinearSimilarity(history_of_word_size, # args.attention_size, # F.relu)) # self.low_level_matrix_attention = MatrixAttention(BilinearSimilarity(history_of_word_size, # history_of_word_size)) # self.high_level_matrix_attention = MatrixAttention(BilinearSimilarity(history_of_word_size, # history_of_word_size)) # self.understanding_matrix_attention = MatrixAttention(BilinearSimilarity(history_of_word_size, # history_of_word_size)) self.low_level_matrix_attention_layer = layers.SymBilinearAttnMatch(history_of_word_size, args.attention_size) self.high_level_matrix_attention_layer = layers.SymBilinearAttnMatch(history_of_word_size, args.attention_size) self.understanding_matrix_attention_layer = layers.SymBilinearAttnMatch(history_of_word_size, args.attention_size) # Multi-level rnn # input: [low_level_doc, high_level_doc, low_level_fusion_doc, high_level_fusion_doc, # understanding_level_question_fusion_doc] self.multi_level_rnn = layers.StackedBRNN( input_size=args.hidden_size * 2 * 5, hidden_size=args.hidden_size, num_layers=1, padding=args.rnn_padding ) # [word_embedding, cove_embedding, low_level_doc_hidden, high_level_doc_hidden, low_level_doc_question_vector, # high_level_doc_question_vector, understanding_doc_question_vector, fa_multi_level_doc_hidden] history_of_doc_word_size = history_of_word_size + 4 * 2 * args.hidden_size # self.self_boosted_matrix_attention = MatrixAttention(SymmetricBilinearSimilarity(history_of_doc_word_size, # args.attention_size, # F.relu)) self.self_boosted_matrix_attention_layer = layers.SymBilinearAttnMatch(history_of_doc_word_size, args.attention_size) # # self.self_boosted_matrix_attention = MatrixAttention(BilinearSimilarity(history_of_doc_word_size, # history_of_doc_word_size)) # Fully-Aware Self-Boosted fusion rnn # input: [fully_aware_encoded_doc(hidden state from last layer) ,self_boosted_fusion_doc] self.understanding_doc_rnn = layers.StackedBRNN( input_size=args.hidden_size * 2 * 2, hidden_size=args.hidden_size, num_layers=1, padding=args.rnn_padding ) # Output sizes of rnn doc_hidden_size = 2 * args.hidden_size question_hidden_size = 2 * args.hidden_size if args.concat_rnn_layers: doc_hidden_size *= args.doc_layers question_hidden_size *= args.question_layers # Question merging self.question_self_attn = layers.LinearSeqAttn(question_hidden_size) self.start_attn = layers.BilinearSeqAttn(doc_hidden_size, question_hidden_size, log_normalize=False) self.start_gru = nn.GRU(doc_hidden_size, args.hidden_size * 2, batch_first=True) self.end_attn = layers.BilinearSeqAttn(doc_hidden_size, question_hidden_size, log_normalize=False)
class FusionNetReader(nn.Module): RNN_TYPES = {'lstm': nn.LSTM, 'gru': nn.GRU, 'rnn': nn.RNN} def __init__(self, args): super(FusionNetReader, self).__init__() # Store config self.args = args # Word embeddings (+1 for padding) self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim, padding_idx=0) if args.use_cove and args.embedding_dim == 300: # init cove_encoder without additional embeddings self.cove_encoder = MTLSTM() # 300 for p in self.cove_encoder.parameters(): p.requires_grad = False if args.use_qemb: self.qemb_match = layers.SeqAttnMatch(args.embedding_dim) # Input size to RNN: word emb + cove emb + manual features + question emb doc_input_size = args.embedding_dim + args.num_features question_input_size = args.embedding_dim if args.use_cove: doc_input_size += 2 * args.cove_embedding_dim question_input_size += 2 * args.cove_embedding_dim if args.use_qemb: doc_input_size += args.embedding_dim # Reading component (low-level layer) self.reading_low_level_doc_rnn = layers.StackedBRNN( input_size=doc_input_size, hidden_size=args.hidden_size, num_layers=1, dropout_rate=args.dropout_rnn, dropout_output=args.dropout_rnn_output, padding=args.rnn_padding ) self.reading_low_level_question_rnn = layers.StackedBRNN( input_size=question_input_size, hidden_size=args.hidden_size, num_layers=1, dropout_rate=args.dropout_rnn, dropout_output=args.dropout_rnn_output, padding=args.rnn_padding ) # Reading component (high-level layer) self.reading_high_level_doc_rnn = layers.StackedBRNN( input_size=args.hidden_size * 2, hidden_size=args.hidden_size, num_layers=1, dropout_rate=args.dropout_rnn, dropout_output=args.dropout_rnn_output, padding=args.rnn_padding ) self.reading_high_level_question_rnn = layers.StackedBRNN( input_size=args.hidden_size * 2, hidden_size=args.hidden_size, num_layers=1, dropout_rate=args.dropout_rnn, dropout_output=args.dropout_rnn_output, padding=args.rnn_padding ) # Question understanding component # input: [low_level_question, high_level_question] self.understanding_question_rnn = layers.StackedBRNN( input_size=args.hidden_size * 4, hidden_size=args.hidden_size, num_layers=1, dropout_rate=args.dropout_rnn, dropout_output=args.dropout_rnn_output, padding=args.rnn_padding ) # [word_embedding, cove_embedding, low_level_doc_hidden, high_level_doc_hidden] history_of_word_size = args.embedding_dim + 2 * args.cove_embedding_dim + 4 * args.hidden_size # self.low_level_matrix_attention = MatrixAttention(SymmetricBilinearSimilarity(history_of_word_size, # args.attention_size, # F.relu)) # self.high_level_matrix_attention = MatrixAttention(SymmetricBilinearSimilarity(history_of_word_size, # args.attention_size, # F.relu)) # self.understanding_matrix_attention = MatrixAttention(SymmetricBilinearSimilarity(history_of_word_size, # args.attention_size, # F.relu)) # self.low_level_matrix_attention = MatrixAttention(BilinearSimilarity(history_of_word_size, # history_of_word_size)) # self.high_level_matrix_attention = MatrixAttention(BilinearSimilarity(history_of_word_size, # history_of_word_size)) # self.understanding_matrix_attention = MatrixAttention(BilinearSimilarity(history_of_word_size, # history_of_word_size)) self.low_level_matrix_attention_layer = layers.SymBilinearAttnMatch(history_of_word_size, args.attention_size) self.high_level_matrix_attention_layer = layers.SymBilinearAttnMatch(history_of_word_size, args.attention_size) self.understanding_matrix_attention_layer = layers.SymBilinearAttnMatch(history_of_word_size, args.attention_size) # Multi-level rnn # input: [low_level_doc, high_level_doc, low_level_fusion_doc, high_level_fusion_doc, # understanding_level_question_fusion_doc] self.multi_level_rnn = layers.StackedBRNN( input_size=args.hidden_size * 2 * 5, hidden_size=args.hidden_size, num_layers=1, padding=args.rnn_padding ) # [word_embedding, cove_embedding, low_level_doc_hidden, high_level_doc_hidden, low_level_doc_question_vector, # high_level_doc_question_vector, understanding_doc_question_vector, fa_multi_level_doc_hidden] history_of_doc_word_size = history_of_word_size + 4 * 2 * args.hidden_size # self.self_boosted_matrix_attention = MatrixAttention(SymmetricBilinearSimilarity(history_of_doc_word_size, # args.attention_size, # F.relu)) self.self_boosted_matrix_attention_layer = layers.SymBilinearAttnMatch(history_of_doc_word_size, args.attention_size) # # self.self_boosted_matrix_attention = MatrixAttention(BilinearSimilarity(history_of_doc_word_size, # history_of_doc_word_size)) # Fully-Aware Self-Boosted fusion rnn # input: [fully_aware_encoded_doc(hidden state from last layer) ,self_boosted_fusion_doc] self.understanding_doc_rnn = layers.StackedBRNN( input_size=args.hidden_size * 2 * 2, hidden_size=args.hidden_size, num_layers=1, padding=args.rnn_padding ) # Output sizes of rnn doc_hidden_size = 2 * args.hidden_size question_hidden_size = 2 * args.hidden_size if args.concat_rnn_layers: doc_hidden_size *= args.doc_layers question_hidden_size *= args.question_layers # Question merging self.question_self_attn = layers.LinearSeqAttn(question_hidden_size) self.start_attn = layers.BilinearSeqAttn(doc_hidden_size, question_hidden_size, log_normalize=False) self.start_gru = nn.GRU(doc_hidden_size, args.hidden_size * 2, batch_first=True) self.end_attn = layers.BilinearSeqAttn(doc_hidden_size, question_hidden_size, log_normalize=False) def forward(self, x1, x1_f, x1_mask, x2, x2_mask): """Inputs: x1 = document word indices [batch * len_d] x1_mask = document padding mask [batch * len_d] x1_f = document word features indices [batch * len_d * nfeat] x2 = question word indices [batch * len_q] x2_mask = question padding mask [batch * len_q] """ # Embed both document and question x1_word_emb = self.embedding(x1) # [batch, len_d, embedding_dim] x2_word_emb = self.embedding(x2) # [batch, len_q, embedding_dim] x1_lengths = x1_mask.data.eq(0).long().sum(1).squeeze() # batch x2_lengths = x2_mask.data.eq(0).long().sum(1).squeeze() # batch x1_cove_emb = self.cove_encoder(x1_word_emb, x1_lengths) x2_cove_emb = self.cove_encoder(x2_word_emb, x2_lengths) x1_emb = torch.cat([x1_word_emb, x1_cove_emb], dim=-1) x2_emb = torch.cat([x2_word_emb, x2_cove_emb], dim=-1) # Dropout on embeddings if self.args.dropout_emb > 0: x1_emb = nn.functional.dropout(x1_emb, p=self.args.dropout_emb, training=self.training) x2_emb = nn.functional.dropout(x2_emb, p=self.args.dropout_emb, training=self.training) # Form document encoding inputs drnn_input = [x1_emb] # Add attention-weighted question representation if self.args.use_qemb: x2_weighted_emb = self.qemb_match(x1_word_emb, x2_word_emb, x2_mask) # batch * len_d drnn_input.append(x2_weighted_emb) # Add manual features if self.args.num_features > 0: drnn_input.append(x1_f) # Encode document with RNN shape: [batch, len_d, 2*hidden_size] low_level_doc_hiddens = self.reading_low_level_doc_rnn(torch.cat(drnn_input, 2), x1_mask) low_level_question_hiddens = self.reading_low_level_question_rnn(x2_emb, x2_mask) # Encode question with RNN shape: [batch, len_q, 2*hidden_size] high_level_doc_hiddens = self.reading_high_level_doc_rnn(low_level_doc_hiddens, x1_mask) high_level_question_hiddens = self.reading_high_level_question_rnn(low_level_question_hiddens, x2_mask) # Encode low_level_question_hiddens and high_level_question_hiddens shape:[batch, len_q, 2*hidden_size] understanding_question_hiddens = self.understanding_question_rnn(torch.cat([low_level_question_hiddens, high_level_question_hiddens], 2), x2_mask) # history of word shape:[batch, len_d, history_of_word_size] history_of_doc_word = torch.cat([x1_word_emb, x1_cove_emb, low_level_doc_hiddens, high_level_doc_hiddens] , dim=2) # history of word shape:[batch, len_q, history_of_word_size] history_of_question_word = torch.cat([x2_word_emb, x2_cove_emb, low_level_question_hiddens, low_level_question_hiddens], dim=2) # # high_level_doc_hiddens # # fully-aware multi-level attention # low_level_similarity = self.low_level_matrix_attention(history_of_doc_word, history_of_question_word) # high_level_similarity = self.high_level_matrix_attention(history_of_doc_word, history_of_question_word) # understanding_similarity = self.understanding_matrix_attention(history_of_doc_word, history_of_question_word) # # # shape: [batch, len_d, len_q] # low_level_norm_sim = util.last_dim_softmax(low_level_similarity, x2_mask) # high_level_norm_sim = util.last_dim_softmax(high_level_similarity, x2_mask) # understanding_norm_sim = util.last_dim_softmax(understanding_similarity, x2_mask) # # # shape: [batch, len_d, 2*hidden_size] # low_level_doc_question_vectors = util.weighted_sum(low_level_question_hiddens, low_level_norm_sim) # high_level_doc_question_vectors = util.weighted_sum(high_level_question_hiddens, high_level_norm_sim) # understanding_doc_question_vectors = util.weighted_sum(understanding_question_hiddens, understanding_norm_sim) low_level_doc_question_vectors = self.low_level_matrix_attention_layer( history_of_doc_word, history_of_question_word, x2_mask, low_level_question_hiddens) high_level_doc_question_vectors = self.high_level_matrix_attention_layer( history_of_doc_word, history_of_question_word, x2_mask, high_level_question_hiddens) understanding_doc_question_vectors = self.understanding_matrix_attention_layer( history_of_doc_word, history_of_question_word, x2_mask, understanding_question_hiddens) # Encode multi-level hiddens and vectors fa_multi_level_doc_hiddens = self.multi_level_rnn(torch.cat([low_level_doc_hiddens, high_level_doc_hiddens, low_level_doc_question_vectors, high_level_doc_question_vectors, understanding_doc_question_vectors], dim=2), x1_mask) # fa_multi_level_doc_hiddens = low_level_doc_question_vectors # history_of_doc_word2 = torch.cat([x1_word_emb, x1_cove_emb, low_level_doc_hiddens, high_level_doc_hiddens, low_level_doc_question_vectors, high_level_doc_question_vectors, understanding_doc_question_vectors, fa_multi_level_doc_hiddens], dim=2) # # shape: [batch, len_d, len_d] # self_boosted_similarity = self.self_boosted_matrix_attention(history_of_doc_word2, history_of_doc_word2) # # # shape: [batch, len_d, len_d] # self_boosted_norm_sim = util.last_dim_softmax(self_boosted_similarity, x1_mask) # # # shape: [batch, len_d, 2*hidden_size] # self_boosted_vectors = util.weighted_sum(fa_multi_level_doc_hiddens, self_boosted_norm_sim) self_boosted_vectors = self.self_boosted_matrix_attention_layer( history_of_doc_word2, history_of_doc_word2, x1_mask, fa_multi_level_doc_hiddens) # Encode vectors and hiddens # shape: [batch, len_d, 2*hidden_size] understanding_doc_hiddens = self.understanding_doc_rnn(torch.cat([fa_multi_level_doc_hiddens, self_boosted_vectors], dim=2), x1_mask) # understanding_doc_hiddens = fa_multi_level_doc_hiddens # shape: [batch, len_q] q_merge_weights = self.question_self_attn(understanding_question_hiddens, x2_mask) # shape: [batch, 2*hidden_size] question_hidden = layers.weighted_avg(understanding_question_hiddens, q_merge_weights) # Predict start and end positions # shape: [batch, len_d] SOFTMAX NOT LOG_SOFTMAX start_scores = self.start_attn(understanding_doc_hiddens, question_hidden, x1_mask) # shape: [batch, 2*hidden_size] gru_input = layers.weighted_avg(understanding_doc_hiddens, start_scores) # shape: [batch, 1, 2*hidden_size] gru_input = gru_input.unsqueeze(1) # shape: [1, batch, 2*hidden_size] question_hidden = question_hidden.unsqueeze(0) _, memory_hidden = self.start_gru(gru_input, question_hidden) # shape: [batch, 2*hidden_size] memory_hidden = memory_hidden.squeeze(0) # shape: [batch, len_d] end_scores = self.end_attn(understanding_doc_hiddens, memory_hidden, x1_mask) # log start_scores if self.training: start_scores = torch.log(start_scores.add(1e-8)) end_scores = torch.log(end_scores.add(1e-8)) return start_scores, end_scores
parser.add_argument('--data', default='.data', help='where to store data') parser.add_argument('--embeddings', default='.embeddings', help='where to store embeddings') args = parser.parse_args() inputs = data.Field(lower=True, include_lengths=True, batch_first=True) print('Generating train, dev, test splits') train, dev, test = datasets.IWSLT.splits(root=args.data, exts=['.en', '.de'], fields=[inputs, inputs]) train_iter, dev_iter, test_iter = data.Iterator.splits( (train, dev, test), batch_size=100, device=torch.device(args.device) if args.device >= 0 else None) print('Building vocabulary') inputs.build_vocab(train, dev, test) inputs.vocab.load_vectors(vectors=GloVe(name='840B', dim=300, cache=args.embeddings)) outputs_last_layer_cove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, model_cache=args.embeddings) outputs_both_layer_cove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, layer0=True, model_cache=args.embeddings) outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, layer0=True, residual_embeddings=True, model_cache=args.embeddings) if args.device >=0: outputs_last_layer_cove.cuda() outputs_both_layer_cove.cuda() outputs_both_layer_cove_with_glove.cuda() train_iter.init_epoch() print('Generating CoVe') for batch_idx, batch in enumerate(train_iter): if batch_idx > 0: break last_layer_cove = outputs_last_layer_cove(*batch.src) print(last_layer_cove.size())
class RnnDocReader(nn.Module): """Network for the Document Reader module of DrQA.""" RNN_TYPES = {'lstm': nn.LSTM, 'gru': nn.GRU, 'rnn': nn.RNN} def __init__(self, opt, padding_idx=0, embedding=None, normalize_emb=False, embedding_order=True): super(RnnDocReader, self).__init__() # Store config self.opt = opt # Word embeddings if opt['pretrained_words']: assert embedding is not None self.embedding = nn.Embedding(embedding.size(0), embedding.size(1), padding_idx=padding_idx) if normalize_emb: normalize_emb_(embedding) self.embedding.weight.data = embedding if opt['fix_embeddings']: assert opt['tune_partial'] == 0 for p in self.embedding.parameters(): p.requires_grad = False elif opt['tune_partial'] > 0: assert opt['tune_partial'] + 2 < embedding.size(0) fixed_embedding = embedding[opt['tune_partial'] + 2:] self.register_buffer('fixed_embedding', fixed_embedding) self.fixed_embedding = fixed_embedding else: # random initialized self.embedding = nn.Embedding(opt['vocab_size'], opt['embedding_dim'], padding_idx=padding_idx) if opt['pos']: self.pos_embedding = nn.Embedding(opt['pos_size'], opt['pos_dim']) if normalize_emb: normalize_emb_(self.pos_embedding.weight.data) if opt['ner']: self.ner_embedding = nn.Embedding(opt['ner_size'], opt['ner_dim']) if normalize_emb: normalize_emb_(self.ner_embedding.weight.data) # Projection for attention weighted question if opt['use_qemb']: self.qemb_match = layers.SeqAttnMatch(3 * opt['embedding_dim']) if opt['use_cove']: self.cove_embedding = MTLSTM(n_vocab=embedding.size(0), vectors=embedding.clone()) if not opt['fine_tune']: for p in self.cove_embedding.parameters(): p.requires_grad = False # Input size to RNN: word emb + question emb + manual features doc_input_size = opt['embedding_dim'] + opt['num_features'] question_input_size = opt['embedding_dim'] if opt['use_qemb']: doc_input_size += opt['embedding_dim'] if opt['pos']: doc_input_size += opt['pos_dim'] if opt['ner']: doc_input_size += opt['ner_dim'] if opt['use_cove']: # for Cove doc_input_size += 2 * opt['embedding_dim'] question_input_size += 2 * opt['embedding_dim'] print('doc_input_size:', doc_input_size) self.attention_rnns = custom.AttentionRNN( opt, doc_input_size=doc_input_size, question_input_size=question_input_size, ratio=opt['reduction_ratio']) # Output sizes of rnn encoders doc_hidden_size = 2 * opt['hidden_size'] + opt['hidden_size'] // opt[ 'reduction_ratio'] question_hidden_size = 2 * opt['hidden_size'] + opt[ 'hidden_size'] // opt['reduction_ratio'] # Question merging if opt['question_merge'] not in ['avg', 'self_attn']: raise NotImplementedError('question_merge = %s' % opt['question_merge']) if opt['question_merge'] == 'self_attn': self.self_attn = layers.LinearSeqAttn(question_hidden_size) # Bilinear attention for span start/end self.start_attn = layers.BilinearSeqAttn( doc_hidden_size, question_hidden_size, ) self.end_attn = layers.BilinearSeqAttn( doc_hidden_size, question_hidden_size, ) def forward(self, x1, x1_f, x1_pos, x1_ner, x1_mask, x2, x2_mask, x1_order, x2_order): """Inputs: x1 = document word indices [batch * len_d] x1_f = document word features indices [batch * len_d * nfeat] x1_pos = document POS tags [batch * len_d] x1_ner = document entity tags [batch * len_d] x1_mask = document padding mask [batch * len_d] x2 = question word indices [batch * len_q] x2_mask = question padding mask [batch * len_q] """ # Embed both document and question x1_emb = self.embedding(x1) if self.opt['use_cove']: x1_emb_cove = self.cove_embedding( x1, torch.LongTensor(x1.size(0)).fill_(x1.size(1)).cuda()) #x1_emb_order = self.embedding_order(x1_order) x2_emb = self.embedding(x2) if self.opt['use_cove']: x2_emb_cove = self.cove_embedding( x2, torch.LongTensor(x2.size(0)).fill_(x2.size(1)).cuda()) #x2_emb += self.embedding_order(x2_order) if self.opt['dropout_emb'] > 0: x1_emb = nn.functional.dropout(x1_emb, p=self.opt['dropout_emb'], training=self.training) x2_emb = nn.functional.dropout(x2_emb, p=self.opt['dropout_emb'], training=self.training) x2_emb = torch.cat([x2_emb, x2_emb_cove], dim=2) x1_emb = torch.cat([x1_emb, x1_emb_cove], dim=2) drnn_input_list = [x1_emb, x1_f] # Add attention-weighted question representation if self.opt['use_qemb']: x2_weighted_emb = self.qemb_match(x1_emb, x2_emb, x2_mask) drnn_input_list.append(x2_weighted_emb) if self.opt['pos']: x1_pos_emb = self.pos_embedding(x1_pos) if self.opt['dropout_emb'] > 0: x1_pos_emb = nn.functional.dropout(x1_pos_emb, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_pos_emb) if self.opt['ner']: x1_ner_emb = self.ner_embedding(x1_ner) if self.opt['dropout_emb'] > 0: x1_ner_emb = nn.functional.dropout(x1_ner_emb, p=self.opt['dropout_emb'], training=self.training) drnn_input_list.append(x1_ner_emb) drnn_input = torch.cat(drnn_input_list, 2) #print('drnn_input:',drnn_input.size()) # Encode document with RNN doc_hiddens, question_hiddens = self.attention_rnns( drnn_input, x1_mask, x2_emb, x2_mask) if self.opt['question_merge'] == 'avg': q_merge_weights = layers.uniform_weights(question_hiddens, x2_mask) elif self.opt['question_merge'] == 'self_attn': q_merge_weights = self.self_attn(question_hiddens, x2_mask) question_hidden = layers.weighted_avg(question_hiddens, q_merge_weights) start_scores = self.start_attn(doc_hiddens, question_hidden, x1_mask) end_scores = self.end_attn(doc_hiddens, question_hidden, x1_mask) return start_scores, end_scores
def __init__(self, opt, padding_idx=0, embedding=None, normalize_emb=False, embedding_order=True): super(RnnDocReader, self).__init__() # Store config self.opt = opt # Word embeddings if opt['pretrained_words']: assert embedding is not None self.embedding = nn.Embedding(embedding.size(0), embedding.size(1), padding_idx=padding_idx) if normalize_emb: normalize_emb_(embedding) self.embedding.weight.data = embedding if opt['fix_embeddings']: assert opt['tune_partial'] == 0 for p in self.embedding.parameters(): p.requires_grad = False elif opt['tune_partial'] > 0: assert opt['tune_partial'] + 2 < embedding.size(0) fixed_embedding = embedding[opt['tune_partial'] + 2:] self.register_buffer('fixed_embedding', fixed_embedding) self.fixed_embedding = fixed_embedding else: # random initialized self.embedding = nn.Embedding(opt['vocab_size'], opt['embedding_dim'], padding_idx=padding_idx) if opt['pos']: self.pos_embedding = nn.Embedding(opt['pos_size'], opt['pos_dim']) if normalize_emb: normalize_emb_(self.pos_embedding.weight.data) if opt['ner']: self.ner_embedding = nn.Embedding(opt['ner_size'], opt['ner_dim']) if normalize_emb: normalize_emb_(self.ner_embedding.weight.data) # Projection for attention weighted question if opt['use_qemb']: self.qemb_match = layers.SeqAttnMatch(3 * opt['embedding_dim']) if opt['use_cove']: self.cove_embedding = MTLSTM(n_vocab=embedding.size(0), vectors=embedding.clone()) if not opt['fine_tune']: for p in self.cove_embedding.parameters(): p.requires_grad = False # Input size to RNN: word emb + question emb + manual features doc_input_size = opt['embedding_dim'] + opt['num_features'] question_input_size = opt['embedding_dim'] if opt['use_qemb']: doc_input_size += opt['embedding_dim'] if opt['pos']: doc_input_size += opt['pos_dim'] if opt['ner']: doc_input_size += opt['ner_dim'] if opt['use_cove']: # for Cove doc_input_size += 2 * opt['embedding_dim'] question_input_size += 2 * opt['embedding_dim'] print('doc_input_size:', doc_input_size) self.attention_rnns = custom.AttentionRNN( opt, doc_input_size=doc_input_size, question_input_size=question_input_size, ratio=opt['reduction_ratio']) # Output sizes of rnn encoders doc_hidden_size = 2 * opt['hidden_size'] + opt['hidden_size'] // opt[ 'reduction_ratio'] question_hidden_size = 2 * opt['hidden_size'] + opt[ 'hidden_size'] // opt['reduction_ratio'] # Question merging if opt['question_merge'] not in ['avg', 'self_attn']: raise NotImplementedError('question_merge = %s' % opt['question_merge']) if opt['question_merge'] == 'self_attn': self.self_attn = layers.LinearSeqAttn(question_hidden_size) # Bilinear attention for span start/end self.start_attn = layers.BilinearSeqAttn( doc_hidden_size, question_hidden_size, ) self.end_attn = layers.BilinearSeqAttn( doc_hidden_size, question_hidden_size, )
def __init__(self, opt, padding_idx=0, embedding=None, normalize_emb=False,embedding_order=True): super(RnnDocReader, self).__init__() # Store config self.opt = opt ''' # Word embeddings if opt['pretrained_words']: assert embedding is not None self.embedding = nn.Embedding(embedding.size(0), embedding.size(1), padding_idx=padding_idx) if normalize_emb: normalize_emb_(embedding) self.embedding.weight.data = embedding if opt['fix_embeddings']: assert opt['tune_partial'] == 0 for p in self.embedding.parameters(): p.requires_grad = False elif opt['tune_partial'] > 0: assert opt['tune_partial'] + 2 < embedding.size(0) fixed_embedding = embedding[opt['tune_partial'] + 2:] self.register_buffer('fixed_embedding', fixed_embedding) self.fixed_embedding = fixed_embedding else: # random initialized self.embedding = nn.Embedding(opt['vocab_size'], opt['embedding_dim'], padding_idx=padding_idx) ''' if opt['pos']: self.pos_embedding = nn.Embedding(opt['pos_size'], opt['pos_dim']) if normalize_emb: normalize_emb_(self.pos_embedding.weight.data) if opt['ner']: self.ner_embedding = nn.Embedding(opt['ner_size'], opt['ner_dim']) if normalize_emb: normalize_emb_(self.ner_embedding.weight.data) # Projection for attention weighted question if opt['use_qemb']: self.qemb_match = layers.SeqAttnMatch(3 * opt['embedding_dim']) if opt['use_cove']: self.cove_embedding = MTLSTM(n_vocab=embedding.size(0),vectors=embedding.clone(),residual_embeddings=True) if not opt['fine_tune']: for p in self.cove_embedding.parameters(): p.requires_grad=False # Input size to RNN: word emb + question emb + manual features doc_input_size = opt['embedding_dim'] + opt['num_features'] question_input_size = opt['embedding_dim'] if opt['use_qemb']: doc_input_size += opt['embedding_dim'] if opt['pos']: doc_input_size += opt['pos_dim'] if opt['ner']: doc_input_size += opt['ner_dim'] if opt['use_cove']: # for Cove doc_input_size+=2* opt['embedding_dim'] question_input_size += 2*opt['embedding_dim'] print('doc_input_size:',doc_input_size) self.attention_rnns= custom.AttentionRNN(opt,doc_input_size=doc_input_size,question_input_size=question_input_size, ratio=opt['reduction_ratio']) # Output sizes of rnn encoders doc_hidden_size = 2 * opt['hidden_size'] +opt['hidden_size']//opt['reduction_ratio'] question_hidden_size = 2 * opt['hidden_size']+opt['hidden_size']//opt['reduction_ratio'] # Question merging if opt['question_merge'] not in ['avg', 'self_attn']: raise NotImplementedError('question_merge = %s' % opt['question_merge']) if opt['question_merge'] == 'self_attn': self.self_attn = layers.LinearSeqAttn(question_hidden_size) # Bilinear attention for span start/end self.start_attn = layers.BilinearSeqAttn( doc_hidden_size, question_hidden_size, ) self.end_attn = layers.BilinearSeqAttn( doc_hidden_size, question_hidden_size, )
# 安装CoVe !python setup.py develop import torch from torchtext.vocab import GloVe from cove import MTLSTM # GloVe词表,维度为2.1*10^6*3 glove = GloVe(name='840B', dim=300, cache='.embeddings') # 输入2个句子,每个句子中每个单词在词表中的编号 inputs = torch.LongTensor([[10, 2, 3, 0], [7, 8, 10, 3]]);inputs # 两个句子的长度分别为3和4 lengths = torch.LongTensor([3, 4]);lengths # CoVe类 cove = MTLSTM(n_vocab=glove.vectors.shape[0], vectors=glove.vectors, model_cache='.embeddings') # 每个句子每个单词的CoVe编码,维度为2*4*600 outputs = cove(inputs, lengths);outputs outputs.shape
def forward(self, # type: ignore tokens: Dict[str, torch.LongTensor], label: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``. label : torch.LongTensor, optional (default = None) A variable representing the label for each instance in the batch. Returns ------- An output dictionary consisting of: class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_classes)`` representing a distribution over the label classes for each instance. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ text_mask = util.get_text_field_mask(tokens).float() # Pop elmo tokens, since elmo embedder should not be present. elmo_tokens = tokens.pop("elmo", None) if tokens: embedded_text = self._text_field_embedder(tokens) else: # only using "elmo" for input embedded_text = None # Add the "elmo" key back to "tokens" if not None, since the tests and the # subsequent training epochs rely not being modified during forward() if elmo_tokens is not None: tokens["elmo"] = elmo_tokens # Create ELMo embeddings if applicable if self._elmo: if elmo_tokens is not None: elmo_representations = self._elmo(elmo_tokens)["elmo_representations"] # Pop from the end is more performant with list if self._use_integrator_output_elmo: integrator_output_elmo = elmo_representations.pop() if self._use_input_elmo: input_elmo = elmo_representations.pop() assert not elmo_representations else: raise ConfigurationError( "Model was built to use Elmo, but input text is not tokenized for Elmo.") if self._use_input_elmo: if embedded_text is not None: embedded_text = torch.cat([embedded_text, input_elmo], dim=-1) else: embedded_text = input_elmo # While using embeddings from the mt-cnn encoder, the hardcoded values for vocab_size can be initialsed appropriately if cnn: embedded_text_cnn = embedded_text enc = Encoder(7855, 300, 600, 5, 3, 0.25, 'cuda') dec = Decoder(5893, 300, 600, 5, 3, 0.25, 1, 'cuda') cnn_model = Seq2Seq(enc, dec).cuda() cnn_model.load_state_dict(torch.load('../cnn_lstm_model.pt')) cnn_model.eval() v1, v2 = cnn_model.encoder(embedded_text[:,:,:256]) v3 = torch.cat((v1,v2),2) embedded_text = torch.cat((embedded_text_cnn,v3),2) # While using embeddings from the mt-lstm encoder (either load from the saved model from the paper or the reproduced model) elif lstm: outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=None, vectors=None, layer0=True, residual_embeddings=True) outputs_both_layer_cove_with_glove.cuda() embedded_text = outputs_both_layer_cove_with_glove(embedded_text,[embedded_text.shape[1]]*embedded_text.shape[0]) dropped_embedded_text = self._embedding_dropout(embedded_text) pre_encoded_text = self._pre_encode_feedforward(dropped_embedded_text) encoded_tokens = self._encoder(pre_encoded_text, text_mask) # Compute biattention. This is a special case since the inputs are the same. attention_logits = encoded_tokens.bmm(encoded_tokens.permute(0, 2, 1).contiguous()) attention_weights = util.masked_softmax(attention_logits, text_mask) encoded_text = util.weighted_sum(encoded_tokens, attention_weights) # Build the input to the integrator integrator_input = torch.cat([encoded_tokens, encoded_tokens - encoded_text, encoded_tokens * encoded_text], 2) integrated_encodings = self._integrator(integrator_input, text_mask) # Concatenate ELMo representations to integrated_encodings if specified if self._use_integrator_output_elmo: integrated_encodings = torch.cat([integrated_encodings, integrator_output_elmo], dim=-1) # Simple Pooling layers max_masked_integrated_encodings = util.replace_masked_values( integrated_encodings, text_mask.unsqueeze(2), -1e7) max_pool = torch.max(max_masked_integrated_encodings, 1)[0] min_masked_integrated_encodings = util.replace_masked_values( integrated_encodings, text_mask.unsqueeze(2), +1e7) min_pool = torch.min(min_masked_integrated_encodings, 1)[0] mean_pool = torch.sum(integrated_encodings, 1) / torch.sum(text_mask, 1, keepdim=True) # Self-attentive pooling layer # Run through linear projection. Shape: (batch_size, sequence length, 1) # Then remove the last dimension to get the proper attention shape (batch_size, sequence length). self_attentive_logits = self._self_attentive_pooling_projection( integrated_encodings).squeeze(2) self_weights = util.masked_softmax(self_attentive_logits, text_mask) self_attentive_pool = util.weighted_sum(integrated_encodings, self_weights) pooled_representations = torch.cat([max_pool, min_pool, mean_pool, self_attentive_pool], 1) pooled_representations_dropped = self._integrator_dropout(pooled_representations) logits = self._output_layer(pooled_representations_dropped) class_probabilities = F.softmax(logits, dim=-1) output_dict = {'logits': logits, 'class_probabilities': class_probabilities} if label is not None: loss = self.loss(logits, label) for metric in self.metrics.values(): metric(logits, label) output_dict["loss"] = loss return output_dict