def predict(self, context_tokens, answer_features, max_length, pad=False): input_token = variable.Variable( torch.LongTensor([[self.vocab.start_index]])).cuda() end_token = torch.LongTensor([[self.vocab.end_index]]).cuda() context_tokens = variable.Variable( torch.LongTensor(context_tokens)).cuda() answer_features = variable.Variable( torch.from_numpy(answer_features)).cuda() predictions = self.model.predict(input_token=input_token, context_tokens=context_tokens, end_token=end_token, answer_features=answer_features, max_length=max_length) if pad: pad_token = variable.Variable( torch.LongTensor([self.vocab.pad_index]).cuda()) while len(predictions) < max_length: predictions.append(pad_token) stacked_predictions = torch.stack(predictions, 0) tokens = self.get_tokens_single(stacked_predictions.cpu()) sentence = " ".join(tokens) return sentence, stacked_predictions
def forward(self, inputs): batch_size = inputs.size()[1] state_shape = self.config['num_layers'] * 2, batch_size, int( self.config['hidden_size'] / 2) h0 = variable.Variable(inputs.data.new(*state_shape).zero_(), requires_grad=False).cuda() c0 = variable.Variable(inputs.data.new(*state_shape).zero_(), requires_grad=False).cuda() outputs, (ht, ct) = self.rnn(inputs, (h0, c0)) return outputs, ht[-2:].transpose(0, 1).contiguous().view(batch_size, -1)
def combine_predictions(self, context_tokens, predictor_probs, attentions, language_probs): max_attention_length = attentions.size(2) pad_size = self.config['vocab_size'] - max_attention_length batch_size = attentions.size(1) seq_size = attentions.size(0) context_tokens_padding = variable.Variable(torch.LongTensor( batch_size, pad_size).zero_(), requires_grad=False).cuda() attentions_padding = variable.Variable( torch.zeros(batch_size, pad_size) + -1e10, requires_grad=False).cuda() stacked_context_tokens = torch.cat( (context_tokens, context_tokens_padding), 1) total_attention_results = [] softmax_probs = predictor_probs[:, :, 0] text_field_probs = predictor_probs[:, :, 1] replicated_softmax_probs = softmax_probs.unsqueeze(2) replicated_text_field_probs = text_field_probs.unsqueeze(2) dims = replicated_softmax_probs.size() dims1 = replicated_text_field_probs.size() expanded_softmax_probs = replicated_softmax_probs.expand( dims[0], dims[1], self.config['vocab_size']) expanded_text_field_probs = replicated_text_field_probs.expand( dims[0], dims[1], max_attention_length) for i in range(0, seq_size): selected_text_field_probs = expanded_text_field_probs[i, :, :] selected_attention = attentions[ i, :, :] + selected_text_field_probs stacked_attentions = torch.cat( (selected_attention, attentions_padding), 1) attention_results = variable.Variable( torch.zeros(batch_size, self.config['vocab_size']) + -1e10).cuda() attention_results.scatter_(1, stacked_context_tokens, stacked_attentions) total_attention_results.append(attention_results) concated_attention_results = torch.stack(total_attention_results, 0) final_probs = torch.log( torch.exp(concated_attention_results) + torch.exp(language_probs + expanded_softmax_probs)) return final_probs
def predict(self, input_token, context_tokens, end_token, answer_features, max_length=20, min_length=3): """ input_token: Input token to start with context_tokens: Context tokens to use Do greedy decoding using input token and context tokens """ predicted_tokens = [] total_loss = 0.0 batch_first_context_tokens = context_tokens.transpose(0, 1) context_embeddings = self.text_field_predictor.forward_prepro( context_tokens, input_masks=None, answer_features=answer_features) state_shape = (1, self.config['hidden_size']) h0 = c0 = variable.Variable( context_embeddings.data.new(*state_shape).zero_()) cur_states = (h0, c0) def step(input_token, states): cur_input_embedding = self.embedder(input_token) hidden_states, new_states = self.base_lstm.forward(cur_input_embedding, \ states, context_embeddings) reshaped_hidden_states = hidden_states.view( -1, hidden_states.size(-1)) predictor_probs = self.combiner(reshaped_hidden_states) language_probs = self.softmax_predictor(reshaped_hidden_states) reshaped_language_probs = language_probs.view( -1, language_probs.size(-1)) _, attentions, inputs = self.text_field_predictor.forward_similarity( hidden_states) combined_predictions = self.combine_predictions_single(\ context_tokens=batch_first_context_tokens, predictor_probs=predictor_probs, attentions=attentions, language_probs=reshaped_language_probs) loss, token = torch.max(combined_predictions, 1) return loss, token, new_states loss, new_token, new_states = step(input_token, cur_states) while (not torch_utils.to_bool(new_token.data == end_token) or len(predicted_tokens) < min_length ) and len(predicted_tokens) < max_length: predicted_tokens.append(new_token) loss, new_token, new_states = step(new_token, new_states) return predicted_tokens
def combine_predictions_single(self, context_tokens, predictor_probs, attentions, language_probs): max_attention_length = attentions.size(1) pad_size = self.config['vocab_size'] - max_attention_length batch_size = attentions.size(0) context_tokens_padding = variable.Variable( torch.LongTensor(batch_size, pad_size).zero_()).cuda() attentions_padding = variable.Variable( torch.zeros(batch_size, pad_size)).cuda() + -1e10 stacked_context_tokens = torch.cat( (context_tokens, context_tokens_padding), 1) softmax_probs = predictor_probs[:, 0] text_field_probs = predictor_probs[:, 1] replicated_softmax_probs = softmax_probs.unsqueeze(1) replicated_text_field_probs = text_field_probs.unsqueeze(1) dims = replicated_softmax_probs.size() dims1 = replicated_text_field_probs.size() expanded_softmax_probs = replicated_softmax_probs.expand( dims[0], self.config['vocab_size']) expanded_text_field_probs = replicated_text_field_probs.expand( dims[0], max_attention_length) stacked_attentions = torch.cat((attentions, attentions_padding), 1) attention_results = variable.Variable( torch.zeros(batch_size, self.config['vocab_size'])).cuda() + -1e10 attention_results.scatter_(1, stacked_context_tokens, stacked_attentions) use_softmax_predictor = softmax_probs > text_field_probs if torch_utils.to_bool(use_softmax_predictor.data): return language_probs else: return attention_results
def get_index_select(masks): """ Get index select tensor from a list of masks """ num_rows = masks.size(0) num_cols = masks.size(1) new_tensor = [] for i in range(0, num_rows): for j in range(0, num_cols): if to_bool(masks[i][j].data.cpu() == torch.LongTensor([0])): new_tensor.append(i * num_cols + j) indices = torch.from_numpy(np.array(new_tensor)).long() flattened_indices = variable.Variable(indices) return flattened_indices
def forward(self, inputs, contexts, answer_features): context_embeddings = self.text_field_predictor.forward_prepro( contexts, input_masks=None, answer_features=answer_features) input_embeddings = self.embeddings(inputs) batch_size = inputs.size(1) state_shape = (batch_size, self.config['hidden_size']) h0 = c0 = variable.Variable( input_embeddings.data.new(*state_shape).zero_()).cuda() cur_states = (h0, c0) out, hidden = self.base_lstm.forward(input_embeddings, cur_states, context_embeddings) h = hidden[0] pred = torch.squeeze(self.predictor(h)) return pred
def forward(self, input_tokens, context_tokens, context_masks, answer_features): self.batch_first_context_tokens = context_tokens.transpose(0, 1) self.context_embeddings = self.text_field_predictor.forward_prepro( context_tokens, context_masks, answer_features) self.input_embeddings = self.embedder(input_tokens) batch_size = input_tokens.size(1) token_length = input_tokens.size(0) state_shape = (batch_size, self.config['hidden_size']) h0 = c0 = variable.Variable( self.input_embeddings.data.new(*state_shape).zero_(), requires_grad=False) hidden_states, res = self.base_lstm.forward(self.input_embeddings, \ (h0, c0), \ self.context_embeddings) reshaped_hidden_states = hidden_states.view(batch_size * token_length, -1) predictor_probs = self.combiner(reshaped_hidden_states) reshaped_predictor_probs = predictor_probs.view( token_length, batch_size, predictor_probs.size(-1)) language_probs = self.softmax_predictor(reshaped_hidden_states) reshaped_language_probs = language_probs.view(token_length, batch_size, language_probs.size(-1)) attentions_list = [] for i in range(0, token_length): _, attentions, inputs = self.text_field_predictor.forward_similarity( hidden_states[i, :, :]) attentions_list.append(attentions) attentions_sequence = torch.stack(attentions_list, 0) combined_predictions = self.combine_predictions( context_tokens=self.batch_first_context_tokens, predictor_probs=reshaped_predictor_probs, attentions=attentions_sequence, language_probs=reshaped_language_probs) #return reshaped_language_probs return combined_predictions
def step(self, batch, train=True): inputs = variable.Variable(torch.from_numpy( batch['input_tokens'])).cuda() desired_inputs = variable.Variable( torch.from_numpy(batch['desired_input_tokens'])).cuda() desired_input_masks = variable.Variable( torch.from_numpy(batch['desired_input_masks'])).cuda() contexts = variable.Variable(torch.from_numpy( batch['context_tokens'])).cuda() context_masks = variable.Variable( torch.from_numpy(batch['context_masks'])).cuda() answer_features = variable.Variable( torch.from_numpy(batch['answer_features'])).cuda() language_probs = self.language_model.forward(inputs, contexts, context_masks, answer_features) reshaped_inputs = desired_inputs.contiguous().view(-1) reshaped_language_probs = language_probs.view( -1, self.config['vocab_size']) max_likelihoods, best_indices = torch.max(language_probs, 2) #accuracy = torch_utils.average_accuracy(torch.squeeze(best_indices).data, desired_inputs.data) #predictions = self.language_wrapper.get_tokens(best_indices.cpu()) #predictions_text = utils.transpose_join(predictions, " ") loss = 0 select_indices = torch_utils.get_index_select( desired_input_masks).cuda() gathered_indices = torch.index_select(reshaped_inputs, 0, select_indices) gathered_probs = torch.index_select(reshaped_language_probs, 0, select_indices) if train: self.optimizer.zero_grad() if not self.config['finetune_embeddings']: inputs.detach() contexts.detach() answer_features.detach() """ batch_size = language_probs.size(1) for i in range(0, language_probs.size(1)): cur_language_probs = language_probs[:, i, :] cur_desired_inputs = desired_inputs[:, i] cur_lengths = batch['desired_input_lengths'][i] truncated_language_probs = cur_language_probs[0:cur_lengths, :] truncated_desired_inputs = cur_desired_inputs[0:cur_lengths] loss = self.criterion(truncated_language_probs, truncated_desired_inputs) if i == batch_size - 1: loss.backward() else: loss.backward(retain_variables=True) """ loss = self.criterion(gathered_probs, gathered_indices) loss.backward() torch.nn.utils.clip_grad_norm(self.language_model.parameters(), 5) self.optimizer.step() return loss #, accuracy, predictions_text
import torch import numpy as np from torch.autograd import variable from models.language_model import TextFieldPredictor, SoftmaxPredictor config = {} config['vocab_size'] = 12 config['embedding_size'] = 20 config['hidden_size'] = 50 config['num_layers'] = 1 config['dropout'] = 0.0 config['batch_first'] = True # First test text field predictor inp = variable.Variable(torch.LongTensor([[1, 2, 3], [4, 5, 6]])) hidden = variable.Variable(torch.randn(2, config['hidden_size'])) predictor = TextFieldPredictor(config) lstm_embeddings = predictor.forward_prepro(inp) h_tilde, attentions, inp = predictor.forward_similarity(hidden) inp1 = variable.Variable(torch.LongTensor(2, config['vocab_size'] - 3).zero_()) inp2 = variable.Variable(torch.zeros(2, config['vocab_size'] - 3)) stacked_inps = torch.cat((inp, inp1), 1) stacked_attentions = torch.cat((attentions, inp2), 1) # Second test softma predictor softmax_predictor = SoftmaxPredictor(config) softmax_logits = softmax_predictor.forward(hidden) res = variable.Variable(torch.zeros(2, config['vocab_size'])) res.scatter_(1, stacked_inps, stacked_attentions)
config['load_model'] = True config['load_path'] = 'logs/squad_saved_data/model_7_old.pyt7' pointer_network = PointerNetwork(config).cuda() criterion1 = nn.CrossEntropyLoss().cuda() criterion2 = nn.CrossEntropyLoss().cuda() optimizer = optim.Adam(pointer_network.parameters(), 1e-2) batch = language_model_loader.get_batch(dataset_type=constants.DATASET_TRAIN, batch_size=config['batch_size']) large_negative_number = -1.e-10 while batch is not None: optimizer.zero_grad() input_lengths = variable.Variable( torch.from_numpy(batch['context_lengths'])).cuda() input_vals = variable.Variable(torch.from_numpy( batch['context_tokens'])).cuda() answer_starts = variable.Variable(torch.from_numpy( batch['answer_starts'])).cuda() answer_ends = variable.Variable(torch.from_numpy( batch['answer_ends'])).cuda() masks = variable.Variable( torch.from_numpy(batch['context_masks'].T).float()).cuda() p_start, p_end = pointer_network.forward(input_vals, input_lengths, masks) # Batch first loss = criterion1(p_start, answer_starts) + \ criterion2(p_end, answer_ends)
import numpy as np import torch import torch.nn as nn import torch.optim from torch.autograd import variable from models.card_model import CardModel config = {} config['vocab_size'] = 52 config['embedding_size'] = 23 model = CardModel(config) emb1 = nn.Embedding(config['vocab_size'], config['embedding_size']) desired = variable.Variable(torch.randn(3, 23)) tmp = variable.Variable(torch.LongTensor([1, 2, 3])) tmp1 = emb1(tmp) tmp2 = emb1(tmp) criterion = nn.MSELoss() loss = criterion(tmp1 + tmp2, desired) loss.backward()
import torch from torch import nn from torch.autograd import variable from torch import optim batch_size = 25 input_size = 125 input_length = 25 hidden_size = 250 ctx_length = 230 net = LSTMAttentionDot(input_size=input_size, hidden_size=hidden_size, batch_first=False).cuda() inputs = variable.Variable(torch.randn(input_length, batch_size, input_size)).cuda() hidden = variable.Variable(torch.randn(batch_size, hidden_size)).cuda() cell = variable.Variable(torch.randn(batch_size, hidden_size)).cuda() context = variable.Variable(torch.randn(ctx_length, batch_size, hidden_size)).cuda() desired = variable.Variable(torch.randn(batch_size, hidden_size)).cuda() criterion = nn.MSELoss() optimizer = optim.Adam(net.parameters(), lr=3e-2) for i in range(0, 1000): print(i) optimizer.zero_grad() out, h = net.forward(inputs, [hidden, cell], context) loss = criterion(h[0], desired)
language_model = torch_utils.load_model(load_path) language_model = language_model.cuda() batch_size = 3 embeddings = language_model.embedder text_field_predictor = language_model.text_field_predictor base_lstm = language_model.base_lstm discriminator = LanguageDiscriminator(language_model.config, embeddings, text_field_predictor, base_lstm).cuda() discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=3e-2) discriminator_criterion = nn.BCELoss() contexts = variable.Variable( torch.LongTensor([[1, 2, 3], [2, 3, 4], [4, 5, 6]])).cuda() answer_features = variable.Variable( torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]])).cuda() inputs = variable.Variable( torch.LongTensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])).cuda() desired_indices = variable.Variable(torch.FloatTensor([1, 1, 1])).cuda() for i in range(0, 100): discriminator_optimizer.zero_grad() pred = discriminator.forward(inputs, contexts, answer_features) bce_loss = discriminator_criterion(pred, desired_indices) bce_loss.backward() print(bce_loss) discriminator_optimizer.step()
from models.language_model import LanguageModel import torch from torch import nn from torch import optim from torch.autograd import variable from helpers import torch_utils config = {} config['vocab_size'] = 25 config['hidden_size'] = 50 config['embedding_size'] = 10 config['num_layers'] = 1 config['dropout'] = 0.0 config['batch_first'] = False language_model = LanguageModel(config) language_model.cuda() # contexts: context_length x batch_size # inputs: input_length x batch_size # desired_inputs: input_length x batch_size input_token = variable.Variable(torch.LongTensor([[1]])) context_tokens = variable.Variable(torch.LongTensor([[2], [3], [4], [5], [6], [7], [8]])) language_model.predict(input_token, context_tokens, torch.LongTensor([[1]]))
config['batch_first'] = False config['use_pretrained_embeddings'] = False config['finetune_embeddings'] = True language_model = LanguageModel(config).cuda() # contexts: context_length x batch_size # inputs: input_length x batch_size # desired_inputs: input_length x batch_size optimizer = optim.Adam(language_model.parameters(), lr=3e-2) criterion = nn.NLLLoss() for i in range(0, 1000): optimizer.zero_grad() inputs = variable.Variable(torch.LongTensor([[1, 2, 3, 4, 5, 6, 7]] * 100)).cuda() contexts = variable.Variable( torch.LongTensor([[4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10]])).cuda() context_masks = variable.Variable( torch.FloatTensor([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])).cuda() desired_inputs = variable.Variable( torch.LongTensor([[1, 2, 3, 4, 5, 6, 7]] * 100)).cuda() input_masks = variable.Variable( torch.FloatTensor([[1, 1, 1, 1, 1, 1, 1]] * 100)).cuda() answer_features = variable.Variable( torch.LongTensor([[4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10],