class Sequence2SequenceNetwork(object): def __init__(self, config): self.init_writer() self.load_configuration(config) self.load_vocabulary() self.prepare_data() self.build_model() self.load_pretrained_model() self.train_model() self.save_model(self.n_epochs) self.evaluate_all() self.close_writer() def init_writer(self): self.writer = SummaryWriter() def load_configuration(self, config): # Load configuration self.iter_num = 0 self.lr = config['lr'] self.gpu = config['gpu'] self.unit = config['unit'] self.clip = config['clip'] self.beta1 = config['beta1'] self.beta2 = config['beta2'] self.langs = config['langs'] self.fusion = config['fusion'] self.log_tb = config['log_tb'] self.epsilon = config['epsilon'] self.attn_model = config['attn'] self.dropout = config['dropout'] self.emb_mode = config['emb_mode'] self.save_dir = config['save_dir'] self.data_dir = config['data_dir'] self.n_epochs = config['n_epochs'] self.SOS_TOKEN = config['SOS_TOKEN'] self.EOS_TOKEN = config['EOS_TOKEN'] self.MAX_LENGTH = config['MAX_LENGTH'] self.latent_dim = config['latent_dim'] self.batch_size = config['batch_size'] self.model_code = config['model_code'] self.vocab_path = config['vocab_path'] self.hidden_size = config['hidden_size'] self.use_cuda = torch.cuda.is_available() self.log_tb_every = config['log_tb_every'] self.enc_n_layers = config['enc_n_layers'] self.dec_n_layers = config['dec_n_layers'] self.dec_learning_ratio = config['dec_lr'] self.bidirectional = config['bidirectional'] self.enc_input_dim = config['enc_input_dim'] self.embedding_dim = config['embedding_dim'] self.use_scheduler = config['use_scheduler'] self.use_embeddings = config['use_embeddings'] self.lr_lower_bound = config['lr_lower_bound'] self.teacher_forcing_ratio = config['tf_ratio'] self.load_model_name = config['load_model_name'] self.modality = config[ 'modalities'] # no splitting as it's not multimodal case if self.modality in ['ss-vv', 'v-s']: self.pretrained_modality = config['pretrained_modality'] self.generate_word_embeddings = config['generate_word_embeddings'] self.device = torch.device( 'cuda:{}'.format(self.gpu) if self.use_cuda else 'cpu') def load_vocabulary(self): try: with open(self.vocab_path, 'rb') as f: self.vocab = pickle.load(f) except FileNotFoundError as e: # build vocab if it doesn't exist self.vocab = buildVocab() def prepare_data(self): # Note: The below workaround is used a lot and doing so is okay # because this script would only be run for unimodal cases self.pairs = prepareData(self.langs, [self.modality])[self.modality] num_pairs = len(self.pairs) self.pairs = self.pairs[:self.batch_size * (num_pairs // self.batch_size)] random.shuffle(self.pairs) self.n_iters = len(self.pairs) print('\nLoading test data pairs') self.test_pairs = prepareData(self.langs, [self.modality], train=False)[self.modality] random.shuffle(self.test_pairs) print(random.choice(self.pairs)) if self.use_embeddings: if self.generate_word_embeddings: self.embedding_wts = generateWordEmbeddings( self.vocab, self.emb_mode) else: self.embedding_wts = loadWordEmbeddings(self.emb_mode) def build_model(self): if self.use_embeddings: self.embedding = nn.Embedding.from_pretrained(self.embedding_wts) else: self.embedding = nn.Embedding(self.vocab.n_words, self.embedding_dim) if self.modality == 't': # Need embedding only for t2t mode self.encoder = EncoderRNN(self.embedding_dim, self.hidden_size, self.enc_n_layers, self.dropout, self.unit, self.modality, self.embedding, fusion_or_unimodal=True).to(self.device) else: # Note: no embedding used here self.encoder = EncoderRNN(self.enc_input_dim, self.hidden_size, self.enc_n_layers, self.dropout, self.unit, self.modality, fusion_or_unimodal=True).to(self.device) self.decoder = DecoderRNN(self.attn_model, self.embedding_dim, self.hidden_size, self.vocab.n_words, self.unit, self.dec_n_layers, self.dropout, self.embedding).to(self.device) self.encoder_optimizer = optim.Adam(self.encoder.parameters(), lr=self.lr) self.decoder_optimizer = optim.Adam(self.decoder.parameters(), lr=self.lr * self.dec_learning_ratio) self.epoch = 0 # define here to add resume training feature self.project_factor = self.encoder.project_factor self.latent2hidden = nn.Linear(self.latent_dim, self.hidden_size * self.project_factor).to(self.device) def load_pretrained_model(self): if self.load_model_name: checkpoint = torch.load(self.load_model_name, map_location=self.device) print('Loaded {}'.format(self.load_model_name)) self.epoch = checkpoint['epoch'] self.encoder.load_state_dict(checkpoint['en']) self.decoder.load_state_dict(checkpoint['de']) self.encoder_optimizer.load_state_dict(checkpoint['en_op']) self.decoder_optimizer.load_state_dict(checkpoint['de_op']) self.embedding.load_state_dict(checkpoint['embedding']) def train_model(self): best_score = 1e-200 print_loss_total = 0 # Reset every epoch saving_skipped = 0 for epoch in range(self.epoch, self.n_epochs): incomplete = False for iter in range(0, self.n_iters, self.batch_size): pairs = self.pairs[iter:iter + self.batch_size] # Skip incomplete batch if len(pairs) < self.batch_size: incomplete = True continue training_batch = batch2TrainData(self.vocab, pairs, self.modality) # Extract fields from batch input_variable, lengths, target_variable, \ mask, max_target_len, _ = training_batch if incomplete: break # Run a training iteration with the current batch loss = self.train(input_variable, lengths, target_variable, mask, max_target_len, iter) self.writer.add_scalar('{}loss'.format(self.data_dir), loss, iter) print_loss_total += loss print_loss_avg = print_loss_total * self.batch_size / self.n_iters print_loss_total = 0 print('Epoch: [{}/{}] Loss: {:.4f}'.format(epoch, self.n_epochs, print_loss_avg)) # evaluate and save the model curr_score = self.evaluate_all() self.writer.add_scalar('{}bleu_score'.format(self.data_dir), curr_score) if curr_score > best_score: saving_skipped = 0 best_score = curr_score self.save_model(epoch) saving_skipped += 1 if self.use_scheduler and saving_skipped > 3: saving_skipped = 0 new_lr = self.lr * 0.5 print('Entered the dungeon...') if new_lr > self.lr_lower_bound: # lower bound on lr self.lr = new_lr print('lr decreased to => {}'.format(self.lr)) def train(self, input_variable, lengths, target_variable, mask, max_target_len, iter): self.encoder.train() self.decoder.train() self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() input_variable = input_variable.to(self.device) lengths = lengths.to(self.device) target_variable = target_variable.to(self.device) mask = mask.to(self.device) # Initialize variables loss = 0 print_losses = [] n_totals = 0 # Forward pass through encoder encoder_outputs, encoder_hidden = self.encoder(input_variable, lengths) # Create initial decoder input (start with SOS tokens for each sentence) decoder_input = torch.LongTensor([[self.SOS_TOKEN] * self.batch_size]) decoder_input = decoder_input.to(self.device) # Set initial decoder hidden state to the encoder's final hidden state if self.unit == 'gru': decoder_hidden = encoder_hidden[:self.decoder.n_layers] else: decoder_hidden = (encoder_hidden[0][:self.decoder.n_layers], encoder_hidden[1][:self.decoder.n_layers]) if iter % conf['log_tb_every'] == 0: # Visualize latent space if self.unit == 'gru': vis_hidden = decoder_hidden[-1, :, :] else: vis_hidden = decoder_hidden[0][-1, :, :] self.writer.add_embedding(vis_hidden, tag='decoder_hidden_{}'.format(iter)) use_teacher_forcing = True if random.random( ) < self.teacher_forcing_ratio else False if use_teacher_forcing: for t in range(max_target_len): decoder_output, decoder_hidden = self.decoder( decoder_input, decoder_hidden, encoder_outputs) # Teacher forcing: next input is current target decoder_input = target_variable[t].view(1, -1) # Calculate and accumulate loss mask_loss, nTotal = self.mask_nll_loss(decoder_output, target_variable[t], mask[t]) loss += mask_loss print_losses.append(mask_loss.item() * nTotal) n_totals += nTotal else: for t in range(max_target_len): decoder_output, decoder_hidden = self.decoder( decoder_input, decoder_hidden, encoder_outputs) # No teacher forcing: next input is decoder's own current output _, topi = decoder_output.topk(1) decoder_input = torch.LongTensor( [[topi[i][0] for i in range(self.batch_size)]]) decoder_input = decoder_input.to(self.device) # Calculate and accumulate loss mask_loss, nTotal = self.mask_nll_loss(decoder_output, target_variable[t], mask[t]) loss += mask_loss print_losses.append(mask_loss.item() * nTotal) n_totals += nTotal loss.backward() # Clip gradients: gradients are modified in place torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), self.clip) torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), self.clip) self.encoder_optimizer.step() self.decoder_optimizer.step() return sum(print_losses) / n_totals def mask_nll_loss(self, inp, target, mask): n_total = mask.sum() cross_entropy = -torch.log( torch.gather(inp, 1, target.view(-1, 1)).squeeze(1)) loss = cross_entropy.masked_select(mask).sum() loss = loss.to(self.device) return loss, n_total.item() def save_model(self, epoch): directory = self.save_dir if not os.path.exists(directory): os.makedirs(directory) torch.save( { 'epoch': epoch, 'en': self.encoder.state_dict(), 'de': self.decoder.state_dict(), 'en_op': self.encoder_optimizer.state_dict(), 'de_op': self.decoder_optimizer.state_dict(), 'embedding': self.embedding.state_dict() }, '{}{}-{}-{}-{}.pth'.format(directory, self.model_code, self.modality, self.langs, epoch)) def evaluate_all(self): self.encoder.eval() self.decoder.eval() searcher = GreedySearchDecoder(self.encoder, self.decoder, None, self.device, self.SOS_TOKEN) refs = [] hyp = [] for pair in self.test_pairs: output_words = self.evaluate(self.encoder, self.decoder, searcher, self.vocab, pair[0]) if output_words: final_output = [] for x in output_words: if x == '<EOS>': break final_output.append(x) refs.append([pair[1].split()]) hyp.append(final_output) bleu_scores = calculateBleuScores(refs, hyp) print('Bleu score: {bleu_1} | {bleu_2} | {bleu_3} | {bleu_4}'.format( **bleu_scores)) eg_idx = random.choice(range(len(hyp))) print(hyp[eg_idx], refs[eg_idx]) return bleu_scores['bleu_4'] def evaluate(self, encoder, decoder, searcher, vocab, sentence_or_vector, max_length=conf['MAX_LENGTH']): with torch.no_grad(): if self.modality == 't': # `sentence_or_vector` ~> sentence # Format input sentence as a batch # words => indexes indexes_batch = [ indexesFromSentence(vocab, sentence_or_vector) ] if None in indexes_batch: return None for idx, indexes in enumerate(indexes_batch): indexes_batch[idx] = indexes_batch[idx] + [self.EOS_TOKEN] # Create lengths tensor lengths = torch.tensor( [len(indexes) for indexes in indexes_batch]) # Transpose dimensions of batch to match models' expectations input_batch = torch.LongTensor(indexes_batch).transpose(0, 1) else: # `sentence_or_vector` ~> vector input_batch, lengths = inputVarVec([sentence_or_vector], self.modality) # Use appropriate device input_batch = input_batch.to(self.device) lengths = lengths.to(self.device) # Decode sentence with searcher tokens, scores = searcher(input_batch, lengths, max_length) # indexes -> words decoded_words = [ vocab.index2word[token.item()] for token in tokens ] return decoded_words def close_writer(self): self.writer.close()
def train(input_sentences, output_sentences, input_vocab, output_vocab, input_reverse, output_reverse, hy, writer): dataset = NMTDataset(input_sentences, output_sentences, input_vocab, output_vocab, input_reverse, output_reverse) loader = DataLoader(dataset, batch_size=hy.batch_size, shuffle=True, drop_last=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_vocab_size = len(input_vocab.keys()) output_vocab_size = len(output_vocab.keys()) encoder = EncoderRNN(input_vocab_size, hy.embedding_size, hy.hidden_size, hy.rnn_layers, hy.bidirectional, device) decoder = DecoderRNN(output_vocab_size, hy.embedding_size, hy.hidden_size, hy.rnn_layers, hy.bidirectional, device) loss_function = nn.CrossEntropyLoss().to(device) encoder_optimizer = optim.Adam(encoder.parameters(), lr=hy.lr) decoder_optimizer = optim.Adam(decoder.parameters(), lr=hy.lr) n_iterations = 0 loss_history = [] training_accuracy = 0. encoder.train() decoder.train() for epoch in range(1, hy.num_epochs + 1): for encoder_input, decoder_input, decoder_output in tqdm( loader, desc="{}/{}".format(epoch, hy.num_epochs)): encoder_input = encoder_input.to(device) decoder_input = decoder_input.to(device) decoder_output = decoder_output.to(device) encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() _, encoder_hidden = encoder(encoder_input) logits = decoder(decoder_input, encoder_hidden) loss = loss_function( logits.view(hy.batch_size * decoder_output.shape[1], -1), decoder_output.view(-1)) loss.backward() encoder_optimizer.step() decoder_optimizer.step() writer.add_scalar("TrainingLoss", loss.item(), n_iterations) n_iterations = n_iterations + 1 loss_history.append(loss.item()) training_accuracy = compute_model_accuracy(encoder, decoder, loader, device, epoch, writer) torch.save(encoder.state_dict(), "saved_runs/encoder_{}_weights.pt".format(epoch)) torch.save(decoder.state_dict(), "saved_runs/decoder_{}_weights.pt".format(epoch)) return loss_history, training_accuracy
class FusionNetwork(Sequence2SequenceNetwork): def __init__(self, config): self.init_writer() self.load_configuration(config) self.load_vocabulary() self.prepare_data() self.build_model() self.load_pretrained_model() self.train_model() self.save_model(self.n_epochs) self.evaluate_all() self.close_writer() def prepare_data(self): self.modality = self.modality.split('-') self.pairs = prepareData(self.langs, self.modality) # dict: m => pairs num_pairs = len(random.choice(list(self.pairs.values()))) rand_indices = random.sample(list(range(num_pairs)), num_pairs) self.n_iters = num_pairs for m in self.modality: self.pairs[m] = self.pairs[m][: self.batch_size * ( num_pairs // self.batch_size)] # Shuffle all modalities the same way self.pairs[m] = [p for p, _ in sorted(zip(self.pairs[m], rand_indices))] print(random.choice(self.pairs[m])) print('\nLoading test data pairs') self.test_pairs = prepareData(self.langs, self.modality, train=False) self.num_test_pairs = len(random.choice(list(self.test_pairs.values()))) rand_indices = random.sample(list(range(self.num_test_pairs)), self.num_test_pairs) if self.use_embeddings: if self.generate_word_embeddings: self.embedding_wts = generateWordEmbeddings(self.vocab, self.emb_mode) else: self.embedding_wts = loadWordEmbeddings(self.emb_mode) def build_model(self): if self.use_embeddings: self.embedding = nn.Embedding.from_pretrained(self.embedding_wts) else: self.embedding = nn.Embedding(self.vocab.n_words, self.embedding_dim) self.encoder = MultimodalEncoderRNN(self.fusion, self.hidden_size, self.enc_n_layers, self.dropout, self.unit, self.modality, self.embedding, self.device).to(self.device) if self.fusion == 'early' or self.fusion is None: parameter_list = self.encoder.parameters() else: parameter_list = [] for m in self.encoder.modalities: parameter_list += list(self.encoder.encoder[m].parameters()) # Need to expand hidden layer according to # modalities for early fusion self.decoder = DecoderRNN(self.attn_model, self.embedding_dim, self.hidden_size, self.vocab.n_words, self.unit, self.dec_n_layers, self.dropout, self.embedding).to(self.device) self.encoder_optimizer = optim.Adam(parameter_list, lr=self.lr) self.decoder_optimizer = optim.Adam(self.decoder.parameters(), lr=self.lr*self.dec_learning_ratio) self.epoch = 0 # define here to add resume training feature def load_pretrained_model(self): if self.load_model_name: checkpoint = torch.load(self.load_model_name, map_location=self.device) print('Loaded {}'.format(self.load_model_name)) self.epoch = checkpoint['epoch'] self.encoder.load_state_dict(checkpoint['en']) self.decoder.load_state_dict(checkpoint['de']) self.encoder_optimizer.load_state_dict(checkpoint['en_op']) self.decoder_optimizer.load_state_dict(checkpoint['de_op']) self.embedding.load_state_dict(checkpoint['embedding']) self.vocab.__dict__ = checkpoint['vocab_dict'] self.evaluate_all() def train_model(self): best_score = 1e-200 print_loss_total = 0 # Reset every epoch num_pairs = {} for m in self.modality: num_pairs[m] = len(self.pairs[m]) saving_skipped = 0 for epoch in range(self.epoch, self.n_epochs): incomplete = False for iter in range(0, self.n_iters, self.batch_size): training_batch = {} input_variable = {} lengths = {} for m in self.modality: pairs = self.pairs[m][iter: iter + self.batch_size] # Skip incomplete batch if len(pairs) < self.batch_size: incomplete = True continue training_batch[m] = batch2TrainData( self.vocab, pairs, m) # Extract fields from batch input_variable[m], lengths[m], target_variable, \ mask, max_target_len, _ = training_batch[m] if incomplete: break # Run a training iteration with the current batch loss = self.train(input_variable, lengths, target_variable, mask, max_target_len, epoch, iter) self.writer.add_scalar('{}loss'.format(self.data_dir), loss, iter) print_loss_total += loss print_loss_avg = print_loss_total * self.batch_size / self.n_iters print_loss_total = 0 print('Epoch: [{}/{}] Loss: {:.4f}'.format( epoch, self.n_epochs, print_loss_avg)) # evaluate and save the model curr_score = self.evaluate_all() self.writer.add_scalar('{}bleu_score'.format(self.data_dir), curr_score, iter) if curr_score > best_score: saving_skipped = 0 best_score = curr_score self.save_model(epoch) saving_skipped += 1 if self.use_scheduler and saving_skipped > 3: saving_skipped = 0 new_lr = self.lr * 0.5 print('Entered the dungeon...') if new_lr > self.lr_lower_bound: # lower bound on lr self.lr = new_lr print('lr decreased to => {}'.format(self.lr)) def train(self, input_variable, lengths, target_variable, mask, max_target_len, epoch, iter): self.encoder.train() self.decoder.train() self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() for m in self.modality: input_variable[m] = input_variable[m].to(self.device) lengths[m] = lengths[m].to(self.device) target_variable = target_variable.to(self.device) mask = mask.to(self.device) # Initialize variables loss = 0 print_losses = [] n_totals = 0 # Forward pass through encoder encoder_outputs, encoder_hidden = self.encoder(input_variable, lengths) # Create initial decoder input (start with SOS tokens for each sentence) decoder_input = torch.LongTensor([[self.SOS_TOKEN] * self.batch_size]) decoder_input = decoder_input.to(self.device) # Set initial decoder hidden state to the encoder's final hidden state if self.unit == 'gru': decoder_hidden = encoder_hidden[:self.decoder.n_layers] else: decoder_hidden = (encoder_hidden[0][:self.decoder.n_layers], encoder_hidden[1][:self.decoder.n_layers]) if iter % conf['log_tb_every'] == 1: # Visualize latent space if self.unit == 'gru': vis_hidden = decoder_hidden[-1, :, :] else: vis_hidden = decoder_hidden[0][-1, :, :] self.writer.add_embedding(vis_hidden, tag='decoder_hidden_{}_{}'.format( epoch, iter)) use_teacher_forcing = True if random.random() < self.teacher_forcing_ratio else False if use_teacher_forcing: for t in range(max_target_len): decoder_output, decoder_hidden = self.decoder( decoder_input, decoder_hidden, encoder_outputs) # Teacher forcing: next input is current target decoder_input = target_variable[t].view(1, -1) # Calculate and accumulate loss mask_loss, nTotal = self.mask_nll_loss(decoder_output, target_variable[t], mask[t]) loss += mask_loss print_losses.append(mask_loss.item() * nTotal) n_totals += nTotal else: for t in range(max_target_len): decoder_output, decoder_hidden = self.decoder( decoder_input, decoder_hidden, encoder_outputs ) # No teacher forcing: next input is decoder's own current output _, topi = decoder_output.topk(1) decoder_input = torch.LongTensor( [[topi[i][0] for i in range(self.batch_size)]]) decoder_input = decoder_input.to(self.device) # Calculate and accumulate loss mask_loss, nTotal = self.mask_nll_loss( decoder_output, target_variable[t], mask[t]) loss += mask_loss print_losses.append(mask_loss.item() * nTotal) n_totals += nTotal loss.backward() # Clip gradients: gradients are modified in place torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), self.clip) torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), self.clip) self.encoder_optimizer.step() self.decoder_optimizer.step() return sum(print_losses) / n_totals def mask_nll_loss(self, inp, target, mask): n_total = mask.sum() cross_entropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1)) loss = cross_entropy.masked_select(mask).sum() loss = loss.to(self.device) return loss, n_total.item() def save_model(self, epoch): directory = self.save_dir if not os.path.exists(directory): os.makedirs(directory) torch.save({ 'epoch': epoch, 'en': self.encoder.state_dict(), 'de': self.decoder.state_dict(), 'en_op': self.encoder_optimizer.state_dict(), 'de_op': self.decoder_optimizer.state_dict(), 'vocab_dict': self.vocab.__dict__, 'embedding': self.embedding.state_dict()}, '{}{}-{}-{}.pth'.format(directory, self.model_code, epoch, iter)) def evaluate_all(self): self.encoder.eval() self.decoder.eval() searcher = GreedySearchDecoder( self.encoder, self.decoder, None, self.device, self.SOS_TOKEN) refs = [] hyp = [] for id in range(self.num_test_pairs): # Sample test pairs of each modality output_words, reference = self.evaluate( searcher, self.vocab, self.test_pairs, id) if output_words: final_output = [] for x in output_words: if x == '<EOS>': break final_output.append(x) refs.append(reference.split()) hyp.append(final_output) bleu_scores = calculateBleuScores(refs, hyp) print('Bleu score: {bleu_1} | {bleu_2} | {bleu_3} | {bleu_4}'.format( **bleu_scores)) eg_idx = random.choice(range(len(hyp))) print(hyp[eg_idx], refs[eg_idx]) return bleu_scores['bleu_4'] def evaluate(self, searcher, vocab, test_pairs, id, max_length=conf['MAX_LENGTH']): lengths = {} input_batch = {} with torch.no_grad(): reference = random.choice(list(test_pairs.values()))[id][1] for m in self.modality: sentence_or_vector = test_pairs[m][id][0] if m == 't': # `sentence_or_vector` ~> sentence # Format input sentence as a batch # words => indexes indexes_batch = [indexesFromSentence(vocab, sentence_or_vector)] if None in indexes_batch: return None for idx, indexes in enumerate(indexes_batch): indexes_batch[idx] = indexes_batch[idx] + [self.EOS_TOKEN] # Create lengths tensor lengths[m] = torch.tensor( [len(indexes) for indexes in indexes_batch]) # Transpose dimensions of batch to match models' expectations input_batch[m] = torch.LongTensor( indexes_batch).transpose(0, 1) else: # `sentence_or_vector` ~> vector input_batch[m], lengths[m] = \ inputVarVec([sentence_or_vector], m) # Use appropriate device input_batch[m] = input_batch[m].to(self.device) lengths[m] = lengths[m].to(self.device) # Decode sentence with searcher tokens, scores = searcher(input_batch, lengths, max_length) # indexes -> words decoded_words = [vocab.index2word[token.item()] for token in tokens] return decoded_words, reference def close_writer(self): self.writer.close()