def evaluate(data, mode): if mode == 'val_mode': bsz = val_batch_size elif mode == 'test_mode': bsz = test_batch_size global loss_least global num_tokens model.eval() total_loss = 0 hidden = model.init_hidden(bsz) for i in range(0, data.size(0) - 1, BPTT): X, Y = batching.get_batch(data, i, evaluation=True) output, hidden = model(X, hidden) predictions = output.view(-1, num_tokens) total_loss += len(X) * criterion(predictions, Y).data hidden = repackage_hidden(hidden) final_loss = total_loss[0] / len(data) print("Epoch: " + str(epoch) + " Val Loss: " + str(final_loss) + " Val Perplexity: " + str(math.exp(final_loss))) ''' if final_loss < loss_least: with open(MODEL_SAVE_PATH, 'wb') as f: torch.save(model, f) loss_least = final_loss ''' return final_loss
def train(): # Turn on training mode which enables dropout. total_loss = 0 begin_t = time.time() #ntokens = len(corpus.dictionary) hidden = model.init_hidden(BATCH_SIZE) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = BPTT if np.random.random() < 0.95 else BPTT / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / BPTT model.train() data, targets = batching.get_batch(train_data, i, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(output.view(-1, num_tokens), targets) loss = raw_loss # Activiation Regularization loss = loss + sum(ALPHA * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(BETA * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), CLIP_GRADIENTS) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % DEUBG_LOG_INTERVAL == 0 and batch > 0: cur_loss = total_loss[0] / DEUBG_LOG_INTERVAL elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // BPTT, optimizer.param_groups[0]['lr'], elapsed * 1000 / DEUBG_LOG_INTERVAL, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
def train(): model.train() # Turn on training mode which enables dropout. total_loss = 0 # Define the total loss of the model to be 0 to start start_time = time.time( ) # Start a timer to keep track of how long our model is taking to train ntokens = myFactorsInfo.getFactorVocabSize( WORD_FACTOR) # Define our vocabulary size hidden = model.init_hidden(BATCH_SIZE) # Define our hidden states trainOrder = range(0, train_data.size()[1] - 1, MAX_SEQ_LEN) np.random.shuffle(trainOrder) for batch, i in enumerate( trainOrder): # For every batch (batch#, batch starting index) data, targets = get_batch( train_data, i, myFactorsInfo ) # Get the batch based on the training data and the batch starting index # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad( ) # Before doing our backwards pass make sure that the gradients are all set to zero output, hidden = model( data, hidden ) # Based on the current batch, do the forward pass, using the given hidden params loss = criterion( output.view(-1, ntokens), targets ) # Calculate the loss with respect to the last element of the output (we discard all the other outputs here) and the targets loss.backward( ) # Actually do the backwards pass, this populates the gradients # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), CLIP_GRADIENTS) optimizer.step( ) # The step the optimizer, this actually updates the weights. The optimizer was initialized with the model as a parameter so thats how it keeps track total_loss += loss.data # Update the total loss if batch % DEUBG_LOG_INTERVAL == 0 and batch > 0: # If we want to print things out... cur_loss = total_loss[ 0] / DEUBG_LOG_INTERVAL # Calculate the current loss elapsed = time.time( ) - start_time # Calculate how much time has passed for this epoch print( '| epoch {:3d} | {:5d}/{:5d} batches | lr ADAM | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, train_data.size()[1] // MAX_SEQ_LEN, elapsed * 1000 / DEUBG_LOG_INTERVAL, cur_loss, math.exp(cur_loss))) # Print some log statement total_loss = 0 # Reset the loss start_time = time.time() # Reset the time
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, BPTT): data, targets = batching.get_batch(data_source, i, evaluation=True) #output, hidden = model(data, hidden) output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
def train(stochastic): global optimizer if MODEL_TYPE == "QRNN": model.reset() if stochastic == False: optimizer = torch.optim.ASGD(model.parameters(), lr=INITIAL_LEARNING_RATE, t0=0, \ lambd=0., weight_decay=WEIGHT_DECAY) total_loss = 0 begin_t = time.time() hidden = model.init_hidden(BATCH_SIZE) i = 0 while i < train_data.size(0)-2: prob = 0.95 rand_prob = np.random.random() if rand_prob < prob: bptt = BPTT else: bptt = BPTT/2 s = 5 window = max(s, int(np.random.normal(bptt, s))) window = min(window, BPTT+10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * window / BPTT model.train() X, Y = batching.get_batch(train_data, i, seq_len=window) hidden = repackage_hidden(hidden) optimizer.zero_grad() # NOT SURE output, hidden = model(X, hidden) loss_base = criterion(output.view(-1, num_tokens), Y) ar_loss = ALPHA * model.ar_fragment tar_loss = BETA * model.tar_fragment loss = loss_base + ar_loss + tar_loss loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), CLIP_GRADIENTS) optimizer.step() total_loss += loss_base.data optimizer.param_groups[0]['lr'] = lr2 i += window
def evaluate(data_source): model.eval() # Turn on evaluation mode which disables dropout. total_loss = 0 # Define the total loss of the model to be 0 to start ntokens = myFactorsInfo.getFactorVocabSize( WORD_FACTOR) # Define our vocabulary size hidden = model.init_hidden(EVAL_BATCH_SIZE) # Define our hidden states for i in range( 0, data_source.size(0) - 1, MAX_SEQ_LEN): # For every batch (batch#, batch starting index) data, targets = get_batch( data_source, i, myFactorsInfo, evaluation=True) # Get the batch in evaulation mode output, hidden = model(data, hidden) # Get the output of the model output_flat = output.view( -1, ntokens ) # Get the final output vector from the model (the last word predicted) total_loss += len(data) * criterion( output_flat, targets).data # Get the loss of the predicitons hidden = repackage_hidden(hidden) # Reset the hidden states return total_loss[0] / len(data_source) # Return the losses
if args.temperature < 1e-3: parser.error("--temperature has to be greater or equal 1e-3") with open(args.checkpoint, 'rb') as f: model = torch.load(f) model.eval() if args.cuda: model.cuda() else: model.cpu() corpus = data.Corpus(args.data) ntokens = len(corpus.dictionary) hidden = model.init_hidden(1) input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) if args.cuda: input.data = input.data.cuda() with open(args.outf, 'w') as outf: for i in range(args.words): output, hidden = model(input, hidden) word_weights = output.squeeze().data.div(args.temperature).exp().cpu() word_idx = torch.multinomial(word_weights, 1)[0] input.data.fill_(word_idx) word = corpus.dictionary.idx2word[word_idx] outf.write(word + ('\n' if i % 20 == 19 else ' ')) if i % args.log_interval == 0:
def evaluate(data): wordCache = None hiddenCache = None windowStartIndex = None model.eval() totalLoss = 0 uncachedHiddenState = model.init_hidden(TEST_BATCH_SIZE) for i in range(0, data.size(0) - 1, BPTT): X, Y = batching.get_batch(data, i, evaluation=True) output, uncachedHiddenState = model(X, uncachedHiddenState) predictions = output.view(-1, vocabSize) outerMostHidden = model.rnns_before_drop[-1].squeeze() #Set our starting position for the window that will keep track of the cache #Update the words in the cache based on the one hot vectors for the targets #Update the hidden states in the cache based on what has been generated before oneHots = torch.cat( [oneHotify(label.data[0], vocabSize) for label in Y]) hiddenValuesToCache = Variable(outerMostHidden.data) #If the cache hasnt been initialized yet... if wordCache is None: wordCache = oneHots hiddenCache = hiddenValuesToCache windowStartIndex = 0 #If the cache has been initialized... else: wordCache = torch.cat([wordCache, oneHots]) hiddenCache = torch.cat([hiddenCache, hiddenValuesToCache], dim=0) windowStartIndex = len(wordCache) softmaxOutputs = torch.nn.functional.softmax(predictions) currentLoss = 0 for wordIndex, modelProbs in enumerate(softmaxOutputs): #If we dont have the cache (as determined by the if statement) then we still need to have a distribution to draw from finalProbs = modelProbs #If we are outside the cache use the cache if windowStartIndex + wordIndex > CACHE_WINDOW_SIZE: #Construct the window of the cache that we are going to be operating over try: slicedWordCache = wordCache[ windowStartIndex + wordIndex - CACHE_WINDOW_SIZE:windowStartIndex + wordIndex] slicedHiddenCache = hiddenCache[ windowStartIndex + wordIndex - CACHE_WINDOW_SIZE:windowStartIndex + wordIndex] #Construct a vector of values that describe how well outerMostHidden correlates with the hidden values in the cache hiddenCorrelation = torch.mv(slicedHiddenCache, outerMostHidden[wordIndex]) #Pass the correlation values through a softmax so we can think of them as probabilities hiddenProbs = torch.nn.functional.softmax( THETA * hiddenCorrelation).view(-1, 1) #Calculate cache probabilities based on the probs from the softmax above times the one hot vectors we calculated earlier. #As the values in slicedWordCache are one hot vectors this will not change the nature of this distribution cacheProbs = (hiddenProbs.expand_as(slicedWordCache) * slicedWordCache).sum(0).squeeze() #Calculate the combined probabilities for the cache and the model based on a linear interpolation finalProbs = LAMBDA * cacheProbs + (1 - LAMBDA) * modelProbs except ValueError as e: pass probOfTargetWord = finalProbs[Y[wordIndex].data] currentLoss += (-torch.log(probOfTargetWord)).data[0] totalLoss += currentLoss / TEST_BATCH_SIZE uncachedHiddenState = repackage_hidden(uncachedHiddenState) wordCache = wordCache[-CACHE_WINDOW_SIZE:] hiddenCache = hiddenCache[-CACHE_WINDOW_SIZE:] print(totalLoss, len(data)) final_loss = totalLoss / len(data) print("Evaluation - Loss: " + str(final_loss) + " Perplexity: " + str(math.exp(final_loss))) return final_loss