def evaluate(data, mode):
    if mode == 'val_mode':
        bsz = val_batch_size
    elif mode == 'test_mode':
        bsz = test_batch_size
    global loss_least
    global num_tokens
    model.eval()
    total_loss = 0
    hidden = model.init_hidden(bsz)
    for i in range(0, data.size(0) - 1, BPTT):
        X, Y = batching.get_batch(data, i, evaluation=True)
        output, hidden = model(X, hidden)
        predictions = output.view(-1, num_tokens)
        total_loss += len(X) * criterion(predictions, Y).data
        hidden = repackage_hidden(hidden)
    final_loss = total_loss[0] / len(data)
    print("Epoch: " + str(epoch) + " Val Loss: " + str(final_loss) +
          " Val Perplexity: " + str(math.exp(final_loss)))
    '''
    if final_loss < loss_least:
            with open(MODEL_SAVE_PATH, 'wb') as f:
                torch.save(model, f)
            loss_least = final_loss
    '''
    return final_loss
Esempio n. 2
0
def train():
    # Turn on training mode which enables dropout.
    total_loss = 0
    begin_t = time.time()
    #ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(BATCH_SIZE)
    batch, i = 0, 0
    while i < train_data.size(0) - 1 - 1:
        bptt = BPTT if np.random.random() < 0.95 else BPTT / 2.
        # Prevent excessively small or negative sequence lengths
        seq_len = max(5, int(np.random.normal(bptt, 5)))
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        # seq_len = min(seq_len, args.bptt + 10)

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * seq_len / BPTT
        model.train()
        data, targets = batching.get_batch(train_data, i, seq_len=seq_len)

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optimizer.zero_grad()

        output, hidden, rnn_hs, dropped_rnn_hs = model(data,
                                                       hidden,
                                                       return_h=True)
        raw_loss = criterion(output.view(-1, num_tokens), targets)

        loss = raw_loss
        # Activiation Regularization
        loss = loss + sum(ALPHA * dropped_rnn_h.pow(2).mean()
                          for dropped_rnn_h in dropped_rnn_hs[-1:])
        # Temporal Activation Regularization (slowness)
        loss = loss + sum(BETA * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean()
                          for rnn_h in rnn_hs[-1:])
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), CLIP_GRADIENTS)
        optimizer.step()

        total_loss += raw_loss.data
        optimizer.param_groups[0]['lr'] = lr2
        if batch % DEUBG_LOG_INTERVAL == 0 and batch > 0:
            cur_loss = total_loss[0] / DEUBG_LOG_INTERVAL
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // BPTT, optimizer.param_groups[0]['lr'],
                    elapsed * 1000 / DEUBG_LOG_INTERVAL, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        ###
        batch += 1
        i += seq_len
def train():
    model.train()  # Turn on training mode which enables dropout.
    total_loss = 0  # Define the total loss of the model to be 0 to start
    start_time = time.time(
    )  # Start a timer to keep track of how long our model is taking to train
    ntokens = myFactorsInfo.getFactorVocabSize(
        WORD_FACTOR)  # Define our vocabulary size
    hidden = model.init_hidden(BATCH_SIZE)  # Define our hidden states
    trainOrder = range(0, train_data.size()[1] - 1, MAX_SEQ_LEN)
    np.random.shuffle(trainOrder)
    for batch, i in enumerate(
            trainOrder):  # For every batch (batch#, batch starting index)
        data, targets = get_batch(
            train_data, i, myFactorsInfo
        )  # Get the batch based on the training data and the batch starting index
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad(
        )  # Before doing our backwards pass make sure that the gradients are all set to zero
        output, hidden = model(
            data, hidden
        )  # Based on the current batch, do the forward pass, using the given hidden params
        loss = criterion(
            output.view(-1, ntokens), targets
        )  # Calculate the loss with respect to the last element of the output (we discard all the other outputs here) and the targets
        loss.backward(
        )  # Actually do the backwards pass, this populates the gradients

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), CLIP_GRADIENTS)
        optimizer.step(
        )  # The step the optimizer, this actually updates the weights. The optimizer was initialized with the model as a parameter so thats how it keeps track

        total_loss += loss.data  # Update the total loss

        if batch % DEUBG_LOG_INTERVAL == 0 and batch > 0:  # If we want to print things out...
            cur_loss = total_loss[
                0] / DEUBG_LOG_INTERVAL  # Calculate the current loss
            elapsed = time.time(
            ) - start_time  # Calculate how much time has passed for this epoch
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr ADAM | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    train_data.size()[1] // MAX_SEQ_LEN,
                    elapsed * 1000 / DEUBG_LOG_INTERVAL, cur_loss,
                    math.exp(cur_loss)))  # Print some log statement
            total_loss = 0  # Reset the loss
            start_time = time.time()  # Reset the time
Esempio n. 4
0
def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    for i in range(0, data_source.size(0) - 1, BPTT):
        data, targets = batching.get_batch(data_source, i, evaluation=True)
        #output, hidden = model(data, hidden)
        output, hidden, rnn_hs, dropped_rnn_hs = model(data,
                                                       hidden,
                                                       return_h=True)
        output_flat = output.view(-1, ntokens)
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss[0] / len(data_source)
Esempio n. 5
0
def train(stochastic):
    global optimizer
    if MODEL_TYPE == "QRNN":
        model.reset()
    if stochastic == False:
        optimizer = torch.optim.ASGD(model.parameters(), lr=INITIAL_LEARNING_RATE, t0=0, \
            lambd=0., weight_decay=WEIGHT_DECAY)
    
    total_loss = 0
    begin_t = time.time()
    hidden = model.init_hidden(BATCH_SIZE)
    i = 0
    while i < train_data.size(0)-2:
        prob = 0.95
        rand_prob = np.random.random()
        if rand_prob < prob:
            bptt = BPTT
        else:
            bptt = BPTT/2
        s = 5
        window = max(s, int(np.random.normal(bptt, s)))
        window = min(window, BPTT+10)

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * window / BPTT

        model.train()
        X, Y = batching.get_batch(train_data, i, seq_len=window)

        hidden = repackage_hidden(hidden)
        optimizer.zero_grad() # NOT SURE

        output, hidden = model(X, hidden)
        loss_base = criterion(output.view(-1, num_tokens), Y)
        ar_loss =  ALPHA * model.ar_fragment
        tar_loss = BETA * model.tar_fragment
        
        loss = loss_base + ar_loss + tar_loss
        loss.backward()

        torch.nn.utils.clip_grad_norm(model.parameters(), CLIP_GRADIENTS)

        optimizer.step()

        total_loss += loss_base.data
        optimizer.param_groups[0]['lr'] = lr2
        i += window
def evaluate(data_source):
    model.eval()  # Turn on evaluation mode which disables dropout.
    total_loss = 0  # Define the total loss of the model to be 0 to start
    ntokens = myFactorsInfo.getFactorVocabSize(
        WORD_FACTOR)  # Define our vocabulary size
    hidden = model.init_hidden(EVAL_BATCH_SIZE)  # Define our hidden states
    for i in range(
            0,
            data_source.size(0) - 1,
            MAX_SEQ_LEN):  # For every batch (batch#, batch starting index)
        data, targets = get_batch(
            data_source, i, myFactorsInfo,
            evaluation=True)  # Get the batch in evaulation mode
        output, hidden = model(data, hidden)  # Get the output of the model
        output_flat = output.view(
            -1, ntokens
        )  # Get the final output vector from the model (the last word predicted)
        total_loss += len(data) * criterion(
            output_flat, targets).data  # Get the loss of the predicitons
        hidden = repackage_hidden(hidden)  # Reset the hidden states
    return total_loss[0] / len(data_source)  # Return the losses
Esempio n. 7
0
if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

with open(args.checkpoint, 'rb') as f:
    model = torch.load(f)
model.eval()

if args.cuda:
    model.cuda()
else:
    model.cpu()

corpus = data.Corpus(args.data)
ntokens = len(corpus.dictionary)
hidden = model.init_hidden(1)
input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True)
if args.cuda:
    input.data = input.data.cuda()

with open(args.outf, 'w') as outf:
    for i in range(args.words):
        output, hidden = model(input, hidden)
        word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
        word_idx = torch.multinomial(word_weights, 1)[0]
        input.data.fill_(word_idx)
        word = corpus.dictionary.idx2word[word_idx]

        outf.write(word + ('\n' if i % 20 == 19 else ' '))

        if i % args.log_interval == 0:
def evaluate(data):

    wordCache = None
    hiddenCache = None
    windowStartIndex = None

    model.eval()
    totalLoss = 0
    uncachedHiddenState = model.init_hidden(TEST_BATCH_SIZE)
    for i in range(0, data.size(0) - 1, BPTT):
        X, Y = batching.get_batch(data, i, evaluation=True)
        output, uncachedHiddenState = model(X, uncachedHiddenState)
        predictions = output.view(-1, vocabSize)
        outerMostHidden = model.rnns_before_drop[-1].squeeze()

        #Set our starting position for the window that will keep track of the cache
        #Update the words in the cache based on the one hot vectors for the targets
        #Update the hidden states in the cache based on what has been generated before
        oneHots = torch.cat(
            [oneHotify(label.data[0], vocabSize) for label in Y])
        hiddenValuesToCache = Variable(outerMostHidden.data)
        #If the cache hasnt been initialized yet...
        if wordCache is None:
            wordCache = oneHots
            hiddenCache = hiddenValuesToCache
            windowStartIndex = 0
        #If the cache has been initialized...
        else:
            wordCache = torch.cat([wordCache, oneHots])
            hiddenCache = torch.cat([hiddenCache, hiddenValuesToCache], dim=0)
            windowStartIndex = len(wordCache)

        softmaxOutputs = torch.nn.functional.softmax(predictions)
        currentLoss = 0
        for wordIndex, modelProbs in enumerate(softmaxOutputs):

            #If we dont have the cache (as determined by the if statement) then we still need to have a distribution to draw from
            finalProbs = modelProbs
            #If we are outside the cache use the cache
            if windowStartIndex + wordIndex > CACHE_WINDOW_SIZE:
                #Construct the window of the cache that we are going to be operating over
                try:
                    slicedWordCache = wordCache[
                        windowStartIndex + wordIndex -
                        CACHE_WINDOW_SIZE:windowStartIndex + wordIndex]
                    slicedHiddenCache = hiddenCache[
                        windowStartIndex + wordIndex -
                        CACHE_WINDOW_SIZE:windowStartIndex + wordIndex]

                    #Construct a vector of values that describe how well outerMostHidden correlates with the hidden values in the cache
                    hiddenCorrelation = torch.mv(slicedHiddenCache,
                                                 outerMostHidden[wordIndex])
                    #Pass the correlation values through a softmax so we can think of them as probabilities
                    hiddenProbs = torch.nn.functional.softmax(
                        THETA * hiddenCorrelation).view(-1, 1)
                    #Calculate cache probabilities based on the probs from the softmax above times the one hot vectors we calculated earlier.
                    #As the values in slicedWordCache are one hot vectors this will not change the nature of this distribution
                    cacheProbs = (hiddenProbs.expand_as(slicedWordCache) *
                                  slicedWordCache).sum(0).squeeze()

                    #Calculate the combined probabilities for the cache and the model based on a linear interpolation
                    finalProbs = LAMBDA * cacheProbs + (1 -
                                                        LAMBDA) * modelProbs
                except ValueError as e:
                    pass
            probOfTargetWord = finalProbs[Y[wordIndex].data]
            currentLoss += (-torch.log(probOfTargetWord)).data[0]
        totalLoss += currentLoss / TEST_BATCH_SIZE

        uncachedHiddenState = repackage_hidden(uncachedHiddenState)
        wordCache = wordCache[-CACHE_WINDOW_SIZE:]
        hiddenCache = hiddenCache[-CACHE_WINDOW_SIZE:]
    print(totalLoss, len(data))
    final_loss = totalLoss / len(data)
    print("Evaluation - Loss: " + str(final_loss) + " Perplexity: " +
          str(math.exp(final_loss)))

    return final_loss