コード例 #1
0
def train(model, train_data, train_data2, target_data, optimizer, criterion, params, epoch, args):
    # Turn on training mode which enables dropout.
    if args["model_type"]== 'QRNN': model.reset()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(args["batch_size"])
    batch, i = 0, 0
    while i < train_data.size(0) - 1 - 1:
        if model.is_attention_model():
            model.reset_last_layer()
        bptt = args["bptt"] if np.random.random() < 0.95 else args["bptt"]/ 2.
        # Prevent excessively small or negative sequence lengths
        seq_len = max(5, int(np.random.normal(bptt, 5)))
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        # seq_len = min(seq_len, args.bptt + 10)

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * seq_len / args["bptt"]
        model.train()
        data = get_batch(train_data, i, args, seq_len=seq_len)
        data2 = get_batch(train_data2, i+1, args, seq_len=seq_len)
        targets = get_batch(target_data, i+1, args, seq_len=seq_len).view(-1)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optimizer.zero_grad()

        output, hidden, rnn_hs, dropped_rnn_hs = model(data, data2, hidden, return_h=True)
        #  output, hidden = model(data, data2, hidden, return_h=False)
        raw_loss = criterion(output, targets)
        loss = raw_loss
        # Activiation Regularization
        if args["alpha"]: loss = loss + sum(args["alpha"]* dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
        # Temporal Activation Regularization (slowness)
        if args["beta"]: loss = loss + sum(args["beta"]* (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if args["clip"]: torch.nn.utils.clip_grad_norm_(params, args["clip"])
        optimizer.step()

        total_loss += raw_loss.data
        optimizer.param_groups[0]['lr'] = lr2
        if batch % args["log_interval"]== 0 and batch > 0:
            cur_loss = total_loss.item() / args["log_interval"]
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                epoch, batch, len(train_data) // args["bptt"], optimizer.param_groups[0]['lr'],
                elapsed * 1000 / args["log_interval"], cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
            total_loss = 0
            start_time = time.time()
        ###
        batch += 1
        i += seq_len
        del data, data2, targets, raw_loss
コード例 #2
0
def evaluate(model, criterion, args, data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    if args["model_type"] == 'QRNN':
        model.reset()
    total_loss = 0
    hidden = model.init_hidden(batch_size)
    for i in range(0, data_source.size(0) - 1, args["bptt"]):
        if model.is_attention_model():
            model.reset_last_layer()
        data, targets = get_batch(data_source, i, args, evaluation=True)
        output, hidden = model(data, hidden)
        #  total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data
        total_loss += len(data) * criterion(output, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss.item() / len(data_source)