Beispiel #1
0
def val_loss_func(data_source):
    val_hidden = model.init_hidden(args.batch_size)

    # model.eval()  # Turn on evaluation mode which disables dropout. TODO: Do we want this here?
    ntokens = len(corpus.dictionary)
    seq_len = args.bptt

    data, targets = get_batch(data_source, 0, args, seq_len=seq_len)
    val_hidden = repackage_hidden(val_hidden)

    output, val_hidden, rnn_hs, dropped_rnn_hs = model(data, val_hidden, return_h=True)
    val_loss = criterion(output.view(-1, ntokens), targets)

    return val_loss
Beispiel #2
0
def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()

    with torch.no_grad():
        total_loss = 0
        ntokens = len(corpus.dictionary)
        hidden = model.init_hidden(batch_size)
        for i in range(0, data_source.size(0) - 1, args.bptt):
            data, targets = get_batch(data_source, i, args, evaluation=True)
            output, hidden = model(data, hidden)
            total_loss += len(data) * criterion(output.view(-1, ntokens),
                                                targets).data
            hidden = repackage_hidden(hidden)
    return total_loss.item() / len(data_source)
Beispiel #3
0
def train_loss_func():
    train_hidden = model.init_hidden(args.batch_size)
    ntokens = len(corpus.dictionary)
    seq_len = args.bptt

    # Turn on training mode which enables dropout.
    model.train()
    data, targets = get_batch(train_data, 0, args, seq_len=seq_len)
    output, train_hidden, rnn_hs, dropped_rnn_hs = model(data, train_hidden, return_h=True)
    xentropy_loss = criterion(output.view(-1, ntokens), targets)
    loss = xentropy_loss

    if args.wdecay_type in ['global', 'per_layer', 'per_param']:
        loss = loss + model.L2_loss()

    return xentropy_loss, loss
Beispiel #4
0
def train():

    global global_iteration

    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    batch, i = 0, 0

    train_losses = []

    while i < train_data.size(0) - 1 - 1:

        seq_len = args.bptt

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
        model.train()
        data, targets = get_batch(train_data, i, args, seq_len=seq_len)

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optimizer.zero_grad()

        # Optionally perturb hyperparameter values (via Gaussian or sinusoid perturbations)
        # ---------------------------------------------------------------------------------
        for hparam in ['dropoute', 'dropouti', 'dropouth', 'dropouto']:
            if getattr(args, 'perturb_' + hparam):
                if args.perturb_type == 'gaussian':
                    gaussian_perturbation = args.perturb_std * np.random.randn(
                    )
                    use_hparam_value = getattr(args,
                                               hparam) + gaussian_perturbation
                elif args.perturb_type == 'sinusoid':
                    use_hparam_value = getattr(
                        args, hparam) + args.amplitude * np.sin(
                            multipliers[hparam] * 2 * np.pi)
                    multipliers[hparam] += args.multiplier_increment

                use_hparam_value = np.clip(use_hparam_value, 0.0, 0.99)
                setattr(model, hparam, use_hparam_value)
        # ---------------------------------------------------------------------------------

        output, hidden, rnn_hs, dropped_rnn_hs = model(data,
                                                       hidden,
                                                       return_h=True)
        raw_loss = criterion(output.view(-1, ntokens), targets)

        loss = raw_loss
        # Activation Regularization
        if args.alpha:
            loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean()
                              for dropped_rnn_h in dropped_rnn_hs[-1:])
        # Temporal Activation Regularization (slowness)
        if args.beta:
            loss = loss + sum(args.beta *
                              (rnn_h[1:] - rnn_h[:-1]).pow(2).mean()
                              for rnn_h in rnn_hs[-1:])
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip)
        optimizer.step()

        total_loss += raw_loss.data
        train_losses.append(raw_loss.item())

        optimizer.param_groups[0]['lr'] = lr2
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss.item() / args.log_interval
            elapsed = time.time() - start_time

            val_loss = evaluate(val_data, eval_batch_size)

            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | trn ppl {:8.2f} | val ppl {:8.2f} | bpc {:8.3f}'
                .format(epoch, batch,
                        len(train_data) // args.bptt,
                        optimizer.param_groups[0]['lr'],
                        elapsed * 1000 / args.log_interval, cur_loss,
                        math.exp(cur_loss), math.exp(val_loss),
                        cur_loss / math.log(2)))
            total_loss = 0
            start_time = time.time()

        ###
        batch += 1
        i += seq_len
        global_iteration += 1

    return np.mean(train_losses)