Beispiel #1
0
def validate_step(model, valid_dl, criterion):
    model.eval()

    # init hidden states
    model.hidden = model.init_hidden()

    total_acc = 0.0
    total_loss = 0.0
    total = 0.0

    for i, (test_inputs, test_labels) in tqdm_notebook(enumerate(valid_dl),
                                                       desc='Validation',
                                                       total=len(valid_dl)):

        test_inputs, test_labels = to_var(test_inputs,
                                          True), to_var(test_labels, True)
        if len(test_labels) < valid_dl.batch_size: continue

        output = model(test_inputs.t())
        loss = criterion(output, test_labels)

        # calculate testing acc and loss
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == test_labels.data).sum()
        total_loss += loss.data[0]
        total += len(test_labels)

        model.hidden = detach(model.hidden)

    return total_loss / total, total_acc / total
Beispiel #2
0
def validate_epoch(epoch, model, val_ids, criterion, num_epochs, batch_size,
                   seq_length):

    model.eval()
    states = model.init_hidden(batch_size)
    num_batches = val_ids.size(1) // seq_length
    val_loss = 0.0
    val_acc = 0.0

    for i in range(0, val_ids.size(1) - seq_length, seq_length):
        inputs = to_var(val_ids[:, i:i + seq_length], volatile=True)
        targets = to_var(val_ids[:, (i + 1):(i + 1) + seq_length].contiguous())

        # Forward
        states = detach(states)
        outputs, states = model(inputs, states)

        # accuracy
        _, predictions = torch.max(outputs, dim=1)
        acc = torch.mean((predictions == targets.view(-1)).float())
        val_acc = (val_acc * i + acc.data[0]) / (i + 1)

        # loss
        loss = criterion(outputs, targets.view(-1))
        val_loss = (val_loss * i + loss.data[0]) / (i + 1)

        # report
        step = (i + 1) // seq_length
        sys.stdout.flush()
        sys.stdout.write(
            '\rValidation: Epoch [%d/%d], Step [%d/%d], Loss: %.3f, Perp: %.2f, Acc: %-15.2f'
            % (epoch + 1, num_epochs, step + 1, num_batches, val_loss,
               np.exp(val_loss), val_acc))

    return val_loss
Beispiel #3
0
    def evaluate(self, source, dag, name, batch_size=1, max_num=None):
        """Evaluate on the validation set.

        NOTE(brendan): We should not be using the test set to develop the
        algorithm (basic machine learning good practices).
        """
        self.shared.eval()
        self.controller.eval()

        data = source[:max_num * self.max_length]

        total_loss = 0
        hidden = self.shared.init_hidden(batch_size)

        pbar = range(0, data.size(0) - 1, self.max_length)
        for count, idx in enumerate(pbar):
            inputs, targets = self.get_batch(data, idx, volatile=True)
            output, hidden, _ = self.shared(inputs,
                                            dag,
                                            hidden=hidden,
                                            is_train=False)
            output_flat = output.view(-1, self.dataset.num_tokens)
            total_loss += len(inputs) * self.ce(output_flat, targets).data
            hidden = utils.detach(hidden)
            ppl = math.exp(
                utils.to_item(total_loss) / (count + 1) / self.max_length)

        val_loss = utils.to_item(total_loss) / len(data)
        ppl = math.exp(val_loss)

        self.tb.scalar_summary(f'eval/{name}_loss', val_loss, self.epoch)
        self.tb.scalar_summary(f'eval/{name}_ppl', ppl, self.epoch)
        logger.info(f'eval | loss: {val_loss:8.2f} | ppl: {ppl:8.2f}')
    def evaluate(self, source, dag, name, batch_size=1, max_num=None):
        """Evaluate on the validation set.
        """
        self.shared.eval()
        self.controller.eval()

        if self.image_dataset:
            data = source
        else:
            data = source[:max_num * self.max_length]

        total_loss = 0
        hidden = self.shared.init_training(batch_size)

        pbar = range(0, self.valid_data_size - 1, self.max_length)
        for count, idx in enumerate(pbar):
            inputs, targets = self.get_batch(data, idx, volatile=True)
            output, hidden, _ = self.shared(inputs,
                                            dag,
                                            hidden=hidden,
                                            is_train=False)
            output_flat = output.view(-1, self.dataset.num_classes)
            total_loss += len(inputs) * self.ce(output_flat, targets).data
            hidden = utils.detach(hidden)
            ppl = math.exp(
                utils.to_item(total_loss) / (count + 1) / self.max_length)

        val_loss = utils.to_item(total_loss) / len(data)
        ppl = math.exp(val_loss)

        self.tb.scalar_summary(f'eval/{name}_loss', val_loss, self.epoch)
        self.tb.scalar_summary(f'eval/{name}_ppl', ppl, self.epoch)
        logger.info(f'eval | loss: {val_loss:8.2f} | ppl: {ppl:8.2f}')
def evaluate(dataloader: DataLoader, model: RNN, loss_function: Union[SplitCrossEntropyLoss, CrossEntropyLoss],
             only_l: Union[torch.Tensor, int] = None, device: Union[torch.device, str] = 'cpu', **kwargs):
    model.eval()

    languages = dataloader.dataset.data.keys()
    if only_l:
        if only_l not in languages:
            raise ValueError(f'Language {only_l} does not exist in the dataset')
        local_losses = {only_l: 0}
    else:
        local_losses = {lang: 0 for lang in languages}

    batch = 0
    prev_lang = ""

    with tqdm(dataloader, total=len(dataloader)) as pbar:
        for data, targets, seq_len, lang in pbar:
            data = data.squeeze(0).to(device)
            targets = targets.squeeze(0).to(device)
            lang = lang.to(device)

            if only_l and only_l != lang:
                continue

            if prev_lang != lang:
                prev_lang = lang
                hidden = model.init_hidden(batchsize=data.size(-1))
            else:
                detach(hidden)

            with torch.no_grad():
                output, hidden = model(data, hidden, lang)
                if isinstance(loss_function, SplitCrossEntropyLoss):
                    loss = loss_function(model.decoder.weight, model.decoder.bias, output, targets)
                else:
                    loss = loss_function(output, targets)
                local_losses[lang.item()] += len(data) * loss.data

            batch += 1

            pbar.set_description('Evaluation, finished batch {} | loss {}'.format(batch, loss.data))

    avg_loss = {lang: local_losses[lang].item() / len(dataloader.dataset.data[lang]) for lang in languages} if only_l is None else {only_l: local_losses[only_l].item() / len(dataloader.dataset.data[only_l])}
    total_loss = sum(avg_loss.values())

    return total_loss / len(languages), avg_loss
        def _run_shared_one_batch(inputs, targets, hidden, dags,
                                  raw_total_loss):
            # global abs_max_grad
            # global abs_max_hidden_norm
            # global raw_total_loss
            loss, sample_loss, rest_loss, hidden, extra_out = self.get_loss(
                inputs, targets, dags, hidden=hidden)

            # Detach the hidden
            # Because they are input from previous state.
            hidden = utils.detach(hidden)
            raw_total_loss += sample_loss.data / self.args.num_batch_per_iter
            penalty_loss = _apply_penalties(extra_out, self.args)
            loss += penalty_loss
            rest_loss += penalty_loss
            return loss, sample_loss, rest_loss, hidden, extra_out, raw_total_loss
Beispiel #7
0
def evaluate(data_source, batch_size):
    '''https://mxnet.incubator.apache.org/api/python/autograd/autograd.html#train-mode-and-predict-mode'''
    tic = time.time()
    total_loss = 0
    N = 0
    states = model.begin_state(batch_size, ctx=ctxs[0])
    for cursor in range(0, data_source.shape[0] - 1, args.bptt):
        Xs, Ys = get_batch(data_source, cursor, args)
        # By default, MXNet is in predict_mode
        output, states, _, _ = model(
            Xs, states)  # state(num_layers, bsz, hidden_size)
        states = detach(states)
        total_loss += nd.sum(batch_size *
                             loss(output, Ys)).asscalar()  # loss (seq_len,)
        N += batch_size * len(output)

    return (total_loss / N), time.time() - tic
Beispiel #8
0
def train_epoch(epoch, model, trn_ids, criterion, optimizer, scheduler,
                num_epochs, batch_size, seq_length):

    model.train()
    scheduler.step()
    states = model.init_hidden(batch_size)
    num_batches = trn_ids.size(1) // seq_length
    trn_loss = 0.0
    trn_acc = 0.0

    for i in range(0, trn_ids.size(1) - seq_length, seq_length):
        inputs = to_var(trn_ids[:, i:i + seq_length])
        targets = to_var(trn_ids[:, (i + 1):(i + 1) + seq_length].contiguous())

        # Forward
        states = detach(states)
        outputs, states = model(inputs, states)

        # accuracy
        _, predictions = torch.max(outputs, dim=1)
        acc = torch.mean((predictions == targets.view(-1)).float())
        trn_acc = (trn_acc * i + acc.data[0]) / (i + 1)

        # loss
        loss = criterion(outputs, targets.view(-1))
        trn_loss = (trn_loss * i + loss.data[0]) / (i + 1)

        # backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.3)
        optimizer.step()

        # report
        step = (i + 1) // seq_length
        sys.stdout.flush()
        sys.stdout.write(
            '\rTraining: Epoch [%d/%d], Step [%d/%d], Loss: %.3f, Perp: %.2f, Acc: %-15.2f'
            % (epoch + 1, num_epochs, step + 1, num_batches, trn_loss,
               np.exp(trn_loss), trn_acc))

    return trn_loss
Beispiel #9
0
def train_step(model, train_dl, criterion, optimizer, scheduler):
    model.train()
    scheduler.step()

    # init hidden states
    model.hidden = model.init_hidden()

    total_acc = 0.0
    total_loss = 0.0
    total = 0.0

    for i, (train_inputs, train_labels) in tqdm_notebook(enumerate(train_dl),
                                                         desc='Training',
                                                         total=len(train_dl)):

        train_inputs, train_labels = to_var(train_inputs), to_var(train_labels)
        if len(train_labels) < train_dl.batch_size: continue

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.hidden = detach(model.hidden)
        model.zero_grad()
        output = model(train_inputs.t())

        loss = criterion(output, train_labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.3)
        optimizer.step()

        # calculate training acc and loss
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == train_labels.data).sum()
        total_loss += loss.data[0]
        total += len(train_labels)

    return total_loss / total, total_acc / total
Beispiel #10
0
    def train_shared(self, max_step=None):
        """Train the language model for 400 steps of minibatches of 64
        examples.

        Args:
            max_step: Used to run extra training steps as a warm-up.

        BPTT is truncated at 35 timesteps.

        For each weight update, gradients are estimated by sampling M models
        from the fixed controller policy, and averaging their gradients
        computed on a batch of training data.
        """
        model = self.shared
        model.train()
        self.controller.eval()

        hidden = self.shared.init_hidden(self.args.batch_size)

        if max_step is None:
            max_step = self.args.shared_max_step
        else:
            max_step = min(self.args.shared_max_step, max_step)

        abs_max_grad = 0
        abs_max_hidden_norm = 0
        step = 0
        raw_total_loss = 0
        total_loss = 0
        train_idx = 0
        # TODO(brendan): Why - 1 - 1?
        #while train_idx < self.train_data.size(0) - 1 - 1:
        for _, (inputs, targets) in enumerate(self.train_data):
            if step > max_step:
                break

            dags = self.controller.sample(self.args.shared_num_sample)

            # inputs, targets = self.get_batch(self.train_data,
            #                                  train_idx,
            #                                  self.max_length)
            inputs, targets = prep_batch(inputs, targets)
            print('batch_size', inputs.size())

            loss, hidden, extra_out = self.get_loss(inputs, targets, hidden,
                                                    dags)
            hidden = utils.detach(hidden)
            raw_total_loss += loss.data

            # should only be for RNNs
            loss += _apply_penalties(extra_out, self.args)

            # update
            self.shared_optim.zero_grad()
            loss.backward()

            h1tohT = extra_out['hiddens']
            new_abs_max_hidden_norm = utils.to_item(
                h1tohT.norm(dim=-1).data.max())
            if new_abs_max_hidden_norm > abs_max_hidden_norm:
                abs_max_hidden_norm = new_abs_max_hidden_norm
                logger.info(f'max hidden {abs_max_hidden_norm}')
            abs_max_grad = _check_abs_max_grad(abs_max_grad, model)
            torch.nn.utils.clip_grad_norm(model.parameters(),
                                          self.args.shared_grad_clip)
            self.shared_optim.step()

            total_loss += loss.data

            if ((step % self.args.log_step) == 0) and (step > 0):
                self._summarize_shared_train(total_loss, raw_total_loss)
                raw_total_loss = 0
                total_loss = 0

            step += 1
            self.shared_step += 1
            train_idx += self.max_length
Beispiel #11
0
    def train_shared(self, max_step=None):
        """Train the language model for 400 steps of minibatches of 64
        examples.

        Args:
            max_step: Used to run extra training steps as a warm-up.

        BPTT is truncated at 35 timesteps.

        For each weight update, gradients are estimated by sampling M models
        from the fixed controller policy, and averaging their gradients
        computed on a batch of training data.
        """
        model = self.shared
        model.train()
        self.controller.eval()

        hidden = self.shared.init_hidden(self.args.batch_size)

        if max_step is None:
            max_step = self.args.shared_max_step
        else:
            max_step = min(self.args.shared_max_step, max_step)

        abs_max_grad = 0
        abs_max_hidden_norm = 0
        step = 0
        raw_total_loss = 0
        total_loss = 0
        train_idx = 0
        # NOTE(brendan): The - 1 - 1 here is because each example should
        # include at least one (x_t, y_{t + 1}) sequence, since y_{t + 1} is
        # predicted from x_t.
        while train_idx < self.train_data.size(0) - 1 - 1:
            bptt = self.max_length
            if np.random.random() >= 0.95:
                bptt /= 2.

            seq_len = int(np.random.normal(bptt, 5))
            seq_len = max(5, seq_len)

            saved_lr = self.shared_optim.param_groups[0]['lr']
            self.shared_optim.param_groups[0]['lr'] = saved_lr*seq_len/bptt

            dags = self.controller.sample(self.args.shared_num_sample)
            inputs, targets = self.get_batch(self.train_data,
                                             train_idx,
                                             seq_len)

            loss, hidden, extra_out = self.get_loss(inputs,
                                                    targets,
                                                    hidden,
                                                    dags)
            hidden = utils.detach(hidden)
            raw_total_loss += loss.data.squeeze()

            loss += _apply_penalties(extra_out, self.args)

            # update
            self.shared_optim.zero_grad()
            loss.backward()

            abs_max_hidden_norm = _check_max_hidden(abs_max_hidden_norm,
                                                    extra_out['hiddens'])

            abs_max_grad = _check_abs_max_grad(abs_max_grad, model)
            torch.nn.utils.clip_grad_norm(model.parameters(),
                                          self.args.shared_grad_clip)

            self.shared_optim.step()

            total_loss += loss.data.squeeze()
            self.shared_optim.param_groups[0]['lr'] = saved_lr

            if ((step % self.args.log_step) == 0) and (step > 0):
                self._summarize_shared_train(total_loss, raw_total_loss)
                raw_total_loss = 0
                total_loss = 0

            step += 1
            self.shared_step += 1
            train_idx += seq_len
def train(dataloader: DataLoader, model: RNN, optimizer: torch.optim.Optimizer,
          loss_function: Union[SplitCrossEntropyLoss, CrossEntropyLoss], use_apex=False, amp=None,
          lr_weights: dict = None, prior: str = 'ninf', scaling: str = None, total_steps: int = 0, steps: int = 0,
          bptt: int = 125, alpha: float = 0., beta: float = 0., log_interval: int = 200, n_samples: int = 4,
          device: Union[torch.device, str] = 'cpu', tb_writer=None, **kwargs):
    total_loss = 0
    batch = 0

    tr_kl = 0.
    logging_kl = 0.
    tr_loss = 0.
    logging_loss = 0.

    model.train()

    log.info('Starting training loop')
    start_time = time.time()

    with tqdm(dataloader, total=len(dataloader)) as pbar:
        for data, targets, seq_len, lang in pbar:

            data = data.squeeze(0).to(device)
            targets = targets.squeeze(0).to(device)
            lang = lang.to(device)

            hidden = model.init_hidden(batchsize=data.size(-1))

            lr2 = optimizer.param_groups[0]['lr']
            if lr_weights is not None:
                optimizer.param_groups[0]['lr'] = lr2 * seq_len.item() / bptt * lr_weights[lang.item()]
            else:
                optimizer.param_groups[0]['lr'] = lr2 * seq_len.item() / bptt

            hidden = detach(hidden)
            optimizer.zero_grad()

            loss = 0

            if not isinstance(prior, VIPrior):
                n_samples = 1

            for s in range(n_samples):
                output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, lang, return_h=True)

                if isinstance(loss_function, SplitCrossEntropyLoss):
                    raw_loss = loss_function(model.decoder.weight, model.decoder.bias, output, targets)
                else:
                    raw_loss = loss_function(output, targets)

                if alpha:
                    raw_loss = raw_loss + sum(alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
                # Temporal Activation Regularization (slowness)
                if beta:
                    raw_loss = raw_loss + sum(beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])

                loss += raw_loss

            loss /= n_samples

            log_loss = loss

            if isinstance(prior, VIPrior):
                kl_term = prior.kl_div()

                if scaling == "uniform":
                    scale = 1. / total_steps
                elif scaling == "linear_annealing":
                    scale = ((total_steps - steps - 1) * 2. + 1.) / total_steps ** 2
                elif scaling == "logistic_annealing":
                    steepness = 0.0025
                    scale = 1. / (1 + np.exp(-steepness * (steps - total_steps / 2.)))
                else:
                    scale = 1.
                loss = loss + scale * kl_term
                tr_kl += kl_term.item()

            if use_apex:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if tb_writer is not None:
                tb_writer.add_scalar('train/loss', log_loss.item(), steps)

                if isinstance(prior, VIPrior):
                    tb_writer.add_scalar('train/kl', kl_term.item(), steps)
                    tb_writer.add_scalar('train/loss+kl', loss.item(), steps)

                    logging_kl += tr_kl

                logging_loss += tr_loss

            optimizer.step()

            total_loss += raw_loss.data
            batch += 1
            steps += 1

            # reset lr to optimiser default
            optimizer.param_groups[0]['lr'] = lr2

            if batch % log_interval == 0 and batch > 0:
                cur_loss = total_loss.item() / log_interval
                elapsed = time.time() - start_time
                log.debug(
                    '| {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                        batch, len(dataloader), optimizer.param_groups[0]['lr'], elapsed * 1000 / log_interval,
                        cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
                total_loss = 0
                start_time = time.time()

            pbar.set_description('Training, end of batch {} | Loss {}'.format(batch, loss.data))

    return steps
Beispiel #13
0
def train_one_epoch(epoch, cur_lr):
    ''' Train all the batches within one epoch.
    costs is the container created once and reuse for efficiency'''

    total_loss = 0
    states = [model.begin_state(batch_size=m, ctx=ctx) for ctx in ctxs]

    # Loop all batches
    batch, cursor = 0, 0
    tic_log_interval = time.time()
    while cursor < train_data.shape[0] - 1 - 1:
        #######################################################################
        # Control seq_len cited from origin paper
        random_bptt = args.bptt if np.random.random(
        ) < 0.95 else args.bptt / 2.
        # Normal distribution (mean, variance): Prevent extreme sequence lengths
        seq_len = max(5, int(np.random.normal(random_bptt, 5)))
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        seq_len = min(seq_len, args.bptt + args.max_seq_len_delta)
        # Rescale learning rate depending on the variable length w.r.t bptt
        trainer.set_learning_rate(cur_lr * seq_len / args.bptt)
        ########################################################################
        '''Each batch shape(seq_len, batch_size), split data to each device.
        m is the # of samples for each device, devided along batch_size axis.'''
        Xs, Ys = get_batch(train_data, cursor, args, seq_len=seq_len)
        assert args.batch_size == Xs.shape[
            1], 'data shape[1] should be batch_size'
        Xs = gluon.utils.split_and_load(Xs, ctxs, 1)
        Ys = gluon.utils.split_and_load(Ys, ctxs, 1)
        tic_b = time.time()

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        states = detach(states)
        loss_list = []
        with autograd.record():  # train_mode
            for i, X in enumerate(Xs):
                output, states[i], encoded_raw, encoded_dropped = model(
                    X, states[i])  # state(num_layers, bsz, hidden_size)
                device_loss = joint_loss(output, Ys[i], encoded_raw,
                                         encoded_dropped)
                loss_list.append(device_loss.as_in_context(ctxs[0]) / X.size)
        for l in loss_list:
            l.backward()
        ''' trainer.allreduce_grads()
            For each parameter, reduce the gradients from different contexts.
            Should be called after autograd.backward(), outside of record() scope, and before trainer.update().
            For normal parameter updates, step() should be used, which internally calls allreduce_grads() and then update().
            However, in gradient clipping, manually call allreduce_grads() and update() separately.
        '''
        # trainer.allreduce_grads()
        # grads = [p.grad(ctxs[0]) for p in parameters]
        grads = [p.grad(ctx) for ctx in ctxs for p in parameters]
        gluon.utils.clip_global_norm(grads, args.clipping_theta)
        trainer.step(1)
        # trainer.update(1)

        batch_loss = sum([nd.sum(l).asscalar() for l in loss_list]) / len(ctxs)
        toc_b = time.time()
        batch_info.append([
            epoch, batch, trainer.learning_rate, seq_len,
            (toc_b - tic_b) * 1000,
            args.batch_size * seq_len // (toc_b - tic_b), batch_loss,
            math.exp(batch_loss)
        ])

        total_loss += batch_loss

        if batch % args.log_interval == 0 and batch > 0:
            utils.save_info(batch_info, batch_file)

            toc_log_interval = time.time()
            total_loss = total_loss / args.log_interval

            logging.info(
                '| epoch {:4d} ({:5.2f}%)| batch {:4d} | lr {:7.4f} | seq_len {:2d} | {:4.0f} ms/batch | '
                '{:5d} tokens/s | loss {:6.3f} | ppl {:5.2f}'.format(
                    epoch, cursor / train_data.shape[0] * 100, batch,
                    trainer.learning_rate, seq_len,
                    (toc_log_interval - tic_log_interval) * 1000 /
                    args.log_interval,
                    int(args.batch_size * args.log_interval * seq_len /
                        (toc_log_interval - tic_log_interval)), total_loss,
                    math.exp(total_loss)))

            total_loss = 0
            tic_log_interval = time.time()

        batch += 1
        cursor += seq_len

        global parameters_count
        if not parameters_count:
            logging.info('Parameters (except embeding): {}'.format(
                sum(p.data(ctxs[0]).size for p in parameters)))
            parameters_count = 1

    nd.waitall()  # synchronize batch data