Example #1
0
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    stats = adl.Accumulator()
    for inputs, targets in trainloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        stats["loss_sum"] += loss.item() * targets.size(0)
        _, predicted = outputs.max(1)
        stats["total"] += targets.size(0)
        stats["correct"] += predicted.eq(targets).sum().item()

    trainloader.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Data/")
    net.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Model/")
    with stats.synchronized():
        stats["loss_avg"] = stats["loss_sum"] / stats["total"]
        stats["accuracy"] = stats["correct"] / stats["total"]
        writer.add_scalar("Loss/Train", stats["loss_avg"], epoch)
        writer.add_scalar("Accuracy/Train", stats["accuracy"], epoch)
        print("Train:", stats)
Example #2
0
def valid(epoch):
    net.eval()
    stats = adl.Accumulator()
    with torch.no_grad():
        for inputs, targets in validloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            stats["loss_sum"] += loss.item() * targets.size(0)
            _, predicted = outputs.max(1)
            stats["total"] += targets.size(0)
            stats["correct"] += predicted.eq(targets).sum().item()

    with stats.synchronized():
        stats["loss_avg"] = stats["loss_sum"] / stats["total"]
        stats["accuracy"] = stats["correct"] / stats["total"]
        writer.add_scalar("Loss/Valid", stats["loss_avg"], epoch)
        writer.add_scalar("Accuracy/Valid", stats["accuracy"], epoch)

        if adaptdl.env.replica_rank() == 0:
            nni.report_intermediate_result(stats["accuracy"])

        print("Valid:", stats)
        return stats["accuracy"]
Example #3
0
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    stats = adl.Accumulator()
    for inputs, targets in trainloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        stats["loss_sum"] += loss.item() * targets.size(0)
        _, predicted = outputs.max(1)
        stats["total"] += targets.size(0)
        stats["correct"] += predicted.eq(targets).sum().item()

        writer.add_scalar("Throughput/Gain", net.gain, epoch)
        writer.add_scalar("Throughput/Global_Batchsize",
                          trainloader.current_batch_size, epoch)

    with stats.synchronized():
        stats["loss_avg"] = stats["loss_sum"] / stats["total"]
        stats["accuracy"] = stats["correct"] / stats["total"]
        writer.add_scalar("Loss/Train", stats["loss_avg"], epoch)
        writer.add_scalar("Accuracy/Train", stats["accuracy"], epoch)
        print("Train:", stats)
Example #4
0
def evaluate(eval_model, val_iter):
    eval_model.eval()  # Turn on the evaluation mode
    stats = adl.Accumulator()
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for batch in val_iter:
            output = eval_model(batch.text.to(device))
            output_flat = output.view(-1, ntokens)
            stats["loss_sum"] += batch.text.size(1) * \
                criterion(output_flat, batch.target.view(-1).to(device)).item()
            stats["total"] += batch.target.size(1)

    with stats.synchronized():
        loss_avg = stats["loss_avg"] = stats["loss_sum"] / stats["total"]
        writer.add_scalar("Loss/Valid", stats["loss_avg"], epoch)
        print("Valid:", stats)

    return loss_avg
Example #5
0
    def train(self, train_data, epoch, writer):
        stats = adl.Accumulator()
        self.model.train()  # Turn on the train mode
        total_loss = 0.
        start_time = time.time()
        ntokens = len(TEXT.vocab.stoi)
        for i, batch in enumerate(
                AdaptiveBPTTIterator(
                    train_data,
                    batch_size=args.bs,
                    bptt_len=args.bptt,
                    max_batch_size=self.max_batch_size,  # noqa: E501
                    local_bsz_bounds=self.local_bsz_bounds)):  # noqa: E501
            self.optimizer.zero_grad()
            output = self.model(batch.text.to(device))
            loss = self.criterion(output.view(-1, ntokens),
                                  batch.target.view(-1).to(device))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
            self.optimizer.step()
            total_loss += loss.item()

            stats["loss_sum"] += loss.item() * batch.target.size(1)
            stats["total"] += batch.target.size(1)

            writer.add_scalar("Throughput/Gain", self.model.gain, epoch)

            log_interval = 10
            if i % log_interval == 0 and i > 0:
                cur_loss = total_loss / log_interval
                elapsed = time.time() - start_time
                print(f'| epoch {epoch:3d} | batch {i:5d} | '
                      f'lr {self.scheduler.get_lr()[0]:02.2f} | '
                      f'ms/batch {elapsed * 1000 / log_interval:7.2f} | '
                      f'loss {cur_loss:5.2f} | ppl {np.exp(cur_loss):8.2f}')
                total_loss = 0
                start_time = time.time()

        with stats.synchronized():
            stats["loss_avg"] = stats["loss_sum"] / stats["total"]
            writer.add_scalar("Loss/Train", stats["loss_avg"], epoch)
            print("Train:", stats)
Example #6
0
    def evaluate(self, eval_model, data_source, epoch=0, writer=None):
        eval_model.eval()  # Turn on the evaluation mode
        stats = adl.Accumulator()
        ntokens = len(TEXT.vocab.stoi)
        with torch.no_grad():
            for batch in AdaptiveBPTTIterator(data_source,
                                              batch_size=args.bs,
                                              bptt_len=args.bptt):
                output = eval_model(batch.text.to(device))
                output_flat = output.view(-1, ntokens)
                stats["loss_sum"] += batch.text.size(1) * \
                    self.criterion(output_flat,
                                   batch.target.view(-1).to(device)).item()
                stats["total"] += batch.target.size(1)

        with stats.synchronized():
            loss_avg = stats["loss_avg"] = stats["loss_sum"] / stats["total"]
            if writer:
                writer.add_scalar("Loss/Valid", stats["loss_avg"], epoch)
            print("Valid:", stats)

        return loss_avg
Example #7
0
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    stats = adl.Accumulator()
    for inputs, targets in trainloader:
        optimizer.zero_grad()
        if args.mixed_precision:
            inputs, targets = inputs.to(device), targets.to(device)
            with torch.cuda.amp.autocast():
                outputs = net(inputs)
                loss = criterion(outputs, targets)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        stats["loss_sum"] += loss.item() * targets.size(0)
        _, predicted = outputs.max(1)
        stats["total"] += targets.size(0)
        stats["correct"] += predicted.eq(targets).sum().item()

    trainloader.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Data/")
    net.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Model/")
    if args.mixed_precision:
        writer.add_scalar("MixedPrecision/scale", scaler.get_scale(), epoch)
    with stats.synchronized():
        stats["loss_avg"] = stats["loss_sum"] / stats["total"]
        stats["accuracy"] = stats["correct"] / stats["total"]
        writer.add_scalar("Loss/Train", stats["loss_avg"], epoch)
        writer.add_scalar("Accuracy/Train", stats["accuracy"], epoch)
        print("Train:", stats)
Example #8
0
            network.zero_grad()
            prediction = network(user, item)
            loss = loss_function(prediction, label)
            loss.backward()
            optimizer.step()
            count += 1
            gain = network.gain
            batchsize = train_loader.current_batch_size
            accumulation_steps = train_loader.accumulation_steps

        train_loader.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Data/")
        network.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Model/")

        network.eval()
        stats = adl.Accumulator()
        HR, NDCG = evaluate.metrics(network, test_loader, args.top_k)
        stats['HR'] += HR
        stats['replicas'] += 1.0
        with stats.synchronized():
            writer.add_scalar('Loss/HR', stats['HR'] / stats['replicas'],
                              epoch)

        elapsed_time = time.time() - start_time
        print("The time elapse of epoch {:03d}".format(epoch) + " is: " +
              time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
        print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG)))

        if HR > best_hr:
            best_hr, best_ndcg, best_epoch = HR, NDCG, epoch
            if args.out and adaptdl.env.replica_rank() == 0:
Example #9
0
def train(epoch):
    iters = 0
    # For each batch in the dataloader
    stats = adl.Accumulator()
    for i, data in enumerate(dataloader, 0):
        data = data[0]
        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        # Format batch
        real_cpu = data.to(device)
        b_size = real_cpu.size(0)
        label = torch.full((b_size, ), real_label, device=device)
        # Forward pass real batch through D
        output = netD(real_cpu).view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, nz, 1, 1, device=device)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output = netD(fake.detach()).view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        # Calculate the gradients for this batch
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Add the gradients from the all-real and all-fake batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output = netD(fake).view(-1)
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        stats["g_loss_sum"] += errG.item()
        stats["d_loss_sum"] += errD.item()
    stats["norm"] += metrics._metrics_state().grad_params[0]
    stats["var"] += metrics._metrics_state().grad_params[1]
    stats["replicas"] += 1.0
    scheduleD.step()
    scheduleG.step()

    with stats.synchronized():
        with SummaryWriter(adaptdl.get_tensorboard_dir()) as writer:
            writer.add_scalar("Loss/G",
                              stats["g_loss_sum"] / stats["replicas"], epoch)
            writer.add_scalar("Loss/D",
                              stats["d_loss_sum"] / stats["replicas"], epoch)
            writer.add_scalar("Performance/GlobalBatchsize",
                              b_size * stats["replicas"], epoch)
            writer.add_scalar("Performance/Replicas", stats["replicas"], epoch)
            writer.add_scalar("Stats/Variance",
                              stats["norm"] / stats["replicas"], epoch)
            writer.add_scalar("Stats/Norm", stats["var"] / stats["replicas"],
                              epoch)