コード例 #1
0
def validate(val_loader, model, criterion, args, writer, epoch):
    batch_time = AverageMeter("Time", ":6.3f", write_val=False)
    losses = AverageMeter("Loss", ":.3f", write_val=False)
    top1 = AverageMeter("Acc@1", ":6.2f", write_val=False)
    top5 = AverageMeter("Acc@5", ":6.2f", write_val=False)
    progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5],
                             prefix="Test: ")

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in tqdm.tqdm(enumerate(val_loader),
                                             ascii=True,
                                             total=len(val_loader)):
            if args.gpu is not None:
                images = images.cuda(args.gpu, non_blocking=True)

            target = target.cuda(args.gpu, non_blocking=True)

            # YHT modification
            '''
            This will severely influence the generalization! drop this.
            if args.seed is not None and args.prandom:
                torch.manual_seed(args.seed)
                torch.cuda.manual_seed(args.seed)
                torch.cuda.manual_seed_all(args.seed)
            '''
            # End of modification
            # compute output
            output = model(images)

            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1.item(), images.size(0))
            top5.update(acc5.item(), images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)

        progress.display(len(val_loader))

        if writer is not None:
            progress.write_to_tensorboard(writer,
                                          prefix="test",
                                          global_step=epoch)

    return top1.avg, top5.avg
コード例 #2
0
def validate(val_loader, model, criterion, args, writer, epoch):
    batch_time = AverageMeter("Time", ":6.3f", write_val=False)
    losses = AverageMeter("Loss", ":.3f", write_val=False)
    top1 = AverageMeter("Acc@1", ":6.2f", write_val=False)
    top5 = AverageMeter("Acc@5", ":6.2f", write_val=False)
    progress = ProgressMeter(val_loader.num_batches,
                             [batch_time, losses, top1, top5],
                             args,
                             prefix="Test: ")

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()

        # confusion_matrix = torch.zeros(args.num_cls,args.num_cls)
        for i, data in enumerate(val_loader):
            # images, target = data[0]['data'], data[0]['label'].long().squeeze()
            images, target = data[0].cuda(), data[1].long().squeeze().cuda()

            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            # print(target,torch.mean(images),acc1,acc5,loss,torch.mean(output))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1.item(), images.size(0))
            top5.update(acc5.item(), images.size(0))

            # _, preds = torch.max(output, 1)
            # for t, p in zip(target.view(-1), preds.view(-1)):
            #     confusion_matrix[t.long(), p.long()] += 1

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)

        progress.display(val_loader.num_batches)

        if writer is not None:
            progress.write_to_tensorboard(writer,
                                          prefix="test",
                                          global_step=epoch)

    # torch.save(confusion_matrix,'./conf_mat.pt')
    # print(top1.count)
    return top1.avg, top5.avg
コード例 #3
0
def base(model, device, val_loader, criterion, args, writer, epoch=0):
    """
        Evaluating on unmodified validation set inputs.
    """
    batch_time = AverageMeter("Time", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(val_loader), [batch_time, losses, top1, top5], prefix="Test: "
    )

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, data in enumerate(val_loader):
            images, target = data[0].to(device), data[1].to(device)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if (i + 1) % args.print_freq == 0:
                progress.display(i)

            if writer:
                progress.write_to_tensorboard(
                    writer, "test", epoch * len(val_loader) + i
                )

            # write a sample of test images to tensorboard (helpful for debugging)
            if i == 0 and writer:
                writer.add_image(
                    "test-images",
                    torchvision.utils.make_grid(images[0 : len(images) // 4]),
                )
        progress.display(i)  # print final results

    return top1.avg, top5.avg
コード例 #4
0
ファイル: default.py プロジェクト: hwaranlee/hidden-networks
def train(train_loader, model, criterion, optimizer, epoch, args, writer):
    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.3f")
    top1 = AverageMeter("Acc@1", ":6.2f")
    top5 = AverageMeter("Acc@5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix=f"Epoch: [{epoch}]",
    )

    # switch to train mode
    model.train()

    batch_size = train_loader.batch_size
    num_batches = len(train_loader)
    end = time.time()
    for i, (images, target) in tqdm.tqdm(
        enumerate(train_loader), ascii=True, total=len(train_loader)
    ):
        # measure data loading time
        data_time.update(time.time() - end)

        if args.gpu is not None:
            images = images.cuda(args.gpu, non_blocking=True)

        ### for MNIST
        #images = images.expand()
        #import pdb
        #pdb.set_trace()
        
        target = target.cuda(args.gpu, non_blocking=True)

        # compute output
        output = model(images)

        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1.item(), images.size(0))
        top5.update(acc5.item(), images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            t = (num_batches * epoch + i) * batch_size
            progress.display(i)
            progress.write_to_tensorboard(writer, prefix="train", global_step=t)

    return top1.avg, top5.avg
コード例 #5
0
def validate(val_loader, model, criterion, args, writer, epoch):
    # batch_time = AverageMeter("Time", ":6.3f", write_val=False)
    losses = AverageMeter("Loss", ":.3f", write_val=False)
    top1 = AverageMeter("Acc@1", ":6.2f", write_val=False)
    top5 = AverageMeter("Acc@5", ":6.2f", write_val=False)
    #progress = ProgressMeter(
    #    len(val_loader), [batch_time, losses, top1, top5], prefix="Test: "
    #)
    progress = ProgressMeter(len(val_loader), [losses, top1, top5],
                             prefix="Test: ")
    # switch to evaluate mode
    model.eval()
    printModelScore(model, args)
    with torch.no_grad():
        end = time.time()
        for i, (images, target) in tqdm.tqdm(enumerate(val_loader),
                                             ascii=True,
                                             total=len(val_loader)):
            if args.gpu is not None:
                images = images.cuda(args.gpu, non_blocking=True)

            target = target.cuda(args.gpu, non_blocking=True)

            # compute output
            output = model(images)

            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1.item(), images.size(0))
            top5.update(acc5.item(), images.size(0))

            # measure elapsed time
            # batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)

        progress.display(len(val_loader))

        if writer is not None:
            progress.write_to_tensorboard(writer,
                                          prefix="test",
                                          global_step=epoch)

    return top1.avg, top5.avg, losses.avg
コード例 #6
0
ファイル: pretrain.py プロジェクト: turian/simsiam
def train(train_loader, model, criterion, optimizer, epoch, cfg, writer):
    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.3f")
    progress = ProgressMeter(
        train_loader.num_batches,
        [batch_time, data_time, losses],
        cfg,
        prefix=f"Epoch: [{epoch}]",
    )

    # switch to train mode
    model.train()

    # batch_size = train_loader.batch_size
    num_batches = train_loader.num_batches
    end = time.time()
    batch_size = train_loader.batch_size
    for i, data in enumerate(train_loader):
        imgs1, imgs2, target = data[0][0].cuda(
            non_blocking=True), data[0][1].cuda(
                non_blocking=True), data[1].long().squeeze().cuda(
                    non_blocking=True)
        # measure data loading time
        data_time.update(time.time() - end)

        #compute output
        emb1, emb2 = model(imgs1, imgs2)
        loss = criterion(emb1, emb2)

        losses.update(loss.item(), batch_size)
        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % cfg.print_freq == 0 or i == num_batches - 1:
            t = (num_batches * epoch + i) * batch_size
            progress.display(i)
            progress.write_to_tensorboard(writer,
                                          prefix="train",
                                          global_step=t)
コード例 #7
0
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch,
          args, writer):
    print(
        " ->->->->->->->->->-> One epoch with Natural training <-<-<-<-<-<-<-<-<-<-"
    )

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch),
    )

    model.train()
    end = time.time()

    dataloader = train_loader if sm_loader is None else zip(
        train_loader, sm_loader)

    for i, data in enumerate(dataloader):
        if sm_loader:
            images, target = (
                torch.cat([d[0] for d in data], 0).to(device),
                torch.cat([d[1] for d in data], 0).to(device),
            )
        else:
            images, target = data[0].to(device), data[1].to(device)

        # basic properties of training
        if i == 0:
            print(
                images.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print("Pixel range for training images : [{}, {}]".format(
                torch.min(images).data.cpu().numpy(),
                torch.max(images).data.cpu().numpy(),
            ))

        # stability-loss
        if args.dataset == "imagenet":
            std = (torch.tensor(
                [0.229, 0.224,
                 0.225]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)).to(device)
            noise = (torch.randn_like(images) /
                     std).to(device) * args.noise_std
            output = model(images + noise)
            loss = nn.CrossEntropyLoss()(output, target)
        else:
            output = model(images)
            loss_natural = nn.CrossEntropyLoss()(output, target)
            loss_robust = (1.0 / len(images)) * nn.KLDivLoss(
                size_average=False)(
                    F.log_softmax(
                        model(images + torch.randn_like(images).to(device) *
                              args.noise_std),
                        dim=1,
                    ),
                    F.softmax(output, dim=1),
                )
            loss = loss_natural + args.beta * loss_robust

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
            progress.write_to_tensorboard(writer, "train",
                                          epoch * len(train_loader) + i)

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(images[0:len(images) // 4]),
            )
コード例 #8
0
def main_worker(args):
    # NEW: equivalent to MPI init.
    print("world size ", os.environ['OMPI_COMM_WORLD_SIZE'])
    print("rank ", os.environ['OMPI_COMM_WORLD_RANK'])
    torch.distributed.init_process_group(
        backend="nccl",
        init_method="env://",
        world_size=int(os.environ['OMPI_COMM_WORLD_SIZE']),
        rank=int(os.environ['OMPI_COMM_WORLD_RANK']))

    # NEW: lookup number of ranks in the job, and our rank
    args.world_size = torch.distributed.get_world_size()
    print("world size ", args.world_size)
    args.rank = torch.distributed.get_rank()
    print("rank ", args.rank)
    ngpus_per_node = torch.cuda.device_count()
    print("ngpus_per_node ", ngpus_per_node)
    local_rank = args.rank % ngpus_per_node
    print("local_rank ", local_rank)

    # NEW: Globalize variables
    global best_acc1
    global best_acc5
    global best_train_acc1
    global best_train_acc5

    #args.gpu = None
    # NEW: Specify gpu
    args.gpu = local_rank
    train, validate, modifier = get_trainer(args)

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # create model and optimizer
    model = get_model(args)

    # NEW: Distributed data
    #if args.distributed:
    args.batch_size = int(args.batch_size / ngpus_per_node)
    args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)

    #model = set_gpu(args, model)
    # NEW: Modified function for loading gpus on multinode setups
    model = lassen_set_gpu(args, model)

    if args.pretrained:
        pretrained(args, model)

    optimizer = get_optimizer(args, model)
    data = get_dataset(args)
    lr_policy = get_policy(args.lr_policy)(optimizer, args)

    if args.label_smoothing is None:
        #criterion = nn.CrossEntropyLoss().cuda()
        # NEW: Specify gpu
        criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    else:
        criterion = LabelSmoothing(smoothing=args.label_smoothing)

    # optionally resume from a checkpoint
    best_acc1 = 0.0
    best_acc5 = 0.0
    best_train_acc1 = 0.0
    best_train_acc5 = 0.0

    if args.resume:
        best_acc1 = resume(args, model, optimizer)

    # Data loading code
    if args.evaluate:
        acc1, acc5 = validate(data.val_loader,
                              model,
                              criterion,
                              args,
                              writer=None,
                              epoch=args.start_epoch)

        return

    # Set up directories
    # NEW: Only do for main processor (one with global rank 0)
    if args.rank == 0:
        run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args)
        args.ckpt_base_dir = ckpt_base_dir

    # NEW: Only do for main processor (one with global rank 0)
    if args.rank == 0:
        writer = SummaryWriter(log_dir=log_base_dir)
    else:
        writer = None

    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)

    # NEW: Only do for main processor (one with global rank 0)
    if args.rank == 0:
        progress_overall = ProgressMeter(
            1, [epoch_time, validation_time, train_time],
            prefix="Overall Timing")

    end_epoch = time.time()
    args.start_epoch = args.start_epoch or 0
    acc1 = None

    # Save the initial state
    # NEW: Only do for main processor (one with global rank 0)
    if args.rank == 0:
        save_checkpoint(
            {
                "epoch": 0,
                "arch": args.arch,
                "state_dict": model.state_dict(),
                "best_acc1": best_acc1,
                "best_acc5": best_acc5,
                "best_train_acc1": best_train_acc1,
                "best_train_acc5": best_train_acc5,
                "optimizer": optimizer.state_dict(),
                "curr_acc1": acc1 if acc1 else "Not evaluated",
            },
            False,
            filename=ckpt_base_dir / f"initial.state",
            save=False,
        )

    # Start training
    for epoch in range(args.start_epoch, args.epochs):
        # NEW: Distributed data
        #if args.distributed:
        data.train_sampler.set_epoch(epoch)
        data.val_sampler.set_epoch(epoch)

        lr_policy(epoch, iteration=None)
        #modifier(args, epoch, model)

        cur_lr = get_lr(optimizer)

        # train for one epoch
        start_train = time.time()
        train_acc1, train_acc5 = train(data.train_loader,
                                       model,
                                       criterion,
                                       optimizer,
                                       epoch,
                                       args,
                                       writer=writer)
        #train_acc1, train_acc5 = train(
        #    data.train_loader, model, criterion, optimizer, epoch, args, writer=None
        #)
        train_time.update((time.time() - start_train) / 60)

        # evaluate on validation set
        start_validation = time.time()

        # NEW: Only write values to tensorboard for main processor (one with global rank 0)
        if args.rank == 0:
            acc1, acc5 = validate(data.val_loader, model, criterion, args,
                                  writer, epoch)
        else:
            acc1, acc5 = validate(data.val_loader, model, criterion, args,
                                  None, epoch)

        validation_time.update((time.time() - start_validation) / 60)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        best_acc5 = max(acc5, best_acc5)
        best_train_acc1 = max(train_acc1, best_train_acc1)
        best_train_acc5 = max(train_acc5, best_train_acc5)

        save = ((epoch % args.save_every) == 0) and args.save_every > 0

        # NEW: Only do for main processor (one with global rank 0)
        if args.rank == 0:
            if is_best or save or epoch == args.epochs - 1:
                if is_best:
                    print(
                        f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}"
                    )

                save_checkpoint(
                    {
                        "epoch": epoch + 1,
                        "arch": args.arch,
                        "state_dict": model.state_dict(),
                        "best_acc1": best_acc1,
                        "best_acc5": best_acc5,
                        "best_train_acc1": best_train_acc1,
                        "best_train_acc5": best_train_acc5,
                        "optimizer": optimizer.state_dict(),
                        "curr_acc1": acc1,
                        "curr_acc5": acc5,
                    },
                    is_best,
                    filename=ckpt_base_dir / f"epoch_most_recent.state",
                    save=save,
                )
                #filename=ckpt_base_dir / f"epoch_{epoch}.state",

        epoch_time.update((time.time() - end_epoch) / 60)

        # NEW: Only do for main processor (one with global rank 0)
        if args.rank == 0:
            progress_overall.display(epoch)
            progress_overall.write_to_tensorboard(writer,
                                                  prefix="diagnostics",
                                                  global_step=epoch)

            if args.conv_type == "SampleSubnetConv":
                count = 0
                sum_pr = 0.0
                for n, m in model.named_modules():
                    if isinstance(m, SampleSubnetConv):
                        # avg pr across 10 samples
                        pr = 0.0
                        for _ in range(10):
                            pr += ((torch.rand_like(m.clamped_scores) >=
                                    m.clamped_scores).float().mean().item())
                        pr /= 10.0
                        writer.add_scalar("pr/{}".format(n), pr, epoch)
                        sum_pr += pr
                        count += 1

                args.prune_rate = sum_pr / count
                writer.add_scalar("pr/average", args.prune_rate, epoch)

        # NEW: Only do for main processor (one with global rank 0)
        if args.rank == 0:
            writer.add_scalar("test/lr", cur_lr, epoch)

        end_epoch = time.time()

    # NEW: Only do for main processor (one with global rank 0)
    if args.rank == 0:
        write_result_to_csv(
            best_acc1=best_acc1,
            best_acc5=best_acc5,
            best_train_acc1=best_train_acc1,
            best_train_acc5=best_train_acc5,
            prune_rate=args.prune_rate,
            curr_acc1=acc1,
            curr_acc5=acc5,
            base_config=args.config,
            name=args.name,
        )
コード例 #9
0
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch,
          args, writer):
    print(
        " ->->->->->->->->->-> One epoch with Natural training <-<-<-<-<-<-<-<-<-<-"
    )

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch),
    )

    model.train()
    end = time.time()

    dataloader = train_loader if sm_loader is None else zip(
        train_loader, sm_loader)

    for i, data in enumerate(dataloader):
        if sm_loader:
            images, target = (
                torch.cat([d[0] for d in data], 0).to(device),
                torch.cat([d[1] for d in data], 0).to(device),
            )
        else:
            images, target = data[0].to(device), data[1].to(device)

        # basic properties of training
        if i == 0:
            print(
                images.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print("Pixel range for training images : [{}, {}]".format(
                torch.min(images).data.cpu().numpy(),
                torch.max(images).data.cpu().numpy(),
            ))

        output = model(images)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
            progress.write_to_tensorboard(writer, "train",
                                          epoch * len(train_loader) + i)

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(images[0:len(images) // 4]),
            )
コード例 #10
0
ファイル: main.py プロジェクト: isamu-isozaki/hidden-networks
def main_worker(args):
    train, validate, modifier = get_trainer(args)

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # create model and optimizer
    model = get_model(args)
    model = set_gpu(args, model)
    wandb.watch(model)

    if args.pretrained:
        pretrained(args, model)

    optimizer = get_optimizer(args, model)
    data = get_dataset(args)
    lr_policy = get_policy(args.lr_policy)(optimizer, args)

    if args.label_smoothing is None:
        criterion = nn.CrossEntropyLoss().cuda()
    else:
        criterion = LabelSmoothing(smoothing=args.label_smoothing)

    # optionally resume from a checkpoint
    best_acc1 = 0.0
    best_acc5 = 0.0
    best_train_acc1 = 0.0
    best_train_acc5 = 0.0

    if args.resume:
        best_acc1 = resume(args, model, optimizer)

    # Data loading code
    if args.evaluate:
        acc1, acc5 = validate(data.val_loader,
                              model,
                              criterion,
                              args,
                              writer=None,
                              epoch=args.start_epoch)

        return

    # Set up directories
    run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args)
    args.ckpt_base_dir = ckpt_base_dir

    writer = SummaryWriter(log_dir=log_base_dir)
    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)
    progress_overall = ProgressMeter(1,
                                     [epoch_time, validation_time, train_time],
                                     prefix="Overall Timing")

    end_epoch = time.time()
    args.start_epoch = args.start_epoch or 0
    acc1 = None

    # Save the initial state
    save_checkpoint(
        {
            "epoch": 0,
            "arch": args.arch,
            "state_dict": model.state_dict(),
            "best_acc1": best_acc1,
            "best_acc5": best_acc5,
            "best_train_acc1": best_train_acc1,
            "best_train_acc5": best_train_acc5,
            "optimizer": optimizer.state_dict(),
            "curr_acc1": acc1 if acc1 else "Not evaluated",
        },
        False,
        filename=ckpt_base_dir / f"initial.state",
        save=False,
    )

    # Start training
    for epoch in range(args.start_epoch, args.epochs):
        lr_policy(epoch, iteration=None)
        modifier(args, epoch, model)

        cur_lr = get_lr(optimizer)

        # train for one epoch
        start_train = time.time()
        train_acc1, train_acc5 = train(data.train_loader,
                                       model,
                                       criterion,
                                       optimizer,
                                       epoch,
                                       args,
                                       writer=writer)
        train_time.update((time.time() - start_train) / 60)

        # evaluate on validation set
        start_validation = time.time()
        acc1, acc5 = validate(data.val_loader, model, criterion, args, writer,
                              epoch)
        validation_time.update((time.time() - start_validation) / 60)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        best_acc5 = max(acc5, best_acc5)
        best_train_acc1 = max(train_acc1, best_train_acc1)
        best_train_acc5 = max(train_acc5, best_train_acc5)

        save = ((epoch % args.save_every) == 0) and args.save_every > 0
        if is_best or save or epoch == args.epochs - 1:
            if is_best:
                print(
                    f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}"
                )

            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "arch": args.arch,
                    "state_dict": model.state_dict(),
                    "best_acc1": best_acc1,
                    "best_acc5": best_acc5,
                    "best_train_acc1": best_train_acc1,
                    "best_train_acc5": best_train_acc5,
                    "optimizer": optimizer.state_dict(),
                    "curr_acc1": acc1,
                    "curr_acc5": acc5,
                },
                is_best,
                filename=ckpt_base_dir / f"epoch_{epoch}.state",
                save=save,
            )
            wandb.log({
                "curr_acc1": acc1,
                "curr_acc5": acc5,
            })

        epoch_time.update((time.time() - end_epoch) / 60)
        progress_overall.display(epoch)
        progress_overall.write_to_tensorboard(writer,
                                              prefix="diagnostics",
                                              global_step=epoch)

        if args.conv_type == "SampleSubnetConv":
            count = 0
            sum_pr = 0.0
            for n, m in model.named_modules():
                if isinstance(m, SampleSubnetConv):
                    # avg pr across 10 samples
                    pr = 0.0
                    for _ in range(10):
                        pr += ((torch.rand_like(m.clamped_scores) >=
                                m.clamped_scores).float().mean().item())
                    pr /= 10.0
                    writer.add_scalar("pr/{}".format(n), pr, epoch)
                    sum_pr += pr
                    count += 1

            args.prune_rate = sum_pr / count
            writer.add_scalar("pr/average", args.prune_rate, epoch)

        writer.add_scalar("test/lr", cur_lr, epoch)
        end_epoch = time.time()

    write_result_to_csv(
        best_acc1=best_acc1,
        best_acc5=best_acc5,
        best_train_acc1=best_train_acc1,
        best_train_acc5=best_train_acc5,
        prune_rate=args.prune_rate,
        curr_acc1=acc1,
        curr_acc5=acc5,
        base_config=args.config,
        name=args.name,
    )
コード例 #11
0
def train(train_loader, model, criterion, optimizer, epoch, args, writer):
    # batch_time = AverageMeter("Time", ":6.3f")
    # data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.3f")
    top1 = AverageMeter("Acc@1", ":6.2f")
    top5 = AverageMeter("Acc@5", ":6.2f")
    #l = [batch_time, data_time, losses, top1, top5]
    l = [losses, top1, top5]
    progress = ProgressMeter(
        len(train_loader),
        l,
        prefix=f"Epoch: [{epoch}]",
    )

    # switch to train mode
    model.train()

    batch_size = train_loader.batch_size
    num_batches = len(train_loader)
    end = time.time()
    image0, target0 = None, None
    for i, (images, target) in tqdm.tqdm(enumerate(train_loader),
                                         ascii=True,
                                         total=len(train_loader)):
        # if i == 0:
        image0 = images
        target0 = target
        # measure data loading time
        # data_time.update(time.time() - end)

        if args.gpu is not None:
            image0 = image0.cuda(args.gpu, non_blocking=True)

        target0 = target0.cuda(args.gpu, non_blocking=True)
        l = 0
        a1 = 0
        a5 = 0
        for j in range(args.K):
            output = model(image0)
            loss = criterion(output, target0)
            acc1, acc5 = accuracy(output, target0, topk=(1, 5))
            l = l + loss
            a1 = a1 + acc1
            a5 = a5 + acc5
        l = l / args.K
        a1 = a1 / args.K
        a5 = a5 / args.K
        # measure accuracy and record loss
        # torch.Size([128, 3, 32, 32])
        # 128
        losses.update(l.item(), image0.size(0))
        top1.update(a1.item(), images.size(0))
        top5.update(a5.item(), images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        if args.conv_type != "SFESubnetConv":
            l.backward()
        else:
            updateScoreDiff(model, l)
        # printModelScore(model, args)
        optimizer.step()

        # measure elapsed time
        # batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            t = (num_batches * epoch + i) * batch_size
            progress.display(i)
            progress.write_to_tensorboard(writer,
                                          prefix="train",
                                          global_step=t)

    return top1.avg, top5.avg
コード例 #12
0
def train(train_loader, model, criterion, optimizer, epoch, cfg, writer):
    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.3f")
    top1 = AverageMeter("Acc@1", ":6.2f")
    top5 = AverageMeter("Acc@5", ":6.2f")
    progress = ProgressMeter(
        train_loader.num_batches,
        [batch_time, data_time, losses, top1, top5],
        cfg,
        prefix=f"Epoch: [{epoch}]",
    )

    # switch to train mode
    model.train()

    batch_size = train_loader.batch_size
    num_batches = train_loader.num_batches
    end = time.time()

    for i, data in enumerate(train_loader):
        # images, target = data[0]['data'],data[0]['label'].long().squeeze()
        images, target = data[0].cuda(), data[1].long().squeeze().cuda()
        # measure data loading time
        data_time.update(time.time() - end)

        if cfg.cs_kd:

            batch_size = images.size(0)
            loss_batch_size = batch_size // 2
            targets_ = target[:batch_size // 2]
            outputs = model(images[:batch_size // 2])
            loss = torch.mean(criterion(outputs, targets_))
            # loss += loss.item()

            with torch.no_grad():
                outputs_cls = model(images[batch_size // 2:])
            cls_loss = kdloss(outputs[:batch_size // 2], outputs_cls.detach())
            lamda = 3
            loss += lamda * cls_loss
            acc1, acc5 = accuracy(outputs, targets_, topk=(1, 5))
        else:
            batch_size = images.size(0)
            loss_batch_size = batch_size
            #compute output
            output = model(images)
            loss = criterion(output, target)
            acc1, acc5 = accuracy(output, target, topk=(1, 5))

        # print(i, batch_size, loss)

        # measure accuracy and record loss

        losses.update(loss.item(), loss_batch_size)
        top1.update(acc1.item(), loss_batch_size)
        top5.update(acc5.item(), loss_batch_size)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % cfg.print_freq == 0 or i == num_batches - 1:
            t = (num_batches * epoch + i) * batch_size
            progress.display(i)
            progress.write_to_tensorboard(writer,
                                          prefix="train",
                                          global_step=t)

    # train_loader.reset()
    # print(top1.count)
    return top1.avg, top5.avg
コード例 #13
0
ファイル: eval.py プロジェクト: timko98/hydra
def smooth(model, device, val_loader, criterion, args, writer, epoch=0):
    """
        Evaluating on unmodified validation set inputs.
    """
    batch_time = AverageMeter("Time", ":6.3f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    rad = AverageMeter("rad", ":6.2f")
    progress = ProgressMeter(len(val_loader), [batch_time, top1, top5, rad],
                             prefix="Smooth (eval): ")

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, data in enumerate(val_loader):
            images, target = data[0].to(device), data[1].to(device)

            # Defult: evaluate on 10 random samples of additive gaussian noise.
            output = []
            for _ in range(10):
                # add noise
                if args.dataset == "imagenet":
                    std = (torch.tensor([
                        0.229, 0.224, 0.225
                    ]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)).to(device)
                    noise = (torch.randn_like(images) /
                             std).to(device) * args.noise_std
                else:
                    noise = torch.randn_like(images).to(
                        device) * args.noise_std

                output.append(F.softmax(model(images + noise), -1))

            output = torch.sum(torch.stack(output), axis=0)

            p_max, _ = output.max(dim=-1)
            radii = (args.noise_std + 1e-16) * norm.ppf(
                p_max.data.cpu().numpy())

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))
            rad.update(np.mean(radii))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if (i + 1) % args.print_freq == 0:
                progress.display(i)

            if writer:
                progress.write_to_tensorboard(writer, "test",
                                              epoch * len(val_loader) + i)

            # write a sample of test images to tensorboard (helpful for debugging)
            if i == 0 and writer:
                writer.add_image(
                    "Adv-test-images",
                    torchvision.utils.make_grid(images[0:len(images) // 4]),
                )

        progress.display(i)  # print final results

    return top1.avg, rad.avg
コード例 #14
0
ファイル: eval.py プロジェクト: timko98/hydra
def ibp(model, device, val_loader, criterion, args, writer, epoch=0):
    batch_time = AverageMeter("Time", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    ibp_losses = AverageMeter("IBP_Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    ibp_top1 = AverageMeter("IBP-Acc_1", ":6.2f")
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, ibp_losses, top1, top5, ibp_top1],
        prefix="Test: ",
    )

    # switch to evaluation mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, data in enumerate(val_loader):
            images, target = data[0].to(device), data[1].to(device)

            # clean images

            output = model(images)
            loss = criterion(output, target)

            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            rce, rerr = naive_interval_analyze(
                model,
                args.epsilon,
                images,
                target,
                use_cuda=torch.cuda.is_available(),
                parallel=False,
            )

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            ibp_losses.update(rce.item(), images.size(0))
            ibp_top1.update((1 - rerr) * 100.0, images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if (i + 1) % args.print_freq == 0:
                progress.display(i)

            if writer:
                progress.write_to_tensorboard(writer, "test",
                                              epoch * len(val_loader) + i)

            # write a sample of test images to tensorboard (helpful for debugging)
            if i == 0 and writer:
                writer.add_image(
                    "Adv-test-images",
                    torchvision.utils.make_grid(images[0:len(images) // 4]),
                )
        progress.display(i)  # print final results

    return ibp_top1.avg, ibp_top1.avg
コード例 #15
0
def train(
    model,
    device,
    train_loader,
    sm_loader,
    criterion,
    optimizer,
    epoch,
    args,
    writer=None,
):

    assert (
        not args.normalize
    ), "Explicit normalization is done in the training loop, Dataset should have [0, 1] dynamic range."

    global_noise_data = torch.zeros(
        [args.batch_size, 3, args.image_dim, args.image_dim]).to(device)

    mean = torch.Tensor(np.array(args.mean)[:, np.newaxis, np.newaxis])
    mean = mean.expand(3, args.image_dim, args.image_dim).to(device)
    std = torch.Tensor(np.array(args.std)[:, np.newaxis, np.newaxis])
    std = std.expand(3, args.image_dim, args.image_dim).to(device)

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch),
    )

    # switch to train mode
    model.train()
    for i, (input, target) in enumerate(train_loader):
        end = time.time()
        input = input.to(device, non_blocking=True)
        target = target.to(device, non_blocking=True)
        data_time.update(time.time() - end)

        for _ in range(args.n_repeats):
            # Ascend on the global noise
            noise_batch = Variable(global_noise_data[0:input.size(0)],
                                   requires_grad=True).to(device)
            in1 = input + noise_batch
            in1.clamp_(0, 1.0)
            in1.sub_(mean).div_(std)
            output = model(in1)
            loss = criterion(output, target)

            prec1, prec5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(prec1[0], input.size(0))
            top5.update(prec5[0], input.size(0))

            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()

            # Update the noise for the next iteration
            pert = fgsm(noise_batch.grad, args.epsilon)
            global_noise_data[0:input.size(0)] += pert.data
            global_noise_data.clamp_(-args.epsilon, args.epsilon)

            optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)
                progress.write_to_tensorboard(writer, "train",
                                              epoch * len(train_loader) + i)

        if i == 0:
            print(
                in1.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print(f"Training images range: {[torch.min(in1), torch.max(in1)]}")

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(input[0:len(input) // 4]),
            )
コード例 #16
0
def evaluate(epoch, data, model, criterion):
    """Main evaluation procedure.
    
    Arguments:
        epoch -- current epoch 
        data -- DataLoader which can provide validation batches
        model -- model to be evaluated
        criterion -- instance of loss function to measure performance
    """
    text_logger = logging.getLogger(__name__)

    model.eval()

    # initialize counters, etc.
    mcd, mcd_count = 0, 0
    cla, cla_count = 0, 0
    eval_losses = {}

    total_loss = AverageMeter('Total Loss', ':.4e')
    mel_pre_loss = AverageMeter('Mel Pre Loss', ':.4e')
    mel_post_loss = AverageMeter('Mel Post Loss', ':.4e')
    lang_class_acc = AverageMeter('Lang Class Acc', ':.4e')
    progress = ProgressMeter(len(data),
                             total_loss,
                             mel_pre_loss,
                             mel_post_loss,
                             lang_class_acc,
                             prefix="Epoch: [{}]".format(epoch),
                             logger=text_logger)

    # loop through epoch batches
    with torch.no_grad():
        for i, batch in enumerate(data):

            # parse batch
            batch = list(map(to_gpu, batch))
            src, src_len, trg_mel, trg_lin, trg_len, stop_trg, spkrs, langs = batch

            # run the model (twice, with and without teacher forcing)
            post_pred, pre_pred, stop_pred, alignment, spkrs_pred, enc_output = model(
                src, src_len, trg_mel, trg_len, spkrs, langs, 1.0)
            post_pred_0, _, stop_pred_0, alignment_0, _, _ = model(
                src, src_len, trg_mel, trg_len, spkrs, langs, 0.0)
            stop_pred_probs = torch.sigmoid(stop_pred_0)

            # evaluate loss function
            post_trg = trg_lin if hp.predict_linear else trg_mel
            classifier = model._reversal_classifier if hp.reversal_classifier else None
            loss, batch_losses = criterion(src_len, trg_len, pre_pred, trg_mel,
                                           post_pred, post_trg, stop_pred,
                                           stop_trg, alignment, spkrs,
                                           spkrs_pred, enc_output, classifier)
            total_loss.update(loss, src.size(0))
            mel_pre_loss.update(batch_losses['mel_pre'], src.size(0))
            mel_post_loss.update(batch_losses['mel_pos'], src.size(0))
            # compute mel cepstral distorsion
            for j, (gen, ref, stop) in enumerate(
                    zip(post_pred_0, trg_mel, stop_pred_probs)):
                stop_idxes = np.where(stop.cpu().numpy() > 0.5)[0]
                stop_idx = min(
                    np.min(stop_idxes) + hp.stop_frames,
                    gen.size()[1]) if len(stop_idxes) > 0 else gen.size()[1]
                gen = gen[:, :stop_idx].data.cpu().numpy()
                ref = ref[:, :trg_len[j]].data.cpu().numpy()
                if hp.normalize_spectrogram:
                    gen = audio.denormalize_spectrogram(
                        gen, not hp.predict_linear)
                    ref = audio.denormalize_spectrogram(ref, True)
                if hp.predict_linear: gen = audio.linear_to_mel(gen)
                mcd = (mcd_count * mcd + audio.mel_cepstral_distorision(
                    gen, ref, 'dtw')) / (mcd_count + 1)
                mcd_count += 1

            # compute adversarial classifier accuracy
            if hp.reversal_classifier:
                input_mask = lengths_to_mask(src_len)
                trg_spkrs = torch.zeros_like(input_mask, dtype=torch.int64)
                for s in range(hp.speaker_number):
                    speaker_mask = (spkrs == s)
                    trg_spkrs[speaker_mask] = s
                matches = (trg_spkrs == torch.argmax(
                    torch.nn.functional.softmax(spkrs_pred, dim=-1), dim=-1))
                matches[~input_mask] = False
                cla = (cla_count * cla + torch.sum(matches).item() /
                       torch.sum(input_mask).item()) / (cla_count + 1)
                cla_count += 1
                lang_class_acc.update(cla, src.size(0))

            # add batch losses to epoch losses
            for k, v in batch_losses.items():
                eval_losses[k] = v + eval_losses[k] if k in eval_losses else v

    # normalize loss per batch
    for k in eval_losses.keys():
        eval_losses[k] /= len(data)

    # log evaluation
    progress.print(i)
    Logger.evaluation(epoch + 1, eval_losses, mcd, src_len, trg_len, src,
                      post_trg, post_pred, post_pred_0, stop_pred_probs,
                      stop_trg, alignment_0, cla)

    return sum(eval_losses.values())
コード例 #17
0
ファイル: default.py プロジェクト: zhanzheng8585/biprop
def train(train_loader, model, criterion, optimizer, epoch, args, writer):
    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.3f")
    top1 = AverageMeter("Acc@1", ":6.2f")
    top5 = AverageMeter("Acc@5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix=f"Epoch: [{epoch}]",
    )

    # switch to train mode
    model.train()

    batch_size = train_loader.batch_size
    num_batches = len(train_loader)
    end = time.time()
    for i, (images, target) in tqdm.tqdm(enumerate(train_loader),
                                         ascii=True,
                                         total=len(train_loader)):
        # measure data loading time
        data_time.update(time.time() - end)

        if args.gpu is not None:
            images = images.cuda(args.gpu, non_blocking=True)

        target = target.cuda(args.gpu, non_blocking=True)

        # Write scores and weights to tensorboard at beginning of every other epoch
        if args.histograms:
            if (i % (num_batches * batch_size) == 0) and (epoch % 2 == 0):
                for param_name in model.state_dict():
                    #print(param_name)
                    # Only write scores for now (not weights and batch norm parameters since the pytorch parms don't actually change)
                    #if 'score' not in param_name:
                    #if 'score' in param_name or 'weight' in param_name:
                    #print(param_name, model.state_dict()[param_name])
                    writer.add_histogram(param_name,
                                         model.state_dict()[param_name], epoch)

        # compute output
        output = model(images)

        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1.item(), images.size(0))
        top5.update(acc5.item(), images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        #torch.nn.utils.clip_grad_norm_(model.parameters(),1)
        loss.backward()
        # EDITED
        #print(torch.norm(torch.cat([p.grad.view(-1) for p in model.parameters()])))
        if args.grad_clip:
            torch.nn.utils.clip_grad_value_(model.parameters(), 1)
        #print(torch.norm(torch.cat([p.grad.view(-1) for p in model.parameters()])))
        #for param_name in model.state_dict(): print(param_name, str(model.state_dict()[param_name])[:50])
        #torch.nn.utils.clip_grad_norm_(model.parameters(),1)
        # end
        optimizer.step()

        # Clamp updated scores to [-1,1] only when using binarized/quantized activations
        #for param_name in model.state_dict():
        #  if 'score' in param_name:
        #    #print(param_name)
        #    scores = model.state_dict()[param_name]
        #    #scores = torch.clamp(scores,min=-1.0,max=1.0)
        #    scores.clamp_(min=-1.0,max=1.0)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        #print(model.state_dict()['module.linear.3.scores'].grad)
        #params = list(model.parameters())
        #print(params[1].grad)

        if i % args.print_freq == 0:
            t = (num_batches * epoch + i) * batch_size
            progress.display(i)

            #_, predicted = torch.max(output, 1)
            progress.write_to_tensorboard(writer,
                                          prefix="train",
                                          global_step=t)

        # Write score gradients to tensorboard at end of every other epoch
        if args.histograms:
            if (i % (num_batches * batch_size) == 0) and (epoch % 2 == 0):
                #if ((i+1) % (num_batches-1) == 0) and (epoch % 2 == 0):
                params = list(model.parameters())
                param_names = list(model.state_dict())
                for j in range(len(params)):
                    if params[j].grad is not None:
                        # if 'score' in param_names[j] or 'weight' in param_names[j]:
                        # if 'score' not in param_name and params[j].grad is not None:
                        #print(param_names[j])
                        #print(params[j].grad)
                        writer.add_histogram(param_names[j] + '.grad',
                                             params[j].grad, epoch)
                    else:
                        writer.add_histogram(param_names[j] + '.grad', 0,
                                             epoch)
                #for param_name in model.state_dict():
                #  if 'score' in param_name:
                #    writer.add_histogram(param_name + '.grad', model.state_dict()[param_name].grad, epoch)
                #params = list(model.parameters())
                #for j in range(len(params)):
                #  writer.add_histogram('Layer' + str(j) + 'grad', params[j].grad, epoch)

    # Write final scores and weights to tensorboard
    if args.histograms:
        for param_name in model.state_dict():
            #writer.add_histogram(param_name, model.state_dict()[param_name], epoch)
            # Only write scores for now (not weights and batch norm parameters since the pytorch parms don't actually change)
            #if 'score' not in param_name:
            #print(param_name, model.state_dict()[param_name])
            writer.add_histogram(param_name,
                                 model.state_dict()[param_name], epoch)

    return top1.avg, top5.avg
コード例 #18
0
def train(logging_start_epoch, epoch, data, model, criterion, optimizer):
    """Main training procedure.
    
    Arguments:
        logging_start_epoch -- number of the first epoch to be logged
        epoch -- current epoch 
        data -- DataLoader which can provide batches for an epoch
        model -- model to be trained
        criterion -- instance of loss function to be optimized
        optimizer -- instance of optimizer which will be used for parameter updates
    """
    text_logger = logging.getLogger(__name__)

    model.train()

    # initialize counters, etc.
    learning_rate = optimizer.param_groups[0]['lr']
    cla = 0
    done, start_time = 0, time.time()

    total_loss = AverageMeter('Total Loss', ':.4e')
    mel_pre_loss = AverageMeter('Mel Pre Loss', ':.4e')
    mel_post_loss = AverageMeter('Mel Post Loss', ':.4e')
    lang_class_acc = AverageMeter('Lang Class Acc', ':.4e')
    progress = ProgressMeter(len(data),
                             total_loss,
                             mel_pre_loss,
                             mel_post_loss,
                             lang_class_acc,
                             prefix="Epoch: [{}]".format(epoch),
                             logger=text_logger)

    # loop through epoch batches
    for i, batch in enumerate(data):

        global_step = done + epoch * len(data)
        optimizer.zero_grad()

        # parse batch
        batch = list(map(to_gpu, batch))
        src, src_len, trg_mel, trg_lin, trg_len, stop_trg, spkrs, langs = batch

        # get teacher forcing ratio
        if hp.constant_teacher_forcing: tf = hp.teacher_forcing
        else:
            tf = cos_decay(
                max(global_step - hp.teacher_forcing_start_steps, 0),
                hp.teacher_forcing_steps)

        # run the model
        post_pred, pre_pred, stop_pred, alignment, spkrs_pred, enc_output = model(
            src, src_len, trg_mel, trg_len, spkrs, langs, tf)

        # evaluate loss function
        post_trg = trg_lin if hp.predict_linear else trg_mel
        classifier = model._reversal_classifier if hp.reversal_classifier else None
        loss, batch_losses = criterion(src_len, trg_len, pre_pred, trg_mel,
                                       post_pred, post_trg, stop_pred,
                                       stop_trg, alignment, spkrs, spkrs_pred,
                                       enc_output, classifier)

        total_loss.update(loss, src.size(0))
        mel_pre_loss.update(batch_losses['mel_pre'], src.size(0))
        mel_post_loss.update(batch_losses['mel_pos'], src.size(0))

        # evaluate adversarial classifier accuracy, if present
        if hp.reversal_classifier:
            input_mask = lengths_to_mask(src_len)
            trg_spkrs = torch.zeros_like(input_mask, dtype=torch.int64)
            for s in range(hp.speaker_number):
                speaker_mask = (spkrs == s)
                trg_spkrs[speaker_mask] = s
            matches = (trg_spkrs == torch.argmax(torch.nn.functional.softmax(
                spkrs_pred, dim=-1),
                                                 dim=-1))
            matches[~input_mask] = False
            cla = torch.sum(matches).item() / torch.sum(input_mask).item()
            lang_class_acc.update(cla, src.size(0))

        # comptute gradients and make a step
        loss.backward()
        gradient = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                  hp.gradient_clipping)
        optimizer.step()

        # log training progress
        if epoch >= logging_start_epoch:
            Logger.training(global_step, batch_losses, gradient, learning_rate,
                            time.time() - start_time, cla)
            progress.print(i)

        # update criterion states (params and decay of the loss and so on ...)
        criterion.update_states()

        start_time = time.time()
        done += 1
コード例 #19
0
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch,
          args, writer):
    print(
        " ->->->->->->->->->-> One epoch with Adversarial training (TRADES) <-<-<-<-<-<-<-<-<-<-"
    )

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch),
    )

    model.train()
    end = time.time()

    dataloader = train_loader if sm_loader is None else zip(
        train_loader, sm_loader)

    for i, data in enumerate(dataloader):
        if sm_loader:
            images, target = (
                torch.cat([d[0] for d in data], 0).to(device),
                torch.cat([d[1] for d in data], 0).to(device),
            )
        else:
            images, target = data[0].to(device), data[1].to(device)

        # basic properties of training data
        if i == 0:
            print(
                images.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print(
                f"Training images range: {[torch.min(images), torch.max(images)]}"
            )

        output = model(images)

        # calculate robust loss
        loss = pgd_loss(
            model=model,
            x_natural=images,
            y=target,
            device=device,
            optimizer=optimizer,
            step_size=args.step_size,
            epsilon=args.epsilon,
            perturb_steps=args.num_steps,
            beta=args.beta,
            clip_min=args.clip_min,
            clip_max=args.clip_max,
            distance=args.distance,
        )

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
            progress.write_to_tensorboard(writer, "train",
                                          epoch * len(train_loader) + i)

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(images[0:len(images) // 4]),
            )
コード例 #20
0
ファイル: eval.py プロジェクト: timko98/hydra
def freeadv(model, device, val_loader, criterion, args, writer, epoch=0):

    assert (
        not args.normalize
    ), "Explicit normalization is done in the training loop, Dataset should have [0, 1] dynamic range."

    # Mean/Std for normalization
    mean = torch.Tensor(np.array(args.mean)[:, np.newaxis, np.newaxis])
    mean = mean.expand(3, args.image_dim, args.image_dim).to(device)
    std = torch.Tensor(np.array(args.std)[:, np.newaxis, np.newaxis])
    std = std.expand(3, args.image_dim, args.image_dim).to(device)

    batch_time = AverageMeter("Time", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    top5 = AverageMeter("Acc_5", ":6.2f")
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix="Test: ",
    )

    eps = args.epsilon
    K = args.num_steps
    step = args.step_size
    model.eval()
    end = time.time()
    print(" PGD eps: {}, num-steps: {}, step-size: {} ".format(eps, K, step))
    for i, (input, target) in enumerate(val_loader):

        input = input.to(device, non_blocking=True)
        target = target.to(device, non_blocking=True)

        orig_input = input.clone()
        randn = torch.FloatTensor(input.size()).uniform_(-eps, eps).to(device)
        input += randn
        input.clamp_(0, 1.0)
        for _ in range(K):
            invar = Variable(input, requires_grad=True)
            in1 = invar - mean
            in1.div_(std)
            output = model(in1)
            ascend_loss = criterion(output, target)
            ascend_grad = torch.autograd.grad(ascend_loss, invar)[0]
            pert = fgsm(ascend_grad, step)
            # Apply purturbation
            input += pert.data
            input = torch.max(orig_input - eps, input)
            input = torch.min(orig_input + eps, input)
            input.clamp_(0, 1.0)

        input.sub_(mean).div_(std)
        with torch.no_grad():
            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(prec1[0], input.size(0))
            top5.update(prec5[0], input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

        if (i + 1) % args.print_freq == 0:
            progress.display(i)

        if writer:
            progress.write_to_tensorboard(writer, "test",
                                          epoch * len(val_loader) + i)

        # write a sample of test images to tensorboard (helpful for debugging)
        if i == 0 and writer:
            writer.add_image(
                "Adv-test-images",
                torchvision.utils.make_grid(input[0:len(input) // 4]),
            )

    progress.display(i)  # print final results

    return top1.avg, top5.avg
コード例 #21
0
ファイル: main.py プロジェクト: zj15001/STR
def main_worker(args):
    args.gpu = None

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # create model and optimizer
    model = get_model(args)
    model = set_gpu(args, model)

    # Set up directories
    run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args)

    # Loading pretrained model
    if args.pretrained:
        pretrained(args, model)

        # Saving a DenseConv (nn.Conv2d) compatible model
        if args.dense_conv_model:
            print(
                f"==> DenseConv compatible model, saving at {ckpt_base_dir / 'model_best.pth'}"
            )
            save_checkpoint(
                {
                    "epoch": 0,
                    "arch": args.arch,
                    "state_dict": model.state_dict(),
                },
                True,
                filename=ckpt_base_dir / f"epoch_pretrained.state",
                save=True,
            )
            return

    optimizer = get_optimizer(args, model)
    data = get_dataset(args)
    lr_policy = get_policy(args.lr_policy)(optimizer, args)

    if args.label_smoothing is None:
        criterion = nn.CrossEntropyLoss().cuda()
    else:
        criterion = LabelSmoothing(smoothing=args.label_smoothing)

    # optionally resume from a checkpoint
    best_acc1 = 0.0
    best_acc5 = 0.0
    best_train_acc1 = 0.0
    best_train_acc5 = 0.0

    if args.resume:
        best_acc1 = resume(args, model, optimizer)

    # Evaulation of a model
    if args.evaluate:
        acc1, acc5 = validate(data.val_loader,
                              model,
                              criterion,
                              args,
                              writer=None,
                              epoch=args.start_epoch)
        return

    writer = SummaryWriter(log_dir=log_base_dir)
    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)
    progress_overall = ProgressMeter(1,
                                     [epoch_time, validation_time, train_time],
                                     prefix="Overall Timing")

    end_epoch = time.time()
    args.start_epoch = args.start_epoch or 0
    acc1 = None

    # Save the initial state
    save_checkpoint(
        {
            "epoch": 0,
            "arch": args.arch,
            "state_dict": model.state_dict(),
            "best_acc1": best_acc1,
            "best_acc5": best_acc5,
            "best_train_acc1": best_train_acc1,
            "best_train_acc5": best_train_acc5,
            "optimizer": optimizer.state_dict(),
            "curr_acc1": acc1 if acc1 else "Not evaluated",
        },
        False,
        filename=ckpt_base_dir / f"initial.state",
        save=False,
    )

    # Start training
    for epoch in range(args.start_epoch, args.epochs):
        lr_policy(epoch, iteration=None)
        cur_lr = get_lr(optimizer)

        # Gradual pruning in GMP experiments
        if args.conv_type == "GMPConv" and epoch >= args.init_prune_epoch and epoch <= args.final_prune_epoch:
            total_prune_epochs = args.final_prune_epoch - args.init_prune_epoch + 1
            for n, m in model.named_modules():
                if hasattr(m, 'set_curr_prune_rate'):
                    prune_decay = (
                        1 - ((args.curr_prune_epoch - args.init_prune_epoch) /
                             total_prune_epochs))**3
                    curr_prune_rate = m.prune_rate - (m.prune_rate *
                                                      prune_decay)
                    m.set_curr_prune_rate(curr_prune_rate)

        # train for one epoch
        start_train = time.time()
        train_acc1, train_acc5 = train(data.train_loader,
                                       model,
                                       criterion,
                                       optimizer,
                                       epoch,
                                       args,
                                       writer=writer)
        train_time.update((time.time() - start_train) / 60)

        # evaluate on validation set
        start_validation = time.time()
        acc1, acc5 = validate(data.val_loader, model, criterion, args, writer,
                              epoch)
        validation_time.update((time.time() - start_validation) / 60)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        best_acc5 = max(acc5, best_acc5)
        best_train_acc1 = max(train_acc1, best_train_acc1)
        best_train_acc5 = max(train_acc5, best_train_acc5)

        save = ((epoch % args.save_every) == 0) and args.save_every > 0
        if is_best or save or epoch == args.epochs - 1:
            if is_best:
                print(
                    f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}"
                )

            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "arch": args.arch,
                    "state_dict": model.state_dict(),
                    "best_acc1": best_acc1,
                    "best_acc5": best_acc5,
                    "best_train_acc1": best_train_acc1,
                    "best_train_acc5": best_train_acc5,
                    "optimizer": optimizer.state_dict(),
                    "curr_acc1": acc1,
                    "curr_acc5": acc5,
                },
                is_best,
                filename=ckpt_base_dir / f"epoch_{epoch}.state",
                save=save,
            )

        epoch_time.update((time.time() - end_epoch) / 60)
        progress_overall.display(epoch)
        progress_overall.write_to_tensorboard(writer,
                                              prefix="diagnostics",
                                              global_step=epoch)

        writer.add_scalar("test/lr", cur_lr, epoch)
        end_epoch = time.time()

        # Storing sparsity and threshold statistics for STRConv models
        if args.conv_type == "STRConv":
            count = 0
            sum_sparse = 0.0
            for n, m in model.named_modules():
                if isinstance(m, STRConv):
                    sparsity, total_params, thresh = m.getSparsity()
                    writer.add_scalar("sparsity/{}".format(n), sparsity, epoch)
                    writer.add_scalar("thresh/{}".format(n), thresh, epoch)
                    sum_sparse += int(((100 - sparsity) / 100) * total_params)
                    count += total_params
            total_sparsity = 100 - (100 * sum_sparse / count)
            writer.add_scalar("sparsity/total", total_sparsity, epoch)
        writer.add_scalar("test/lr", cur_lr, epoch)
        end_epoch = time.time()

    write_result_to_csv(
        best_acc1=best_acc1,
        best_acc5=best_acc5,
        best_train_acc1=best_train_acc1,
        best_train_acc5=best_train_acc5,
        prune_rate=args.prune_rate,
        curr_acc1=acc1,
        curr_acc5=acc5,
        base_config=args.config,
        name=args.name,
    )
    if args.conv_type == "STRConv":
        json_data = {}
        json_thres = {}
        for n, m in model.named_modules():
            if isinstance(m, STRConv):
                sparsity = m.getSparsity()
                json_data[n] = sparsity[0]
                sum_sparse += int(((100 - sparsity[0]) / 100) * sparsity[1])
                count += sparsity[1]
                json_thres[n] = sparsity[2]
        json_data["total"] = 100 - (100 * sum_sparse / count)
        if not os.path.exists("runs/layerwise_sparsity"):
            os.mkdir("runs/layerwise_sparsity")
        if not os.path.exists("runs/layerwise_threshold"):
            os.mkdir("runs/layerwise_threshold")
        with open("runs/layerwise_sparsity/{}.json".format(args.name),
                  "w") as f:
            json.dump(json_data, f)
        with open("runs/layerwise_threshold/{}.json".format(args.name),
                  "w") as f:
            json.dump(json_thres, f)
コード例 #22
0
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch,
          args, writer):
    epsilon = set_epsilon(args, epoch)
    k = args.mixtraink
    alpha = 0.8
    iw = set_interval_weight(args, epoch)

    print(" ->->->->->->->->->-> One epoch with MixTrain{} (SYM {:.3f})"
          " <-<-<-<-<-<-<-<-<-<-".format(k, epsilon))

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    sym_losses = AverageMeter("Sym_Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    sym1 = AverageMeter("Sym1", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, sym_losses, top1, sym1],
        prefix="Epoch: [{}]".format(epoch),
    )

    model.train()
    end = time.time()

    dataloader = train_loader if sm_loader is None else zip(
        train_loader, sm_loader)

    for i, data in enumerate(dataloader):
        if sm_loader:
            images, target = (
                torch.cat([d[0] for d in data], 0).to(device),
                torch.cat([d[1] for d in data], 0).to(device),
            )
        else:
            images, target = data[0].to(device), data[1].to(device)

        # basic properties of training data
        if i == 0:
            print(
                images.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print(
                f"Training images range: {[torch.min(images), torch.max(images)]}"
            )

        output = model(images)
        ce = nn.CrossEntropyLoss()(output, target)

        if (np.random.uniform() <= alpha):
            r = np.random.randint(low=0, high=images.shape[0], size=k)
            rce, rerr = sym_interval_analyze(
                model,
                epsilon,
                images[r],
                target[r],
                use_cuda=torch.cuda.is_available(),
                parallel=False)

            #print("sym:", rce.item(), ce.item())
            loss = iw * rce + ce
            sym_losses.update(rce.item(), k)
            sym1.update((1 - rerr) * 100., images.size(0))
        else:
            loss = ce

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        top1.update(acc1[0], images.size(0))
        losses.update(ce.item(), images.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
            progress.write_to_tensorboard(writer, "train",
                                          epoch * len(train_loader) + i)

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(images[0:len(images) // 4]),
            )
コード例 #23
0
def dann_train(feature_extractor:FeatureExtractor, 
    domain_adv:DomainAdversarialLoss, 
    src_iter:ForeverDataIterator, 
    tar_iter:ForeverDataIterator, 
    src_val_loader, tar_val_loader,
    args):
    optimizer = Adam(
        itertools.chain(feature_extractor.parameters(), domain_adv.parameters()),
        lr= args.lr,weight_decay=args.weight_decay
    )
    
    npair_loss = NPairsLoss()  # n pair loss

    epoch = args.epoch
    iter_per_epoch = args.iter_per_epoch
    writer = args.writer # Summary Writer
    logger = args.logger
    device = args.device
    w_da = args.w_da
    model_dir = args.model_dir

    # loss
    loss_rec = AverageMeter('tot_loss', tb_tag='Loss/tot', writer=writer)
    loss_lb_rec = AverageMeter('lb_loss', tb_tag='Loss/lb', writer=writer)
    loss_lb_g_rec = AverageMeter('lb_g_loss', tb_tag='Loss/lb_g', writer=writer)
    loss_da_rec = AverageMeter('da_loss', tb_tag='Loss/da', writer=writer)

    # acc
    da_acc_rec = AverageMeter('da_acc', tb_tag='Acc/da', writer=writer)

    n_iter = 0
    best_nmi = 0
    for e_i in range(epoch):
        feature_extractor.train()
        domain_adv.train()
        progress = ProgressMeter(
            iter_per_epoch,
            [loss_lb_g_rec, loss_lb_rec, loss_da_rec,da_acc_rec],
            prefix="Epoch: [{}]".format(e_i),
            logger=logger
        )
        for i in range(iter_per_epoch):
            x_s, l_s = next(src_iter)
            x_t, l_t = next(tar_iter)
            # for obj in [x_s, x_t, l_s, l_t]: # to device
                # obj = obj.to(device)
            
            x_s, l_s, x_t, l_t = x_s.to(device), l_s.to(device), x_t.to(device), l_t.to(device)

            x = torch.cat((x_s, x_t), dim=0)
            f, g = feature_extractor(x)
            f_s, f_t = f.chunk(2, dim=0)
            g_s, g_t = g.chunk(2, dim=0)
            
            # source only part
            loss_s = npair_loss(f_s, l_s) # get n-pair loss on source domain
            loss_s_g = npair_loss(g_s, l_s) # get n-pair loss on source domain
            loss_lb_rec.update(loss_s.item(), x_s.size(0), iter=n_iter)
            loss_lb_g_rec.update(loss_s_g.item(), x_s.size(0), iter=n_iter)
            
            # dann
            # da_loss = domain_adv(f_s,f_t)
            da_loss = domain_adv(g_s,f_t)
            domain_acc = domain_adv.domain_discriminator_accuracy
            loss_da_rec.update(da_loss.item(), f.size(0), iter=n_iter)
            da_acc_rec.update(domain_acc.item(), f.size(0), iter=n_iter)

            loss = 0.5 * (loss_s + loss_s_g) + w_da * da_loss
            # loss = loss_s
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            n_iter += 1
            if i % print_freq == 0:
                progress.display(i)

        if e_i % 5 == 0:
            # logger.info(f"saving embedding in epoch{e_i}")
            # # show embedding
            # show_embedding(backbone, [src_val_loader], tag=f'src_{e_i}', epoch=e_i, writer, device)
            # show_embedding(backbone, [tar_val_loader], tag=f'tar_{e_i}', epoch=e_i, writer, device)
            
            nmi = NMI_eval(feature_extractor, src_val_loader, 5, device, type='src')
            logger.info(f'test on train set nmi: {nmi}')
            nmi = NMI_eval(feature_extractor, tar_val_loader, 5, device, type='tar')
            logger.info(f'test on test set nmi: {nmi}')
            if nmi > best_nmi:
                logger.info(f"save best model to {model_dir}")
                torch.save(feature_extractor.state_dict(), os.path.join(model_dir, 'minst_best_model.pth'))
                best_nmi = nmi
コード例 #24
0
ファイル: trn_pretrain.py プロジェクト: turian/simsiam
def trn(cfg,model):



    cfg.logger.info(cfg)
    if cfg.seed is not None:
        random.seed(cfg.seed)
        torch.manual_seed(cfg.seed)
        torch.cuda.manual_seed(cfg.seed)
        torch.cuda.manual_seed_all(cfg.seed)

    train, validate_knn = get_trainer(cfg)

    if cfg.gpu is not None:
        cfg.logger.info("Use GPU: {} for training".format(cfg.gpu))

    # if cfg.pretrained:
    #     net_utils.load_pretrained(cfg.pretrained,cfg.multigpu[0], model)

    optimizer = get_optimizer(cfg, model)
    cfg.logger.info(f"=> Getting {cfg.set} dataset")

    dataset = getattr(data, cfg.set)(cfg)

    lr_policy = get_policy(cfg.lr_policy)(optimizer, cfg)

    if cfg.arch == 'SimSiam':
        # L = D(p1, z2) / 2 + D(p2, z1) / 2
        base_criterion = lambda bb1_z1_p1_emb, bb2_z2_p2_emb: simsiam.SimSaimLoss(bb1_z1_p1_emb[2], bb2_z2_p2_emb[1]) / 2 +\
                                                 simsiam.SimSaimLoss(bb2_z2_p2_emb[2], bb1_z1_p1_emb[1]) / 2
    elif cfg.arch == 'SimCLR':
        base_criterion = lambda z1,z2 : simclr.NT_XentLoss(z1, z2)
    else:
        raise NotImplemented

    run_base_dir, ckpt_base_dir, log_base_dir = path_utils.get_directories(cfg,cfg.gpu)
    _, zero_gpu_ckpt_base_dir, _ = path_utils.get_directories(cfg, 0)
    # if cfg.resume:
    saved_epochs = sorted(glob.glob(str(zero_gpu_ckpt_base_dir) + '/epoch_*.state'), key=os.path.getmtime)
    # assert len(epochs) < 2, 'Should be only one saved epoch -- the last one'
    if len(saved_epochs) > 0:
        cfg.resume = saved_epochs[-1]
        resume(cfg, model, optimizer)


    cfg.ckpt_base_dir = ckpt_base_dir

    writer = SummaryWriter(log_dir=log_base_dir)
    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)
    progress_overall = ProgressMeter(
        1, [epoch_time, validation_time, train_time], cfg, prefix="Overall Timing"
    )

    end_epoch = time.time()
    cfg.start_epoch = cfg.start_epoch or 0

    start_time = time.time()
    gpu_info = gpu_utils.GPU_Utils(gpu_index=cfg.gpu)

    cfg.logger.info('Start Training: Model conv 1 initialization {}'.format(torch.sum(model.module.backbone.conv1.weight)))
    # Start training

    for n,m in model.module.named_modules():
        if hasattr(m, "weight") and m.weight is not None:
            cfg.logger.info('{} ({}): {}'.format(n,type(m).__name__,m.weight.shape))


    criterion = base_criterion
    cfg.logger.info('Using Vanilla Criterion')
    for epoch in range(cfg.start_epoch, cfg.epochs):
        if cfg.world_size > 1:
            dataset.sampler.set_epoch(epoch)

        lr_policy(epoch, iteration=None)

        cur_lr = net_utils.get_lr(optimizer)

        start_train = time.time()
        train(dataset.trn_loader, model,criterion, optimizer, epoch, cfg, writer=writer)
        train_time.update((time.time() - start_train) / 60)


        if (epoch + 1) % cfg.test_interval == 0:
            if cfg.gpu == cfg.base_gpu:
                # evaluate on validation set
                start_validation = time.time()

                acc = validate_knn(dataset.trn_loader, dataset.val_loader, model.module, cfg, writer, epoch)

                validation_time.update((time.time() - start_validation) / 60)
                csv_utils.write_generic_result_to_csv(path=cfg.exp_dir,name=os.path.basename(cfg.exp_dir[:-1]),
                                                      epoch=epoch,
                                                      knn_acc=acc)

                save = (((epoch+1) % cfg.save_every) == 0) and cfg.save_every > 0
                if save or epoch == cfg.epochs - 1:
                    # if is_best:
                    # print(f"==> best {last_val_acc1:.02f} saving at {ckpt_base_dir / 'model_best.pth'}")

                    net_utils.save_checkpoint(
                        {
                            "epoch": epoch + 1,
                            "arch": cfg.arch,
                            "state_dict": model.state_dict(),
                            "ACC": acc,
                            "optimizer": optimizer.state_dict(),
                        },
                        is_best=False,
                        filename=ckpt_base_dir / f"epoch_{epoch:04d}.state",
                        save=save or epoch == cfg.epochs - 1,
                    )


                    elapsed_time = time.time() - start_time
                    seconds_todo = (cfg.epochs - epoch) * (elapsed_time / cfg.test_interval)
                    estimated_time_complete = timedelta(seconds=int(seconds_todo))
                    start_time = time.time()
                    cfg.logger.info(
                        f"==> ETA: {estimated_time_complete}\tGPU-M: {gpu_info.gpu_mem_usage()}\tGPU-U: {gpu_info.gpu_utilization()}")


                    epoch_time.update((time.time() - end_epoch) / 60)
                    progress_overall.display(epoch)
                    progress_overall.write_to_tensorboard(
                        writer, prefix="diagnostics", global_step=epoch
                    )

                    writer.add_scalar("test/lr", cur_lr, epoch)
                    end_epoch = time.time()


            if cfg.world_size > 1:
                # cfg.logger.info('GPU {} going into the barrier'.format(cfg.gpu))
                dist.barrier()
コード例 #25
0
ファイル: crown-ibp.py プロジェクト: timko98/hydra
def train(
    model, device, train_loader, sm_loader, criterion, optimizer, epoch, args, writer
):
    num_class = 10

    sa = np.zeros((num_class, num_class - 1), dtype = np.int32)
    for i in range(sa.shape[0]):
        for j in range(sa.shape[1]):
            if j < i:
                sa[i][j] = j
            else:
                sa[i][j] = j + 1
    sa = torch.LongTensor(sa) 
    batch_size = args.batch_size*2

    schedule_start = 0
    num_steps_per_epoch = len(train_loader)
    eps_scheduler = EpsilonScheduler("linear",
                args.schedule_start,
                ((args.schedule_start + args.schedule_length) - 1) *\
                num_steps_per_epoch, args.starting_epsilon,
                args.epsilon,
                num_steps_per_epoch)

    end_eps = eps_scheduler.get_eps(epoch+1, 0)
    start_eps = eps_scheduler.get_eps(epoch, 0)


    print(
        " ->->->->->->->->->-> One epoch with CROWN-IBP ({:.6f}-{:.6f})"
        " <-<-<-<-<-<-<-<-<-<-".format(start_eps, end_eps)
    )

    batch_time = AverageMeter("Time", ":6.3f")
    data_time = AverageMeter("Data", ":6.3f")
    losses = AverageMeter("Loss", ":.4f")
    ibp_losses = AverageMeter("IBP_Loss", ":.4f")
    top1 = AverageMeter("Acc_1", ":6.2f")
    ibp_acc1 = AverageMeter("IBP1", ":6.2f")
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, ibp_losses, top1, ibp_acc1],
        prefix="Epoch: [{}]".format(epoch),
    )

    model = BoundSequential.convert(model,\
                    {'same-slope': False, 'zero-lb': False,\
                    'one-lb': False}).to(device)

    model.train()
    end = time.time()

    dataloader = train_loader if sm_loader is None else zip(train_loader, sm_loader)

    for i, data in enumerate(dataloader):
        if sm_loader:
            images, target = (
                torch.cat([d[0] for d in data], 0).to(device),
                torch.cat([d[1] for d in data], 0).to(device),
            )
        else:
            images, target = data[0].to(device), data[1].to(device)

        # basic properties of training data
        if i == 0:
            print(
                images.shape,
                target.shape,
                f"Batch_size from args: {args.batch_size}",
                "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]),
            )
            print(f"Training images range: {[torch.min(images), torch.max(images)]}")

        output = model(images, method_opt="forward")
        ce = nn.CrossEntropyLoss()(output, target)

        eps = eps_scheduler.get_eps(epoch, i) 
        # generate specifications
        c = torch.eye(num_class).type_as(images)[target].unsqueeze(1) -\
                torch.eye(num_class).type_as(images).unsqueeze(0) 
        # remove specifications to self
        I = (~(target.unsqueeze(1) ==\
            torch.arange(num_class).to(device).type_as(target).unsqueeze(0)))
        c = (c[I].view(images.size(0),num_class-1,num_class)).to(device)
        # scatter matrix to avoid compute margin to self
        sa_labels = sa[target].to(device)
        # storing computed lower bounds after scatter
        lb_s = torch.zeros(images.size(0), num_class).to(device)
        ub_s = torch.zeros(images.size(0), num_class).to(device)

        data_ub = torch.min(images + eps, images.max()).to(device)
        data_lb = torch.max(images - eps, images.min()).to(device)

        ub, ilb, relu_activity, unstable, dead, alive =\
                model(norm=np.inf, x_U=data_ub, x_L=data_lb,\
                eps=eps, C=c, method_opt="interval_range")

        crown_final_beta = 0.
        beta = (args.epsilon - eps * (1.0 - crown_final_beta)) / args.epsilon

        if beta < 1e-5:
            # print("pure naive")
            lb = ilb
        else:
            # print("crown-ibp")
            # get the CROWN bound using interval bounds 
            _, _, clb, bias = model(norm=np.inf, x_U=data_ub,\
                        x_L=data_lb, eps=eps, C=c,\
                        method_opt="backward_range")
            # how much better is crown-ibp better than ibp?
            # diff = (clb - ilb).sum().item()
            lb = clb * beta + ilb * (1 - beta)

        lb = lb_s.scatter(1, sa_labels, lb)
        robust_ce = criterion(-lb, target)

        #print(ce, robust_ce)
        racc = accuracy(-lb, target, topk=(1,))

        loss = robust_ce

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        top1.update(acc1[0].item(), images.size(0))
        losses.update(ce.item(), images.size(0))
        ibp_losses.update(robust_ce.item(), images.size(0))
        ibp_acc1.update(racc[0].item(), images.size(0))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
            progress.write_to_tensorboard(
                writer, "train", epoch * len(train_loader) + i
            )

        # write a sample of training images to tensorboard (helpful for debugging)
        if i == 0:
            writer.add_image(
                "training-images",
                torchvision.utils.make_grid(images[0 : len(images) // 4]),
            )
コード例 #26
0
def trn(cfg, model):

    cfg.logger.info(cfg)
    if cfg.seed is not None:
        random.seed(cfg.seed)
        torch.manual_seed(cfg.seed)
        torch.cuda.manual_seed(cfg.seed)
        torch.cuda.manual_seed_all(cfg.seed)

    train, validate = get_trainer(cfg)

    if cfg.gpu is not None:
        cfg.logger.info("Use GPU: {} for training".format(cfg.gpu))

    linear_classifier_layer = model.module[1]
    optimizer = get_optimizer(cfg, linear_classifier_layer)
    cfg.logger.info(f"=> Getting {cfg.set} dataset")

    dataset = getattr(data, cfg.set)(cfg)

    lr_policy = get_policy(cfg.lr_policy)(optimizer, cfg)

    softmax_criterion = nn.CrossEntropyLoss().cuda()

    criterion = lambda output, target: softmax_criterion(output, target)

    # optionally resume from a checkpoint
    best_val_acc1 = 0.0
    best_val_acc5 = 0.0
    best_train_acc1 = 0.0
    best_train_acc5 = 0.0

    if cfg.resume:
        best_val_acc1 = resume(cfg, model, optimizer)

    run_base_dir, ckpt_base_dir, log_base_dir = path_utils.get_directories(
        cfg, cfg.gpu)
    cfg.ckpt_base_dir = ckpt_base_dir

    writer = SummaryWriter(log_dir=log_base_dir)
    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)
    progress_overall = ProgressMeter(1,
                                     [epoch_time, validation_time, train_time],
                                     cfg,
                                     prefix="Overall Timing")

    end_epoch = time.time()
    cfg.start_epoch = cfg.start_epoch or 0
    last_val_acc1 = None

    start_time = time.time()
    gpu_info = gpu_utils.GPU_Utils(gpu_index=cfg.gpu)

    # Start training
    for epoch in range(cfg.start_epoch, cfg.epochs):
        cfg.logger.info('Model conv 1 {} at epoch {}'.format(
            torch.sum(model.module[0].conv1.weight),
            epoch))  ##  make sure backbone is not updated
        if cfg.world_size > 1:
            dataset.sampler.set_epoch(epoch)
        lr_policy(epoch, iteration=None)

        cur_lr = net_utils.get_lr(optimizer)

        start_train = time.time()
        train_acc1, train_acc5 = train(dataset.trn_loader,
                                       model,
                                       criterion,
                                       optimizer,
                                       epoch,
                                       cfg,
                                       writer=writer)
        train_time.update((time.time() - start_train) / 60)

        if (epoch + 1) % cfg.test_interval == 0:
            if cfg.gpu == cfg.base_gpu:
                # evaluate on validation set
                start_validation = time.time()
                last_val_acc1, last_val_acc5 = validate(
                    dataset.val_loader, model.module, criterion, cfg, writer,
                    epoch)
                validation_time.update((time.time() - start_validation) / 60)

                # remember best acc@1 and save checkpoint
                is_best = last_val_acc1 > best_val_acc1
                best_val_acc1 = max(last_val_acc1, best_val_acc1)
                best_val_acc5 = max(last_val_acc5, best_val_acc5)
                best_train_acc1 = max(train_acc1, best_train_acc1)
                best_train_acc5 = max(train_acc5, best_train_acc5)

                save = (((epoch + 1) % cfg.save_every)
                        == 0) and cfg.save_every > 0
                if save or epoch == cfg.epochs - 1:
                    if is_best:
                        cfg.logger.info(
                            f"==> best {last_val_acc1:.02f} saving at {ckpt_base_dir / 'model_best.pth'}"
                        )

                    net_utils.save_checkpoint(
                        {
                            "epoch": epoch + 1,
                            "arch": cfg.arch,
                            "state_dict": model.state_dict(),
                            "best_acc1": best_val_acc1,
                            "best_acc5": best_val_acc5,
                            "best_train_acc1": best_train_acc1,
                            "best_train_acc5": best_train_acc5,
                            "optimizer": optimizer.state_dict(),
                            "curr_acc1": last_val_acc1,
                            "curr_acc5": last_val_acc5,
                        },
                        is_best,
                        filename=ckpt_base_dir / f"epoch_{epoch}.state",
                        save=save or epoch == cfg.epochs - 1,
                    )

                elapsed_time = time.time() - start_time
                seconds_todo = (cfg.epochs - epoch) * (elapsed_time /
                                                       cfg.test_interval)
                estimated_time_complete = timedelta(seconds=int(seconds_todo))
                start_time = time.time()
                cfg.logger.info(
                    f"==> ETA: {estimated_time_complete}\tGPU-M: {gpu_info.gpu_mem_usage()}\tGPU-U: {gpu_info.gpu_utilization()}"
                )

                epoch_time.update((time.time() - end_epoch) / 60)
                progress_overall.display(epoch)
                progress_overall.write_to_tensorboard(writer,
                                                      prefix="diagnostics",
                                                      global_step=epoch)

                writer.add_scalar("test/lr", cur_lr, epoch)
                end_epoch = time.time()

            if cfg.world_size > 1:
                dist.barrier()
コード例 #27
0
def main_worker(args):
    args.gpu = None
    train, validate, modifier = get_trainer(args)

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # create model and optimizer
    model = get_model(args)
    model = set_gpu(args, model)

    if args.pretrained:
        pretrained(args, model)

    # SJT modification:
    if args.exp_mode:  # pretraining/pruning/funetuning
        exp_mode = args.exp_mode
        if exp_mode == "pretraining":
            # YHT modefication, setting the pruning rate to 0
            print(
                "Figure out your exp_mode is pretraining, setting prune-rate to 0"
            )
            args.prune_rate = 0
            unfreeze_model_weights(model)
            freeze_model_subnet(model)
    # End of SJT modification

    optimizer = get_optimizer(args, model)
    data = get_dataset(args)
    lr_policy = get_policy(args.lr_policy)(optimizer, args)

    if args.label_smoothing is None:
        criterion = nn.CrossEntropyLoss().cuda()
    else:
        criterion = LabelSmoothing(smoothing=args.label_smoothing)

    # optionally resume from a checkpoint
    best_acc1 = 0.0
    best_acc5 = 0.0
    best_train_acc1 = 0.0
    best_train_acc5 = 0.0

    if args.resume:
        # SJT modification
        if args.exp_mode:
            if args.exp_mode == "pruning":
                optimizer = resume_pruning(args, model)
            else:  # Only can be "finetuning"
                if args.exp_mode != "finetuning":
                    print(
                        "resume method should be combined with pruning/finetuning exp_mode together!"
                    )
                    return
                else:
                    optimizer = resume_finetuning(args, model)
                    # YHT: not sure whether it is needed
                    #lr_policy = get_policy(args.lr_policy)(optimizer, args)
                    #print("#####################DEBUG PRINT : VALIDATE FIRST#####################")
                    #validate(data.val_loader, model, criterion, args, writer= None, epoch=args.start_epoch)
        else:
            best_acc1 = resume(args, model, optimizer)
        # End of SJT modification
    else:
        # YHT modification
        if args.exp_mode:
            if args.exp_mode == "finetuning":
                #here, we suppose the user want to use init prun-rate vector to do the finetuning(subnetwork)
                print(
                    "Using finetuning mode without resume, which is supposed to be innit fientune."
                )
                optimizer = resume_finetuning(args, model)
                # YHT: not sure whether it is needed
                lr_policy = get_policy(args.lr_policy)(optimizer, args)
        # End of modification

    # Data loading code
    if args.evaluate:
        acc1, acc5 = validate(data.val_loader,
                              model,
                              criterion,
                              args,
                              writer=None,
                              epoch=args.start_epoch)

        return

    # Set up directories
    run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args)
    args.ckpt_base_dir = ckpt_base_dir

    writer = SummaryWriter(log_dir=log_base_dir)
    epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False)
    validation_time = AverageMeter("validation_time", ":.4f", write_avg=False)
    train_time = AverageMeter("train_time", ":.4f", write_avg=False)
    progress_overall = ProgressMeter(1,
                                     [epoch_time, validation_time, train_time],
                                     prefix="Overall Timing")

    end_epoch = time.time()
    args.start_epoch = args.start_epoch or 0
    acc1 = None

    # Save the initial state
    save_checkpoint(
        {
            "epoch": 0,
            "arch": args.arch,
            "state_dict": model.state_dict(),
            "best_acc1": best_acc1,
            "best_acc5": best_acc5,
            "best_train_acc1": best_train_acc1,
            "best_train_acc5": best_train_acc5,
            "optimizer": optimizer.state_dict(),
            "curr_acc1": acc1 if acc1 else "Not evaluated",
        },
        False,
        filename=ckpt_base_dir / f"initial.state",
        save=False,
    )

    if args.gp_warm_up:
        record_prune_rate = args.prune_rate
    if args.print_more:
        print_global_layerwise_prune_rate(model, args.prune_rate)

    # YHT modification May 20
    # till here, we have every prune-rate is accurate
    # Now we need to create mask if prandom is true using
    if args.prandom:
        make_prandom_mask(model)
    # End of modification

    # Start training
    for epoch in range(args.start_epoch, args.epochs):
        lr_policy(epoch, iteration=None)
        modifier(args, epoch, model)
        cur_lr = get_lr(optimizer)
        if args.print_more:
            print("In epoch{epoch}, lr = {cur_lr}")
        # train for one epoch
        start_train = time.time()
        # WHN modeification add global pruning
        if args.pscale == "global":
            if args.gp_warm_up:
                if epoch < args.gp_warm_up_epochs:
                    args.prune_rate = 0
                else:
                    args.prune_rate = record_prune_rate
            if not args.prandom:
                args.score_threshold = get_global_score_threshold(
                    model, args.prune_rate)

        # YHT modification
        if args.print_more:
            print_global_layerwise_prune_rate(model, args.prune_rate)
        # End of modification

        train_acc1, train_acc5 = train(data.train_loader,
                                       model,
                                       criterion,
                                       optimizer,
                                       epoch,
                                       args,
                                       writer=writer)
        train_time.update((time.time() - start_train) / 60)

        # evaluate on validation set
        start_validation = time.time()
        # if random labeled, evaluate on training set (by yty)
        if args.shuffle:
            acc1, acc5 = train_acc1, train_acc5
        else:
            acc1, acc5 = validate(data.val_loader, model, criterion, args,
                                  writer, epoch)

        validation_time.update((time.time() - start_validation) / 60)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        best_acc5 = max(acc5, best_acc5)
        best_train_acc1 = max(train_acc1, best_train_acc1)
        best_train_acc5 = max(train_acc5, best_train_acc5)

        save = ((epoch % args.save_every) == 0) and args.save_every > 0
        if is_best or save or epoch == args.epochs - 1:
            if is_best:
                print(
                    f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}"
                )

            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "arch": args.arch,
                    "state_dict": model.state_dict(),
                    "best_acc1": best_acc1,
                    "best_acc5": best_acc5,
                    "best_train_acc1": best_train_acc1,
                    "best_train_acc5": best_train_acc5,
                    "optimizer": optimizer.state_dict(),
                    "curr_acc1": acc1,
                    "curr_acc5": acc5,
                },
                is_best,
                filename=ckpt_base_dir / f"epoch_{epoch}.state",
                save=save,
            )

        epoch_time.update((time.time() - end_epoch) / 60)
        progress_overall.display(epoch)
        progress_overall.write_to_tensorboard(writer,
                                              prefix="diagnostics",
                                              global_step=epoch)

        if args.conv_type == "SampleSubnetConv":
            count = 0
            sum_pr = 0.0
            for n, m in model.named_modules():
                if isinstance(m, SampleSubnetConv):
                    # avg pr across 10 samples
                    pr = 0.0
                    for _ in range(10):
                        pr += ((torch.rand_like(m.clamped_scores) >=
                                m.clamped_scores).float().mean().item())
                    pr /= 10.0
                    writer.add_scalar("pr/{}".format(n), pr, epoch)
                    sum_pr += pr
                    count += 1

            args.prune_rate = sum_pr / count
            writer.add_scalar("pr/average", args.prune_rate, epoch)

        writer.add_scalar("test/lr", cur_lr, epoch)
        end_epoch = time.time()

    write_result_to_csv(
        best_acc1=best_acc1,
        best_acc5=best_acc5,
        best_train_acc1=best_train_acc1,
        best_train_acc5=best_train_acc5,
        prune_rate=args.prune_rate,
        curr_acc1=acc1,
        curr_acc5=acc5,
        base_config=args.config,
        name=args.name,
    )