def validate(val_loader, model, criterion, args, writer, epoch): batch_time = AverageMeter("Time", ":6.3f", write_val=False) losses = AverageMeter("Loss", ":.3f", write_val=False) top1 = AverageMeter("Acc@1", ":6.2f", write_val=False) top5 = AverageMeter("Acc@5", ":6.2f", write_val=False) progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5], prefix="Test: ") # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (images, target) in tqdm.tqdm(enumerate(val_loader), ascii=True, total=len(val_loader)): if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # YHT modification ''' This will severely influence the generalization! drop this. if args.seed is not None and args.prandom: torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) ''' # End of modification # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1.item(), images.size(0)) top5.update(acc5.item(), images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) progress.display(len(val_loader)) if writer is not None: progress.write_to_tensorboard(writer, prefix="test", global_step=epoch) return top1.avg, top5.avg
def validate(val_loader, model, criterion, args, writer, epoch): batch_time = AverageMeter("Time", ":6.3f", write_val=False) losses = AverageMeter("Loss", ":.3f", write_val=False) top1 = AverageMeter("Acc@1", ":6.2f", write_val=False) top5 = AverageMeter("Acc@5", ":6.2f", write_val=False) progress = ProgressMeter(val_loader.num_batches, [batch_time, losses, top1, top5], args, prefix="Test: ") # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() # confusion_matrix = torch.zeros(args.num_cls,args.num_cls) for i, data in enumerate(val_loader): # images, target = data[0]['data'], data[0]['label'].long().squeeze() images, target = data[0].cuda(), data[1].long().squeeze().cuda() output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) # print(target,torch.mean(images),acc1,acc5,loss,torch.mean(output)) losses.update(loss.item(), images.size(0)) top1.update(acc1.item(), images.size(0)) top5.update(acc5.item(), images.size(0)) # _, preds = torch.max(output, 1) # for t, p in zip(target.view(-1), preds.view(-1)): # confusion_matrix[t.long(), p.long()] += 1 # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) progress.display(val_loader.num_batches) if writer is not None: progress.write_to_tensorboard(writer, prefix="test", global_step=epoch) # torch.save(confusion_matrix,'./conf_mat.pt') # print(top1.count) return top1.avg, top5.avg
def base(model, device, val_loader, criterion, args, writer, epoch=0): """ Evaluating on unmodified validation set inputs. """ batch_time = AverageMeter("Time", ":6.3f") losses = AverageMeter("Loss", ":.4f") top1 = AverageMeter("Acc_1", ":6.2f") top5 = AverageMeter("Acc_5", ":6.2f") progress = ProgressMeter( len(val_loader), [batch_time, losses, top1, top5], prefix="Test: " ) # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, data in enumerate(val_loader): images, target = data[0].to(device), data[1].to(device) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if (i + 1) % args.print_freq == 0: progress.display(i) if writer: progress.write_to_tensorboard( writer, "test", epoch * len(val_loader) + i ) # write a sample of test images to tensorboard (helpful for debugging) if i == 0 and writer: writer.add_image( "test-images", torchvision.utils.make_grid(images[0 : len(images) // 4]), ) progress.display(i) # print final results return top1.avg, top5.avg
def train(train_loader, model, criterion, optimizer, epoch, args, writer): batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.3f") top1 = AverageMeter("Acc@1", ":6.2f") top5 = AverageMeter("Acc@5", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], prefix=f"Epoch: [{epoch}]", ) # switch to train mode model.train() batch_size = train_loader.batch_size num_batches = len(train_loader) end = time.time() for i, (images, target) in tqdm.tqdm( enumerate(train_loader), ascii=True, total=len(train_loader) ): # measure data loading time data_time.update(time.time() - end) if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) ### for MNIST #images = images.expand() #import pdb #pdb.set_trace() target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1.item(), images.size(0)) top5.update(acc5.item(), images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: t = (num_batches * epoch + i) * batch_size progress.display(i) progress.write_to_tensorboard(writer, prefix="train", global_step=t) return top1.avg, top5.avg
def validate(val_loader, model, criterion, args, writer, epoch): # batch_time = AverageMeter("Time", ":6.3f", write_val=False) losses = AverageMeter("Loss", ":.3f", write_val=False) top1 = AverageMeter("Acc@1", ":6.2f", write_val=False) top5 = AverageMeter("Acc@5", ":6.2f", write_val=False) #progress = ProgressMeter( # len(val_loader), [batch_time, losses, top1, top5], prefix="Test: " #) progress = ProgressMeter(len(val_loader), [losses, top1, top5], prefix="Test: ") # switch to evaluate mode model.eval() printModelScore(model, args) with torch.no_grad(): end = time.time() for i, (images, target) in tqdm.tqdm(enumerate(val_loader), ascii=True, total=len(val_loader)): if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1.item(), images.size(0)) top5.update(acc5.item(), images.size(0)) # measure elapsed time # batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) progress.display(len(val_loader)) if writer is not None: progress.write_to_tensorboard(writer, prefix="test", global_step=epoch) return top1.avg, top5.avg, losses.avg
def train(train_loader, model, criterion, optimizer, epoch, cfg, writer): batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.3f") progress = ProgressMeter( train_loader.num_batches, [batch_time, data_time, losses], cfg, prefix=f"Epoch: [{epoch}]", ) # switch to train mode model.train() # batch_size = train_loader.batch_size num_batches = train_loader.num_batches end = time.time() batch_size = train_loader.batch_size for i, data in enumerate(train_loader): imgs1, imgs2, target = data[0][0].cuda( non_blocking=True), data[0][1].cuda( non_blocking=True), data[1].long().squeeze().cuda( non_blocking=True) # measure data loading time data_time.update(time.time() - end) #compute output emb1, emb2 = model(imgs1, imgs2) loss = criterion(emb1, emb2) losses.update(loss.item(), batch_size) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % cfg.print_freq == 0 or i == num_batches - 1: t = (num_batches * epoch + i) * batch_size progress.display(i) progress.write_to_tensorboard(writer, prefix="train", global_step=t)
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch, args, writer): print( " ->->->->->->->->->-> One epoch with Natural training <-<-<-<-<-<-<-<-<-<-" ) batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.4f") top1 = AverageMeter("Acc_1", ":6.2f") top5 = AverageMeter("Acc_5", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch), ) model.train() end = time.time() dataloader = train_loader if sm_loader is None else zip( train_loader, sm_loader) for i, data in enumerate(dataloader): if sm_loader: images, target = ( torch.cat([d[0] for d in data], 0).to(device), torch.cat([d[1] for d in data], 0).to(device), ) else: images, target = data[0].to(device), data[1].to(device) # basic properties of training if i == 0: print( images.shape, target.shape, f"Batch_size from args: {args.batch_size}", "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]), ) print("Pixel range for training images : [{}, {}]".format( torch.min(images).data.cpu().numpy(), torch.max(images).data.cpu().numpy(), )) # stability-loss if args.dataset == "imagenet": std = (torch.tensor( [0.229, 0.224, 0.225]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)).to(device) noise = (torch.randn_like(images) / std).to(device) * args.noise_std output = model(images + noise) loss = nn.CrossEntropyLoss()(output, target) else: output = model(images) loss_natural = nn.CrossEntropyLoss()(output, target) loss_robust = (1.0 / len(images)) * nn.KLDivLoss( size_average=False)( F.log_softmax( model(images + torch.randn_like(images).to(device) * args.noise_std), dim=1, ), F.softmax(output, dim=1), ) loss = loss_natural + args.beta * loss_robust # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) progress.write_to_tensorboard(writer, "train", epoch * len(train_loader) + i) # write a sample of training images to tensorboard (helpful for debugging) if i == 0: writer.add_image( "training-images", torchvision.utils.make_grid(images[0:len(images) // 4]), )
def main_worker(args): # NEW: equivalent to MPI init. print("world size ", os.environ['OMPI_COMM_WORLD_SIZE']) print("rank ", os.environ['OMPI_COMM_WORLD_RANK']) torch.distributed.init_process_group( backend="nccl", init_method="env://", world_size=int(os.environ['OMPI_COMM_WORLD_SIZE']), rank=int(os.environ['OMPI_COMM_WORLD_RANK'])) # NEW: lookup number of ranks in the job, and our rank args.world_size = torch.distributed.get_world_size() print("world size ", args.world_size) args.rank = torch.distributed.get_rank() print("rank ", args.rank) ngpus_per_node = torch.cuda.device_count() print("ngpus_per_node ", ngpus_per_node) local_rank = args.rank % ngpus_per_node print("local_rank ", local_rank) # NEW: Globalize variables global best_acc1 global best_acc5 global best_train_acc1 global best_train_acc5 #args.gpu = None # NEW: Specify gpu args.gpu = local_rank train, validate, modifier = get_trainer(args) if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # create model and optimizer model = get_model(args) # NEW: Distributed data #if args.distributed: args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) #model = set_gpu(args, model) # NEW: Modified function for loading gpus on multinode setups model = lassen_set_gpu(args, model) if args.pretrained: pretrained(args, model) optimizer = get_optimizer(args, model) data = get_dataset(args) lr_policy = get_policy(args.lr_policy)(optimizer, args) if args.label_smoothing is None: #criterion = nn.CrossEntropyLoss().cuda() # NEW: Specify gpu criterion = nn.CrossEntropyLoss().cuda(args.gpu) else: criterion = LabelSmoothing(smoothing=args.label_smoothing) # optionally resume from a checkpoint best_acc1 = 0.0 best_acc5 = 0.0 best_train_acc1 = 0.0 best_train_acc5 = 0.0 if args.resume: best_acc1 = resume(args, model, optimizer) # Data loading code if args.evaluate: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer=None, epoch=args.start_epoch) return # Set up directories # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args) args.ckpt_base_dir = ckpt_base_dir # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: writer = SummaryWriter(log_dir=log_base_dir) else: writer = None epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: progress_overall = ProgressMeter( 1, [epoch_time, validation_time, train_time], prefix="Overall Timing") end_epoch = time.time() args.start_epoch = args.start_epoch or 0 acc1 = None # Save the initial state # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: save_checkpoint( { "epoch": 0, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1 if acc1 else "Not evaluated", }, False, filename=ckpt_base_dir / f"initial.state", save=False, ) # Start training for epoch in range(args.start_epoch, args.epochs): # NEW: Distributed data #if args.distributed: data.train_sampler.set_epoch(epoch) data.val_sampler.set_epoch(epoch) lr_policy(epoch, iteration=None) #modifier(args, epoch, model) cur_lr = get_lr(optimizer) # train for one epoch start_train = time.time() train_acc1, train_acc5 = train(data.train_loader, model, criterion, optimizer, epoch, args, writer=writer) #train_acc1, train_acc5 = train( # data.train_loader, model, criterion, optimizer, epoch, args, writer=None #) train_time.update((time.time() - start_train) / 60) # evaluate on validation set start_validation = time.time() # NEW: Only write values to tensorboard for main processor (one with global rank 0) if args.rank == 0: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer, epoch) else: acc1, acc5 = validate(data.val_loader, model, criterion, args, None, epoch) validation_time.update((time.time() - start_validation) / 60) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) best_acc5 = max(acc5, best_acc5) best_train_acc1 = max(train_acc1, best_train_acc1) best_train_acc5 = max(train_acc5, best_train_acc5) save = ((epoch % args.save_every) == 0) and args.save_every > 0 # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: if is_best or save or epoch == args.epochs - 1: if is_best: print( f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}" ) save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1, "curr_acc5": acc5, }, is_best, filename=ckpt_base_dir / f"epoch_most_recent.state", save=save, ) #filename=ckpt_base_dir / f"epoch_{epoch}.state", epoch_time.update((time.time() - end_epoch) / 60) # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: progress_overall.display(epoch) progress_overall.write_to_tensorboard(writer, prefix="diagnostics", global_step=epoch) if args.conv_type == "SampleSubnetConv": count = 0 sum_pr = 0.0 for n, m in model.named_modules(): if isinstance(m, SampleSubnetConv): # avg pr across 10 samples pr = 0.0 for _ in range(10): pr += ((torch.rand_like(m.clamped_scores) >= m.clamped_scores).float().mean().item()) pr /= 10.0 writer.add_scalar("pr/{}".format(n), pr, epoch) sum_pr += pr count += 1 args.prune_rate = sum_pr / count writer.add_scalar("pr/average", args.prune_rate, epoch) # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() # NEW: Only do for main processor (one with global rank 0) if args.rank == 0: write_result_to_csv( best_acc1=best_acc1, best_acc5=best_acc5, best_train_acc1=best_train_acc1, best_train_acc5=best_train_acc5, prune_rate=args.prune_rate, curr_acc1=acc1, curr_acc5=acc5, base_config=args.config, name=args.name, )
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch, args, writer): print( " ->->->->->->->->->-> One epoch with Natural training <-<-<-<-<-<-<-<-<-<-" ) batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.4f") top1 = AverageMeter("Acc_1", ":6.2f") top5 = AverageMeter("Acc_5", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch), ) model.train() end = time.time() dataloader = train_loader if sm_loader is None else zip( train_loader, sm_loader) for i, data in enumerate(dataloader): if sm_loader: images, target = ( torch.cat([d[0] for d in data], 0).to(device), torch.cat([d[1] for d in data], 0).to(device), ) else: images, target = data[0].to(device), data[1].to(device) # basic properties of training if i == 0: print( images.shape, target.shape, f"Batch_size from args: {args.batch_size}", "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]), ) print("Pixel range for training images : [{}, {}]".format( torch.min(images).data.cpu().numpy(), torch.max(images).data.cpu().numpy(), )) output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) progress.write_to_tensorboard(writer, "train", epoch * len(train_loader) + i) # write a sample of training images to tensorboard (helpful for debugging) if i == 0: writer.add_image( "training-images", torchvision.utils.make_grid(images[0:len(images) // 4]), )
def main_worker(args): train, validate, modifier = get_trainer(args) if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # create model and optimizer model = get_model(args) model = set_gpu(args, model) wandb.watch(model) if args.pretrained: pretrained(args, model) optimizer = get_optimizer(args, model) data = get_dataset(args) lr_policy = get_policy(args.lr_policy)(optimizer, args) if args.label_smoothing is None: criterion = nn.CrossEntropyLoss().cuda() else: criterion = LabelSmoothing(smoothing=args.label_smoothing) # optionally resume from a checkpoint best_acc1 = 0.0 best_acc5 = 0.0 best_train_acc1 = 0.0 best_train_acc5 = 0.0 if args.resume: best_acc1 = resume(args, model, optimizer) # Data loading code if args.evaluate: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer=None, epoch=args.start_epoch) return # Set up directories run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args) args.ckpt_base_dir = ckpt_base_dir writer = SummaryWriter(log_dir=log_base_dir) epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) progress_overall = ProgressMeter(1, [epoch_time, validation_time, train_time], prefix="Overall Timing") end_epoch = time.time() args.start_epoch = args.start_epoch or 0 acc1 = None # Save the initial state save_checkpoint( { "epoch": 0, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1 if acc1 else "Not evaluated", }, False, filename=ckpt_base_dir / f"initial.state", save=False, ) # Start training for epoch in range(args.start_epoch, args.epochs): lr_policy(epoch, iteration=None) modifier(args, epoch, model) cur_lr = get_lr(optimizer) # train for one epoch start_train = time.time() train_acc1, train_acc5 = train(data.train_loader, model, criterion, optimizer, epoch, args, writer=writer) train_time.update((time.time() - start_train) / 60) # evaluate on validation set start_validation = time.time() acc1, acc5 = validate(data.val_loader, model, criterion, args, writer, epoch) validation_time.update((time.time() - start_validation) / 60) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) best_acc5 = max(acc5, best_acc5) best_train_acc1 = max(train_acc1, best_train_acc1) best_train_acc5 = max(train_acc5, best_train_acc5) save = ((epoch % args.save_every) == 0) and args.save_every > 0 if is_best or save or epoch == args.epochs - 1: if is_best: print( f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}" ) save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1, "curr_acc5": acc5, }, is_best, filename=ckpt_base_dir / f"epoch_{epoch}.state", save=save, ) wandb.log({ "curr_acc1": acc1, "curr_acc5": acc5, }) epoch_time.update((time.time() - end_epoch) / 60) progress_overall.display(epoch) progress_overall.write_to_tensorboard(writer, prefix="diagnostics", global_step=epoch) if args.conv_type == "SampleSubnetConv": count = 0 sum_pr = 0.0 for n, m in model.named_modules(): if isinstance(m, SampleSubnetConv): # avg pr across 10 samples pr = 0.0 for _ in range(10): pr += ((torch.rand_like(m.clamped_scores) >= m.clamped_scores).float().mean().item()) pr /= 10.0 writer.add_scalar("pr/{}".format(n), pr, epoch) sum_pr += pr count += 1 args.prune_rate = sum_pr / count writer.add_scalar("pr/average", args.prune_rate, epoch) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() write_result_to_csv( best_acc1=best_acc1, best_acc5=best_acc5, best_train_acc1=best_train_acc1, best_train_acc5=best_train_acc5, prune_rate=args.prune_rate, curr_acc1=acc1, curr_acc5=acc5, base_config=args.config, name=args.name, )
def train(train_loader, model, criterion, optimizer, epoch, args, writer): # batch_time = AverageMeter("Time", ":6.3f") # data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.3f") top1 = AverageMeter("Acc@1", ":6.2f") top5 = AverageMeter("Acc@5", ":6.2f") #l = [batch_time, data_time, losses, top1, top5] l = [losses, top1, top5] progress = ProgressMeter( len(train_loader), l, prefix=f"Epoch: [{epoch}]", ) # switch to train mode model.train() batch_size = train_loader.batch_size num_batches = len(train_loader) end = time.time() image0, target0 = None, None for i, (images, target) in tqdm.tqdm(enumerate(train_loader), ascii=True, total=len(train_loader)): # if i == 0: image0 = images target0 = target # measure data loading time # data_time.update(time.time() - end) if args.gpu is not None: image0 = image0.cuda(args.gpu, non_blocking=True) target0 = target0.cuda(args.gpu, non_blocking=True) l = 0 a1 = 0 a5 = 0 for j in range(args.K): output = model(image0) loss = criterion(output, target0) acc1, acc5 = accuracy(output, target0, topk=(1, 5)) l = l + loss a1 = a1 + acc1 a5 = a5 + acc5 l = l / args.K a1 = a1 / args.K a5 = a5 / args.K # measure accuracy and record loss # torch.Size([128, 3, 32, 32]) # 128 losses.update(l.item(), image0.size(0)) top1.update(a1.item(), images.size(0)) top5.update(a5.item(), images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() if args.conv_type != "SFESubnetConv": l.backward() else: updateScoreDiff(model, l) # printModelScore(model, args) optimizer.step() # measure elapsed time # batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: t = (num_batches * epoch + i) * batch_size progress.display(i) progress.write_to_tensorboard(writer, prefix="train", global_step=t) return top1.avg, top5.avg
def train(train_loader, model, criterion, optimizer, epoch, cfg, writer): batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.3f") top1 = AverageMeter("Acc@1", ":6.2f") top5 = AverageMeter("Acc@5", ":6.2f") progress = ProgressMeter( train_loader.num_batches, [batch_time, data_time, losses, top1, top5], cfg, prefix=f"Epoch: [{epoch}]", ) # switch to train mode model.train() batch_size = train_loader.batch_size num_batches = train_loader.num_batches end = time.time() for i, data in enumerate(train_loader): # images, target = data[0]['data'],data[0]['label'].long().squeeze() images, target = data[0].cuda(), data[1].long().squeeze().cuda() # measure data loading time data_time.update(time.time() - end) if cfg.cs_kd: batch_size = images.size(0) loss_batch_size = batch_size // 2 targets_ = target[:batch_size // 2] outputs = model(images[:batch_size // 2]) loss = torch.mean(criterion(outputs, targets_)) # loss += loss.item() with torch.no_grad(): outputs_cls = model(images[batch_size // 2:]) cls_loss = kdloss(outputs[:batch_size // 2], outputs_cls.detach()) lamda = 3 loss += lamda * cls_loss acc1, acc5 = accuracy(outputs, targets_, topk=(1, 5)) else: batch_size = images.size(0) loss_batch_size = batch_size #compute output output = model(images) loss = criterion(output, target) acc1, acc5 = accuracy(output, target, topk=(1, 5)) # print(i, batch_size, loss) # measure accuracy and record loss losses.update(loss.item(), loss_batch_size) top1.update(acc1.item(), loss_batch_size) top5.update(acc5.item(), loss_batch_size) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % cfg.print_freq == 0 or i == num_batches - 1: t = (num_batches * epoch + i) * batch_size progress.display(i) progress.write_to_tensorboard(writer, prefix="train", global_step=t) # train_loader.reset() # print(top1.count) return top1.avg, top5.avg
def smooth(model, device, val_loader, criterion, args, writer, epoch=0): """ Evaluating on unmodified validation set inputs. """ batch_time = AverageMeter("Time", ":6.3f") top1 = AverageMeter("Acc_1", ":6.2f") top5 = AverageMeter("Acc_5", ":6.2f") rad = AverageMeter("rad", ":6.2f") progress = ProgressMeter(len(val_loader), [batch_time, top1, top5, rad], prefix="Smooth (eval): ") # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, data in enumerate(val_loader): images, target = data[0].to(device), data[1].to(device) # Defult: evaluate on 10 random samples of additive gaussian noise. output = [] for _ in range(10): # add noise if args.dataset == "imagenet": std = (torch.tensor([ 0.229, 0.224, 0.225 ]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)).to(device) noise = (torch.randn_like(images) / std).to(device) * args.noise_std else: noise = torch.randn_like(images).to( device) * args.noise_std output.append(F.softmax(model(images + noise), -1)) output = torch.sum(torch.stack(output), axis=0) p_max, _ = output.max(dim=-1) radii = (args.noise_std + 1e-16) * norm.ppf( p_max.data.cpu().numpy()) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) rad.update(np.mean(radii)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if (i + 1) % args.print_freq == 0: progress.display(i) if writer: progress.write_to_tensorboard(writer, "test", epoch * len(val_loader) + i) # write a sample of test images to tensorboard (helpful for debugging) if i == 0 and writer: writer.add_image( "Adv-test-images", torchvision.utils.make_grid(images[0:len(images) // 4]), ) progress.display(i) # print final results return top1.avg, rad.avg
def ibp(model, device, val_loader, criterion, args, writer, epoch=0): batch_time = AverageMeter("Time", ":6.3f") losses = AverageMeter("Loss", ":.4f") ibp_losses = AverageMeter("IBP_Loss", ":.4f") top1 = AverageMeter("Acc_1", ":6.2f") top5 = AverageMeter("Acc_5", ":6.2f") ibp_top1 = AverageMeter("IBP-Acc_1", ":6.2f") progress = ProgressMeter( len(val_loader), [batch_time, losses, ibp_losses, top1, top5, ibp_top1], prefix="Test: ", ) # switch to evaluation mode model.eval() with torch.no_grad(): end = time.time() for i, data in enumerate(val_loader): images, target = data[0].to(device), data[1].to(device) # clean images output = model(images) loss = criterion(output, target) acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) rce, rerr = naive_interval_analyze( model, args.epsilon, images, target, use_cuda=torch.cuda.is_available(), parallel=False, ) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) ibp_losses.update(rce.item(), images.size(0)) ibp_top1.update((1 - rerr) * 100.0, images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if (i + 1) % args.print_freq == 0: progress.display(i) if writer: progress.write_to_tensorboard(writer, "test", epoch * len(val_loader) + i) # write a sample of test images to tensorboard (helpful for debugging) if i == 0 and writer: writer.add_image( "Adv-test-images", torchvision.utils.make_grid(images[0:len(images) // 4]), ) progress.display(i) # print final results return ibp_top1.avg, ibp_top1.avg
def train( model, device, train_loader, sm_loader, criterion, optimizer, epoch, args, writer=None, ): assert ( not args.normalize ), "Explicit normalization is done in the training loop, Dataset should have [0, 1] dynamic range." global_noise_data = torch.zeros( [args.batch_size, 3, args.image_dim, args.image_dim]).to(device) mean = torch.Tensor(np.array(args.mean)[:, np.newaxis, np.newaxis]) mean = mean.expand(3, args.image_dim, args.image_dim).to(device) std = torch.Tensor(np.array(args.std)[:, np.newaxis, np.newaxis]) std = std.expand(3, args.image_dim, args.image_dim).to(device) batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.4f") top1 = AverageMeter("Acc_1", ":6.2f") top5 = AverageMeter("Acc_5", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch), ) # switch to train mode model.train() for i, (input, target) in enumerate(train_loader): end = time.time() input = input.to(device, non_blocking=True) target = target.to(device, non_blocking=True) data_time.update(time.time() - end) for _ in range(args.n_repeats): # Ascend on the global noise noise_batch = Variable(global_noise_data[0:input.size(0)], requires_grad=True).to(device) in1 = input + noise_batch in1.clamp_(0, 1.0) in1.sub_(mean).div_(std) output = model(in1) loss = criterion(output, target) prec1, prec5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() # Update the noise for the next iteration pert = fgsm(noise_batch.grad, args.epsilon) global_noise_data[0:input.size(0)] += pert.data global_noise_data.clamp_(-args.epsilon, args.epsilon) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) progress.write_to_tensorboard(writer, "train", epoch * len(train_loader) + i) if i == 0: print( in1.shape, target.shape, f"Batch_size from args: {args.batch_size}", "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]), ) print(f"Training images range: {[torch.min(in1), torch.max(in1)]}") # write a sample of training images to tensorboard (helpful for debugging) if i == 0: writer.add_image( "training-images", torchvision.utils.make_grid(input[0:len(input) // 4]), )
def evaluate(epoch, data, model, criterion): """Main evaluation procedure. Arguments: epoch -- current epoch data -- DataLoader which can provide validation batches model -- model to be evaluated criterion -- instance of loss function to measure performance """ text_logger = logging.getLogger(__name__) model.eval() # initialize counters, etc. mcd, mcd_count = 0, 0 cla, cla_count = 0, 0 eval_losses = {} total_loss = AverageMeter('Total Loss', ':.4e') mel_pre_loss = AverageMeter('Mel Pre Loss', ':.4e') mel_post_loss = AverageMeter('Mel Post Loss', ':.4e') lang_class_acc = AverageMeter('Lang Class Acc', ':.4e') progress = ProgressMeter(len(data), total_loss, mel_pre_loss, mel_post_loss, lang_class_acc, prefix="Epoch: [{}]".format(epoch), logger=text_logger) # loop through epoch batches with torch.no_grad(): for i, batch in enumerate(data): # parse batch batch = list(map(to_gpu, batch)) src, src_len, trg_mel, trg_lin, trg_len, stop_trg, spkrs, langs = batch # run the model (twice, with and without teacher forcing) post_pred, pre_pred, stop_pred, alignment, spkrs_pred, enc_output = model( src, src_len, trg_mel, trg_len, spkrs, langs, 1.0) post_pred_0, _, stop_pred_0, alignment_0, _, _ = model( src, src_len, trg_mel, trg_len, spkrs, langs, 0.0) stop_pred_probs = torch.sigmoid(stop_pred_0) # evaluate loss function post_trg = trg_lin if hp.predict_linear else trg_mel classifier = model._reversal_classifier if hp.reversal_classifier else None loss, batch_losses = criterion(src_len, trg_len, pre_pred, trg_mel, post_pred, post_trg, stop_pred, stop_trg, alignment, spkrs, spkrs_pred, enc_output, classifier) total_loss.update(loss, src.size(0)) mel_pre_loss.update(batch_losses['mel_pre'], src.size(0)) mel_post_loss.update(batch_losses['mel_pos'], src.size(0)) # compute mel cepstral distorsion for j, (gen, ref, stop) in enumerate( zip(post_pred_0, trg_mel, stop_pred_probs)): stop_idxes = np.where(stop.cpu().numpy() > 0.5)[0] stop_idx = min( np.min(stop_idxes) + hp.stop_frames, gen.size()[1]) if len(stop_idxes) > 0 else gen.size()[1] gen = gen[:, :stop_idx].data.cpu().numpy() ref = ref[:, :trg_len[j]].data.cpu().numpy() if hp.normalize_spectrogram: gen = audio.denormalize_spectrogram( gen, not hp.predict_linear) ref = audio.denormalize_spectrogram(ref, True) if hp.predict_linear: gen = audio.linear_to_mel(gen) mcd = (mcd_count * mcd + audio.mel_cepstral_distorision( gen, ref, 'dtw')) / (mcd_count + 1) mcd_count += 1 # compute adversarial classifier accuracy if hp.reversal_classifier: input_mask = lengths_to_mask(src_len) trg_spkrs = torch.zeros_like(input_mask, dtype=torch.int64) for s in range(hp.speaker_number): speaker_mask = (spkrs == s) trg_spkrs[speaker_mask] = s matches = (trg_spkrs == torch.argmax( torch.nn.functional.softmax(spkrs_pred, dim=-1), dim=-1)) matches[~input_mask] = False cla = (cla_count * cla + torch.sum(matches).item() / torch.sum(input_mask).item()) / (cla_count + 1) cla_count += 1 lang_class_acc.update(cla, src.size(0)) # add batch losses to epoch losses for k, v in batch_losses.items(): eval_losses[k] = v + eval_losses[k] if k in eval_losses else v # normalize loss per batch for k in eval_losses.keys(): eval_losses[k] /= len(data) # log evaluation progress.print(i) Logger.evaluation(epoch + 1, eval_losses, mcd, src_len, trg_len, src, post_trg, post_pred, post_pred_0, stop_pred_probs, stop_trg, alignment_0, cla) return sum(eval_losses.values())
def train(train_loader, model, criterion, optimizer, epoch, args, writer): batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.3f") top1 = AverageMeter("Acc@1", ":6.2f") top5 = AverageMeter("Acc@5", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], prefix=f"Epoch: [{epoch}]", ) # switch to train mode model.train() batch_size = train_loader.batch_size num_batches = len(train_loader) end = time.time() for i, (images, target) in tqdm.tqdm(enumerate(train_loader), ascii=True, total=len(train_loader)): # measure data loading time data_time.update(time.time() - end) if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # Write scores and weights to tensorboard at beginning of every other epoch if args.histograms: if (i % (num_batches * batch_size) == 0) and (epoch % 2 == 0): for param_name in model.state_dict(): #print(param_name) # Only write scores for now (not weights and batch norm parameters since the pytorch parms don't actually change) #if 'score' not in param_name: #if 'score' in param_name or 'weight' in param_name: #print(param_name, model.state_dict()[param_name]) writer.add_histogram(param_name, model.state_dict()[param_name], epoch) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1.item(), images.size(0)) top5.update(acc5.item(), images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() #torch.nn.utils.clip_grad_norm_(model.parameters(),1) loss.backward() # EDITED #print(torch.norm(torch.cat([p.grad.view(-1) for p in model.parameters()]))) if args.grad_clip: torch.nn.utils.clip_grad_value_(model.parameters(), 1) #print(torch.norm(torch.cat([p.grad.view(-1) for p in model.parameters()]))) #for param_name in model.state_dict(): print(param_name, str(model.state_dict()[param_name])[:50]) #torch.nn.utils.clip_grad_norm_(model.parameters(),1) # end optimizer.step() # Clamp updated scores to [-1,1] only when using binarized/quantized activations #for param_name in model.state_dict(): # if 'score' in param_name: # #print(param_name) # scores = model.state_dict()[param_name] # #scores = torch.clamp(scores,min=-1.0,max=1.0) # scores.clamp_(min=-1.0,max=1.0) # measure elapsed time batch_time.update(time.time() - end) end = time.time() #print(model.state_dict()['module.linear.3.scores'].grad) #params = list(model.parameters()) #print(params[1].grad) if i % args.print_freq == 0: t = (num_batches * epoch + i) * batch_size progress.display(i) #_, predicted = torch.max(output, 1) progress.write_to_tensorboard(writer, prefix="train", global_step=t) # Write score gradients to tensorboard at end of every other epoch if args.histograms: if (i % (num_batches * batch_size) == 0) and (epoch % 2 == 0): #if ((i+1) % (num_batches-1) == 0) and (epoch % 2 == 0): params = list(model.parameters()) param_names = list(model.state_dict()) for j in range(len(params)): if params[j].grad is not None: # if 'score' in param_names[j] or 'weight' in param_names[j]: # if 'score' not in param_name and params[j].grad is not None: #print(param_names[j]) #print(params[j].grad) writer.add_histogram(param_names[j] + '.grad', params[j].grad, epoch) else: writer.add_histogram(param_names[j] + '.grad', 0, epoch) #for param_name in model.state_dict(): # if 'score' in param_name: # writer.add_histogram(param_name + '.grad', model.state_dict()[param_name].grad, epoch) #params = list(model.parameters()) #for j in range(len(params)): # writer.add_histogram('Layer' + str(j) + 'grad', params[j].grad, epoch) # Write final scores and weights to tensorboard if args.histograms: for param_name in model.state_dict(): #writer.add_histogram(param_name, model.state_dict()[param_name], epoch) # Only write scores for now (not weights and batch norm parameters since the pytorch parms don't actually change) #if 'score' not in param_name: #print(param_name, model.state_dict()[param_name]) writer.add_histogram(param_name, model.state_dict()[param_name], epoch) return top1.avg, top5.avg
def train(logging_start_epoch, epoch, data, model, criterion, optimizer): """Main training procedure. Arguments: logging_start_epoch -- number of the first epoch to be logged epoch -- current epoch data -- DataLoader which can provide batches for an epoch model -- model to be trained criterion -- instance of loss function to be optimized optimizer -- instance of optimizer which will be used for parameter updates """ text_logger = logging.getLogger(__name__) model.train() # initialize counters, etc. learning_rate = optimizer.param_groups[0]['lr'] cla = 0 done, start_time = 0, time.time() total_loss = AverageMeter('Total Loss', ':.4e') mel_pre_loss = AverageMeter('Mel Pre Loss', ':.4e') mel_post_loss = AverageMeter('Mel Post Loss', ':.4e') lang_class_acc = AverageMeter('Lang Class Acc', ':.4e') progress = ProgressMeter(len(data), total_loss, mel_pre_loss, mel_post_loss, lang_class_acc, prefix="Epoch: [{}]".format(epoch), logger=text_logger) # loop through epoch batches for i, batch in enumerate(data): global_step = done + epoch * len(data) optimizer.zero_grad() # parse batch batch = list(map(to_gpu, batch)) src, src_len, trg_mel, trg_lin, trg_len, stop_trg, spkrs, langs = batch # get teacher forcing ratio if hp.constant_teacher_forcing: tf = hp.teacher_forcing else: tf = cos_decay( max(global_step - hp.teacher_forcing_start_steps, 0), hp.teacher_forcing_steps) # run the model post_pred, pre_pred, stop_pred, alignment, spkrs_pred, enc_output = model( src, src_len, trg_mel, trg_len, spkrs, langs, tf) # evaluate loss function post_trg = trg_lin if hp.predict_linear else trg_mel classifier = model._reversal_classifier if hp.reversal_classifier else None loss, batch_losses = criterion(src_len, trg_len, pre_pred, trg_mel, post_pred, post_trg, stop_pred, stop_trg, alignment, spkrs, spkrs_pred, enc_output, classifier) total_loss.update(loss, src.size(0)) mel_pre_loss.update(batch_losses['mel_pre'], src.size(0)) mel_post_loss.update(batch_losses['mel_pos'], src.size(0)) # evaluate adversarial classifier accuracy, if present if hp.reversal_classifier: input_mask = lengths_to_mask(src_len) trg_spkrs = torch.zeros_like(input_mask, dtype=torch.int64) for s in range(hp.speaker_number): speaker_mask = (spkrs == s) trg_spkrs[speaker_mask] = s matches = (trg_spkrs == torch.argmax(torch.nn.functional.softmax( spkrs_pred, dim=-1), dim=-1)) matches[~input_mask] = False cla = torch.sum(matches).item() / torch.sum(input_mask).item() lang_class_acc.update(cla, src.size(0)) # comptute gradients and make a step loss.backward() gradient = torch.nn.utils.clip_grad_norm_(model.parameters(), hp.gradient_clipping) optimizer.step() # log training progress if epoch >= logging_start_epoch: Logger.training(global_step, batch_losses, gradient, learning_rate, time.time() - start_time, cla) progress.print(i) # update criterion states (params and decay of the loss and so on ...) criterion.update_states() start_time = time.time() done += 1
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch, args, writer): print( " ->->->->->->->->->-> One epoch with Adversarial training (TRADES) <-<-<-<-<-<-<-<-<-<-" ) batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.4f") top1 = AverageMeter("Acc_1", ":6.2f") top5 = AverageMeter("Acc_5", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch), ) model.train() end = time.time() dataloader = train_loader if sm_loader is None else zip( train_loader, sm_loader) for i, data in enumerate(dataloader): if sm_loader: images, target = ( torch.cat([d[0] for d in data], 0).to(device), torch.cat([d[1] for d in data], 0).to(device), ) else: images, target = data[0].to(device), data[1].to(device) # basic properties of training data if i == 0: print( images.shape, target.shape, f"Batch_size from args: {args.batch_size}", "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]), ) print( f"Training images range: {[torch.min(images), torch.max(images)]}" ) output = model(images) # calculate robust loss loss = pgd_loss( model=model, x_natural=images, y=target, device=device, optimizer=optimizer, step_size=args.step_size, epsilon=args.epsilon, perturb_steps=args.num_steps, beta=args.beta, clip_min=args.clip_min, clip_max=args.clip_max, distance=args.distance, ) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) progress.write_to_tensorboard(writer, "train", epoch * len(train_loader) + i) # write a sample of training images to tensorboard (helpful for debugging) if i == 0: writer.add_image( "training-images", torchvision.utils.make_grid(images[0:len(images) // 4]), )
def freeadv(model, device, val_loader, criterion, args, writer, epoch=0): assert ( not args.normalize ), "Explicit normalization is done in the training loop, Dataset should have [0, 1] dynamic range." # Mean/Std for normalization mean = torch.Tensor(np.array(args.mean)[:, np.newaxis, np.newaxis]) mean = mean.expand(3, args.image_dim, args.image_dim).to(device) std = torch.Tensor(np.array(args.std)[:, np.newaxis, np.newaxis]) std = std.expand(3, args.image_dim, args.image_dim).to(device) batch_time = AverageMeter("Time", ":6.3f") losses = AverageMeter("Loss", ":.4f") top1 = AverageMeter("Acc_1", ":6.2f") top5 = AverageMeter("Acc_5", ":6.2f") progress = ProgressMeter( len(val_loader), [batch_time, losses, top1, top5], prefix="Test: ", ) eps = args.epsilon K = args.num_steps step = args.step_size model.eval() end = time.time() print(" PGD eps: {}, num-steps: {}, step-size: {} ".format(eps, K, step)) for i, (input, target) in enumerate(val_loader): input = input.to(device, non_blocking=True) target = target.to(device, non_blocking=True) orig_input = input.clone() randn = torch.FloatTensor(input.size()).uniform_(-eps, eps).to(device) input += randn input.clamp_(0, 1.0) for _ in range(K): invar = Variable(input, requires_grad=True) in1 = invar - mean in1.div_(std) output = model(in1) ascend_loss = criterion(output, target) ascend_grad = torch.autograd.grad(ascend_loss, invar)[0] pert = fgsm(ascend_grad, step) # Apply purturbation input += pert.data input = torch.max(orig_input - eps, input) input = torch.min(orig_input + eps, input) input.clamp_(0, 1.0) input.sub_(mean).div_(std) with torch.no_grad(): # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if (i + 1) % args.print_freq == 0: progress.display(i) if writer: progress.write_to_tensorboard(writer, "test", epoch * len(val_loader) + i) # write a sample of test images to tensorboard (helpful for debugging) if i == 0 and writer: writer.add_image( "Adv-test-images", torchvision.utils.make_grid(input[0:len(input) // 4]), ) progress.display(i) # print final results return top1.avg, top5.avg
def main_worker(args): args.gpu = None if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # create model and optimizer model = get_model(args) model = set_gpu(args, model) # Set up directories run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args) # Loading pretrained model if args.pretrained: pretrained(args, model) # Saving a DenseConv (nn.Conv2d) compatible model if args.dense_conv_model: print( f"==> DenseConv compatible model, saving at {ckpt_base_dir / 'model_best.pth'}" ) save_checkpoint( { "epoch": 0, "arch": args.arch, "state_dict": model.state_dict(), }, True, filename=ckpt_base_dir / f"epoch_pretrained.state", save=True, ) return optimizer = get_optimizer(args, model) data = get_dataset(args) lr_policy = get_policy(args.lr_policy)(optimizer, args) if args.label_smoothing is None: criterion = nn.CrossEntropyLoss().cuda() else: criterion = LabelSmoothing(smoothing=args.label_smoothing) # optionally resume from a checkpoint best_acc1 = 0.0 best_acc5 = 0.0 best_train_acc1 = 0.0 best_train_acc5 = 0.0 if args.resume: best_acc1 = resume(args, model, optimizer) # Evaulation of a model if args.evaluate: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer=None, epoch=args.start_epoch) return writer = SummaryWriter(log_dir=log_base_dir) epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) progress_overall = ProgressMeter(1, [epoch_time, validation_time, train_time], prefix="Overall Timing") end_epoch = time.time() args.start_epoch = args.start_epoch or 0 acc1 = None # Save the initial state save_checkpoint( { "epoch": 0, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1 if acc1 else "Not evaluated", }, False, filename=ckpt_base_dir / f"initial.state", save=False, ) # Start training for epoch in range(args.start_epoch, args.epochs): lr_policy(epoch, iteration=None) cur_lr = get_lr(optimizer) # Gradual pruning in GMP experiments if args.conv_type == "GMPConv" and epoch >= args.init_prune_epoch and epoch <= args.final_prune_epoch: total_prune_epochs = args.final_prune_epoch - args.init_prune_epoch + 1 for n, m in model.named_modules(): if hasattr(m, 'set_curr_prune_rate'): prune_decay = ( 1 - ((args.curr_prune_epoch - args.init_prune_epoch) / total_prune_epochs))**3 curr_prune_rate = m.prune_rate - (m.prune_rate * prune_decay) m.set_curr_prune_rate(curr_prune_rate) # train for one epoch start_train = time.time() train_acc1, train_acc5 = train(data.train_loader, model, criterion, optimizer, epoch, args, writer=writer) train_time.update((time.time() - start_train) / 60) # evaluate on validation set start_validation = time.time() acc1, acc5 = validate(data.val_loader, model, criterion, args, writer, epoch) validation_time.update((time.time() - start_validation) / 60) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) best_acc5 = max(acc5, best_acc5) best_train_acc1 = max(train_acc1, best_train_acc1) best_train_acc5 = max(train_acc5, best_train_acc5) save = ((epoch % args.save_every) == 0) and args.save_every > 0 if is_best or save or epoch == args.epochs - 1: if is_best: print( f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}" ) save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1, "curr_acc5": acc5, }, is_best, filename=ckpt_base_dir / f"epoch_{epoch}.state", save=save, ) epoch_time.update((time.time() - end_epoch) / 60) progress_overall.display(epoch) progress_overall.write_to_tensorboard(writer, prefix="diagnostics", global_step=epoch) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() # Storing sparsity and threshold statistics for STRConv models if args.conv_type == "STRConv": count = 0 sum_sparse = 0.0 for n, m in model.named_modules(): if isinstance(m, STRConv): sparsity, total_params, thresh = m.getSparsity() writer.add_scalar("sparsity/{}".format(n), sparsity, epoch) writer.add_scalar("thresh/{}".format(n), thresh, epoch) sum_sparse += int(((100 - sparsity) / 100) * total_params) count += total_params total_sparsity = 100 - (100 * sum_sparse / count) writer.add_scalar("sparsity/total", total_sparsity, epoch) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() write_result_to_csv( best_acc1=best_acc1, best_acc5=best_acc5, best_train_acc1=best_train_acc1, best_train_acc5=best_train_acc5, prune_rate=args.prune_rate, curr_acc1=acc1, curr_acc5=acc5, base_config=args.config, name=args.name, ) if args.conv_type == "STRConv": json_data = {} json_thres = {} for n, m in model.named_modules(): if isinstance(m, STRConv): sparsity = m.getSparsity() json_data[n] = sparsity[0] sum_sparse += int(((100 - sparsity[0]) / 100) * sparsity[1]) count += sparsity[1] json_thres[n] = sparsity[2] json_data["total"] = 100 - (100 * sum_sparse / count) if not os.path.exists("runs/layerwise_sparsity"): os.mkdir("runs/layerwise_sparsity") if not os.path.exists("runs/layerwise_threshold"): os.mkdir("runs/layerwise_threshold") with open("runs/layerwise_sparsity/{}.json".format(args.name), "w") as f: json.dump(json_data, f) with open("runs/layerwise_threshold/{}.json".format(args.name), "w") as f: json.dump(json_thres, f)
def train(model, device, train_loader, sm_loader, criterion, optimizer, epoch, args, writer): epsilon = set_epsilon(args, epoch) k = args.mixtraink alpha = 0.8 iw = set_interval_weight(args, epoch) print(" ->->->->->->->->->-> One epoch with MixTrain{} (SYM {:.3f})" " <-<-<-<-<-<-<-<-<-<-".format(k, epsilon)) batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.4f") sym_losses = AverageMeter("Sym_Loss", ":.4f") top1 = AverageMeter("Acc_1", ":6.2f") sym1 = AverageMeter("Sym1", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, sym_losses, top1, sym1], prefix="Epoch: [{}]".format(epoch), ) model.train() end = time.time() dataloader = train_loader if sm_loader is None else zip( train_loader, sm_loader) for i, data in enumerate(dataloader): if sm_loader: images, target = ( torch.cat([d[0] for d in data], 0).to(device), torch.cat([d[1] for d in data], 0).to(device), ) else: images, target = data[0].to(device), data[1].to(device) # basic properties of training data if i == 0: print( images.shape, target.shape, f"Batch_size from args: {args.batch_size}", "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]), ) print( f"Training images range: {[torch.min(images), torch.max(images)]}" ) output = model(images) ce = nn.CrossEntropyLoss()(output, target) if (np.random.uniform() <= alpha): r = np.random.randint(low=0, high=images.shape[0], size=k) rce, rerr = sym_interval_analyze( model, epsilon, images[r], target[r], use_cuda=torch.cuda.is_available(), parallel=False) #print("sym:", rce.item(), ce.item()) loss = iw * rce + ce sym_losses.update(rce.item(), k) sym1.update((1 - rerr) * 100., images.size(0)) else: loss = ce # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) top1.update(acc1[0], images.size(0)) losses.update(ce.item(), images.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) progress.write_to_tensorboard(writer, "train", epoch * len(train_loader) + i) # write a sample of training images to tensorboard (helpful for debugging) if i == 0: writer.add_image( "training-images", torchvision.utils.make_grid(images[0:len(images) // 4]), )
def dann_train(feature_extractor:FeatureExtractor, domain_adv:DomainAdversarialLoss, src_iter:ForeverDataIterator, tar_iter:ForeverDataIterator, src_val_loader, tar_val_loader, args): optimizer = Adam( itertools.chain(feature_extractor.parameters(), domain_adv.parameters()), lr= args.lr,weight_decay=args.weight_decay ) npair_loss = NPairsLoss() # n pair loss epoch = args.epoch iter_per_epoch = args.iter_per_epoch writer = args.writer # Summary Writer logger = args.logger device = args.device w_da = args.w_da model_dir = args.model_dir # loss loss_rec = AverageMeter('tot_loss', tb_tag='Loss/tot', writer=writer) loss_lb_rec = AverageMeter('lb_loss', tb_tag='Loss/lb', writer=writer) loss_lb_g_rec = AverageMeter('lb_g_loss', tb_tag='Loss/lb_g', writer=writer) loss_da_rec = AverageMeter('da_loss', tb_tag='Loss/da', writer=writer) # acc da_acc_rec = AverageMeter('da_acc', tb_tag='Acc/da', writer=writer) n_iter = 0 best_nmi = 0 for e_i in range(epoch): feature_extractor.train() domain_adv.train() progress = ProgressMeter( iter_per_epoch, [loss_lb_g_rec, loss_lb_rec, loss_da_rec,da_acc_rec], prefix="Epoch: [{}]".format(e_i), logger=logger ) for i in range(iter_per_epoch): x_s, l_s = next(src_iter) x_t, l_t = next(tar_iter) # for obj in [x_s, x_t, l_s, l_t]: # to device # obj = obj.to(device) x_s, l_s, x_t, l_t = x_s.to(device), l_s.to(device), x_t.to(device), l_t.to(device) x = torch.cat((x_s, x_t), dim=0) f, g = feature_extractor(x) f_s, f_t = f.chunk(2, dim=0) g_s, g_t = g.chunk(2, dim=0) # source only part loss_s = npair_loss(f_s, l_s) # get n-pair loss on source domain loss_s_g = npair_loss(g_s, l_s) # get n-pair loss on source domain loss_lb_rec.update(loss_s.item(), x_s.size(0), iter=n_iter) loss_lb_g_rec.update(loss_s_g.item(), x_s.size(0), iter=n_iter) # dann # da_loss = domain_adv(f_s,f_t) da_loss = domain_adv(g_s,f_t) domain_acc = domain_adv.domain_discriminator_accuracy loss_da_rec.update(da_loss.item(), f.size(0), iter=n_iter) da_acc_rec.update(domain_acc.item(), f.size(0), iter=n_iter) loss = 0.5 * (loss_s + loss_s_g) + w_da * da_loss # loss = loss_s optimizer.zero_grad() loss.backward() optimizer.step() n_iter += 1 if i % print_freq == 0: progress.display(i) if e_i % 5 == 0: # logger.info(f"saving embedding in epoch{e_i}") # # show embedding # show_embedding(backbone, [src_val_loader], tag=f'src_{e_i}', epoch=e_i, writer, device) # show_embedding(backbone, [tar_val_loader], tag=f'tar_{e_i}', epoch=e_i, writer, device) nmi = NMI_eval(feature_extractor, src_val_loader, 5, device, type='src') logger.info(f'test on train set nmi: {nmi}') nmi = NMI_eval(feature_extractor, tar_val_loader, 5, device, type='tar') logger.info(f'test on test set nmi: {nmi}') if nmi > best_nmi: logger.info(f"save best model to {model_dir}") torch.save(feature_extractor.state_dict(), os.path.join(model_dir, 'minst_best_model.pth')) best_nmi = nmi
def trn(cfg,model): cfg.logger.info(cfg) if cfg.seed is not None: random.seed(cfg.seed) torch.manual_seed(cfg.seed) torch.cuda.manual_seed(cfg.seed) torch.cuda.manual_seed_all(cfg.seed) train, validate_knn = get_trainer(cfg) if cfg.gpu is not None: cfg.logger.info("Use GPU: {} for training".format(cfg.gpu)) # if cfg.pretrained: # net_utils.load_pretrained(cfg.pretrained,cfg.multigpu[0], model) optimizer = get_optimizer(cfg, model) cfg.logger.info(f"=> Getting {cfg.set} dataset") dataset = getattr(data, cfg.set)(cfg) lr_policy = get_policy(cfg.lr_policy)(optimizer, cfg) if cfg.arch == 'SimSiam': # L = D(p1, z2) / 2 + D(p2, z1) / 2 base_criterion = lambda bb1_z1_p1_emb, bb2_z2_p2_emb: simsiam.SimSaimLoss(bb1_z1_p1_emb[2], bb2_z2_p2_emb[1]) / 2 +\ simsiam.SimSaimLoss(bb2_z2_p2_emb[2], bb1_z1_p1_emb[1]) / 2 elif cfg.arch == 'SimCLR': base_criterion = lambda z1,z2 : simclr.NT_XentLoss(z1, z2) else: raise NotImplemented run_base_dir, ckpt_base_dir, log_base_dir = path_utils.get_directories(cfg,cfg.gpu) _, zero_gpu_ckpt_base_dir, _ = path_utils.get_directories(cfg, 0) # if cfg.resume: saved_epochs = sorted(glob.glob(str(zero_gpu_ckpt_base_dir) + '/epoch_*.state'), key=os.path.getmtime) # assert len(epochs) < 2, 'Should be only one saved epoch -- the last one' if len(saved_epochs) > 0: cfg.resume = saved_epochs[-1] resume(cfg, model, optimizer) cfg.ckpt_base_dir = ckpt_base_dir writer = SummaryWriter(log_dir=log_base_dir) epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) progress_overall = ProgressMeter( 1, [epoch_time, validation_time, train_time], cfg, prefix="Overall Timing" ) end_epoch = time.time() cfg.start_epoch = cfg.start_epoch or 0 start_time = time.time() gpu_info = gpu_utils.GPU_Utils(gpu_index=cfg.gpu) cfg.logger.info('Start Training: Model conv 1 initialization {}'.format(torch.sum(model.module.backbone.conv1.weight))) # Start training for n,m in model.module.named_modules(): if hasattr(m, "weight") and m.weight is not None: cfg.logger.info('{} ({}): {}'.format(n,type(m).__name__,m.weight.shape)) criterion = base_criterion cfg.logger.info('Using Vanilla Criterion') for epoch in range(cfg.start_epoch, cfg.epochs): if cfg.world_size > 1: dataset.sampler.set_epoch(epoch) lr_policy(epoch, iteration=None) cur_lr = net_utils.get_lr(optimizer) start_train = time.time() train(dataset.trn_loader, model,criterion, optimizer, epoch, cfg, writer=writer) train_time.update((time.time() - start_train) / 60) if (epoch + 1) % cfg.test_interval == 0: if cfg.gpu == cfg.base_gpu: # evaluate on validation set start_validation = time.time() acc = validate_knn(dataset.trn_loader, dataset.val_loader, model.module, cfg, writer, epoch) validation_time.update((time.time() - start_validation) / 60) csv_utils.write_generic_result_to_csv(path=cfg.exp_dir,name=os.path.basename(cfg.exp_dir[:-1]), epoch=epoch, knn_acc=acc) save = (((epoch+1) % cfg.save_every) == 0) and cfg.save_every > 0 if save or epoch == cfg.epochs - 1: # if is_best: # print(f"==> best {last_val_acc1:.02f} saving at {ckpt_base_dir / 'model_best.pth'}") net_utils.save_checkpoint( { "epoch": epoch + 1, "arch": cfg.arch, "state_dict": model.state_dict(), "ACC": acc, "optimizer": optimizer.state_dict(), }, is_best=False, filename=ckpt_base_dir / f"epoch_{epoch:04d}.state", save=save or epoch == cfg.epochs - 1, ) elapsed_time = time.time() - start_time seconds_todo = (cfg.epochs - epoch) * (elapsed_time / cfg.test_interval) estimated_time_complete = timedelta(seconds=int(seconds_todo)) start_time = time.time() cfg.logger.info( f"==> ETA: {estimated_time_complete}\tGPU-M: {gpu_info.gpu_mem_usage()}\tGPU-U: {gpu_info.gpu_utilization()}") epoch_time.update((time.time() - end_epoch) / 60) progress_overall.display(epoch) progress_overall.write_to_tensorboard( writer, prefix="diagnostics", global_step=epoch ) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() if cfg.world_size > 1: # cfg.logger.info('GPU {} going into the barrier'.format(cfg.gpu)) dist.barrier()
def train( model, device, train_loader, sm_loader, criterion, optimizer, epoch, args, writer ): num_class = 10 sa = np.zeros((num_class, num_class - 1), dtype = np.int32) for i in range(sa.shape[0]): for j in range(sa.shape[1]): if j < i: sa[i][j] = j else: sa[i][j] = j + 1 sa = torch.LongTensor(sa) batch_size = args.batch_size*2 schedule_start = 0 num_steps_per_epoch = len(train_loader) eps_scheduler = EpsilonScheduler("linear", args.schedule_start, ((args.schedule_start + args.schedule_length) - 1) *\ num_steps_per_epoch, args.starting_epsilon, args.epsilon, num_steps_per_epoch) end_eps = eps_scheduler.get_eps(epoch+1, 0) start_eps = eps_scheduler.get_eps(epoch, 0) print( " ->->->->->->->->->-> One epoch with CROWN-IBP ({:.6f}-{:.6f})" " <-<-<-<-<-<-<-<-<-<-".format(start_eps, end_eps) ) batch_time = AverageMeter("Time", ":6.3f") data_time = AverageMeter("Data", ":6.3f") losses = AverageMeter("Loss", ":.4f") ibp_losses = AverageMeter("IBP_Loss", ":.4f") top1 = AverageMeter("Acc_1", ":6.2f") ibp_acc1 = AverageMeter("IBP1", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, ibp_losses, top1, ibp_acc1], prefix="Epoch: [{}]".format(epoch), ) model = BoundSequential.convert(model,\ {'same-slope': False, 'zero-lb': False,\ 'one-lb': False}).to(device) model.train() end = time.time() dataloader = train_loader if sm_loader is None else zip(train_loader, sm_loader) for i, data in enumerate(dataloader): if sm_loader: images, target = ( torch.cat([d[0] for d in data], 0).to(device), torch.cat([d[1] for d in data], 0).to(device), ) else: images, target = data[0].to(device), data[1].to(device) # basic properties of training data if i == 0: print( images.shape, target.shape, f"Batch_size from args: {args.batch_size}", "lr: {:.5f}".format(optimizer.param_groups[0]["lr"]), ) print(f"Training images range: {[torch.min(images), torch.max(images)]}") output = model(images, method_opt="forward") ce = nn.CrossEntropyLoss()(output, target) eps = eps_scheduler.get_eps(epoch, i) # generate specifications c = torch.eye(num_class).type_as(images)[target].unsqueeze(1) -\ torch.eye(num_class).type_as(images).unsqueeze(0) # remove specifications to self I = (~(target.unsqueeze(1) ==\ torch.arange(num_class).to(device).type_as(target).unsqueeze(0))) c = (c[I].view(images.size(0),num_class-1,num_class)).to(device) # scatter matrix to avoid compute margin to self sa_labels = sa[target].to(device) # storing computed lower bounds after scatter lb_s = torch.zeros(images.size(0), num_class).to(device) ub_s = torch.zeros(images.size(0), num_class).to(device) data_ub = torch.min(images + eps, images.max()).to(device) data_lb = torch.max(images - eps, images.min()).to(device) ub, ilb, relu_activity, unstable, dead, alive =\ model(norm=np.inf, x_U=data_ub, x_L=data_lb,\ eps=eps, C=c, method_opt="interval_range") crown_final_beta = 0. beta = (args.epsilon - eps * (1.0 - crown_final_beta)) / args.epsilon if beta < 1e-5: # print("pure naive") lb = ilb else: # print("crown-ibp") # get the CROWN bound using interval bounds _, _, clb, bias = model(norm=np.inf, x_U=data_ub,\ x_L=data_lb, eps=eps, C=c,\ method_opt="backward_range") # how much better is crown-ibp better than ibp? # diff = (clb - ilb).sum().item() lb = clb * beta + ilb * (1 - beta) lb = lb_s.scatter(1, sa_labels, lb) robust_ce = criterion(-lb, target) #print(ce, robust_ce) racc = accuracy(-lb, target, topk=(1,)) loss = robust_ce # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) top1.update(acc1[0].item(), images.size(0)) losses.update(ce.item(), images.size(0)) ibp_losses.update(robust_ce.item(), images.size(0)) ibp_acc1.update(racc[0].item(), images.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) progress.write_to_tensorboard( writer, "train", epoch * len(train_loader) + i ) # write a sample of training images to tensorboard (helpful for debugging) if i == 0: writer.add_image( "training-images", torchvision.utils.make_grid(images[0 : len(images) // 4]), )
def trn(cfg, model): cfg.logger.info(cfg) if cfg.seed is not None: random.seed(cfg.seed) torch.manual_seed(cfg.seed) torch.cuda.manual_seed(cfg.seed) torch.cuda.manual_seed_all(cfg.seed) train, validate = get_trainer(cfg) if cfg.gpu is not None: cfg.logger.info("Use GPU: {} for training".format(cfg.gpu)) linear_classifier_layer = model.module[1] optimizer = get_optimizer(cfg, linear_classifier_layer) cfg.logger.info(f"=> Getting {cfg.set} dataset") dataset = getattr(data, cfg.set)(cfg) lr_policy = get_policy(cfg.lr_policy)(optimizer, cfg) softmax_criterion = nn.CrossEntropyLoss().cuda() criterion = lambda output, target: softmax_criterion(output, target) # optionally resume from a checkpoint best_val_acc1 = 0.0 best_val_acc5 = 0.0 best_train_acc1 = 0.0 best_train_acc5 = 0.0 if cfg.resume: best_val_acc1 = resume(cfg, model, optimizer) run_base_dir, ckpt_base_dir, log_base_dir = path_utils.get_directories( cfg, cfg.gpu) cfg.ckpt_base_dir = ckpt_base_dir writer = SummaryWriter(log_dir=log_base_dir) epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) progress_overall = ProgressMeter(1, [epoch_time, validation_time, train_time], cfg, prefix="Overall Timing") end_epoch = time.time() cfg.start_epoch = cfg.start_epoch or 0 last_val_acc1 = None start_time = time.time() gpu_info = gpu_utils.GPU_Utils(gpu_index=cfg.gpu) # Start training for epoch in range(cfg.start_epoch, cfg.epochs): cfg.logger.info('Model conv 1 {} at epoch {}'.format( torch.sum(model.module[0].conv1.weight), epoch)) ## make sure backbone is not updated if cfg.world_size > 1: dataset.sampler.set_epoch(epoch) lr_policy(epoch, iteration=None) cur_lr = net_utils.get_lr(optimizer) start_train = time.time() train_acc1, train_acc5 = train(dataset.trn_loader, model, criterion, optimizer, epoch, cfg, writer=writer) train_time.update((time.time() - start_train) / 60) if (epoch + 1) % cfg.test_interval == 0: if cfg.gpu == cfg.base_gpu: # evaluate on validation set start_validation = time.time() last_val_acc1, last_val_acc5 = validate( dataset.val_loader, model.module, criterion, cfg, writer, epoch) validation_time.update((time.time() - start_validation) / 60) # remember best acc@1 and save checkpoint is_best = last_val_acc1 > best_val_acc1 best_val_acc1 = max(last_val_acc1, best_val_acc1) best_val_acc5 = max(last_val_acc5, best_val_acc5) best_train_acc1 = max(train_acc1, best_train_acc1) best_train_acc5 = max(train_acc5, best_train_acc5) save = (((epoch + 1) % cfg.save_every) == 0) and cfg.save_every > 0 if save or epoch == cfg.epochs - 1: if is_best: cfg.logger.info( f"==> best {last_val_acc1:.02f} saving at {ckpt_base_dir / 'model_best.pth'}" ) net_utils.save_checkpoint( { "epoch": epoch + 1, "arch": cfg.arch, "state_dict": model.state_dict(), "best_acc1": best_val_acc1, "best_acc5": best_val_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": last_val_acc1, "curr_acc5": last_val_acc5, }, is_best, filename=ckpt_base_dir / f"epoch_{epoch}.state", save=save or epoch == cfg.epochs - 1, ) elapsed_time = time.time() - start_time seconds_todo = (cfg.epochs - epoch) * (elapsed_time / cfg.test_interval) estimated_time_complete = timedelta(seconds=int(seconds_todo)) start_time = time.time() cfg.logger.info( f"==> ETA: {estimated_time_complete}\tGPU-M: {gpu_info.gpu_mem_usage()}\tGPU-U: {gpu_info.gpu_utilization()}" ) epoch_time.update((time.time() - end_epoch) / 60) progress_overall.display(epoch) progress_overall.write_to_tensorboard(writer, prefix="diagnostics", global_step=epoch) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() if cfg.world_size > 1: dist.barrier()
def main_worker(args): args.gpu = None train, validate, modifier = get_trainer(args) if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # create model and optimizer model = get_model(args) model = set_gpu(args, model) if args.pretrained: pretrained(args, model) # SJT modification: if args.exp_mode: # pretraining/pruning/funetuning exp_mode = args.exp_mode if exp_mode == "pretraining": # YHT modefication, setting the pruning rate to 0 print( "Figure out your exp_mode is pretraining, setting prune-rate to 0" ) args.prune_rate = 0 unfreeze_model_weights(model) freeze_model_subnet(model) # End of SJT modification optimizer = get_optimizer(args, model) data = get_dataset(args) lr_policy = get_policy(args.lr_policy)(optimizer, args) if args.label_smoothing is None: criterion = nn.CrossEntropyLoss().cuda() else: criterion = LabelSmoothing(smoothing=args.label_smoothing) # optionally resume from a checkpoint best_acc1 = 0.0 best_acc5 = 0.0 best_train_acc1 = 0.0 best_train_acc5 = 0.0 if args.resume: # SJT modification if args.exp_mode: if args.exp_mode == "pruning": optimizer = resume_pruning(args, model) else: # Only can be "finetuning" if args.exp_mode != "finetuning": print( "resume method should be combined with pruning/finetuning exp_mode together!" ) return else: optimizer = resume_finetuning(args, model) # YHT: not sure whether it is needed #lr_policy = get_policy(args.lr_policy)(optimizer, args) #print("#####################DEBUG PRINT : VALIDATE FIRST#####################") #validate(data.val_loader, model, criterion, args, writer= None, epoch=args.start_epoch) else: best_acc1 = resume(args, model, optimizer) # End of SJT modification else: # YHT modification if args.exp_mode: if args.exp_mode == "finetuning": #here, we suppose the user want to use init prun-rate vector to do the finetuning(subnetwork) print( "Using finetuning mode without resume, which is supposed to be innit fientune." ) optimizer = resume_finetuning(args, model) # YHT: not sure whether it is needed lr_policy = get_policy(args.lr_policy)(optimizer, args) # End of modification # Data loading code if args.evaluate: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer=None, epoch=args.start_epoch) return # Set up directories run_base_dir, ckpt_base_dir, log_base_dir = get_directories(args) args.ckpt_base_dir = ckpt_base_dir writer = SummaryWriter(log_dir=log_base_dir) epoch_time = AverageMeter("epoch_time", ":.4f", write_avg=False) validation_time = AverageMeter("validation_time", ":.4f", write_avg=False) train_time = AverageMeter("train_time", ":.4f", write_avg=False) progress_overall = ProgressMeter(1, [epoch_time, validation_time, train_time], prefix="Overall Timing") end_epoch = time.time() args.start_epoch = args.start_epoch or 0 acc1 = None # Save the initial state save_checkpoint( { "epoch": 0, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1 if acc1 else "Not evaluated", }, False, filename=ckpt_base_dir / f"initial.state", save=False, ) if args.gp_warm_up: record_prune_rate = args.prune_rate if args.print_more: print_global_layerwise_prune_rate(model, args.prune_rate) # YHT modification May 20 # till here, we have every prune-rate is accurate # Now we need to create mask if prandom is true using if args.prandom: make_prandom_mask(model) # End of modification # Start training for epoch in range(args.start_epoch, args.epochs): lr_policy(epoch, iteration=None) modifier(args, epoch, model) cur_lr = get_lr(optimizer) if args.print_more: print("In epoch{epoch}, lr = {cur_lr}") # train for one epoch start_train = time.time() # WHN modeification add global pruning if args.pscale == "global": if args.gp_warm_up: if epoch < args.gp_warm_up_epochs: args.prune_rate = 0 else: args.prune_rate = record_prune_rate if not args.prandom: args.score_threshold = get_global_score_threshold( model, args.prune_rate) # YHT modification if args.print_more: print_global_layerwise_prune_rate(model, args.prune_rate) # End of modification train_acc1, train_acc5 = train(data.train_loader, model, criterion, optimizer, epoch, args, writer=writer) train_time.update((time.time() - start_train) / 60) # evaluate on validation set start_validation = time.time() # if random labeled, evaluate on training set (by yty) if args.shuffle: acc1, acc5 = train_acc1, train_acc5 else: acc1, acc5 = validate(data.val_loader, model, criterion, args, writer, epoch) validation_time.update((time.time() - start_validation) / 60) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) best_acc5 = max(acc5, best_acc5) best_train_acc1 = max(train_acc1, best_train_acc1) best_train_acc5 = max(train_acc5, best_train_acc5) save = ((epoch % args.save_every) == 0) and args.save_every > 0 if is_best or save or epoch == args.epochs - 1: if is_best: print( f"==> New best, saving at {ckpt_base_dir / 'model_best.pth'}" ) save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "best_acc1": best_acc1, "best_acc5": best_acc5, "best_train_acc1": best_train_acc1, "best_train_acc5": best_train_acc5, "optimizer": optimizer.state_dict(), "curr_acc1": acc1, "curr_acc5": acc5, }, is_best, filename=ckpt_base_dir / f"epoch_{epoch}.state", save=save, ) epoch_time.update((time.time() - end_epoch) / 60) progress_overall.display(epoch) progress_overall.write_to_tensorboard(writer, prefix="diagnostics", global_step=epoch) if args.conv_type == "SampleSubnetConv": count = 0 sum_pr = 0.0 for n, m in model.named_modules(): if isinstance(m, SampleSubnetConv): # avg pr across 10 samples pr = 0.0 for _ in range(10): pr += ((torch.rand_like(m.clamped_scores) >= m.clamped_scores).float().mean().item()) pr /= 10.0 writer.add_scalar("pr/{}".format(n), pr, epoch) sum_pr += pr count += 1 args.prune_rate = sum_pr / count writer.add_scalar("pr/average", args.prune_rate, epoch) writer.add_scalar("test/lr", cur_lr, epoch) end_epoch = time.time() write_result_to_csv( best_acc1=best_acc1, best_acc5=best_acc5, best_train_acc1=best_train_acc1, best_train_acc5=best_train_acc5, prune_rate=args.prune_rate, curr_acc1=acc1, curr_acc5=acc5, base_config=args.config, name=args.name, )