def train_copy(proc_ind, trainloader, model, criterion, use_cuda, device, e, args, testloader, receiveQueue, sendQueue): cuda_p2p.operatingGPUs(gpu0, gpu1) cuda_p2p.enablePeerAccess() print("\nTraining on device " + str(device) + " begins") device_name = "cuda:" + str(device) model.cuda(device_name) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # switch to train mode model.train() batch_time, data_time, losses, top1, top5, end = AverageMeter( ), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter( ), time.time() bar = Bar('Processing', max=len(trainloader)) for batch_idx, (inputs, targets) in enumerate(trainloader): if batch_idx == 1: print("\nGPU1 waiting for signal") e.wait() print("\nGPU1 caught signal, sending receival signal") e.clear() cuda_p2p.cudaSync() # measure data loading time data_time.update(time.time() - end) if use_cuda: inputs_remote = inputs[:len(inputs) // 2, :, :, :].cuda("cuda:" + str(gpu0)) inputs_local = inputs[len(inputs) // 2:, :, :, :].cuda(device_name) targets = targets.cuda(device_name, non_blocking=True) # compute output # print("Running train_copy") cuda_p2p.cudaSync() outputs_remote_f, outputs_remote_res = work_warpper( lambda: model(inputs_remote)) outputs_local_f, outputs_local_res = work_warpper( lambda: model(inputs_local)) t1, t2 = Thread(target=outputs_remote_f), Thread( target=outputs_local_f) t1.start() t2.start() t1.join() t2.join() outputs_remote = outputs_remote_res() outputs_local = outputs_local_res() outputs = torch.cat( (outputs_remote, outputs_local), dim=0) ###outputs are always saved in the model gpu loss = criterion(outputs, targets) # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) losses.update(loss.data.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) top5.update(prec5.item(), inputs.size(0)) # compute gradient and do SGD step cuda_p2p.cudaSync() optimizer.zero_grad() cuda_p2p.cudaSync() # All Reduce 1, receiving model 0 grad if batch_idx >= 1: # print("GPU1 getting") for idx, param in enumerate(list(model.parameters())): uuid, grad = receiveQueue.get() grad = grad.cuda("cuda:" + str(gpu0)) print("\n--- GPU 1 received uuid", uuid, "grad", grad) param.grad.data = grad.clone() # print("After receive, 0->1 queue size is ", receiveQueue.qsize()) # print("GPU1 Received") cuda_p2p.cudaSync() loss.backward() # all reduce /2 part if batch_idx >= 1: for param in model.parameters(): param.grad.data /= 2 cuda_p2p.cudaSync() # All Reduce 2, sharing model 1 grad if batch_idx >= 1: # print("GPU1 putting") # remote_pdb.set_trace() for idx, param in enumerate(list(model.parameters())): # print("\n GPU 1 put in uuid", idx, "grad", param.grad) sendQueue.put((idx, param.grad.clone().cpu())) # print("current 1->0 queue size is ", sendQueue.qsize()) # print("GPU1 all reduce Share") cuda_p2p.cudaSync() optimizer.step() cuda_p2p.cudaSync() # measure elapsed time batch_time.update(time.time() - end) end = time.time() #plot progress # bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format( # batch=batch_idx + 1, # size=len(trainloader), # data=data_time.avg, # bt=batch_time.avg, # total=bar.elapsed_td, # eta=bar.eta_td, # loss=losses.avg, # top1=top1.avg, # top5=top5.avg, # ) # bar.next() bar.finish() # validation test_loss, test_acc = test(testloader, model, criterion, args.start_epoch, use_cuda, device) print("model 1 loss, acc:", test_loss, test_acc) save_model(model, 1) print("\nTraining on device " + str(device) + " ends") queue.close()
def main(): global best_acc cuda_p2p.operatingGPUs(2, 3) cuda_p2p.enablePeerAccess() #a = torch.tensor(3).int().cuda('cuda:0') #b = torch.tensor(4).int().cuda('cuda:1') #cuda_p2p.add_test(a, b) #print(a) #print(b) start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, block_name=args.block_name, ) else: model = models.__dict__[args.arch](num_classes=num_classes) #model = torch.nn.DataParallel(model).cuda() model = model.cuda("cuda:2") print("Initialized linear weight: ", list(model.classifier.weight)[0][:10]) print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) #optimizer = optim.Adam(model.parameters(), lr=args.lr) # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) # append logger file # logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) print('Best acc:') print(best_acc) # save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'acc': test_acc, # 'best_acc': best_acc, # 'optimizer' : optimizer.state_dict(), # }, is_best, checkpoint=args.checkpoint) # e.set() #e2.set() # th.join() #th2.join() logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps'))