def train_copy(proc_ind, trainloader, model, criterion, use_cuda, device, e, args, testloader, receiveQueue, sendQueue): cuda_p2p.operatingGPUs(gpu0, gpu1) cuda_p2p.enablePeerAccess() print("\nTraining on device " + str(device) + " begins") device_name = "cuda:" + str(device) model.cuda(device_name) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # switch to train mode model.train() batch_time, data_time, losses, top1, top5, end = AverageMeter( ), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter( ), time.time() bar = Bar('Processing', max=len(trainloader)) for batch_idx, (inputs, targets) in enumerate(trainloader): if batch_idx == 1: print("\nGPU1 waiting for signal") e.wait() print("\nGPU1 caught signal, sending receival signal") e.clear() cuda_p2p.cudaSync() # measure data loading time data_time.update(time.time() - end) if use_cuda: inputs_remote = inputs[:len(inputs) // 2, :, :, :].cuda("cuda:" + str(gpu0)) inputs_local = inputs[len(inputs) // 2:, :, :, :].cuda(device_name) targets = targets.cuda(device_name, non_blocking=True) # compute output # print("Running train_copy") cuda_p2p.cudaSync() outputs_remote_f, outputs_remote_res = work_warpper( lambda: model(inputs_remote)) outputs_local_f, outputs_local_res = work_warpper( lambda: model(inputs_local)) t1, t2 = Thread(target=outputs_remote_f), Thread( target=outputs_local_f) t1.start() t2.start() t1.join() t2.join() outputs_remote = outputs_remote_res() outputs_local = outputs_local_res() outputs = torch.cat( (outputs_remote, outputs_local), dim=0) ###outputs are always saved in the model gpu loss = criterion(outputs, targets) # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) losses.update(loss.data.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) top5.update(prec5.item(), inputs.size(0)) # compute gradient and do SGD step cuda_p2p.cudaSync() optimizer.zero_grad() cuda_p2p.cudaSync() # All Reduce 1, receiving model 0 grad if batch_idx >= 1: # print("GPU1 getting") for idx, param in enumerate(list(model.parameters())): uuid, grad = receiveQueue.get() grad = grad.cuda("cuda:" + str(gpu0)) print("\n--- GPU 1 received uuid", uuid, "grad", grad) param.grad.data = grad.clone() # print("After receive, 0->1 queue size is ", receiveQueue.qsize()) # print("GPU1 Received") cuda_p2p.cudaSync() loss.backward() # all reduce /2 part if batch_idx >= 1: for param in model.parameters(): param.grad.data /= 2 cuda_p2p.cudaSync() # All Reduce 2, sharing model 1 grad if batch_idx >= 1: # print("GPU1 putting") # remote_pdb.set_trace() for idx, param in enumerate(list(model.parameters())): # print("\n GPU 1 put in uuid", idx, "grad", param.grad) sendQueue.put((idx, param.grad.clone().cpu())) # print("current 1->0 queue size is ", sendQueue.qsize()) # print("GPU1 all reduce Share") cuda_p2p.cudaSync() optimizer.step() cuda_p2p.cudaSync() # measure elapsed time batch_time.update(time.time() - end) end = time.time() #plot progress # bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format( # batch=batch_idx + 1, # size=len(trainloader), # data=data_time.avg, # bt=batch_time.avg, # total=bar.elapsed_td, # eta=bar.eta_td, # loss=losses.avg, # top1=top1.avg, # top5=top5.avg, # ) # bar.next() bar.finish() # validation test_loss, test_acc = test(testloader, model, criterion, args.start_epoch, use_cuda, device) print("model 1 loss, acc:", test_loss, test_acc) save_model(model, 1) print("\nTraining on device " + str(device) + " ends") queue.close()
def train(trainloader, model, criterion, optimizer, epoch, use_cuda): # switch to train mode model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() bar = Bar('Processing', max=len(trainloader)) for batch_idx, (inputs, targets) in enumerate(trainloader): cuda_p2p.cudaSync() # measure data loading time data_time.update(time.time() - end) if use_cuda: if batch_idx < len(trainloader) // 2: inputs, targets = inputs.cuda("cuda:3"), targets.cuda( "cuda:3", non_blocking=True) else: inputs, targets = inputs.cuda("cuda:2"), targets.cuda( "cuda:2", non_blocking=True) # compute output cuda_p2p.cudaSync() outputs = model(inputs) loss = criterion(outputs, targets) # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) losses.update(loss.data.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) top5.update(prec5.item(), inputs.size(0)) # compute gradient and do SGD step cuda_p2p.cudaSync() optimizer.zero_grad() #pdb.set_trace() cuda_p2p.cudaSync() loss.backward() cuda_p2p.cudaSync() optimizer.step() cuda_p2p.cudaSync() #print(list(model.classifier.weight)[0][:10]) # measure elapsed time batch_time.update(time.time() - end) end = time.time() #plot progress bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format( batch=batch_idx + 1, size=len(trainloader), data=data_time.avg, bt=batch_time.avg, total=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg, top1=top1.avg, top5=top5.avg, ) bar.next() bar.finish() return (losses.avg, top1.avg)