Example #1
0
def train_copy(proc_ind, trainloader, model, criterion, use_cuda, device, e,
               args, testloader, receiveQueue, sendQueue):
    cuda_p2p.operatingGPUs(gpu0, gpu1)
    cuda_p2p.enablePeerAccess()
    print("\nTraining on device " + str(device) + " begins")
    device_name = "cuda:" + str(device)
    model.cuda(device_name)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # switch to train mode
        model.train()
        batch_time, data_time, losses, top1, top5, end = AverageMeter(
        ), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(
        ), time.time()
        bar = Bar('Processing', max=len(trainloader))
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            if batch_idx == 1:
                print("\nGPU1 waiting for signal")
                e.wait()
                print("\nGPU1 caught signal, sending receival signal")
                e.clear()

            cuda_p2p.cudaSync()
            # measure data loading time
            data_time.update(time.time() - end)
            if use_cuda:
                inputs_remote = inputs[:len(inputs) //
                                       2, :, :, :].cuda("cuda:" + str(gpu0))
                inputs_local = inputs[len(inputs) //
                                      2:, :, :, :].cuda(device_name)
                targets = targets.cuda(device_name, non_blocking=True)
            # compute output
            # print("Running train_copy")
            cuda_p2p.cudaSync()

            outputs_remote_f, outputs_remote_res = work_warpper(
                lambda: model(inputs_remote))
            outputs_local_f, outputs_local_res = work_warpper(
                lambda: model(inputs_local))
            t1, t2 = Thread(target=outputs_remote_f), Thread(
                target=outputs_local_f)
            t1.start()
            t2.start()
            t1.join()
            t2.join()
            outputs_remote = outputs_remote_res()
            outputs_local = outputs_local_res()

            outputs = torch.cat(
                (outputs_remote, outputs_local),
                dim=0)  ###outputs are always saved in the model gpu
            loss = criterion(outputs, targets)
            # measure accuracy and record loss
            prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
            losses.update(loss.data.item(), inputs.size(0))
            top1.update(prec1.item(), inputs.size(0))
            top5.update(prec5.item(), inputs.size(0))
            # compute gradient and do SGD step
            cuda_p2p.cudaSync()
            optimizer.zero_grad()
            cuda_p2p.cudaSync()

            # All Reduce 1, receiving model 0 grad
            if batch_idx >= 1:
                # print("GPU1 getting")
                for idx, param in enumerate(list(model.parameters())):
                    uuid, grad = receiveQueue.get()
                    grad = grad.cuda("cuda:" + str(gpu0))
                    print("\n--- GPU 1 received uuid", uuid, "grad", grad)

                    param.grad.data = grad.clone()
                # print("After receive, 0->1 queue size is ", receiveQueue.qsize())
                # print("GPU1 Received")

            cuda_p2p.cudaSync()
            loss.backward()
            # all reduce /2 part
            if batch_idx >= 1:
                for param in model.parameters():
                    param.grad.data /= 2
            cuda_p2p.cudaSync()

            # All Reduce 2, sharing model 1 grad
            if batch_idx >= 1:
                # print("GPU1 putting")
                # remote_pdb.set_trace()
                for idx, param in enumerate(list(model.parameters())):
                    # print("\n GPU 1 put in uuid", idx, "grad", param.grad)
                    sendQueue.put((idx, param.grad.clone().cpu()))
                # print("current 1->0 queue size is ", sendQueue.qsize())
                # print("GPU1 all reduce Share")

            cuda_p2p.cudaSync()
            optimizer.step()
            cuda_p2p.cudaSync()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            #plot progress
            # bar.suffix  = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format(
            #             batch=batch_idx + 1,
            #             size=len(trainloader),
            #             data=data_time.avg,
            #             bt=batch_time.avg,
            #             total=bar.elapsed_td,
            #             eta=bar.eta_td,
            #             loss=losses.avg,
            #             top1=top1.avg,
            #             top5=top5.avg,
            #             )
            # bar.next()
        bar.finish()

        # validation
        test_loss, test_acc = test(testloader, model, criterion,
                                   args.start_epoch, use_cuda, device)
        print("model 1 loss, acc:", test_loss, test_acc)

    save_model(model, 1)
    print("\nTraining on device " + str(device) + " ends")
    queue.close()
Example #2
0
def main():
    global best_acc
    cuda_p2p.operatingGPUs(2, 3)
    cuda_p2p.enablePeerAccess()
    #a = torch.tensor(3).int().cuda('cuda:0')
    #b = torch.tensor(4).int().cuda('cuda:1')
    #cuda_p2p.add_test(a, b)
    #print(a)

    #print(b)
    start_epoch = args.start_epoch  # start from epoch 0 or last checkpoint epoch

    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # Data
    print('==> Preparing dataset %s' % args.dataset)
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])
    if args.dataset == 'cifar10':
        dataloader = datasets.CIFAR10
        num_classes = 10
    else:
        dataloader = datasets.CIFAR100
        num_classes = 100

    trainset = dataloader(root='./data',
                          train=True,
                          download=True,
                          transform=transform_train)
    trainloader = data.DataLoader(trainset,
                                  batch_size=args.train_batch,
                                  shuffle=True,
                                  num_workers=args.workers)

    testset = dataloader(root='./data',
                         train=False,
                         download=False,
                         transform=transform_test)
    testloader = data.DataLoader(testset,
                                 batch_size=args.test_batch,
                                 shuffle=False,
                                 num_workers=args.workers)

    # Model
    print("==> creating model '{}'".format(args.arch))
    if args.arch.startswith('resnext'):
        model = models.__dict__[args.arch](
            cardinality=args.cardinality,
            num_classes=num_classes,
            depth=args.depth,
            widen_factor=args.widen_factor,
            dropRate=args.drop,
        )
    elif args.arch.startswith('densenet'):
        model = models.__dict__[args.arch](
            num_classes=num_classes,
            depth=args.depth,
            growthRate=args.growthRate,
            compressionRate=args.compressionRate,
            dropRate=args.drop,
        )
    elif args.arch.startswith('wrn'):
        model = models.__dict__[args.arch](
            num_classes=num_classes,
            depth=args.depth,
            widen_factor=args.widen_factor,
            dropRate=args.drop,
        )
    elif args.arch.endswith('resnet'):
        model = models.__dict__[args.arch](
            num_classes=num_classes,
            depth=args.depth,
            block_name=args.block_name,
        )
    else:
        model = models.__dict__[args.arch](num_classes=num_classes)

    #model = torch.nn.DataParallel(model).cuda()
    model = model.cuda("cuda:2")
    print("Initialized linear weight: ", list(model.classifier.weight)[0][:10])
    print('    Total params: %.2fM' %
          (sum(p.numel() for p in model.parameters()) / 1000000.0))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    #optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Resume
    title = 'cifar-10-' + args.arch
    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isfile(
            args.resume), 'Error: no checkpoint directory found!'
        args.checkpoint = os.path.dirname(args.resume)
        checkpoint = torch.load(args.resume)
        best_acc = checkpoint['best_acc']
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                        title=title,
                        resume=True)
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title)
        logger.set_names([
            'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.',
            'Valid Acc.'
        ])

    if args.evaluate:
        print('\nEvaluation only')
        test_loss, test_acc = test(testloader, model, criterion, start_epoch,
                                   use_cuda)
        print(' Test Loss:  %.8f, Test Acc:  %.2f' % (test_loss, test_acc))
        return

    # Train and val
    for epoch in range(start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        print('\nEpoch: [%d | %d] LR: %f' %
              (epoch + 1, args.epochs, state['lr']))
        train_loss, train_acc = train(trainloader, model, criterion, optimizer,
                                      epoch, use_cuda)
        test_loss, test_acc = test(testloader, model, criterion, epoch,
                                   use_cuda)
        # append logger file
        # logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc])

        # save model
        is_best = test_acc > best_acc
        best_acc = max(test_acc, best_acc)
        print('Best acc:')
        print(best_acc)
        # save_checkpoint({
        #         'epoch': epoch + 1,
        #         'state_dict': model.state_dict(),
        #         'acc': test_acc,
        #         'best_acc': best_acc,
        #         'optimizer' : optimizer.state_dict(),
        #     }, is_best, checkpoint=args.checkpoint)

    # e.set()
    #e2.set()
    # th.join()
    #th2.join()

    logger.close()
    logger.plot()
    savefig(os.path.join(args.checkpoint, 'log.eps'))