コード例 #1
0
ファイル: cifar_mem.py プロジェクト: lxjhk/wavelet
def train_copy(proc_ind, trainloader, model, criterion, use_cuda, device, e,
               args, testloader, receiveQueue, sendQueue):
    cuda_p2p.operatingGPUs(gpu0, gpu1)
    cuda_p2p.enablePeerAccess()
    print("\nTraining on device " + str(device) + " begins")
    device_name = "cuda:" + str(device)
    model.cuda(device_name)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # switch to train mode
        model.train()
        batch_time, data_time, losses, top1, top5, end = AverageMeter(
        ), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(
        ), time.time()
        bar = Bar('Processing', max=len(trainloader))
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            if batch_idx == 1:
                print("\nGPU1 waiting for signal")
                e.wait()
                print("\nGPU1 caught signal, sending receival signal")
                e.clear()

            cuda_p2p.cudaSync()
            # measure data loading time
            data_time.update(time.time() - end)
            if use_cuda:
                inputs_remote = inputs[:len(inputs) //
                                       2, :, :, :].cuda("cuda:" + str(gpu0))
                inputs_local = inputs[len(inputs) //
                                      2:, :, :, :].cuda(device_name)
                targets = targets.cuda(device_name, non_blocking=True)
            # compute output
            # print("Running train_copy")
            cuda_p2p.cudaSync()

            outputs_remote_f, outputs_remote_res = work_warpper(
                lambda: model(inputs_remote))
            outputs_local_f, outputs_local_res = work_warpper(
                lambda: model(inputs_local))
            t1, t2 = Thread(target=outputs_remote_f), Thread(
                target=outputs_local_f)
            t1.start()
            t2.start()
            t1.join()
            t2.join()
            outputs_remote = outputs_remote_res()
            outputs_local = outputs_local_res()

            outputs = torch.cat(
                (outputs_remote, outputs_local),
                dim=0)  ###outputs are always saved in the model gpu
            loss = criterion(outputs, targets)
            # measure accuracy and record loss
            prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
            losses.update(loss.data.item(), inputs.size(0))
            top1.update(prec1.item(), inputs.size(0))
            top5.update(prec5.item(), inputs.size(0))
            # compute gradient and do SGD step
            cuda_p2p.cudaSync()
            optimizer.zero_grad()
            cuda_p2p.cudaSync()

            # All Reduce 1, receiving model 0 grad
            if batch_idx >= 1:
                # print("GPU1 getting")
                for idx, param in enumerate(list(model.parameters())):
                    uuid, grad = receiveQueue.get()
                    grad = grad.cuda("cuda:" + str(gpu0))
                    print("\n--- GPU 1 received uuid", uuid, "grad", grad)

                    param.grad.data = grad.clone()
                # print("After receive, 0->1 queue size is ", receiveQueue.qsize())
                # print("GPU1 Received")

            cuda_p2p.cudaSync()
            loss.backward()
            # all reduce /2 part
            if batch_idx >= 1:
                for param in model.parameters():
                    param.grad.data /= 2
            cuda_p2p.cudaSync()

            # All Reduce 2, sharing model 1 grad
            if batch_idx >= 1:
                # print("GPU1 putting")
                # remote_pdb.set_trace()
                for idx, param in enumerate(list(model.parameters())):
                    # print("\n GPU 1 put in uuid", idx, "grad", param.grad)
                    sendQueue.put((idx, param.grad.clone().cpu()))
                # print("current 1->0 queue size is ", sendQueue.qsize())
                # print("GPU1 all reduce Share")

            cuda_p2p.cudaSync()
            optimizer.step()
            cuda_p2p.cudaSync()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            #plot progress
            # bar.suffix  = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format(
            #             batch=batch_idx + 1,
            #             size=len(trainloader),
            #             data=data_time.avg,
            #             bt=batch_time.avg,
            #             total=bar.elapsed_td,
            #             eta=bar.eta_td,
            #             loss=losses.avg,
            #             top1=top1.avg,
            #             top5=top5.avg,
            #             )
            # bar.next()
        bar.finish()

        # validation
        test_loss, test_acc = test(testloader, model, criterion,
                                   args.start_epoch, use_cuda, device)
        print("model 1 loss, acc:", test_loss, test_acc)

    save_model(model, 1)
    print("\nTraining on device " + str(device) + " ends")
    queue.close()
コード例 #2
0
ファイル: cifar_twin.py プロジェクト: lxjhk/wavelet
def train(trainloader, model, criterion, optimizer, epoch, use_cuda):
    # switch to train mode
    model.train()

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    end = time.time()

    bar = Bar('Processing', max=len(trainloader))
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        cuda_p2p.cudaSync()
        # measure data loading time
        data_time.update(time.time() - end)
        if use_cuda:
            if batch_idx < len(trainloader) // 2:
                inputs, targets = inputs.cuda("cuda:3"), targets.cuda(
                    "cuda:3", non_blocking=True)
            else:
                inputs, targets = inputs.cuda("cuda:2"), targets.cuda(
                    "cuda:2", non_blocking=True)

        # compute output
        cuda_p2p.cudaSync()
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
        losses.update(loss.data.item(), inputs.size(0))
        top1.update(prec1.item(), inputs.size(0))
        top5.update(prec5.item(), inputs.size(0))

        # compute gradient and do SGD step
        cuda_p2p.cudaSync()
        optimizer.zero_grad()

        #pdb.set_trace()

        cuda_p2p.cudaSync()
        loss.backward()
        cuda_p2p.cudaSync()

        optimizer.step()
        cuda_p2p.cudaSync()
        #print(list(model.classifier.weight)[0][:10])
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        #plot progress
        bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format(
            batch=batch_idx + 1,
            size=len(trainloader),
            data=data_time.avg,
            bt=batch_time.avg,
            total=bar.elapsed_td,
            eta=bar.eta_td,
            loss=losses.avg,
            top1=top1.avg,
            top5=top5.avg,
        )
        bar.next()
    bar.finish()
    return (losses.avg, top1.avg)