コード例 #1
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu
    ngpus_per_node = torch.cuda.device_count()
    print("Use GPU: {} for training".format(args.gpu))

    args.rank = args.rank * ngpus_per_node + gpu
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=args.rank)

    print('==> Making model..')
    net = pyramidnet()
    torch.cuda.set_device(args.gpu)
    net.cuda(args.gpu)
    args.batch_size = int(args.batch_size / ngpus_per_node)
    args.num_workers = int(args.num_workers / ngpus_per_node)
    net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[args.gpu])
    num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
    print('The number of parameters of model is', num_params)

    print('==> Preparing data..')
    transforms_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    dataset_train = CIFAR10(root='../data',
                            train=True,
                            download=True,
                            transform=transforms_train)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        dataset_train)
    train_loader = DataLoader(dataset_train,
                              batch_size=args.batch_size,
                              shuffle=(train_sampler is None),
                              num_workers=args.num_workers,
                              sampler=train_sampler)

    # there are 10 classes so the dataset name is cifar-10
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=1e-4)

    train(net, criterion, optimizer, train_loader, args.gpu)
コード例 #2
0
ファイル: train.py プロジェクト: alisure-ml/pytorch-multigpu
def main_worker(gpu_id, ngpus, root, args):
    batch_size = int(args.batch_size / ngpus)
    num_workers = int(args.num_workers / ngpus)

    Tools.print("Use GPU: {} for training".format(gpu_id))
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=ngpus,
                            rank=gpu_id)

    Tools.print('==> Making model..')
    net = pyramidnet()
    torch.cuda.set_device(gpu_id)
    net.cuda(gpu_id)
    net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[gpu_id])

    num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
    Tools.print('The number of parameters of model is {}'.format(num_params))

    Tools.print('==> Preparing data..')
    transforms_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    dataset_train = CIFAR10(root=root,
                            train=True,
                            download=True,
                            transform=transforms_train)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        dataset_train)
    train_loader = DataLoader(dataset_train,
                              batch_size=batch_size,
                              shuffle=(train_sampler is None),
                              num_workers=num_workers,
                              sampler=train_sampler)

    criterion = nn.CrossEntropyLoss().cuda(gpu_id)
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=1e-4)
    cudnn.benchmark = True

    for _ in range(10):
        Tools.print("epoch {}".format(_))
        train(net, criterion, optimizer, train_loader, gpu_id)
    pass
コード例 #3
0
def main():
    if args.gpu_nums > 1:
        raise ValueError("gpu nums must be equal to 1.")

    # set run env
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    print('==> Preparing data..')
    transforms_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    dataset_train = CIFAR10(root=args.dataset_dir,
                            train=True,
                            download=True,
                            transform=transforms_train)

    train_loader = DataLoader(dataset_train,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)

    print('==> Making model..')

    model = pyramidnet()
    model = model.to(device)
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('The number of parameters of model is', num_params)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=args.learning_rate,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

    for epoch in range(1, args.epochs + 1):
        train(epoch, model, criterion, optimizer, train_loader, device)

    if args.save_model:
        if not path.exists(args.train_dir):
            mkdir(args.train_dir)

        torch.save(model.state_dict(),
                   path.join(args.train_dir, "single_gpu_model.pth"))
        print("single gpu model has been saved.")
コード例 #4
0
def main(root="/mnt/4T/Data/data/CIFAR"):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    Tools.print('==> Preparing data..')
    transforms_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    dataset_train = CIFAR10(root=root,
                            train=True,
                            download=True,
                            transform=transforms_train)
    train_loader = DataLoader(dataset_train,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_worker)

    # there are 10 classes so the dataset name is cifar-10
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    Tools.print('==> Making model..')

    net = pyramidnet()

    #############################################################
    net = nn.DataParallel(net)
    cudnn.benchmark = True
    #############################################################

    net = net.to(device)
    num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
    Tools.print('The number of parameters of model is {}'.format(num_params))

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=1e-4)
    # optimizer = optim.Adam(net.parameters(), lr=args.lr)

    for _ in range(10):
        Tools.print("epoch {}".format(_))
        train(net, criterion, optimizer, train_loader, device)
    pass
コード例 #5
0
ファイル: train.py プロジェクト: zzapzzap/hello
def main():
    best_acc = 0

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    print('==> Preparing data..')
    transforms_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    dataset_train = CIFAR10(root='../data',
                            train=True,
                            download=True,
                            transform=transforms_train)

    train_loader = DataLoader(dataset_train,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_worker)

    # there are 10 classes so the dataset name is cifar-10
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    print('==> Making model..')

    net = pyramidnet()
    net = parallel.DataParallelModel(net)
    net = net.to(device)

    num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
    print('The number of parameters of model is', num_params)

    criterion = nn.CrossEntropyLoss()
    criterion = parallel.DataParallelCriterion(criterion, device_ids=[0, 1])
    # criterion = criterion.to(device)
    # optimizer = optim.Adam(net.parameters(),weight_decay=1e-4, lr=args.lr)
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=1e-4)

    train(net, criterion, optimizer, train_loader, device)
コード例 #6
0
def main():
    # set run env
    if args.gpu_nums > 1:
        device = 'cuda' if torch.cuda.is_available() else "cpu"
        gpu_ids = ','.join([str(id) for id in range(args.gpu_nums)])
        environ["CUDA_VISIBLE_DEVICES"] = gpu_ids
    else:
        raise ValueError("gpu-nums must be greater than 1.")

    print('==> Preparing data..')
    transforms_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

    dataset_train = CIFAR10(root='/home/zhaopp5', train=True, download=True,
                            transform=transforms_train)

    train_loader = DataLoader(dataset_train, batch_size=args.batch_size,
                              shuffle=True, num_workers=args.num_workers)

    print('==> Making model..')

    model = pyramidnet()
    if args.gpu_nums > 1:
        model = nn.DataParallel(model)
    model = model.to(device)
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('The number of parameters of model is', num_params)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=args.learning_rate,
                          momentum=args.momentum, weight_decay=args.weight_decay)

    for epoch in range(1, args.epochs + 1):
        train(epoch, model, criterion, optimizer, train_loader, device)

    if args.save_model:
        if not path.exists(args.train_dir):
            mkdir(args.train_dir)

        torch.save(
            model.state_dict(),
            path.join(args.train_dir, "data_parallel_model.pth")
        )
        print("data parallel model has been saved.")
コード例 #7
0
                       download=True,
                       transform=transforms_test)

train_loader = DataLoader(dataset_train,
                          batch_size=args.batch_size,
                          shuffle=True,
                          num_workers=args.num_worker)
test_loader = DataLoader(dataset_test,
                         batch_size=args.batch_size_test,
                         shuffle=False,
                         num_workers=args.num_worker)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
           'ship', 'truck')

net = pyramidnet()
net = net.to('cuda')
num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)

if args.resume is not None:
    checkpoint = torch.load('./save_model/' + args.resume)
    net.load_state_dict(checkpoint['net'])

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(),
                      lr=0.1,
                      momentum=0.9,
                      weight_decay=1e-4)

decay_epoch = [32000, 48000]
step_lr_scheduler = lr_scheduler.MultiStepLR(optimizer,
コード例 #8
0
def main():
    best_acc = 0 # 뭔지 모르겠다만

    device = 'cuda' if torch.cuda.is_available() else 'cpu' # 디바이스는 쿠다를 사용한다 쿠다거 어베일러블 한다면. else인 경우는 cpu를 사용. 

    print('==> Preparing data..') # 데이터 준비는 TORCH VISION의 TRANSFROM 라이브러리를 사용한다. PYTORCH의 데이터 전처리 패키지라고 한다. 
    transforms_train = transforms.Compose([ # 아닌가? transforms_TRAIN 은 COMPOSE METHOD를 사용하는데, 
        transforms.RandomCrop(32, padding=4), # 여러 ARGS를 지정하는데. 그냥 데이터 섞는 방식인거 같은데?
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

    dataset_train = CIFAR10(root='../data', train=True, download=True,  # 진짜 데이터는 여기 있다. CIFAR10을 갖고 와서 저장한다. TORCH VISION에 들어있네
                            transform=transforms_train) # TEST만 별도로 갖고 올 수도 있고, 없으면 DOWNLOAD해서 쓰나봄. 

    train_loader = DataLoader(dataset_train, batch_size=args.batch_size,  
                              shuffle=True, num_workers=args.num_worker # 데이터 로더는 데이터셋 트레인 가지고 , 기존에 지정한 batch size와 shuffle을 하고, num workers도 지정.CPU갯수다이건

    # there are 10 classes so the dataset name is cifar-10
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 
               'dog', 'frog', 'horse', 'ship', 'truck') # CIFAR CLASS 지정하고 

    print('==> Making model..')

    net = pyramidnet() # 이거네. PYRAMIDNET 구조 갖고오고. 이건 같은 폴더에 model.py에서 갖고옴. 
    net = net.to(device) # 이걸 디바이스에 넣나봄. CUDA에? 
    num_params = sum(p.numel() for p in net.parameters() if p.requires_grad) # NET의 PARAMETER들 하나하나를 뽑아서 파라미터 수의ㅏ 합을 저장하네 이건 왜ㅕ하지 
    print('The number of parameters of model is', num_params) # 아 파라미터 숫자 보여주려고. 이런 식으로 뺄 수 있구만

    criterion = nn.CrossEntropyLoss() # CRITERION은 LOSS FUNCTION을 이렇게 부르는 건가? CROSSENTROPY를 썼다. nn 안에 들어있네. 
    optimizer = optim.SGD(net.parameters(), lr=args.lr,  
                          momentum=0.9, weight_decay=1e-4) # OPTIMIZER는 Stochastic Gradinet Descent를 사용. lr은 기 입력된 대로. monentum. weight_decay도 적용
    
    train(net, criterion, optimizer, train_loader, device) # 그리고 TRAIN METHOD 이건 아래에 TRAIN METHOD 별도 선언한 것. 
            

def train(net, criterion, optimizer, train_loader, device):
    net.train()

    train_loss = 0
    correct = 0
    total = 0
    
    epoch_start = time.time()
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        start = time.time()
        
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        acc = 100 * correct / total
        
        batch_time = time.time() - start
        
        if batch_idx % 20 == 0:
            print('Epoch: [{}/{}]| loss: {:.3f} | acc: {:.3f} | batch time: {:.3f}s '.format(
                batch_idx, len(train_loader), train_loss/(batch_idx+1), acc, batch_time))
    
    elapse_time = time.time() - epoch_start
    elapse_time = datetime.timedelta(seconds=elapse_time)
    print("Training time {}".format(elapse_time))
    

if __name__=='__main__':
    main()
コード例 #9
0
def main_worker(gpu, ngpus_per_node, args):
    # init the process group
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.init_method,
                            world_size=args.world_size,
                            rank=args.rank)

    torch.cuda.set_device(gpu)

    print("From Rank: {}, Use GPU: {} for training".format(args.rank, gpu))

    print('From Rank: {}, ==> Making model..'.format(args.rank))
    net = pyramidnet()
    net.cuda(gpu)
    args.batch_size = int(args.batch_size / ngpus_per_node)
    print("batch_size: ", args.batch_size)

    net = torch.nn.parallel.DistributedDataParallel(net,
                                                    device_ids=[gpu],
                                                    output_device=gpu)
    num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
    print(
        'From Rank: {}, The number of parameters of model is'.format(
            args.rank), num_params)

    print('From Rank: {}, ==> Preparing data..'.format(args.rank))
    transforms_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    dataset_train = CIFAR10(root=args.dataset_dir,
                            train=True,
                            download=True,
                            transform=transforms_train)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        dataset_train)
    train_loader = DataLoader(dataset_train,
                              batch_size=args.batch_size,
                              shuffle=(train_sampler is None),
                              num_workers=args.num_workers,
                              sampler=train_sampler)

    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = optim.SGD(net.parameters(),
                          lr=args.learning_rate,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

    scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

    for epoch in range(1, args.epochs + 1):
        train(epoch, net, criterion, optimizer, train_loader, args.rank)
        scheduler.step()

    if args.save_model:
        if not path.exists(args.train_dir):
            mkdir(args.train_dir)

        # if args.rank == 0:
        torch.save(
            net.module.state_dict(),
            path.join(args.train_dir,
                      "distributed_data_parallel_{}.pth".format(args.rank)))
        print("From Rank: {}, model saved.".format(args.rank))
コード例 #10
0
def main():
    args = parser.parse_args()

    #init the process group
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.init_method,
                            world_size=args.world_size,
                            rank=args.rank)

    #set cuda device for use
    gpu_devices = ','.join([str(id) for id in args.gpu_devices])
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu_devices
    print("From Rank: {}, Use GPU: {} for training".format(
        args.rank, gpu_devices))

    print('From Rank: {}, ==> Making model..'.format(args.rank))
    net = pyramidnet()
    net.cuda()
    args.batch_size = int(args.batch_size / args.world_size)
    args.num_workers = int(args.num_workers / args.world_size)
    net = torch.nn.parallel.DistributedDataParallel(
        net, device_ids=args.gpu_devices, output_device=args.gpu_devices[0])
    num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
    print(
        'From Rank: {}, The number of parameters of model is'.format(
            args.rank), num_params)

    print('From Rank: {}, ==> Preparing data..'.format(args.rank))
    transforms_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    dataset_train = CIFAR10(root='./data',
                            train=True,
                            download=True,
                            transform=transforms_train)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        dataset_train)
    train_loader = DataLoader(dataset_train,
                              batch_size=args.batch_size,
                              shuffle=(train_sampler is None),
                              num_workers=args.num_workers,
                              sampler=train_sampler)

    # there are 10 classes so the dataset name is cifar-10
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=1e-4)

    scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

    for epoch in range(args.max_epochs):
        train(epoch, net, criterion, optimizer, train_loader, args.rank)
        scheduler.step()

    # if args.rank == 0:
    torch.save(net.module.state_dict(),
               "final_model_rank_{}.pth".format(args.rank))
    print("From Rank: {}, model saved.".format(args.rank))