def run(gpu, config): cudnn.benchmark = True if config['distribute']: rank = config['rank'] * config['last_node_gpus'] + gpu print("world_size: {}, rank: {}".format(config['world_size'], rank)) dist.init_process_group(backend=config['backend'], init_method=config['ip'], world_size=config['world_size'], rank=rank) # create model model = AlexNet(10) if config['distribute']: torch.cuda.set_device(gpu) model.cuda(gpu) model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) # define loss function criterion = nn.CrossEntropyLoss().cuda() # define optimizer strategy optimizer = torch.optim.SGD(model.parameters(), config['lr'], momentum=config['momentum'], weight_decay=config['weight_decay']) # load data data_path = '~/datasets/cifar10/train' train_set = LoadClassifyDataSets(data_path, 227) train_sampler = None if config['distribute']: train_sampler = distributed.DistributedSampler(train_set) train_loader = DataLoader(train_set, config['batch_size'], shuffle=(train_sampler is None), num_workers=config['num_workers'], pin_memory=True, sampler=train_sampler, collate_fn=collate_fn) for epo in range(config['epoch']): if config['distribute']: train_sampler.set_epoch(epo) # train for per epoch train(train_loader, model, criterion, optimizer, epo, gpu)
m.weigth.data.fill_(1) m.bias.data.zero_() trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) trainloader = DataLoader(trainset, batch_size=100, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) testloader = DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) n_output = 10 net = AlexNet(10) # 如果GPU可用,使用GPU if use_cuda: # move param and buffer to GPU net.cuda() # parallel use GPU net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count()-1)) # speed up slightly cudnn.benchmark = True # 定义度量和优化 criterion = nn.CrossEntropyLoss() #交叉熵验证 optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) #随机梯度下降 # 训练阶段 def train(epoch): print('\nEpoch: %d' % epoch) # switch to train mode net.train()
def main(is_distributed, rank, ip): world_size = 1 if is_distributed: world_size = 2 torch.distributed.init_process_group(backend='nccl', init_method=ip, world_size=world_size, rank=rank) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." print("Connect") # set hyper parameters batch_size = 128 lr = 0.01 # base on batch size 256 momentum = 0.9 weight_decay = 0.0001 epoch = 100 # recompute lr lr = lr * world_size # create model model = AlexNet(10) model = model.cuda() if is_distributed: # for distribute training model = nn.parallel.DistributedDataParallel(model) # define loss function criterion = nn.CrossEntropyLoss().cuda() # define optimizer strategy optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) # load train data data_path = '~/datasets/cifar10/train' train_set = LoadClassifyDataSets(data_path, 227) train_sampler = None if is_distributed: train_sampler = distributed.DistributedSampler(train_set) train_loader = DataLoader(train_set, batch_size, shuffle=(train_sampler is None), num_workers=4, pin_memory=True, sampler=train_sampler, collate_fn=collate_fn) for epoch in range(100): # for distribute if is_distributed: train_sampler.set_epoch(epoch) model.train() train_iter = iter(train_loader) inputs, target = next(train_iter) step = 0 print("Epoch is {}".format(epoch)) while inputs is not None: step += 1 print("Step is {}".format(step)) if not is_distributed: inputs = inputs.cuda() time_model_1 = time.time() output = model(inputs) time_model_2 = time.time() print("model time: {}".format(time_model_2 - time_model_1)) time_loss_1 = time.time() loss = criterion(output, target.cuda()) time_loss_2 = time.time() print("loss time: {}".format(time_loss_2 - time_loss_1)) optimizer.zero_grad() time_back_1 = time.time() loss.backward() time_back_2 = time.time() print("back time: {}".format(time_back_2 - time_back_1)) optimizer.step() if step % 10 == 0: print("loss is : {}", loss.item()) inputs, target = next(train_iter, (None, None))
def main(is_distributed, sync_bn, rank): world_size = 1 if is_distributed: world_size = 2 torch.distributed.init_process_group( backend='nccl', init_method='tcp://172.16.117.110:1234', world_size=world_size, rank=rank) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." # set hyper parameters batch_size = 30 lr = 0.01 # base on batch size 256 momentum = 0.9 weight_decay = 0.0001 epoch = 100 # recompute lr lr = lr * world_size # create model model = AlexNet(10) # leverage apex to realize batch_normal synchronization in different GPU # if sync_bn: # model = apex.parallel.convert_syncbn_model(model) model = model.cuda() # define loss function criterion = nn.CrossEntropyLoss().cuda() # define optimizer strategy optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) # initialize Amp # model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O0') if is_distributed: # for distribute training # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) model = nn.parallel.DistributedDataParallel(model) # load train data data_path = '~/datasets/cifar10/train' train_set = LoadClassifyDataSets(data_path, 227) train_sampler = None if is_distributed: train_sampler = distributed.DistributedSampler(train_set, world_size, rank=rank) # train_sampler = distributed.DistributedSampler(train_set) train_loader = DataLoader(train_set, batch_size, shuffle=(train_sampler is None), num_workers=4, pin_memory=True, sampler=train_sampler, collate_fn=collate_fn) for epoch in range(100): # for distribute if is_distributed: train_sampler.set_epoch(epoch) model.train() train_iter = iter(train_loader) inputs, target = next(train_iter) step = 0 print("Epoch is {}".format(epoch)) while inputs is not None: step += 1 print("test0") temp = inputs.cuda() print("test01") output = model(temp) print("test1") loss = criterion(output, target.cuda()) print("test2") optimizer.zero_grad() print("test3") loss.backward() print("test4") # with apex.amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() optimizer.step() print("test5") if step % 10 == 0: print("loss is : ", loss.item()) inputs, target = next(train_iter, (None, None))