def log_grad_norm_sum(optimizer): tensors = [] for param_group in optimizer.param_groups: for p in param_group['params']: if p.requires_grad: tensors.append(torch.norm(p.grad)) t = torch.stack(tensors) print('Rank:', dadt.rank(), 'Grad norm sum:', torch.sum(t))
def test(model, device, test_loader): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss( output, target, reduction='sum').item() # sum up batch loss pred = output.argmax( dim=1, keepdim=True) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) print( '\nRank:{}, Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n' .format(dadt.rank(), test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
def train(args, model, device, train_loader, distribte_optimizer, epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): start_time = int(time.time() * 1000) data, target = data.to(device), target.to(device) distribte_optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() distribte_optimizer.step() end_time = int(time.time() * 1000) if batch_idx % args.log_interval == 0: print( 'Rank:{}, Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, cost time:{}' .format(dadt.rank(), epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item(), (end_time - start_time)))
#coding=utf-8 import torch import ctypes import dadt.pytorch as dadt dadt.init(broad_cast_executor='nccl', all_reduce_executor='nccl') device = torch.device('cuda:{}'.format(dadt.local_rank())) # device = torch.device('cpu') if 0 == dadt.rank(): x = torch.tensor([1, 2, 3, 4], device=device, dtype=torch.float) else: x = torch.tensor([1, 1, 1, 1], device=device, dtype=torch.float) y = dadt.all_reduce(x, "x") print(dadt.rank(), y) y = dadt.all_reduce(x, "x") print(dadt.rank(), y) dadt.shutdown()
def train(): # init dadt dadt.init( cycle_duration_ms=5, broad_cast_executor='nccl', all_reduce_executor='nccl', group_buffer_size=0) # Data augmentation and normalization for training # Just normalization for validation data_transforms = { 'train': transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } data_dir = 'hymenoptera_data' image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']} dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4, shuffle=True, num_workers=4) for x in ['train', 'val']} dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} class_names = image_datasets['train'].classes device = torch.device("cuda:{}".format(dadt.local_rank())) model_ft = models.resnet101(pretrained=False) num_ftrs = model_ft.fc.in_features # Here the size of each output sample is set to 2. # Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)). model_ft.fc = nn.Linear(num_ftrs, 2) model_ft = model_ft.to(device) criterion = nn.CrossEntropyLoss() # Observe that all parameters are being optimized optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.002, momentum=0.9) # init distribute optimizer d_optimizer = dadt.DistributedOptimizer(optimizer=optimizer_ft) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) model_ft.train() total_cost_time = 0.0 total_count = 0.0 for epoch in range(2500000): running_loss = 0.0 running_corrects = 0 for inputs, labels in dataloaders['train']: start_time = time.time() inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients d_optimizer.zero_grad() outputs = model_ft(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) loss.backward() d_optimizer.step() cost_time = int(round((time.time() - start_time) * 1000)) total_cost_time += cost_time total_count += 1 # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) print('Rank:{}, cost time:{}, avg tiem:{} epoch:{}, loss:{}'.format(dadt.rank(), cost_time, total_cost_time/total_count, epoch, loss.item())) print('--------------------------------------------------------------------------') epoch_loss = running_loss / dataset_sizes['train'] epoch_acc = running_corrects.double() / dataset_sizes['train'] print('Rank:{}, {} Loss: {:.4f} Acc: {:.4f}'.format(dadt.rank(), 'train', epoch_loss, epoch_acc))
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() # initialize dadt dadt.init(broad_cast_executor='nccl', all_reduce_executor='nccl', all_reduce_buffer_size=64 * 1024 * 1024) torch.manual_seed(args.seed) # get device by rank device = torch.device("cuda:{}".format(dadt.local_rank())) kwargs = {'batch_size': args.batch_size} kwargs.update({'num_workers': 1, 'pin_memory': True, 'shuffle': True}, ) transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) dataset1 = datasets.MNIST('./data{}'.format(dadt.local_rank()), train=True, download=True, transform=transform) dataset2 = datasets.MNIST('./data{}'.format(dadt.local_rank()), train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1, **kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) distribte_optimizer = dadt.DistributedOptimizer(optimizer=optimizer) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, distribte_optimizer, epoch) test(model, device, test_loader) scheduler.step() if args.save_model and 0 == dadt.rank(): torch.save(model.state_dict(), "mnist_cnn.pt") # shut down background thread dadt.shutdown()