def run(rank, size, epochs, batch_size): torch.manual_seed(0) numpy.random.seed(0) batch_size = int(batch_size / float(dist.get_world_size())) normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) training_set = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform_train) train_sampler = torch.utils.data.distributed.DistributedSampler( training_set, num_replicas=size, rank=rank) train_loader = torch.utils.data.DataLoader(training_set, num_workers=2, batch_size=batch_size, sampler=train_sampler, shuffle=False, pin_memory=True) print('Size of training set is {}'.format(len(train_loader))) test_set = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, num_workers=2, batch_size=batch_size, shuffle=False, pin_memory=True) print('Size of test set is {}'.format(len(test_loader))) training_criterion = torch.nn.CrossEntropyLoss().to(device) model = mdl.VGG11() model.to(device) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001) # running training for one epoch for epoch in range(epochs): start_time = time.time() train_model(model, train_loader, optimizer, training_criterion, rank) print('Training time after {} epoch is {}'.format( epoch + 1, (time.time() - start_time))) test_model(model, test_loader, training_criterion)
def vgg_model(rank, size): torch.manual_seed(5000) np.random.seed(5000) normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) training_set = datasets.CIFAR10(root="../../data", train=True, download=True, transform=transform_train) # training set as distributed dataset sampler_d = DistributedSampler( training_set) if torch.distributed.is_available() else None train_loader = torch.utils.data.DataLoader( training_set, num_workers=2, batch_size=batch_size, sampler=sampler_d, #shuffle=True, pin_memory=True) test_set = datasets.CIFAR10(root="../../data", train=False, download=True, transform=transform_test) # testing set as distributed dataset test_loader = torch.utils.data.DataLoader(test_set, num_workers=2, batch_size=batch_size, shuffle=False, pin_memory=True) training_criterion = torch.nn.CrossEntropyLoss().to(device) model = mdl.VGG11() model.to(device) ddp_model = DDP(model) optimizer = optim.SGD(ddp_model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001) # running training for one epoch for epoch in range(1): train_model(ddp_model, train_loader, optimizer, training_criterion, epoch, rank) test_model(ddp_model, test_loader, training_criterion)
def main(): normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) training_set = datasets.CIFAR10(root="../../data", train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(training_set, num_workers=2, batch_size=batch_size, sampler=None, shuffle=True, pin_memory=True) test_set = datasets.CIFAR10(root="../../data", train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, num_workers=2, batch_size=batch_size, shuffle=False, pin_memory=True) training_criterion = torch.nn.CrossEntropyLoss().to(device) model = mdl.VGG11() model.to(device) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001) # running training for one epoch for epoch in range(1): train_model(model, train_loader, optimizer, training_criterion, epoch) test_model(model, test_loader, training_criterion)
def run(rank, size, outputfile): normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) """set randomseed""" torch.manual_seed(randomseed) print('manual_seed=', randomseed) """set up data""" train_set, bsz = partition_dataset(normalize) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) test_set = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform_test) bsz = int(batch_size / float(size)) test_loader = torch.utils.data.DataLoader(test_set, num_workers=2, batch_size=bsz, shuffle=False, pin_memory=True) criterion = torch.nn.CrossEntropyLoss().to(device) """set up model""" model = mdl.VGG11() model.to(device) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001) num_batches = math.ceil(len(train_set.dataset) / float(bsz)) """write output to file""" if os.path.exists(outputfile): os.remove(outputfile) fp = open( outputfile + "_r" + str(dist.get_rank()) + "_size" + str(dist.get_world_size()), "a") """start training""" total_epoch = 1 for epoch in range(total_epoch): # # training start from here running_loss = 0.0 # remember to exit the train loop at end of the epoch for batch_idx, (data, target) in enumerate(train_set): if batch_idx < 10: start = timeit.default_timer() # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(data) loss = criterion(outputs, target) running_loss += loss.item() loss.backward() average_gradients(model) optimizer.step() if batch_idx % 20 == 19: # print every 20 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, batch_idx + 1, running_loss / 20)) fp.write('[%d, %5d] loss: %.3f\n' % (epoch + 1, batch_idx + 1, running_loss / 20)) running_loss = 0.0 if batch_idx == 0: fp.write("Batch\trunning time\n") if batch_idx < 10: end = timeit.default_timer() - start print("Batch " + str(batch_idx) + " running time:" + str(end)) fp.write('%d\t%.5f\n' % (batch_idx, end)) # # training stop fp.close() test_model(model, test_loader, criterion, outputfile)
def main(): #pull in parameters and set variables needed for initialization args = parse_args() print('main.py launched with master ip: ' + str(args.master_ip) + ' num nodes: ' + str(args.num_nodes) + ' rank: ' + str(args.rank) + ' port: ' + str(args.port)) world_size = int(args.num_nodes) os.environ["GLOO_SOCKET_IFNAME"] = 'eth1' batch_size = 256 / world_size #batch size is 256 across all nodes, so need to divide by number of nodes random_seed = 314 torch.manual_seed(random_seed) np.random.seed(random_seed) #initialize distributed group td.init_process_group('gloo', init_method='tcp://' + args.master_ip + ':' + args.port, world_size=world_size, rank=int(args.rank)) #start model setup normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) training_set = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform_train) # # added for distributed process dist_train_sampler = torch.utils.data.distributed.DistributedSampler( training_set) # #add the sampler to the test_loader train_loader = torch.utils.data.DataLoader( training_set, num_workers=2, batch_size=batch_size, sampler=dist_train_sampler, #shuffle=True, pin_memory=True) test_set = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, num_workers=2, batch_size=batch_size, shuffle=False, pin_memory=True) training_criterion = torch.nn.CrossEntropyLoss().to(device) model = mdl.VGG11() model.to(device) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001) #save off model to check if model is initialized the same across nodes #torch.save(model.state_dict(), './node'+args.rank+'_'+'initial_model_weights') # running training for one epoch for epoch in range(1): train_model(model, train_loader, optimizer, training_criterion, epoch, world_size) test_model(model, test_loader, training_criterion)
train_imageData = mnist_data.train.images.reshape( mnist_data.train.images.shape[0], 28, 28) print("start expand data") train_imageData = utils.BatchImageExpand(train_imageData[:SampleSize], 224) print("expand data done") train_labels = mnist_data.train.labels[:SampleSize] train_imageData = train_imageData.reshape(-1, 224, 224, 1) dataset = utils.DataSet(train_imageData, train_labels, testRate=0.05, batchSize=batchSize) learnCurve = utils.learnCurve() vgg = model.VGG11() #data = dataset.getNextBatch() with tf.Session() as sess: writer = tf.summary.FileWriter("name_scope", sess.graph) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) for ep in range(1, 11): data = dataset.getNextBatch() while data != None: # for i, d in enumerate(data[0]): # utils.plotDigit(d.reshape(224,224)) # print(np.argmax(data[1][i])) # print(data[1])
def main(): args = parser.parse_args() # Sets a certain seed for generating random numbers seed = 314 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) batch_size = 256 / args.num_nodes # batch for one node # distributed model setup setup(args) model = mdl.VGG11() print("Setup Finished") # Distributed Data Parallel Training using Built in Module with cpu # DDP uses collective communications in the torch.distributed package to synchronize # gradients and buffers transparently. That is, Gradient synchronization communications # take place during the backward pass and overlap with the backward computation ddp_model = DDP(model) ddp_model.to(device) training_criterion = torch.nn.CrossEntropyLoss().to(device) optimizer = optim.SGD(ddp_model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001) # data loading and preprocess normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) training_set = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform_train) print("Data Loaded") #Sampler that restricts data loading to a subset of the dataset. train_sampler = torch.utils.data.distributed.DistributedSampler( training_set) train_loader = torch.utils.data.DataLoader(training_set, num_workers=args.num_nodes, batch_size=batch_size, sampler=train_sampler, shuffle=(train_sampler is None), pin_memory=True) test_set = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, num_workers=args.num_nodes, batch_size=batch_size, shuffle=False, pin_memory=True) # ensure shuffling work properly across multiple epochs train_sampler.set_epoch(Epoch) print("Training Started") start = time.time() for epoch in range(Epoch): e_s = time.time() train_model(ddp_model, train_loader, optimizer, training_criterion, epoch) e_d = time.time() print("Duration of Epoch " + str(epoch) + " :", e_d - start) test_model(ddp_model, test_loader, training_criterion) print("Total Duration" + str(epoch) + " :", e_d - start) cleanup()