def _test_reduce_helper( self, group, group_id, rank, op, master_value, worker_value, expected_value, cuda=False, rank_to_GPU=None, ): for src in group: if rank == src: tensor = _build_tensor(src + 1).fill_(master_value) if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) dist.reduce(tensor, src, op, group_id) self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) else: tensor = _build_tensor(src + 1).fill_(worker_value) if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) dist.reduce(tensor, src, op, group_id) self._barrier()
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k, v in loss_dict.items(): loss_names.append(k) all_losses.append(v) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
def run(): modell = model.CNN() # modell = model.AlexNet() size = dist.get_world_size() rank = dist.get_rank() group_list = [] for i in range(size): group_list.append(i) group = dist.new_group(group_list) while (1): for param in modell.parameters(): # for dst in range(1, size): # dist.send(param.data, dst=dst) dist.broadcast(param.data, src=0, group=group) for param in modell.parameters(): tensor_temp = torch.zeros_like(param.data) dist.reduce(tensor_temp, dst=0, op=dist.reduce_op.SUM, group=group) param.data = tensor_temp / (size - 1)
def run(size, rank): modell = model.CNN() #modell = model.AlexNet() optimizer = torch.optim.Adam(modell.parameters(), lr=LR) loss_func = torch.nn.CrossEntropyLoss() if(IID == True): train_loader = Mnist().get_train_data() test_data = Mnist().get_test_data() else: if(rank > 0): if(rank == 1): train_loader = Mnist_noniid().get_train_data1() test_data = Mnist_noniid().get_test_data1() if(rank == 2): train_loader = Mnist_noniid().get_train_data2() test_data = Mnist_noniid().get_test_data2() if(rank == 3): train_loader = Mnist_noniid().get_train_data3() test_data = Mnist_noniid().get_test_data3() if(rank == 4): train_loader = Mnist_noniid().get_train_data4() test_data = Mnist_noniid().get_test_data4() if(rank == 5): train_loader = Mnist_noniid().get_train_data5() test_data = Mnist_noniid().get_test_data5() #size = dist.get_world_size() #rank = dist.get_rank() #train_loader = Mnist().get_train_data() #test_data = Mnist().get_test_data() for step, (b_x, b_y) in enumerate(test_data): test_x = b_x test_y = b_y group_list = [] for i in range(size): group_list.append(i) group = dist.new_group(group_list) for epoch in range(MAX_EPOCH): modell = get_new_model(modell, group) #current_model = copy.deepcopy(modell) test_output, last_layer = modell(test_x) pred_y = torch.max(test_output, 1)[1].data.numpy() accuracy = float((pred_y == test_y.data.numpy()).astype(int).sum()) / float(test_y.size(0)) for step, (b_x, b_y) in enumerate(train_loader): #modell = get_new_model(modell) #current_model = copy.deepcopy(modell) output = modell(b_x)[0] loss = loss_func(output, b_y) optimizer.zero_grad() loss.backward() optimizer.step() for param in modell.parameters(): dist.reduce(param.data, dst=0, op=dist.reduce_op.SUM, group=group) f = open('./test.txt', 'a') print('Epoch: ', epoch, ' Rank: ', rank, '| train loss: %.4f' % loss.data.numpy(), '| test accuracy: %.2f' % accuracy, file=f) print('Epoch: ', epoch, ' Rank: ', rank, '| train loss: %.4f' % loss.data.numpy(), '| test accuracy: %.2f' % accuracy) f.close()
def run(size, rank): modell = model.CNN() # modell = model.AlexNet() optimizer = torch.optim.Adam(modell.parameters(), lr=LR) loss_func = torch.nn.CrossEntropyLoss() # size = dist.get_world_size() # rank = dist.get_rank() if (IID == True): train_loader = Mnist().get_train_data() test_data = Mnist().get_test_data() test_x = torch.unsqueeze(test_data.test_data, dim=1).type( torch.FloatTensor ) / 255. # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1) test_y = test_data.test_labels else: if (rank > 0): if (rank == 1): train_loader = Mnist_noniid().get_train_data1() test_data = Mnist_noniid().get_test_data1() test_x = torch.unsqueeze(test_data.test_data, dim=1).type( torch.FloatTensor ) / 255. # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1) test_y = test_data.test_labels if (rank == 2): train_loader = Mnist_noniid().get_train_data2() test_data = Mnist_noniid().get_test_data2() test_x = torch.unsqueeze(test_data.test_data, dim=1).type( torch.FloatTensor ) / 255. # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1) test_y = test_data.test_labels if (rank == 3): train_loader = Mnist_noniid().get_train_data3() test_data = Mnist_noniid().get_test_data3() test_x = torch.unsqueeze(test_data.test_data, dim=1).type( torch.FloatTensor ) / 255. # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1) test_y = test_data.test_labels if (rank == 4): train_loader = Mnist_noniid().get_train_data4() test_data = Mnist_noniid().get_test_data4() test_x = torch.unsqueeze(test_data.test_data, dim=1).type( torch.FloatTensor ) / 255. # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1) test_y = test_data.test_labels if (rank == 5): train_loader = Mnist_noniid().get_train_data5() test_data = Mnist_noniid().get_test_data5() test_x = torch.unsqueeze(test_data.test_data, dim=1).type( torch.FloatTensor ) / 255. # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1) test_y = test_data.test_labels # test_x = torch.unsqueeze(test_data.test_data, dim=1).type( # torch.FloatTensor) / 255. # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1) # test_y = test_data.test_labels group_list = [] for i in range(size): group_list.append(i) group = dist.new_group(group_list) for epoch in range(MAX_EPOCH): modell = get_new_model(modell) # current_model = copy.deepcopy(modell) for step, (b_x, b_y) in enumerate(train_loader): # modell = get_new_model(modell) # current_model = copy.deepcopy(modell) output = modell(b_x)[0] loss = loss_func(output, b_y) optimizer.zero_grad() loss.backward() optimizer.step() # new_model = copy.deepcopy(modell) # for param1, param2 in zip( current_model.parameters(), new_model.parameters() ): # dist.reduce(param2.data-param1.data, dst=0, op=dist.reduce_op.SUM, group=group) for param in modell.parameters(): dist.reduce(param, dst=0, op=dist.reduce_op.SUM, group=group) test_output, last_layer = modell(test_x) pred_y = torch.max(test_output, 1)[1].data.numpy() accuracy = float( (pred_y == test_y.data.numpy()).astype(int).sum()) / float( test_y.size(0)) print('Epoch: ', epoch, ' Rank: ', rank, '| train loss: %.4f' % loss.data.numpy(), '| test accuracy: %.2f' % accuracy)