def test_dynamic_win_put_optimizer(device, kwargs): error_threshold = kwargs.get("error_threshold", 1.5) window_prefix = kwargs.get("window_prefix", None) problem_builder, train_dataloader, test_dataloader, model, optimizer, num_epochs = \ problem_setup() isCUDA = pin_model_to_device(device, model) optimizer = bf.DistributedWinPutOptimizer(optimizer, model=model, window_prefix=window_prefix) # Train and test train_mse = [] test_mse = [] for epoch in range(num_epochs): dynamic_win_put_train( model, optimizer, train_dataloader, isCUDA, epoch) train_mse.append(evaluation(model, train_dataloader, isCUDA)) test_mse.append(evaluation(model, test_dataloader, isCUDA)) train_mse = np.array(train_mse) test_mse = np.array(test_mse) # Check if the MSEs in the last three epochs are small enough assert ( train_mse[-3:].max() < error_threshold*problem_builder.noise_level**2 ), "Train MSE in the last three epochs doesn't coverge." assert ( test_mse[-3:].max() < error_threshold*problem_builder.noise_level**2 ), "Train MSE in the last three epochs doesn't coverge." optimizer.unregister_window()
def test_optimizer_local_aggregation(device, communication_type, kwargs): atc_style = kwargs.get("ATC", False) error_threshold = kwargs.get("error_threshold", 1.5) mini_batch_size = kwargs.get("mini_batch_size", 16) window_prefix = kwargs.get("window_prefix", None) problem_builder, train_dataloader, test_dataloader, model, optimizer, num_epochs = \ problem_setup() isCUDA = pin_model_to_device(device, model) J = train_dataloader.batch_size // mini_batch_size if isinstance(communication_type, bf.CommunicationType): base_dist_optimizer = (bf.DistributedAdaptThenCombineOptimizer if atc_style else bf.DistributedAdaptWithCombineOptimizer) optimizer = base_dist_optimizer(optimizer, model=model, communication_type=communication_type, num_steps_per_communication=J) elif communication_type == "win.put": optimizer = bf.DistributedWinPutOptimizer( optimizer, model=model, num_steps_per_communication=J) elif communication_type == "gradient.allreduce": optimizer = bf.DistributedGradientAllreduceOptimizer( optimizer, model=model, num_steps_per_communication=J) else: raise ValueError("Communication_type under test is not expected.") # Train and test train_mse = [] test_mse = [] for _ in range(num_epochs): local_aggregation_train(model, optimizer, train_dataloader, isCUDA, mini_batch_size) train_mse.append(evaluation(model, train_dataloader, isCUDA)) test_mse.append(evaluation(model, test_dataloader, isCUDA)) train_mse = np.array(train_mse) test_mse = np.array(test_mse) # Check if the MSEs in the last three epochs are small enough assert (train_mse[-3:].max() < error_threshold * problem_builder.noise_level**2 ), "Train MSE in the last three epochs doesn't coverge." assert (test_mse[-3:].max() < error_threshold * problem_builder.noise_level **2), "Train MSE in the last three epochs doesn't coverge." if communication_type == "win.put": optimizer.unregister_window()
def test_optimizer_local_aggregation_duplicated(device, communication_type, kwargs): # Accuracy doesn't matter here, mainly to test if there is warning thrown # for local aggregation. atc_style = kwargs.get("ATC", False) mini_batch_size = kwargs.get("mini_batch_size", 16) window_prefix = kwargs.get("window_prefix", None) _, train_dataloader, test_dataloader, model, optimizer, num_epochs = \ problem_setup(DuplicatedLinearNet) isCUDA = pin_model_to_device(device, model) mini_batch_size = train_dataloader.batch_size J = train_dataloader.batch_size // mini_batch_size if isinstance(communication_type, bf.CommunicationType): base_dist_optimizer = (bf.DistributedAdaptThenCombineOptimizer if atc_style else bf.DistributedAdaptWithCombineOptimizer) optimizer = base_dist_optimizer(optimizer, model=model, communication_type=communication_type, num_steps_per_communication=J) elif communication_type == "win.put": optimizer = bf.DistributedWinPutOptimizer( optimizer, model=model, window_prefix=window_prefix, num_steps_per_communication=J) elif communication_type == "gradient.allreduce": optimizer = bf.DistributedGradientAllreduceOptimizer( optimizer, model=model, num_steps_per_communication=J) else: raise ValueError("Communication_type under test is not expected.") # Train and test for _ in range(num_epochs): local_aggregation_train(model, optimizer, train_dataloader, isCUDA, mini_batch_size) evaluation(model, train_dataloader, isCUDA) evaluation(model, test_dataloader, isCUDA) if communication_type == "win.put": optimizer.unregister_window()
print("using cuda.") # Move model to GPU. model.cuda() # Bluefog: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=args.lr * bf.size(), momentum=args.momentum) # Bluefog: broadcast parameters & optimizer state. bf.broadcast_parameters(model.state_dict(), root_rank=0) bf.broadcast_optimizer_state(optimizer, root_rank=0) # Bluefog: wrap optimizer with DistributedOptimizer. if args.dist_optimizer == 'win_put': optimizer = bf.DistributedWinPutOptimizer(optimizer, model=model) elif args.dist_optimizer == 'neighbor_allreduce': optimizer = optimizer = bf.DistributedNeighborAllreduceOptimizer( optimizer, model=model) elif args.dist_optimizer == 'allreduce': optimizer = optimizer = bf.DistributedAllreduceOptimizer(optimizer, model=model) elif args.dist_optimizer == 'gradient_allreduce': optimizer = optimizer = bf.DistributedGradientAllreduceOptimizer( optimizer, model=model) elif args.dist_optimizer == 'hierarchical_neighbor_allreduce': optimizer = optimizer = bf.DistributedHierarchicalNeighborAllreduceOptimizer( optimizer, model=model) elif args.dist_optimizer == 'horovod': optimizer = optimizer = bf.DistributedOptimizer( optimizer, named_parameters=model.named_parameters())