Exemple #1
0
def test_dynamic_win_put_optimizer(device, kwargs):
    error_threshold = kwargs.get("error_threshold", 1.5)
    window_prefix = kwargs.get("window_prefix", None)

    problem_builder, train_dataloader, test_dataloader, model, optimizer, num_epochs = \
        problem_setup()

    isCUDA = pin_model_to_device(device, model)

    optimizer = bf.DistributedWinPutOptimizer(optimizer, model=model, window_prefix=window_prefix)
    
    # Train and test
    train_mse = []
    test_mse = []
    for epoch in range(num_epochs):
        dynamic_win_put_train(
            model, optimizer, train_dataloader, isCUDA, epoch)
        train_mse.append(evaluation(model, train_dataloader, isCUDA))
        test_mse.append(evaluation(model, test_dataloader, isCUDA))
    train_mse = np.array(train_mse)
    test_mse = np.array(test_mse)

    # Check if the MSEs in the last three epochs are small enough
    assert (
        train_mse[-3:].max() < error_threshold*problem_builder.noise_level**2
    ), "Train MSE in the last three epochs doesn't coverge."
    assert (
        test_mse[-3:].max() < error_threshold*problem_builder.noise_level**2
    ), "Train MSE in the last three epochs doesn't coverge."
    optimizer.unregister_window()
def test_optimizer_local_aggregation(device, communication_type, kwargs):
    atc_style = kwargs.get("ATC", False)
    error_threshold = kwargs.get("error_threshold", 1.5)
    mini_batch_size = kwargs.get("mini_batch_size", 16)
    window_prefix = kwargs.get("window_prefix", None)

    problem_builder, train_dataloader, test_dataloader, model, optimizer, num_epochs = \
        problem_setup()

    isCUDA = pin_model_to_device(device, model)

    J = train_dataloader.batch_size // mini_batch_size

    if isinstance(communication_type, bf.CommunicationType):
        base_dist_optimizer = (bf.DistributedAdaptThenCombineOptimizer
                               if atc_style else
                               bf.DistributedAdaptWithCombineOptimizer)
        optimizer = base_dist_optimizer(optimizer,
                                        model=model,
                                        communication_type=communication_type,
                                        num_steps_per_communication=J)
    elif communication_type == "win.put":
        optimizer = bf.DistributedWinPutOptimizer(
            optimizer, model=model, num_steps_per_communication=J)
    elif communication_type == "gradient.allreduce":
        optimizer = bf.DistributedGradientAllreduceOptimizer(
            optimizer, model=model, num_steps_per_communication=J)
    else:
        raise ValueError("Communication_type under test is not expected.")

    # Train and test
    train_mse = []
    test_mse = []
    for _ in range(num_epochs):
        local_aggregation_train(model, optimizer, train_dataloader, isCUDA,
                                mini_batch_size)
        train_mse.append(evaluation(model, train_dataloader, isCUDA))
        test_mse.append(evaluation(model, test_dataloader, isCUDA))
    train_mse = np.array(train_mse)
    test_mse = np.array(test_mse)

    # Check if the MSEs in the last three epochs are small enough
    assert (train_mse[-3:].max() <
            error_threshold * problem_builder.noise_level**2
            ), "Train MSE in the last three epochs doesn't coverge."
    assert (test_mse[-3:].max() < error_threshold * problem_builder.noise_level
            **2), "Train MSE in the last three epochs doesn't coverge."

    if communication_type == "win.put":
        optimizer.unregister_window()
def test_optimizer_local_aggregation_duplicated(device, communication_type,
                                                kwargs):
    # Accuracy doesn't matter here, mainly to test if there is warning thrown
    # for local aggregation.
    atc_style = kwargs.get("ATC", False)
    mini_batch_size = kwargs.get("mini_batch_size", 16)
    window_prefix = kwargs.get("window_prefix", None)

    _, train_dataloader, test_dataloader, model, optimizer, num_epochs = \
        problem_setup(DuplicatedLinearNet)

    isCUDA = pin_model_to_device(device, model)

    mini_batch_size = train_dataloader.batch_size
    J = train_dataloader.batch_size // mini_batch_size

    if isinstance(communication_type, bf.CommunicationType):
        base_dist_optimizer = (bf.DistributedAdaptThenCombineOptimizer
                               if atc_style else
                               bf.DistributedAdaptWithCombineOptimizer)
        optimizer = base_dist_optimizer(optimizer,
                                        model=model,
                                        communication_type=communication_type,
                                        num_steps_per_communication=J)
    elif communication_type == "win.put":
        optimizer = bf.DistributedWinPutOptimizer(
            optimizer,
            model=model,
            window_prefix=window_prefix,
            num_steps_per_communication=J)
    elif communication_type == "gradient.allreduce":
        optimizer = bf.DistributedGradientAllreduceOptimizer(
            optimizer, model=model, num_steps_per_communication=J)
    else:
        raise ValueError("Communication_type under test is not expected.")

    # Train and test
    for _ in range(num_epochs):
        local_aggregation_train(model, optimizer, train_dataloader, isCUDA,
                                mini_batch_size)
        evaluation(model, train_dataloader, isCUDA)
        evaluation(model, test_dataloader, isCUDA)

    if communication_type == "win.put":
        optimizer.unregister_window()
Exemple #4
0
    print("using cuda.")
    # Move model to GPU.
    model.cuda()

# Bluefog: scale learning rate by the number of GPUs.
optimizer = optim.SGD(model.parameters(),
                      lr=args.lr * bf.size(),
                      momentum=args.momentum)

# Bluefog: broadcast parameters & optimizer state.
bf.broadcast_parameters(model.state_dict(), root_rank=0)
bf.broadcast_optimizer_state(optimizer, root_rank=0)

# Bluefog: wrap optimizer with DistributedOptimizer.
if args.dist_optimizer == 'win_put':
    optimizer = bf.DistributedWinPutOptimizer(optimizer, model=model)
elif args.dist_optimizer == 'neighbor_allreduce':
    optimizer = optimizer = bf.DistributedNeighborAllreduceOptimizer(
        optimizer, model=model)
elif args.dist_optimizer == 'allreduce':
    optimizer = optimizer = bf.DistributedAllreduceOptimizer(optimizer,
                                                             model=model)
elif args.dist_optimizer == 'gradient_allreduce':
    optimizer = optimizer = bf.DistributedGradientAllreduceOptimizer(
        optimizer, model=model)
elif args.dist_optimizer == 'hierarchical_neighbor_allreduce':
    optimizer = optimizer = bf.DistributedHierarchicalNeighborAllreduceOptimizer(
        optimizer, model=model)
elif args.dist_optimizer == 'horovod':
    optimizer = optimizer = bf.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters())