Beispiel #1
0
 def _test_zero_empty_partition(args):
     hidden_dim = 1
     model = SimpleModel(hidden_dim)
     # Ensure model has 2 parameters, to cause empty partition with DP=3
     assert len(list(model.parameters())) == 2
     model, _, _, _ = deepspeed.initialize(args=args,
                                           model=model,
                                           model_parameters=model.parameters())
     model.step()
Beispiel #2
0
 def _test_zero_allow_untested_optimizer(args):
     hidden_dim = 10
     model = SimpleModel(hidden_dim, empty_grad=True)
     optimizer = SimpleOptimizer(model.parameters())
     with pytest.raises(AssertionError):
         model, optim, _,_ = deepspeed.initialize(args=args,
                                                 model=model,
                                                 optimizer=optimizer,
                                                 model_parameters=model.parameters())
Beispiel #3
0
    def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor):
        model = SimpleModel(hidden_dim)

        client_optimizer = optimizer_constructor(params=model.parameters())
        model, _, _, _ = deepspeed.initialize(args=args,
                                              model=model,
                                              optimizer=client_optimizer)
def test_client_optimizer(tmpdir, optimizer_type):
    def _optimizer_callable(params) -> Optimizer:
        return AdamW(params=params)

    hidden_dim = 10
    model = SimpleModel(hidden_dim)

    config_dict = {'train_batch_size': 1}
    if optimizer_type is None:
        client_optimizer = None
        config_dict['optimizer'] = {'type': ADAM_OPTIMIZER}
    elif optimizer_type is Optimizer:
        client_optimizer = Adam(model.parameters())
    else:
        client_optimizer = _optimizer_callable

    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=[1])
    def _test_client_optimizer(args, model, client_optimizer):
        _, ds_optimizer, _, _ = deepspeed.initialize(
            args=args,
            model=model,
            model_parameters=list(model.parameters()),
            optimizer=client_optimizer)
        if client_optimizer is None:
            assert isinstance(ds_optimizer, FusedAdam)
        elif isinstance(client_optimizer, Optimizer):
            assert ds_optimizer == client_optimizer
        else:
            assert isinstance(ds_optimizer, AdamW)

    _test_client_optimizer(args=args,
                           model=model,
                           client_optimizer=client_optimizer)
Beispiel #5
0
    def _test_zero_empty_partition(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim)
        # Ensure model has 2 parameters, to cause empty partition with DP=3
        assert len(list(model.parameters())) == 2
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        # Now make sure things work..
        data_loader = random_dataloader(model=model,
                                        total_samples=1,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
Beispiel #6
0
 def helper(args):
     model = SimpleModel(10)
     model, _, _, _ = deepspeed.initialize(args=args,
                                           model=model,
                                           model_parameters=model.parameters())
     # get base optimizer under zero
     ds_optimizer = model.optimizer.optimizer
     opt_class, adam_w_mode = resulting_optimizer
     assert isinstance(ds_optimizer, opt_class)
     if adam_w_mode in [True, False]:
         assert ds_optimizer.adam_w_mode == adam_w_mode
Beispiel #7
0
    def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage,
                                                    hidden_dim):
        model = SimpleModel(hidden_dim)

        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
Beispiel #8
0
    def _go(args):
        model = SimpleModel(hidden_dim)

        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        data_loader = random_dataloader(model=model,
                                        total_samples=10,
                                        hidden_dim=hidden_dim,
                                        device=model.device)

        for _, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
Beispiel #9
0
    def _test_fused_all_overflow(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim, empty_grad=True)
        model, optim, _, _ = deepspeed.initialize(args=args,
                                                  model=model,
                                                  model_parameters=model.parameters())

        expected_loss_scale = 2**4
        # Ensure the dynamic loss scaler is correctly configured.
        assert optim.dynamic_loss_scale == True
        assert optim.cur_scale == expected_loss_scale

        overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6
        for i, value in enumerate(overflow_gradients):
            run_model_step(model, [value])
            expected_loss_scale = max(expected_loss_scale / 2, 1)
            assert optim.cur_scale == expected_loss_scale
            assert optim.cur_iter == (i + 1)
 def _go(hidden_dim):
     with deepspeed.zero.Init(enabled=zero_stage == 3,
                              config_dict_or_path=ds_config):
         model = SimpleModel(hidden_dim, nlayers=78)
     print('total number of parameters:',
           sum([p.numel() for p in model.parameters()]))
     see_memory_usage('pre-init', force=True)
     model, _, _, _ = deepspeed.initialize(model=model, config=ds_config)
     see_memory_usage('post-init', force=True)
     data_loader = random_dataloader(model=model,
                                     total_samples=50,
                                     hidden_dim=hidden_dim,
                                     device=model.device,
                                     dtype=torch.half)
     print(f"optimizer={model.optimizer}")
     for batch in data_loader:
         model(batch[0], batch[1])
     see_memory_usage('post-fwds', force=True)
Beispiel #11
0
    def _test_zero_static_scale(args):
        hidden_dim = 10
        model = SimpleModel(hidden_dim)
        model, optim, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        # Ensure the static scaler is configured.
        assert optim.dynamic_loss_scale == False
        assert optim.loss_scaler.loss_scale == 138.

        # Now make sure things work..
        data_loader = random_dataloader(model=model,
                                        total_samples=10,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
Beispiel #12
0
    def _test_unfused_no_overflow(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim, empty_grad=True)
        model, optim, _, _ = deepspeed.initialize(args=args,
                                                  model=model,
                                                  model_parameters=model.parameters())
        expected_loss_scale = 2**8
        expected_scale_window = 2
        # Ensure the dynamic loss scaler is correctly configured.
        assert optim.dynamic_loss_scale == True
        assert optim.cur_scale == expected_loss_scale
        assert optim.scale_window == expected_scale_window

        for i, value in enumerate(np.random.uniform(-0.1, 0.1, 10)):
            run_model_step(model, [value])
            assert optim.cur_scale == expected_loss_scale
            assert optim.cur_iter == (i + 1)
            if optim.cur_iter % expected_scale_window == 0:
                expected_loss_scale *= 2
Beispiel #13
0
    def _test_unfused_some_overflow(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim, empty_grad=True)
        model, optim, _, _ = deepspeed.initialize(args=args,
                                                  model=model,
                                                  model_parameters=model.parameters())

        expected_loss_scale = 2**8
        expected_scale_window = 2
        expected_iteration = 0
        # Ensure the dynamic loss scaler is correctly configured.
        assert optim.dynamic_loss_scale == True
        assert optim.cur_scale == expected_loss_scale
        assert optim.scale_window == expected_scale_window

        # Run model with overflows to decrease scale
        overflow_gradients = [float('inf'), float('nan')]
        expected_iteration += len(overflow_gradients)
        run_model_step(model, overflow_gradients)
        expected_loss_scale /= (2**len(overflow_gradients))
        assert optim.cur_scale == expected_loss_scale
        assert optim.cur_iter == expected_iteration

        # Run model scale_window + 1 times to increase scale once
        normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1)
        expected_iteration += len(normal_gradients)
        run_model_step(model, normal_gradients)
        expected_loss_scale *= 2
        assert optim.cur_scale == expected_loss_scale
        assert optim.cur_iter == expected_iteration

        # Run model with overflows to decrease scale
        overflow_gradients = [float('inf')]
        expected_iteration += len(overflow_gradients)
        run_model_step(model, overflow_gradients)
        expected_loss_scale /= (2**len(overflow_gradients))
        assert optim.cur_scale == expected_loss_scale
        assert optim.cur_iter == expected_iteration
Beispiel #14
0
#             x = nn.ReLU()(x)
#         out = self.last_fc(x)
#         return nn.Sigmoid()(out)

# Set device to be used
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device used:', device)
model_pytorch = SimpleModel(input_size=input_size,
                            hidden_sizes=hidden_sizes,
                            output_size=output_size)
model_pytorch = model_pytorch.to(device)

# Set loss and optimizer
# Set binary cross entropy loss since 2 classes only
criterion = nn.BCELoss()
optimizer = optim.Adam(model_pytorch.parameters(), lr=1e-3)

num_epochs = 20

# Train model
time_start = time.time()

for epoch in range(num_epochs):
    model_pytorch.train()

    train_loss_total = 0

    for data, target in train_loader:
        data, target = data.to(device), target.float().to(device)
        optimizer.zero_grad()
        output = model_pytorch(data)
def test_client_lr_scheduler(tmpdir, scheduler_type, optimizer_type):
    def _my_lambda(epoch):
        return epoch // 10

    def _optimizer_callable(params) -> Optimizer:
        return torch.optim.AdamW(params=params)

    def _lr_scheduler_callable(optimizer) -> _LRScheduler:
        return LambdaLR(optimizer, _my_lambda)

    hidden_dim = 10
    model = SimpleModel(hidden_dim)

    config_dict = {'train_batch_size': 1}

    client_optimizer = None
    client_scheduler = None

    if optimizer_type is None:
        config_dict['optimizer'] = {'type': ADAM_OPTIMIZER}
    elif optimizer_type is Optimizer:
        client_optimizer = torch.optim.Adam(model.parameters())
    else:
        client_optimizer = _optimizer_callable

    if scheduler_type is None:
        config_dict['scheduler'] = {'type': WARMUP_LR, 'params': {}}
    elif scheduler_type == _LRScheduler:
        if isinstance(client_optimizer, Optimizer):
            client_scheduler = LambdaLR(client_optimizer, _my_lambda)
        else:
            # Verify invalid combination is correctly handled
            client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()),
                                        _my_lambda)
    else:
        client_scheduler = _lr_scheduler_callable

    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=[1])
    def _test_client_lr_scheduler(args, model, optimizer, lr_scheduler):
        if isinstance(lr_scheduler,
                      _LRScheduler) and not isinstance(optimizer, Optimizer):
            with pytest.raises(AssertionError):
                _, _, _, _ = deepspeed.initialize(args=args,
                                                  model=model,
                                                  model_parameters=list(
                                                      model.parameters()),
                                                  optimizer=optimizer,
                                                  lr_scheduler=lr_scheduler)
        else:
            _, _, _, ds_lr_scheduler = deepspeed.initialize(
                args=args,
                model=model,
                model_parameters=list(model.parameters()),
                optimizer=optimizer,
                lr_scheduler=lr_scheduler)
            if lr_scheduler is None:
                assert isinstance(ds_lr_scheduler, WarmupLR)
            elif isinstance(lr_scheduler, _LRScheduler):
                assert ds_lr_scheduler == lr_scheduler
            else:
                assert isinstance(ds_lr_scheduler, LambdaLR)

    _test_client_lr_scheduler(args=args,
                              model=model,
                              optimizer=client_optimizer,
                              lr_scheduler=client_scheduler)