Beispiel #1
0
def test_lr_range_test(tmpdir, min_lr, step_rate, step_size, staircase):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            },
        },
        "scheduler": {
            "type": LR_RANGE_TEST,
            "params": {
                LR_RANGE_TEST_MIN_LR: min_lr,
                LR_RANGE_TEST_STEP_RATE: step_rate,
                LR_RANGE_TEST_STEP_SIZE: step_size,
                LR_RANGE_TEST_STAIRCASE: staircase
            }
        },
        "gradient_clipping": 1.0
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1])
    def _test_lr_range_test(args, model, hidden_dim, min_lr, step_size,
                            staircase):
        model, _, _, lr_scheduler = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=max(50, step_size * 2),
                                        hidden_dim=hidden_dim,
                                        device=model.device,
                                        dtype=torch.float)

        step_lrs = []
        for _, batch in enumerate(data_loader):
            step_lrs.append(lr_scheduler.get_lr())
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

        # Verify starting lr
        assert step_lrs[0] == min_lr

        if staircase:
            # Verify staircase increasing lr
            _verify_staircase_increase(step_lrs, step_size)
        else:
            # Verify continuous increasing lr
            _verify_continuous_increase(step_lrs)

    _test_lr_range_test(args=args,
                        model=model,
                        hidden_dim=hidden_dim,
                        min_lr=[min_lr],
                        step_size=step_size,
                        staircase=staircase)
Beispiel #2
0
def test_lamb_fp16_empty_grad(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Lamb",
            "params": {
                "lr": 0.00015
            }
        },
        "gradient_clipping": 1.0,
        "fp16": {
            "enabled": True
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=True)

    @distributed_test(world_size=[2])
    def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #3
0
def test_zero_empty_partition(tmpdir):
    config_dict = {
        "train_batch_size": 3,
        "fp16": {
            "enabled": True
        },
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "zero_optimization": True
    }
    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=[3])
    def _test_zero_empty_partition(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim)
        # Ensure model has 2 parameters, to cause empty partition with DP=3
        assert len(list(model.parameters())) == 2
        model, _, _, _ = deepspeed.initialize(args=args,
                                              model=model,
                                              model_parameters=model.parameters())
        model.step()

    _test_zero_empty_partition(args)
Beispiel #4
0
def test_dataloader_drop_last(tmpdir, train_batch_size, drop_last):
    config_dict = {
        "train_batch_size": train_batch_size,
        "dataloader_drop_last": drop_last,
        "steps_per_print": 1
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim)

    @distributed_test(world_size=[1])
    def _test_dataloader_drop_last(args, model, hidden_dim):
        optimizer = torch.optim.AdamW(params=model.parameters())
        #TODO: Figure out why this breaks with cuda device
        train_dataset = random_dataset(total_samples=50,
                                       hidden_dim=hidden_dim,
                                       device=torch.device('cpu'),
                                       dtype=torch.float32)
        model, _, training_dataloader, _ = deepspeed.initialize(
            args=args,
            model=model,
            training_data=train_dataset,
            optimizer=optimizer)
        for n, batch in enumerate(training_dataloader):
            x = batch[0].to(torch.cuda.current_device())
            y = batch[1].to(torch.cuda.current_device())
            loss = model(x, y)
            model.backward(loss)
            model.step()

    _test_dataloader_drop_last(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #5
0
def test_non_elastic_batch_params_w_override(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Lamb",
            "params": {
                "lr": 0.00015
            }
        },
        "gradient_clipping": 1.0,
        "elasticity": {
            "enabled": True,
            "max_train_batch_size": 4,
            "micro_batch_sizes": [1, 2, 3, 4],
            "min_gpus": 1,
            "max_gpus": 4,
            "min_time": 20,
            "version": 0.1,
            "ignore_non_elastic_batch_info": True
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1, 2])
    def _test_elastic(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

    _test_elastic(args=args, model=model, hidden_dim=hidden_dim)
def test_stage2_ignore_unused_parameters(tmpdir, ignore_unused_parameters):
    use_cpu_offload = True

    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[
            CPUAdamBuilder.NAME]:
        pytest.skip("cpu-adam is not compatible")

    config_dict = {
        "train_micro_batch_size_per_gpu": 2,
        "gradient_accumulation_steps": 2,
        "steps_per_print": 1,
        "zero_optimization": {
            "stage": 2,
            "cpu_offload": use_cpu_offload,
            "ignore_unused_parameters": ignore_unused_parameters
        },
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 1e-3
            }
        },
        "fp16": {
            "enabled": True,
            "initial_scale_power": 8
        }
    }

    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 4

    model = UnusedParametersModel(hidden_dim=hidden_dim)

    @distributed_test(world_size=[1])
    def _test_stage2_ignore_unused_parameters(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        data_loader = random_dataloader(model=model,
                                        total_samples=10,
                                        hidden_dim=hidden_dim,
                                        device=model.device)

        def _loop():
            for n, batch in enumerate(data_loader):
                loss = model(batch[0], batch[1])
                model.backward(loss)
                model.step()

        if ignore_unused_parameters:
            _loop()
        else:
            with pytest.raises(AssertionError) as e:
                _loop()
            assert e.value.args and 'ignore_unused_parameters' in e.value.args[
                0]

    _test_stage2_ignore_unused_parameters(args=args,
                                          model=model,
                                          hidden_dim=hidden_dim)
Beispiel #7
0
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage,
                                               use_cpu_offload):
    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[
            CPUAdamBuilder.NAME]:
        pytest.skip("cpu-adam is not compatible")

    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "scheduler": {
            "type": "OneCycle",
            "params": {
                "cycle_first_step_size": 16000,
                "cycle_first_stair_count": 8000,
                "decay_step_size": 16000,
                "cycle_min_lr": 1e-06,
                "cycle_max_lr": 3e-05,
                "decay_lr_rate": 1e-07,
                "cycle_min_mom": 0.85,
                "cycle_max_mom": 0.99,
                "decay_mom_rate": 0.0
            }
        },
        "fp16": {
            "enabled": True
        },
        "zero_optimization": {
            "stage": zero_stage,
            "cpu_offload": use_cpu_offload
        }
    }

    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    @distributed_test(world_size=[1])
    def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage,
                                                    hidden_dim):
        model = SimpleModel(hidden_dim)

        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_adam_fp16_zero_onecycle_compatibility(args=args,
                                                zero_stage=zero_stage,
                                                hidden_dim=hidden_dim)
Beispiel #8
0
def test_zero_allow_untested_optimizer(tmpdir, zero_stage):
    config_dict = {
        "train_batch_size": 4,
        "steps_per_print": 1,
        "fp16": {
            "enabled": True,
        },
        "zero_optimization": {
            "stage": zero_stage
        },
        "zero_allow_untested_optimizer": False
    }
    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=[1])
    def _test_zero_allow_untested_optimizer(args):
        hidden_dim = 10
        model = SimpleModel(hidden_dim, empty_grad=True)
        optimizer = SimpleOptimizer(model.parameters())
        with pytest.raises(AssertionError):
            model, optim, _, _ = deepspeed.initialize(
                args=args,
                model=model,
                optimizer=optimizer,
                model_parameters=model.parameters())

    _test_zero_allow_untested_optimizer(args)
def test_client_optimizer(tmpdir, optimizer_type):
    def _optimizer_callable(params) -> Optimizer:
        return AdamW(params=params)

    hidden_dim = 10
    model = SimpleModel(hidden_dim)

    config_dict = {'train_batch_size': 1}
    if optimizer_type is None:
        client_optimizer = None
        config_dict['optimizer'] = {'type': ADAM_OPTIMIZER}
    elif optimizer_type is Optimizer:
        client_optimizer = Adam(model.parameters())
    else:
        client_optimizer = _optimizer_callable

    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=[1])
    def _test_client_optimizer(args, model, client_optimizer):
        _, ds_optimizer, _, _ = deepspeed.initialize(
            args=args,
            model=model,
            model_parameters=list(model.parameters()),
            optimizer=client_optimizer)
        if client_optimizer is None:
            assert isinstance(ds_optimizer, FusedAdam)
        elif isinstance(client_optimizer, Optimizer):
            assert ds_optimizer == client_optimizer
        else:
            assert isinstance(ds_optimizer, AdamW)

    _test_client_optimizer(args=args,
                           model=model,
                           client_optimizer=client_optimizer)
Beispiel #10
0
def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[
            CPUAdamBuilder.NAME]:
        pytest.skip("cpu-adam is not compatible")

    config_dict = {
        "train_batch_size": 4,
        "steps_per_print": 1,
        "fp16": {
            "enabled": True,
        },
        "zero_optimization": {
            "stage": zero_stage,
            "cpu_offload": use_cpu_offload
        },
        "zero_allow_untested_optimizer": False
    }
    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=[1])
    def _test_zero_allow_untested_optimizer(args, zero_stage):
        hidden_dim = 10
        model = SimpleModel(hidden_dim)
        optimizer = SimpleOptimizer(model.parameters())
        with pytest.raises(AssertionError):
            model, optim, _, _ = deepspeed.initialize(
                args=args,
                model=model,
                optimizer=optimizer,
                model_parameters=model.parameters())

    _test_zero_allow_untested_optimizer(args, zero_stage)
Beispiel #11
0
def test_zero_supported_client_optimizer(tmpdir, zero_stage,
                                         optimizer_constructor):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "fp16": {
            "enabled": True
        },
        "zero_optimization": {
            "stage": zero_stage
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    @distributed_test(world_size=[1])
    def _test_zero_supported_client_optimizer(args, zero_stage,
                                              optimizer_constructor):
        model = SimpleModel(hidden_dim)

        client_optimizer = optimizer_constructor(params=model.parameters())
        model, _, _, _ = deepspeed.initialize(args=args,
                                              model=model,
                                              optimizer=client_optimizer)

    _test_zero_supported_client_optimizer(
        args=args,
        zero_stage=zero_stage,
        optimizer_constructor=optimizer_constructor)
Beispiel #12
0
def test_adam_amp_basic(tmpdir):
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "amp": {
            "enabled": True
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim)

    @distributed_test(world_size=[1])
    def _test_adam_amp_basic(args, model, hidden_dim):
        optimizer = torch.optim.Adam(params=model.parameters())
        model, _, _, _ = deepspeed.initialize(args=args,
                                              model=model,
                                              optimizer=optimizer)
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #13
0
def test_checkpoint_fp32_optimizer(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015,
                "betas": [0.8,
                          0.999],
                "eps": 1e-8,
                "weight_decay": 3e-7
            }
        },
        "fp16": {
            "enabled": False
        }
    }

    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[2])
    def _test_checkpoint_fp32_optimizer(args, model, hidden_dim):
        checkpoint_correctness_verification(args, model, hidden_dim, tmpdir, fp16=False)

    _test_checkpoint_fp32_optimizer(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #14
0
def test_unfused_some_overflow(tmpdir):
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Lamb",
            "params": {
                "lr": 0.00015
            }
        },
        "fp16": {
            "enabled": True,
            "loss_scale": 0,
            "initial_scale_power": 8,
            "loss_scale_window": 2
        }
    }
    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=1)
    def _test_unfused_some_overflow(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim, empty_grad=True)
        model, optim, _, _ = deepspeed.initialize(args=args,
                                                  model=model,
                                                  model_parameters=model.parameters())

        expected_loss_scale = 2**8
        expected_scale_window = 2
        expected_iteration = 0
        # Ensure the dynamic loss scaler is correctly configured.
        assert optim.dynamic_loss_scale == True
        assert optim.cur_scale == expected_loss_scale
        assert optim.scale_window == expected_scale_window

        # Run model with overflows to decrease scale
        overflow_gradients = [float('inf'), float('nan')]
        expected_iteration += len(overflow_gradients)
        run_model_step(model, overflow_gradients)
        expected_loss_scale /= (2**len(overflow_gradients))
        assert optim.cur_scale == expected_loss_scale
        assert optim.cur_iter == expected_iteration

        # Run model scale_window + 1 times to increase scale once
        normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1)
        expected_iteration += len(normal_gradients)
        run_model_step(model, normal_gradients)
        expected_loss_scale *= 2
        assert optim.cur_scale == expected_loss_scale
        assert optim.cur_iter == expected_iteration

        # Run model with overflows to decrease scale
        overflow_gradients = [float('inf')]
        expected_iteration += len(overflow_gradients)
        run_model_step(model, overflow_gradients)
        expected_loss_scale /= (2**len(overflow_gradients))
        assert optim.cur_scale == expected_loss_scale
        assert optim.cur_iter == expected_iteration

    _test_unfused_some_overflow(args)
Beispiel #15
0
def test_lamb_optimizer_gradnorm_for_moe(tmpdir, monkeypatch, fused_lamb_legacy: bool):
    if not required_torch_version():
        pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")

    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "fp16": {
            "enabled": True
        },
        "optimizer": {
            "type": "Lamb",
            "params": {
                "lr": 0.00015
            }
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    def mock_unscale_and_clip_grads(norm_groups, apply_scale=True):
        total_norm = 0.0
        for norm in norm_groups:
            total_norm += norm**2.0
        total_norm = math.sqrt(total_norm)
        torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
        all_gather_results = [
            torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
        ]
        dist.all_gather(all_gather_results, torch_norm_tensor)
        assert len(set([x.item() for x in all_gather_results])) == 1
        return 1.0

    @distributed_test(world_size=[2])
    def _test_lamb_legacy_optimizer_step(args, hidden_dim, fused_lamb_legacy):
        # initialize MoE
        groups.initialize_model_parallel(1)
        groups.initialize_expert_parallel(2)
        model = SimpleMoEModel(hidden_dim)
        engine, optimizer, _, _ = deepspeed.initialize(args=args,
                                               model=model,
                                               model_parameters=model.parameters(),
                                               dist_init_required=False)
        monkeypatch.setattr(optimizer,
                            'unscale_and_clip_grads',
                            mock_unscale_and_clip_grads)
        optimizer.fused_lamb_legacy = fused_lamb_legacy
        data_loader = sequence_dataloader(model=engine,
                                          total_samples=50,
                                          hidden_dim=hidden_dim,
                                          device=engine.device)
        for n, batch in enumerate(data_loader):
            loss = engine(batch[0], batch[1])
            engine.backward(loss)
            engine.step()

    _test_lamb_legacy_optimizer_step(args=args,
                                     hidden_dim=hidden_dim,
                                     fused_lamb_legacy=fused_lamb_legacy)
Beispiel #16
0
def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
    if not bf16_required_version_check():
        pytest.skip(
            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
        )

    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[
            CPUAdamBuilder.NAME]:
        pytest.skip("cpu-adam is not compatible")

    if zero_stage == 3:
        pytest.skip("skip for now")

    config_dict = {
        "train_micro_batch_size_per_gpu": 1,
        "gradient_accumulation_steps": 1,
        "fp16": {
            "enabled": False
        },
        "bfloat16": {
            "enabled": True
        },
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "zero_optimization": {
            "stage": zero_stage,
            "cpu_offload": use_cpu_offload,
            "reduce_bucket_size": 100,
            "allgather_bucket_size": 100
        }
    }
    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=[3])
    def _test_zero_empty_partition(args, zero_stage):
        hidden_dim = 1
        model = SimpleModel(hidden_dim)

        # Ensure model has 2 parameters, to cause empty partition with DP=3
        assert len(list(model.parameters())) == 2
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        # Now make sure things work..
        data_loader = random_dataloader(model=model,
                                        total_samples=1,
                                        hidden_dim=hidden_dim,
                                        device=model.device,
                                        dtype=torch.bfloat16)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_zero_empty_partition(args=args, zero_stage=zero_stage)
Beispiel #17
0
def test_zero3_repeat_forward_loop(tmpdir, zero_stage):

    # force all params to be partitioned by forcing threshold=0
    config_dict = {
        "train_micro_batch_size_per_gpu": 2,
        "gradient_accumulation_steps": 2,
        "steps_per_print": 1,
        "zero_optimization": {
            "stage": zero_stage,
            "stage3_param_persistence_threshold": 0
        },
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 1e-3
            }
        },
        "fp16": {
            "enabled": True,
            "initial_scale_power": 8
        }
    }

    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 4

    class AlbertLikeModel(torch.nn.Module):
        def __init__(self, hidden_dim):
            super().__init__()
            self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
            self.cross_entropy_loss = torch.nn.CrossEntropyLoss()

        def forward(self, x, y):
            # run the same layer multiple times in a loop - to test a stack of forwards, followed by a stack of backwards
            hidden = x
            for i in range(3):
                hidden = hidden + self.linear(hidden)
            return self.cross_entropy_loss(hidden, y)

    model = AlbertLikeModel(hidden_dim=hidden_dim)

    @distributed_test(world_size=[1])
    def _test_zero3_repeat_forward_loop(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=16,
                                        hidden_dim=hidden_dim,
                                        device=model.device)

        for i, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_zero3_repeat_forward_loop(args=args,
                                    model=model,
                                    hidden_dim=hidden_dim)
Beispiel #18
0
def test_checkpoint_unfused_optimizer(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Lamb",
            "params": {
                "lr": 0.00015,
                "max_grad_norm": 1.0
            }
        },
        "fp16": {
            "enabled": True
        },
        "scheduler": {
            "type": "OneCycle",
            "params": {
                "cycle_first_step_size": 1000,
                "cycle_first_stair_count": 500,
                "cycle_second_step_size": 1000,
                "cycle_second_stair_count": 500,
                "decay_step_size": 1000,
                "cycle_min_lr": 0.0001,
                "cycle_max_lr": 0.0010,
                "decay_lr_rate": 0.001,
                "cycle_min_mom": 0.85,
                "cycle_max_mom": 0.99,
                "decay_mom_rate": 0.0
            }
        }
    }

    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[2])
    def _test_checkpoint_unfused_optimizer(args,
                                           model,
                                           hidden_dim,
                                           load_optimizer_states):
        checkpoint_correctness_verification(args,
                                            model,
                                            hidden_dim,
                                            tmpdir,
                                            load_optimizer_states=load_optimizer_states)

    _test_checkpoint_unfused_optimizer(args=args,
                                       model=model,
                                       hidden_dim=hidden_dim,
                                       load_optimizer_states=True)
    _test_checkpoint_unfused_optimizer(args=args,
                                       model=model,
                                       hidden_dim=hidden_dim,
                                       load_optimizer_states=False)
def test_curriculum_scheduler_fixed_discrete(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015,
                "weight_decay": 0.01
            }
        },
        "gradient_clipping": 1.0,
        "fp16": {
            "enabled": True,
            "loss_scale": 0,
            "initial_scale_power": 16
        },
        "curriculum_learning": {
            "enabled": True,
            "curriculum_type": "seqlen",
            "min_difficulty": 1,
            "max_difficulty": 5,
            "schedule_type": "fixed_discrete",
            "schedule_config": {
                "difficulty": [1, 2, 3, 4, 5],
                "max_step": [2, 4, 6, 8]
            }
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10
    ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4}
    model = Curriculum_SimpleModel(hidden_dim)

    @distributed_test(world_size=[1, 2])
    def _test_curriculum_scheduler_fixed_discrete(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=20,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss, seqlen = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
            true_seqlen = 5
            if n + 1 in ground_truths:
                true_seqlen = ground_truths[n + 1]
            print('at step {} the seqlen is {}'.format(n + 1, seqlen))
            assert seqlen == true_seqlen, f"Incorrect curriculum schedule"

    _test_curriculum_scheduler_fixed_discrete(args=args,
                                              model=model,
                                              hidden_dim=hidden_dim)
Beispiel #20
0
def test_onebitlamb_checkpointing_overflow(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "OneBitLamb",
            "params": {
                "lr": 0.00015,
                "weight_decay": 0.01,
                "max_coeff": 0.3,
                "min_coeff": 0.01,
                "freeze_step": 2,
                "cuda_aware": False,
                "comm_backend_name": "nccl",
                "coeff_beta": 0.9,
                "factor_max": 1.0,
                "factor_min": 0.5,
                "factor_threshold": 0.1
            }
        },
        "gradient_clipping": 1.0,
        "fp16": {
            "enabled": True,
            "loss_scale": 0,
            "initial_scale_power": 16
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim)

    @distributed_test(world_size=[2])
    def _test_onebitlamb_checkpointing_overflow(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=100,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            if dist.get_rank() == 0 and n >= 10:
                loss = loss * 1000000.0
            model.backward(loss)
            dist.barrier()
            model.step()
            dist.barrier()
            model.save_checkpoint(save_folder, tag=None)

    _test_onebitlamb_checkpointing_overflow(args=args,
                                            model=model,
                                            hidden_dim=hidden_dim)
Beispiel #21
0
def test_checkpoint_lr_scheduler(tmpdir, zero_stage):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015,
                "betas": [0.8,
                          0.999],
                "eps": 1e-8,
                "weight_decay": 3e-7
            }
        },
        "fp16": {
            "enabled": True
        },
        "zero_optimization": {
            "stage": zero_stage
        },
        "scheduler": {
            "type": "WarmupLR",
            "params": {
                "warmup_min_lr": 0,
                "warmup_max_lr": 0.001,
                "warmup_num_steps": 1000
            }
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[2])
    def _test_checkpoint_lr_scheduler(args,
                                      model,
                                      hidden_dim,
                                      load_optimizer_states,
                                      load_lr_scheduler_states):
        checkpoint_correctness_verification(
            args,
            model,
            hidden_dim,
            tmpdir,
            load_optimizer_states=load_optimizer_states,
            load_lr_scheduler_states=load_lr_scheduler_states)

    _test_checkpoint_lr_scheduler(args=args,
                                  model=model,
                                  hidden_dim=hidden_dim,
                                  load_optimizer_states=False,
                                  load_lr_scheduler_states=True)
Beispiel #22
0
def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
        pytest.skip("cpu-adam is not compatible")

    config_dict = {
        "train_batch_size": 4,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "fp16": {
            "enabled": True,
            "loss_scale": 138.
        },
        "zero_optimization": {
            "stage": zero_stage,
            "cpu_offload": use_cpu_offload
        }
    }
    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=2)
    def _test_zero_static_scale(args, zero_stage, hidden_dim):
        #making hidden size not divisible by DP for covering this scenario
        hidden_dim = hidden_dim
        model = SimpleModel(hidden_dim)

        model, optim, _, _ = deepspeed.initialize(args=args,
                                            model=model,
                                            model_parameters=model.parameters())

        # Ensure the static scaler is configured.
        assert optim.dynamic_loss_scale == False
        assert optim.loss_scaler.loss_scale == 138.

        # Now make sure things work..
        data_loader = random_dataloader(model=model,
                                        total_samples=10,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    #test when hidden_dim is not aligned with world size
    _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=9)
    #test when hidden_dim is aligned with world size
    _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=10)
Beispiel #23
0
def test_zero2_reduce_scatter_off(tmpdir):
    if not bf16_required_version_check():
        pytest.skip(
            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
        )

    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "gradient_clipping": 1.0,
        "zero_optimization": {
            "stage": 2,
            "contiguous_gradients": True,
            "allgather_bucket_size": 2000000000,
            "reduce_bucket_size": 200000000,
            "overlap_comm": False,
            "reduce_scatter": False
        },
        "fp16": {
            "enabled": False
        },
        "bfloat16": {
            "enabled": True
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim)

    @distributed_test(world_size=[2])
    def _helper(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device,
                                        dtype=torch.bfloat16)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _helper(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #24
0
def test_onebitlamb_fp16_pipeline(topo, tmpdir):
    config_dict = {
        "train_batch_size": 16,
        "train_micro_batch_size_per_gpu": 4,
        "steps_per_print": 20,
        "optimizer": {
            "type": "OneBitLamb",
            "params": {
                "lr": 0.00001,
                "betas": [0.9, 0.999],
                "eps": 1e-8,
                "weight_decay": 3e-7,
                "freeze_step": 200,
                "cuda_aware": False,
                "comm_backend_name": "nccl"
            }
        },
        "gradient_clipping": 1.0,
        "zero_optimization": {
            "stage": 0
        },
        "fp16": {
            "enabled": True,
            "loss_scale": 0,
            "initial_scale_power": 16
        },
        "pipeline": {
            "seed_layers": True,
            "activation_checkpoint_interval": 1
        }
    }
    args = args_from_dict(tmpdir, config_dict)

    # Allocate model for consistent initial weights.
    init_net = AlexNetPipe()

    @distributed_test(world_size=4)
    def _helper(topo, tmpdir, steps=500):
        assert steps >= 100

        test_net = copy.deepcopy(init_net)
        test_model = PipelineModule(layers=test_net.to_layers(),
                                    topology=topo,
                                    loss_fn=nn.CrossEntropyLoss())

        test_losses = train_cifar(test_model,
                                  args,
                                  num_steps=steps,
                                  fp16=config_dict['fp16']['enabled'])

    _helper(topo, tmpdir)
def test_three_output_model(tmpdir):
    gradient_accumulation_steps = 3
    micro_batch_size = 1
    world_size = 1
    config_dict = create_config_dict(micro_batch_size,
                                     gradient_accumulation_steps, world_size)

    hidden_dim = 10
    weight_value = 0.1
    args = args_from_dict(tmpdir, config_dict)

    model = MultiOutputModel(hidden_dim, weight_value)

    @distributed_test(world_size=[1])
    def _test_three_output_model(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        total_samples = gradient_accumulation_steps * micro_batch_size * 2
        data_loader = multi_output_dataloader(model=model,
                                              total_samples=total_samples,
                                              hidden_dim=hidden_dim,
                                              device=model.device,
                                              inputs=[1.0, 2.0, 3.0],
                                              targets=[1, 2, 3])
        for n, batch in enumerate(data_loader):
            assert len(batch) % 2 == 0, \
                 f"multi_output_dataloader failed to return even number of data samples (input+target)"

            midpoint = len(batch) // 2
            inputs, targets = batch[:midpoint], batch[midpoint:]
            loss_tuple = model(inputs, targets)
            assert len(loss_tuple) == 3

            expected_loss = torch.tensor(2.302734375,
                                         dtype=torch.half,
                                         device=model.device)

            for loss in loss_tuple:
                assert loss.shape == torch.Size([])
                assert loss.item() == approx(expected_loss.item())

            summed_loss = sum(loss_tuple)
            scaled_loss = model.backward(summed_loss)
            expected_scaled_loss = summed_loss.float(
            ) / gradient_accumulation_steps
            assert scaled_loss.item() == approx(expected_scaled_loss.item())

            model.step()

    _test_three_output_model(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #26
0
def test_pld_model(tmpdir, theta):
    gamma = 0.001
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "optimizer": {
            "type": 'Adam',
            "params": {
                "lr": 0.0001
            }
        },
        "fp16": {
            "enabled": True
        },
        "progressive_layer_drop": {
            "enabled": True,
            "theta": theta,
            "gamma": gamma
        }
    }

    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = PLD_SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1])
    def _test_pld_model(args, model, hidden_dim, theta, gamma):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)

        for i, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

            expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
            actual_theta = model.get_pld_theta()
            assert expected_theta == actual_theta

    _test_pld_model(args=args,
                    model=model,
                    hidden_dim=hidden_dim,
                    theta=theta,
                    gamma=gamma)
Beispiel #27
0
def test_adam_fp16_onecycle_compatibility(tmpdir):
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "scheduler": {
            "type": "OneCycle",
            "params": {
                "cycle_first_step_size": 16000,
                "cycle_first_stair_count": 8000,
                "decay_step_size": 16000,
                "cycle_min_lr": 1e-06,
                "cycle_max_lr": 3e-05,
                "decay_lr_rate": 1e-07,
                "cycle_min_mom": 0.85,
                "cycle_max_mom": 0.99,
                "decay_mom_rate": 0.0
            }
        },
        "fp16": {
            "enabled": True
        },
        "zero_optimization": False
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=True)

    @distributed_test(world_size=[1])
    def _test_adam_fp16_onecycle_compatibility(args, model, hidden_dim):
        model, _, _,_ = deepspeed.initialize(args=args,
                                             model=model,
                                             model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_adam_fp16_onecycle_compatibility(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #28
0
def test_flops_profiler_in_ds_trainning(tmpdir):
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.001,
            }
        },
        "zero_optimization": {
            "stage": 0
        },
        "fp16": {
            "enabled": True,
        },
        "flops_profiler": {
            "enabled": True,
            "start_step": 2,
            "end_step": 3,
            "module_depth": -1,
            "top_modules": 3,
        },
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10
    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1])
    def _test_flops_profiler_in_ds_trainning(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device,
                                        dtype=torch.half)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
            if n == 3: break
        assert model.flops_profiler.flops == 100
        assert model.flops_profiler.params == 110

    _test_flops_profiler_in_ds_trainning(args, model, hidden_dim)
Beispiel #29
0
def test_moe(tmpdir, ep_size):
    if not required_torch_version():
        pytest.skip(
            "DeepSpeed MoE tests need torch 1.8 or higher to run correctly")

    config_dict = {
        "train_batch_size": 8,
        "steps_per_print": 1,
        "fp16": {
            "enabled": True
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 16

    @distributed_test(world_size=[4])
    def _test_moe(args, hidden_dim, ep_size):
        # E+D -- ep_size = 2
        # E only -- ep_size = 4
        #groups.initialize_model_parallel(1)
        #groups.initialize_expert_parallel(2)
        groups.initialize(ep_size=ep_size)
        model = SimpleMoEModel(hidden_dim)
        optimizer = torch.optim.AdamW(params=model.parameters())
        model, _, _, _ = deepspeed.initialize(args=args,
                                              model=model,
                                              optimizer=optimizer,
                                              dist_init_required=False)
        #dist_init_required=False -- parameterize to True/False?

        assert dist.get_world_size() == groups.get_data_parallel_world_size(
        ), "incorrect data parallel world size"
        assert ep_size == groups.get_expert_parallel_world_size(
        ), "incorrect expert parallel world size"

        data_loader = sequence_dataloader(model=model,
                                          total_samples=50,
                                          hidden_dim=hidden_dim,
                                          device=model.device)

        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_moe(args=args, hidden_dim=hidden_dim, ep_size=ep_size)
    def get_deepspeed_model(self, model, tmpdir):
        ds_config_dict = {
            "train_micro_batch_size_per_gpu": 1,
            "optimizer": {
                "type": "Lamb",
                "params": {
                    "lr": 0.00015
                }
            },
        }

        ds_args = args_from_dict(tmpdir, ds_config_dict)
        dist.barrier()

        model, _, _, _ = deepspeed.initialize(
            args=ds_args, model=model, model_parameters=model.parameters())
        return model.cuda()