Esempio n. 1
0
def setup_model_and_optimizer(neox_args, inference=False, get_key_value=True):
    """Setup model and optimizer."""
    model = get_model(neox_args=neox_args,
                      inference=inference,
                      get_key_value=get_key_value)
    optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
    lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer,
                                               neox_args=neox_args)

    if neox_args.deepspeed:
        print_rank_0("DeepSpeed is enabled.")
        if neox_args.no_load_optim:
            assert optimizer is None
            _model_params = None
            _lr_scheduler = None
        else:
            _model_params = param_groups if optimizer is None else None
            _lr_scheduler = lr_scheduler

        model, optimizer, _, lr_scheduler = deepspeed.initialize(
            model=model,
            optimizer=optimizer,
            args=neox_args,
            lr_scheduler=_lr_scheduler,
            dist_init_required=False,
            model_parameters=_model_params,
            config_params=neox_args.deepspeed_config,
            mpu=mpu if not neox_args.is_pipe_parallel else None)
        model.total_params = get_total_params(model.module)
        print_rank_0(f' > total params: {"{:,}".format(model.total_params)}')

        if neox_args.is_pipe_parallel:
            model.set_has_attention_mask(True)
            model.set_batch_fn(model.module._megatron_batch_fn)
    else:
        raise ValueError("Must be using deepspeed to run neox")

    if neox_args.load is not None:
        neox_args.iteration = load_checkpoint(neox_args=neox_args,
                                              model=model,
                                              optimizer=optimizer,
                                              lr_scheduler=lr_scheduler,
                                              inference=inference)
        print_rank_0(
            f'Loading checkpoint and starting from iteration {neox_args.iteration}'
        )
    else:
        neox_args.iteration = 0

    return model, optimizer, lr_scheduler
Esempio n. 2
0
 def _test(model, optimizer):
     engine, _, _, _ = deepspeed.initialize(model=model,
                                            optimizer=optimizer,
                                            config=config_dict)
     loss = torch.nn.BCEWithLogitsLoss()
     x = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9],
                      dtype=torch.long,
                      device=engine.device)
     offsets = torch.tensor([0, 4], dtype=torch.long, device=engine.device)
     y = torch.tensor([[1.0], [0.0]], device=engine.device)
     res = engine(x, offsets)
     with pytest.raises(AssertionError):
         engine.backward(loss(res, y))
     engine.step()
Esempio n. 3
0
    def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage,
                                                    hidden_dim):
        model = SimpleModel(hidden_dim)

        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
Esempio n. 4
0
 def _go(model, hidden_dim):
     model, _, _, _ = deepspeed.initialize(
         model=model,
         model_parameters=model.parameters(),
         config=config_dict)
     data_loader = random_dataloader(model=model,
                                     total_samples=50,
                                     hidden_dim=hidden_dim,
                                     device=model.device)
     dist.barrier()
     for n, batch in enumerate(data_loader):
         loss = model(batch[0], batch[1])
         model.backward(loss)
         model.step()
Esempio n. 5
0
 def _test_dist_init_true(args, model, hidden_dim):
     model, _, _, _ = deepspeed.initialize(
         args=args,
         model=model,
         model_parameters=model.parameters(),
         dist_init_required=True)
     data_loader = random_dataloader(model=model,
                                     total_samples=5,
                                     hidden_dim=hidden_dim,
                                     device=model.device)
     for n, batch in enumerate(data_loader):
         loss = model(batch[0], batch[1])
         model.backward(loss)
         model.step()
Esempio n. 6
0
def test_ext_param_return():
    setup_serial_env()

    net = DanglingExt()

    args = SimpleNamespace(local_rank=0)
    engine, optim, _, _ = deepspeed.initialize(
        args=args, model=net, model_parameters=net.parameters(), config=config)

    for _ in range(5):
        input = torch.rand(net.dim).to(engine.device).half()
        loss = engine(input)
        engine.backward(loss)
        engine.step()
Esempio n. 7
0
    def _test_partition_nccl_alignment(model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(config=config_dict,
                                              model=model,
                                              model_parameters=model.parameters())

        # get nccl all-gather send buffers alignment factor
        nccl_start_alignment_factor = model.optimizer.nccl_start_alignment_factor

        parallel_partitioned_bit16_groups = model.optimizer.parallel_partitioned_bit16_groups if zero_stage == 2 else model.optimizer.parallel_partitioned_fp16_groups
        for data_parallel_partitions in parallel_partitioned_bit16_groups:
            for partition_id, partitioned_data in enumerate(data_parallel_partitions):
                # verify that data partition start locations are 4-byte aligned
                assert (partitioned_data.data_ptr() %
                        (2 * nccl_start_alignment_factor) == 0)
Esempio n. 8
0
 def _test_adamw_fp16_basic(args, model, hidden_dim, config_dict):
     optimizer = torch.optim.AdamW(params=model.parameters())
     model, _, _, _ = deepspeed.initialize(args=args,
                                           model=model,
                                           optimizer=optimizer,
                                           config_params=config_dict)
     data_loader = random_dataloader(model=model,
                                     total_samples=50,
                                     hidden_dim=hidden_dim,
                                     device=model.device)
     for n, batch in enumerate(data_loader):
         loss = model(batch[0], batch[1])
         model.backward(loss)
         model.step()
Esempio n. 9
0
    def _test_fp16_adam_types(args, model, hidden_dim):

        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        data_loader = random_dataloader(model=model,
                                        total_samples=10,
                                        hidden_dim=hidden_dim,
                                        device=model.device)

        for _, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
Esempio n. 10
0
 def _test_get_lr_before_train(args, model, hidden_dim):
     model, _, _, lr_scheduler = deepspeed.initialize(
         args=args, model=model, model_parameters=model.parameters())
     data_loader = random_dataloader(model=model,
                                     total_samples=50,
                                     hidden_dim=hidden_dim,
                                     device=model.device,
                                     dtype=torch.float)
     for n, batch in enumerate(data_loader):
         # get lr before training starts
         lr_scheduler.get_lr()
         loss = model(batch[0], batch[1])
         model.backward(loss)
         model.step()
Esempio n. 11
0
 def _test_scheduler_optimizer_parity(args, model, hidden_dim):
     model, _, _, lr_scheduler = deepspeed.initialize(args=args,
                                                      model=model,
                                                      model_parameters=model.parameters())
     data_loader = random_dataloader(model=model,
                                     total_samples=50,
                                     hidden_dim=hidden_dim,
                                     device=model.device,
                                     dtype=torch.float)
     for n, batch in enumerate(data_loader):
         loss = model(batch[0], batch[1])
         model.backward(loss)
         model.step()
         assert lr_scheduler.get_lr() == model.get_lr()
Esempio n. 12
0
    def _test_zero3_repeat_forward_loop(model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            config=config_dict,
            model=model,
            model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=16,
                                        hidden_dim=hidden_dim,
                                        device=model.device)

        for i, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
Esempio n. 13
0
 def _go(args, model, hidden_dim):
     optimizer = torch.optim.Adam(model.parameters())
     model, _, _, _ = deepspeed.initialize(args=args,
                                           model=model,
                                           optimizer=optimizer)
     data_loader = random_dataloader(model=model,
                                     total_samples=50,
                                     hidden_dim=hidden_dim,
                                     device=model.device,
                                     dtype=torch.bfloat16)
     for n, batch in enumerate(data_loader):
         loss = model(batch[0], batch[1])
         model.backward(loss)
         model.step()
Esempio n. 14
0
def checkpoint_correctness_verification(args,
                                        model,
                                        hidden_dim,
                                        load_optimizer_states=True):

    ds_model, _, _,_ = deepspeed.initialize(args=args,
                                            model=model,
                                            model_parameters=model.parameters())
    data_loader = random_dataloader(model=ds_model,
                                    total_samples=50,
                                    hidden_dim=hidden_dim,
                                    device=ds_model.device)
    for n, batch in enumerate(data_loader):
        loss = ds_model(batch[0], batch[1])
        ds_model.backward(loss)
        ds_model.step()

    trained_model = ds_model

    save_folder = 'saved_checkpoint'
    save_tag = '1'

    trained_model.save_checkpoint(save_folder, save_tag)

    loaded_model, _, _,_ = deepspeed.initialize(args=args,
                                            model=model,
                                            model_parameters=model.parameters())

    loaded_model.load_checkpoint(save_folder,
                                 save_tag,
                                 load_optimizer_states=load_optimizer_states)

    if load_optimizer_states:
        compare_optimizer_states(trained_model, loaded_model, hidden_dim)
    else:
        compare_model_states(trained_model, loaded_model)
 def _test_curriculum_scheduler_fixed_linear(args, model, hidden_dim):
     model, _, _, _ = deepspeed.initialize(
         args=args, model=model, model_parameters=model.parameters())
     data_loader = random_dataloader(model=model,
                                     total_samples=20,
                                     hidden_dim=hidden_dim,
                                     device=model.device)
     for n, batch in enumerate(data_loader):
         loss, seqlen = model(batch[0], batch[1])
         model.backward(loss)
         model.step()
         if n + 1 in ground_truths:
             true_seqlen = ground_truths[n + 1]
             print('at step {} the seqlen is {}'.format(n + 1, seqlen))
             assert seqlen == true_seqlen, f"Incorrect curriculum schedule"
Esempio n. 16
0
 def test_already_init(self, dist_init_required):
     torch.distributed.init_process_group('nccl')
     model = SimpleModel(4)
     config_dict = {
         "train_micro_batch_size_per_gpu": 1,
         "optimizer": {
             "type": "Adam",
             "params": {}
         }
     }
     engine, *_ = deepspeed.initialize(
         model=model,
         config=config_dict,
         model_parameters=model.parameters(),
         dist_init_required=dist_init_required)
Esempio n. 17
0
    def test_overflow(self, tmpdir):
        config_dict = {
            "train_batch_size": 2,
            "steps_per_print": 1,
            "optimizer": {
                "type": "OneBitLamb",
                "params": {
                    "lr": 0.00015,
                    "weight_decay": 0.01,
                    "max_coeff": 0.3,
                    "min_coeff": 0.01,
                    "freeze_step": 2,
                    "cuda_aware": False,
                    "comm_backend_name": "nccl",
                    "coeff_beta": 0.9,
                    "factor_max": 1.0,
                    "factor_min": 0.5,
                    "factor_threshold": 0.1,
                },
            },
            "gradient_clipping": 1.0,
            "fp16": {
                "enabled": True,
                "loss_scale": 0,
                "initial_scale_power": 16
            },
        }
        hidden_dim = 10

        model = SimpleModel(hidden_dim)
        model, _, _, _ = deepspeed.initialize(
            config=config_dict,
            model=model,
            model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=100,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        save_folder = os.path.join(tmpdir, "saved_checkpoint")
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            if dist.get_rank() == 0 and n >= 10:
                loss = loss * 1000000.0
            model.backward(loss)
            dist.barrier()
            model.step()
            dist.barrier()
            model.save_checkpoint(save_folder, tag=None)
    def _test_flops_profiler_in_ds_training(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device,
                                        dtype=torch.half)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
            if n == 3: break
        assert model.flops_profiler.flops == 100
        assert model.flops_profiler.params == 110
Esempio n. 19
0
 def _test_onebitlamb_exp_avg_mask(args, model, hidden_dim):
     model, optimizer, _, _ = deepspeed.initialize(args=args,
                                                   model=model,
                                                   model_parameters=optimizer_grouped_parameters)
     data_loader = random_dataloader(model=model,
                                     total_samples=50,
                                     hidden_dim=hidden_dim,
                                     device=model.device)
     for n, batch in enumerate(data_loader):
         loss = model(batch[0], batch[1])
         model.backward(loss)
         model.step()
     # Test whether the momentum mask works
     for v in optimizer.state.values():
         if v['exp_avg'].size() == mask1.size():
             assert torch.allclose(v['exp_avg'], v['exp_avg'].mul_(mask1.to(device=v['exp_avg'].device)), atol=1e-07), f"Momentum mask is not working properly"
    def launch(self, model: torch.nn.Module,
               optimizer: Union[str, torch.optim.Optimizer], local_rank: int,
               serialization_dir: str, batch_size: int,
               gradient_accumulation_steps: int):
        path = self._to_temp_file(
            serialization_dir,
            train_batch_size=batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps)
        ds = deepspeed.initialize(args=self.build_deepspeed_args(
            path, local_rank),
                                  model=model,
                                  model_parameters=model.parameters(),
                                  dist_init_required=False)

        os.remove(path)
        return ds
Esempio n. 21
0
    def test(self):
        if not bf16_required_version_check():
            pytest.skip(
                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
            )

        config_dict = {
            "train_batch_size": 2,
            "steps_per_print": 1,
            "optimizer": {
                "type": "Adam",
                "params": {
                    "lr": 0.00015
                }
            },
            "gradient_clipping": 1.0,
            "zero_optimization": {
                "stage": 2,
                "contiguous_gradients": True,
                "allgather_bucket_size": 2000000000,
                "reduce_bucket_size": 200000000,
                "overlap_comm": False,
                "reduce_scatter": False
            },
            "fp16": {
                "enabled": False
            },
            "bf16": {
                "enabled": True
            }
        }
        hidden_dim = 10

        model = SimpleModel(hidden_dim)
        model, _, _, _ = deepspeed.initialize(
            config=config_dict,
            model=model,
            model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device,
                                        dtype=torch.bfloat16)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
Esempio n. 22
0
    def _test_pld_model(args, model, hidden_dim, theta, gamma):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)

        for i, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

            expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
            actual_theta = model.get_pld_theta()
            assert expected_theta == actual_theta
Esempio n. 23
0
def test_ext_param_returnobj():
    setup_serial_env()
    print()

    net = ModelContainer(return_obj=True)

    args = SimpleNamespace(local_rank=0)
    engine, optim, _, _ = deepspeed.initialize(
        args=args, model=net, model_parameters=net.parameters(), config=config)

    for _ in range(5):
        input = torch.rand(net.dim).to(engine.device).half()
        loss = engine(input)
        assert len(net._external_params) == 1
        assert len(net.dangler._external_params) == 0
        engine.backward(loss)
        engine.step()
    def get_deepspeed_model(self, model, tmpdir):
        ds_config_dict = {
            "train_micro_batch_size_per_gpu": 1,
            "optimizer": {
                "type": "Lamb",
                "params": {
                    "lr": 0.00015
                }
            },
        }
        dist.barrier()

        model, _, _, _ = deepspeed.initialize(
            model=model,
            model_parameters=model.parameters(),
            config=ds_config_dict)
        return model.cuda()
Esempio n. 25
0
 def _test_onebitlamb_checkpointing_overflow(args, model, hidden_dim):
     model, _, _, _ = deepspeed.initialize(
         args=args, model=model, model_parameters=model.parameters())
     data_loader = random_dataloader(model=model,
                                     total_samples=100,
                                     hidden_dim=hidden_dim,
                                     device=model.device)
     save_folder = os.path.join(tmpdir, 'saved_checkpoint')
     for n, batch in enumerate(data_loader):
         loss = model(batch[0], batch[1])
         if dist.get_rank() == 0 and n >= 10:
             loss = loss * 1000000.0
         model.backward(loss)
         dist.barrier()
         model.step()
         dist.barrier()
         model.save_checkpoint(save_folder, tag=None)
Esempio n. 26
0
    def _test_zero_empty_partition(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim)
        # Ensure model has 2 parameters, to cause empty partition with DP=3
        assert len(list(model.parameters())) == 2
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        # Now make sure things work..
        data_loader = random_dataloader(model=model,
                                        total_samples=1,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
Esempio n. 27
0
    def __init__(self, config_filepath) -> None:
        super().__init__()
        self.config = Config(config_filepath)
        set_seed(self.config.seed)

        self.loader = SemEvalDataLoader(self.config)
        model = BertCls(self.config)
        optimiser = optim.Adam(model.parameters(), lr=self.config.lr)
        model_engine, optimizer, _, _ = deepspeed.initialize(
            config=self.config.__dict__,
            model=model,
            optimizer=optimiser,
            model_parameters=model.parameters())
        self.model_engine = model_engine
        self.model_engine = self.model_engine.to(self.config.device)
        self.optimiser = optimizer
        self.criterion = nn.BCELoss()
    def _test_fused_all_overflow(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim)
        model, optim, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

        expected_loss_scale = 2**4
        # Ensure the dynamic loss scaler is correctly configured.
        assert optim.dynamic_loss_scale == True
        assert optim.cur_scale == expected_loss_scale

        overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6
        for i, value in enumerate(overflow_gradients):
            run_model_step(model, [value])
            expected_loss_scale = max(expected_loss_scale / 2, 1)
            assert optim.cur_scale == expected_loss_scale
            assert optim.cur_iter == (i + 1)
Esempio n. 29
0
    def test(self, dtype):
        config_dict = {
            "train_batch_size": 2,
            "steps_per_print": 1,
            "optimizer": {
                "type": "OneBitLamb",
                "params": {
                    "lr": 0.00015,
                    "weight_decay": 0.01,
                    "max_coeff": 0.3,
                    "min_coeff": 0.01,
                    "freeze_step": 2,
                    "cuda_aware": False,
                    "comm_backend_name": "nccl",
                    "coeff_beta": 0.9,
                    "factor_max": 1.0,
                    "factor_min": 0.5,
                    "factor_threshold": 0.1,
                },
            },
            "gradient_clipping": 1.0,
            "fp16": {
                "enabled": (dtype == torch.float16),
                "loss_scale": 0,
                "initial_scale_power": 16,
            },
        }
        hidden_dim = 10

        model = SimpleModel(hidden_dim)
        model, _, _, _ = deepspeed.initialize(
            config=config_dict,
            model=model,
            model_parameters=model.parameters())
        data_loader = random_dataloader(
            model=model,
            total_samples=50,
            hidden_dim=hidden_dim,
            device=model.device,
            dtype=dtype,
        )
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
Esempio n. 30
0
    def test(self):
        config_dict = {
            "train_batch_size": 1,
            "steps_per_print": 1,
            "optimizer": {
                "type": "Adam",
                "params": {
                    "lr": 0.001,
                }
            },
            "zero_optimization": {
                "stage": 0
            },
            "fp16": {
                "enabled": True,
            },
            "flops_profiler": {
                "enabled": True,
                "step": 1,
                "module_depth": -1,
                "top_modules": 3,
            },
        }
        hidden_dim = 10
        model = SimpleModel(hidden_dim, empty_grad=False)

        model, _, _, _ = deepspeed.initialize(
            config=config_dict,
            model=model,
            model_parameters=model.parameters())

        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device,
                                        dtype=torch.half)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
            if n == 3: break
        assert within_range(model.flops_profiler.flops,
                            200,
                            tolerance=TOLERANCE)
        assert model.flops_profiler.params == 110