Beispiel #1
0
    def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor):
        model = SimpleModel(hidden_dim)

        client_optimizer = optimizer_constructor(params=model.parameters())
        model, _, _, _ = deepspeed.initialize(args=args,
                                              model=model,
                                              optimizer=client_optimizer)
def test_client_optimizer(tmpdir, optimizer_type):
    def _optimizer_callable(params) -> Optimizer:
        return AdamW(params=params)

    hidden_dim = 10
    model = SimpleModel(hidden_dim)

    config_dict = {'train_batch_size': 1}
    if optimizer_type is None:
        client_optimizer = None
        config_dict['optimizer'] = {'type': ADAM_OPTIMIZER}
    elif optimizer_type is Optimizer:
        client_optimizer = Adam(model.parameters())
    else:
        client_optimizer = _optimizer_callable

    args = args_from_dict(tmpdir, config_dict)

    @distributed_test(world_size=[1])
    def _test_client_optimizer(args, model, client_optimizer):
        _, ds_optimizer, _, _ = deepspeed.initialize(
            args=args,
            model=model,
            model_parameters=list(model.parameters()),
            optimizer=client_optimizer)
        if client_optimizer is None:
            assert isinstance(ds_optimizer, FusedAdam)
        elif isinstance(client_optimizer, Optimizer):
            assert ds_optimizer == client_optimizer
        else:
            assert isinstance(ds_optimizer, AdamW)

    _test_client_optimizer(args=args,
                           model=model,
                           client_optimizer=client_optimizer)
Beispiel #3
0
def simple(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=123)
    proxy = SimpleModel()
    x_train, y_train_array, scalar_x, scalar_y = proxy.get_data(
        x_train, y_train)
    proxy.evaluate_simple(x_train, y_train)
Beispiel #4
0
 def _test_zero_allow_untested_optimizer(args):
     hidden_dim = 10
     model = SimpleModel(hidden_dim, empty_grad=True)
     optimizer = SimpleOptimizer(model.parameters())
     with pytest.raises(AssertionError):
         model, optim, _,_ = deepspeed.initialize(args=args,
                                                 model=model,
                                                 optimizer=optimizer,
                                                 model_parameters=model.parameters())
Beispiel #5
0
 def helper(args):
     model = SimpleModel(10)
     model, _, _, _ = deepspeed.initialize(args=args,
                                           model=model,
                                           model_parameters=model.parameters())
     # get base optimizer under zero
     ds_optimizer = model.optimizer.optimizer
     opt_class, adam_w_mode = resulting_optimizer
     assert isinstance(ds_optimizer, opt_class)
     if adam_w_mode in [True, False]:
         assert ds_optimizer.adam_w_mode == adam_w_mode
Beispiel #6
0
def test_checkpoint_fp32_optimizer(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015,
                "betas": [0.8,
                          0.999],
                "eps": 1e-8,
                "weight_decay": 3e-7
            }
        },
        "fp16": {
            "enabled": False
        }
    }

    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[2])
    def _test_checkpoint_fp32_optimizer(args, model, hidden_dim):
        checkpoint_correctness_verification(args, model, hidden_dim, tmpdir, fp16=False)

    _test_checkpoint_fp32_optimizer(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #7
0
def test_lr_range_test(tmpdir, min_lr, step_rate, step_size, staircase):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            },
        },
        "scheduler": {
            "type": LR_RANGE_TEST,
            "params": {
                LR_RANGE_TEST_MIN_LR: min_lr,
                LR_RANGE_TEST_STEP_RATE: step_rate,
                LR_RANGE_TEST_STEP_SIZE: step_size,
                LR_RANGE_TEST_STAIRCASE: staircase
            }
        },
        "gradient_clipping": 1.0
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1])
    def _test_lr_range_test(args, model, hidden_dim, min_lr, step_size,
                            staircase):
        model, _, _, lr_scheduler = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=max(50, step_size * 2),
                                        hidden_dim=hidden_dim,
                                        device=model.device,
                                        dtype=torch.float)

        step_lrs = []
        for _, batch in enumerate(data_loader):
            step_lrs.append(lr_scheduler.get_lr())
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

        # Verify starting lr
        assert step_lrs[0] == min_lr

        if staircase:
            # Verify staircase increasing lr
            _verify_staircase_increase(step_lrs, step_size)
        else:
            # Verify continuous increasing lr
            _verify_continuous_increase(step_lrs)

    _test_lr_range_test(args=args,
                        model=model,
                        hidden_dim=hidden_dim,
                        min_lr=[min_lr],
                        step_size=step_size,
                        staircase=staircase)
Beispiel #8
0
def test_dict_config_adamw_fp16_basic():
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "fp16": {
            "enabled": True
        }
    }
    args = create_deepspeed_args()
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1])
    def _test_adamw_fp16_basic(args, model, hidden_dim, config_dict):
        optimizer = torch.optim.AdamW(params=model.parameters())
        model, _, _, _ = deepspeed.initialize(args=args,
                                              model=model,
                                              optimizer=optimizer,
                                              config_params=config_dict)
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_adamw_fp16_basic(args=args,
                           model=model,
                           hidden_dim=hidden_dim,
                           config_dict=config_dict)
Beispiel #9
0
def test_lamb_fp16_empty_grad(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Lamb",
            "params": {
                "lr": 0.00015
            }
        },
        "gradient_clipping": 1.0,
        "fp16": {
            "enabled": True
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=True)

    @distributed_test(world_size=[2])
    def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-fit", nargs='?', default=False)
    parser.add_argument("-model", nargs='?', default="simple_model")
    parser.add_argument("-visualize", nargs='?', default=False)
    parser.add_argument("-evaluate", nargs='?')
    parser.add_argument("-dataset")
    parser.add_argument("-model_path", nargs='?')
    parser.add_argument("-visualize_heatmap", nargs='?')

    args = parser.parse_args()
    if args.dataset:
        if args.dataset == 'fashion_mnist':
            train_loader, test_loader = fashion_mnist_dataset.get_data_loaders(
            )
            visualize = fashion_mnist_dataset.visualize_dataset
        elif args.dataset == "dogs_cats":
            train_loader, test_loader = dogs_cats_dataset.get_data_loaders()
            visualize = dogs_cats_dataset.visualize_dataset
    if args.model:
        if args.model == 'simple_model':
            model = SimpleModel()
        if args.model == 'explain_model':
            model = ExplainModel()
    if args.fit:
        fit_classifier(model, train_loader, test_loader, args.model)
    elif args.visualize:
        visualize(train_loader)
    elif args.evaluate and args.model_path:
        model = load_model(model_path)
        evaluate(model, test_loader)
    elif args.visualize_heatmap and args.model_path:
        load_visualize_heatmap(args.model_path, test_loader)
Beispiel #11
0
    def _helper():
        model = SimpleModel(hidden_dim=10)
        with pytest.raises(AssertionError):
            model, _, _, _ = deepspeed.initialize(model=None, config=config)

        with pytest.raises(AssertionError):
            model, _, _, _ = deepspeed.initialize(model, config=config)
Beispiel #12
0
def test_non_elastic_batch_params_w_override(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Lamb",
            "params": {
                "lr": 0.00015
            }
        },
        "gradient_clipping": 1.0,
        "elasticity": {
            "enabled": True,
            "max_train_batch_size": 4,
            "micro_batch_sizes": [1, 2, 3, 4],
            "min_gpus": 1,
            "max_gpus": 4,
            "min_time": 20,
            "version": 0.1,
            "ignore_non_elastic_batch_info": True
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1, 2])
    def _test_elastic(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())

    _test_elastic(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #13
0
def test_adam_amp_basic(tmpdir):
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "amp": {
            "enabled": True
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim)

    @distributed_test(world_size=[1])
    def _test_adam_amp_basic(args, model, hidden_dim):
        optimizer = torch.optim.Adam(params=model.parameters())
        model, _, _, _ = deepspeed.initialize(args=args,
                                              model=model,
                                              optimizer=optimizer)
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
 def __init__(self, no_agents: int):
     self.model: SimpleModel = SimpleModel(no_agents)
     self.states_dictionary: Dict[hash, int] = {}
     self.state_number: int = 0
     self.epistemic_states_dictionaries: List[Dict[str, Set[int]]] = []
     self.no_agents: int = no_agents
     self.prepare_epistemic_dictionaries()
Beispiel #15
0
def test_dataloader_drop_last(tmpdir, train_batch_size, drop_last):
    config_dict = {
        "train_batch_size": train_batch_size,
        "dataloader_drop_last": drop_last,
        "steps_per_print": 1
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim)

    @distributed_test(world_size=[1])
    def _test_dataloader_drop_last(args, model, hidden_dim):
        optimizer = torch.optim.AdamW(params=model.parameters())
        #TODO: Figure out why this breaks with cuda device
        train_dataset = random_dataset(total_samples=50,
                                       hidden_dim=hidden_dim,
                                       device=torch.device('cpu'),
                                       dtype=torch.float32)
        model, _, training_dataloader, _ = deepspeed.initialize(
            args=args,
            model=model,
            training_data=train_dataset,
            optimizer=optimizer)
        for n, batch in enumerate(training_dataloader):
            x = batch[0].to(torch.cuda.current_device())
            y = batch[1].to(torch.cuda.current_device())
            loss = model(x, y)
            model.backward(loss)
            model.step()

    _test_dataloader_drop_last(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #16
0
def test_zero_supported_client_optimizer(tmpdir, zero_stage,
                                         optimizer_constructor):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "fp16": {
            "enabled": True
        },
        "zero_optimization": {
            "stage": zero_stage
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1])
    def _test_zero_supported_client_optimizer(args, model,
                                              optimizer_constructor):
        client_optimizer = optimizer_constructor(params=model.parameters())
        model, _, _, _ = deepspeed.initialize(args=args,
                                              model=model,
                                              optimizer=client_optimizer)

    _test_zero_supported_client_optimizer(
        args=args, model=model, optimizer_constructor=optimizer_constructor)
Beispiel #17
0
def main():
    prefix = './data/{}/{}'.format('baby', 'baby')
    dim = 100
    epochs = 50
    layer_size = [64, 32, 16, 1]
    lr = 0.0001
    alpha = 0.5
    num_users, num_items = get_user_item_info(prefix + '_umap.csv',
                                              prefix + '_itmap.csv')
    train_users, train_items, train_revs, train_rs, word_dict, maxlen = load_train_data(
        prefix + '_train.csv')
    test_users, test_items, test_rs = load_test_data(prefix + '_test.csv')
    vocab = get_vocab(word_dict)
    #w2v = Word2Vec.load(prefix+'_w2v.emb')
    with open(prefix + '_w2v.emb', 'rb') as fp:
        w2v = pickle.load(fp)
    word_emb = get_word_emb(vocab, dim, w2v)
    sess = tf.Session()
    model = SimpleModel(sess, num_users, num_items, maxlen, len(vocab), dim,
                        layer_size, lr, alpha)
    sess.run(model.word_emb.assign(tf.constant(word_emb, dtype=tf.float32)))
    best_mse, best_epoch = 10, 0
    for epoch in range(epochs):
        loss, mse = 0, 0
        for batch_u, batch_i, batch_rev, batch_len, batch_r in tqdm(
                generate_train_batch(train_users, train_items, train_revs,
                                     train_rs, vocab, maxlen)):
            batch_mse, batch_loss = model.train(batch_u, batch_i, batch_rev,
                                                batch_len, batch_r)
            loss += batch_loss * len(batch_u)
            mse += batch_mse * len(batch_u)
        mse = mse / len(train_users)
        loss = loss / len(train_users)
        print('train epoch:{},mse:{},loss:{}'.format(epoch + 1, mse, loss))
        mse = 0
        for batch_u, batch_i, batch_r in tqdm(
                generate_test_batch(test_users, test_items, test_rs)):
            y, batch_mse = model.test(batch_u, batch_i, batch_r)
            mse += len(batch_u) * batch_mse
        mse = mse / len(test_users)
        if mse < best_mse:
            best_mse = mse
            best_epoch = epoch + 1
        print('test epoch:{},mse:{}'.format(epoch + 1, mse))
    print('best mse:{},at epoch:{}'.format(best_mse, best_epoch))
Beispiel #18
0
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage,
                                               use_cpu_offload):
    #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
    #    pytest.skip("cpu-adam is not installed")
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "scheduler": {
            "type": "OneCycle",
            "params": {
                "cycle_first_step_size": 16000,
                "cycle_first_stair_count": 8000,
                "decay_step_size": 16000,
                "cycle_min_lr": 1e-06,
                "cycle_max_lr": 3e-05,
                "decay_lr_rate": 1e-07,
                "cycle_min_mom": 0.85,
                "cycle_max_mom": 0.99,
                "decay_mom_rate": 0.0
            }
        },
        "fp16": {
            "enabled": True
        },
        "zero_optimization": {
            "stage": zero_stage,
            "cpu_offload": use_cpu_offload
        }
    }

    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=True)

    @distributed_test(world_size=[1])
    def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _test_adam_fp16_zero_onecycle_compatibility(args=args,
                                                model=model,
                                                hidden_dim=hidden_dim)
Beispiel #19
0
 def _helper():
     model = SimpleModel(hidden_dim=10)
     model, _, _, _ = deepspeed.initialize(model=model, config=config)
     data_loader = random_dataloader(model=model,
                                     total_samples=5,
                                     hidden_dim=10,
                                     device=model.device)
     for n, batch in enumerate(data_loader):
         loss = model(batch[0], batch[1])
Beispiel #20
0
    def _test_fused_all_overflow(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim, empty_grad=True)
        model, optim, _, _ = deepspeed.initialize(args=args,
                                                  model=model,
                                                  model_parameters=model.parameters())

        expected_loss_scale = 2**4
        # Ensure the dynamic loss scaler is correctly configured.
        assert optim.dynamic_loss_scale == True
        assert optim.cur_scale == expected_loss_scale

        overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6
        for i, value in enumerate(overflow_gradients):
            run_model_step(model, [value])
            expected_loss_scale = max(expected_loss_scale / 2, 1)
            assert optim.cur_scale == expected_loss_scale
            assert optim.cur_iter == (i + 1)
 def _go(hidden_dim):
     with deepspeed.zero.Init(enabled=zero_stage == 3,
                              config_dict_or_path=ds_config):
         model = SimpleModel(hidden_dim, nlayers=78)
     print('total number of parameters:',
           sum([p.numel() for p in model.parameters()]))
     see_memory_usage('pre-init', force=True)
     model, _, _, _ = deepspeed.initialize(model=model, config=ds_config)
     see_memory_usage('post-init', force=True)
     data_loader = random_dataloader(model=model,
                                     total_samples=50,
                                     hidden_dim=hidden_dim,
                                     device=model.device,
                                     dtype=torch.half)
     print(f"optimizer={model.optimizer}")
     for batch in data_loader:
         model(batch[0], batch[1])
     see_memory_usage('post-fwds', force=True)
class SimpleModelTest(TestCase):
    def setUp(self):
        R = np.diag([0.1, 0.1, 0.1, 0.1, 0.1])
        Q = np.array([[0.01]])
        Q_b = np.array([[10]])

        self.model = SimpleModel(0.1, R, Q, Q_b)

    def test_accessor(self):
        self.model.mu = [1, 2, 3, 4, 5]

        self.assertEqual(1, self.model.x)
        self.assertEqual(2, self.model.y)
        self.assertEqual(3, self.model.theta)
        self.assertEqual(4, self.model.vx)
        self.assertEqual(5, self.model.vy)

    def test_update(self):
        """
        Smoke test for the predictor.
        """
        self.model.predict(np.array([1, 0]))
        self.model.predict(np.array([1, 0]))
        self.assertAlmostEqual(self.model.x, 0.01, places=4)

    def test_correct_angle(self):
        """
        Smoke test for the angle corrector.
        """
        self.model.correct_angle(1)
        self.assertAlmostEqual(self.model.theta, 0.5, places=3)

    def test_correct_beacon(self):
        """
        Smoke test for the beacon corrector.
        """
        x, y = 1, 1  # ground truth passed to the model for computing beacon pos

        def distance(bx, by):
            return np.sqrt((bx - x) ** 2 + (by - y) ** 2)

        beacons = [
            (3, 3),
            (0, 3),
            (3, 0),
        ]

        for _ in range(100):
            # we run a prediction step to increase variance
            self.model.predict((0, 0))
            for bx, by in beacons:
                self.model.correct_beacon(bx, by, distance(bx, by))

        self.assertAlmostEqual(self.model.x, x, places=2)
Beispiel #23
0
def test_checkpoint_unfused_optimizer(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Lamb",
            "params": {
                "lr": 0.00015,
                "max_grad_norm": 1.0
            }
        },
        "fp16": {
            "enabled": True
        },
        "scheduler": {
            "type": "OneCycle",
            "params": {
                "cycle_first_step_size": 1000,
                "cycle_first_stair_count": 500,
                "cycle_second_step_size": 1000,
                "cycle_second_stair_count": 500,
                "decay_step_size": 1000,
                "cycle_min_lr": 0.0001,
                "cycle_max_lr": 0.0010,
                "decay_lr_rate": 0.001,
                "cycle_min_mom": 0.85,
                "cycle_max_mom": 0.99,
                "decay_mom_rate": 0.0
            }
        }
    }

    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[2])
    def _test_checkpoint_unfused_optimizer(args,
                                           model,
                                           hidden_dim,
                                           load_optimizer_states):
        checkpoint_correctness_verification(args,
                                            model,
                                            hidden_dim,
                                            tmpdir,
                                            load_optimizer_states=load_optimizer_states)

    _test_checkpoint_unfused_optimizer(args=args,
                                       model=model,
                                       hidden_dim=hidden_dim,
                                       load_optimizer_states=True)
    _test_checkpoint_unfused_optimizer(args=args,
                                       model=model,
                                       hidden_dim=hidden_dim,
                                       load_optimizer_states=False)
Beispiel #24
0
    def _test_unfused_no_overflow(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim, empty_grad=True)
        model, optim, _, _ = deepspeed.initialize(args=args,
                                                  model=model,
                                                  model_parameters=model.parameters())
        expected_loss_scale = 2**8
        expected_scale_window = 2
        # Ensure the dynamic loss scaler is correctly configured.
        assert optim.dynamic_loss_scale == True
        assert optim.cur_scale == expected_loss_scale
        assert optim.scale_window == expected_scale_window

        for i, value in enumerate(np.random.uniform(-0.1, 0.1, 10)):
            run_model_step(model, [value])
            assert optim.cur_scale == expected_loss_scale
            assert optim.cur_iter == (i + 1)
            if optim.cur_iter % expected_scale_window == 0:
                expected_loss_scale *= 2
Beispiel #25
0
def test_onebitlamb_checkpointing_overflow(tmpdir):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "OneBitLamb",
            "params": {
                "lr": 0.00015,
                "weight_decay": 0.01,
                "max_coeff": 0.3,
                "min_coeff": 0.01,
                "freeze_step": 2,
                "cuda_aware": False,
                "comm_backend_name": "nccl",
                "coeff_beta": 0.9,
                "factor_max": 1.0,
                "factor_min": 0.5,
                "factor_threshold": 0.1
            }
        },
        "gradient_clipping": 1.0,
        "fp16": {
            "enabled": True,
            "loss_scale": 0,
            "initial_scale_power": 16
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim)

    @distributed_test(world_size=[2])
    def _test_onebitlamb_checkpointing_overflow(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=100,
                                        hidden_dim=hidden_dim,
                                        device=model.device)
        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            if dist.get_rank() == 0 and n >= 10:
                loss = loss * 1000000.0
            model.backward(loss)
            dist.barrier()
            model.step()
            dist.barrier()
            model.save_checkpoint(save_folder, tag=None)

    _test_onebitlamb_checkpointing_overflow(args=args,
                                            model=model,
                                            hidden_dim=hidden_dim)
Beispiel #26
0
def test_checkpoint_lr_scheduler(tmpdir, zero_stage):
    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015,
                "betas": [0.8,
                          0.999],
                "eps": 1e-8,
                "weight_decay": 3e-7
            }
        },
        "fp16": {
            "enabled": True
        },
        "zero_optimization": {
            "stage": zero_stage
        },
        "scheduler": {
            "type": "WarmupLR",
            "params": {
                "warmup_min_lr": 0,
                "warmup_max_lr": 0.001,
                "warmup_num_steps": 1000
            }
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[2])
    def _test_checkpoint_lr_scheduler(args,
                                      model,
                                      hidden_dim,
                                      load_optimizer_states,
                                      load_lr_scheduler_states):
        checkpoint_correctness_verification(
            args,
            model,
            hidden_dim,
            tmpdir,
            load_optimizer_states=load_optimizer_states,
            load_lr_scheduler_states=load_lr_scheduler_states)

    _test_checkpoint_lr_scheduler(args=args,
                                  model=model,
                                  hidden_dim=hidden_dim,
                                  load_optimizer_states=False,
                                  load_lr_scheduler_states=True)
Beispiel #27
0
def export_for_serving(export_path: str) -> None:
    mock_input = tf.keras.Input(shape=4,
                                batch_size=None,
                                name='main_input',
                                dtype=tf.float32)
    model = SimpleModel()
    model(mock_input)

    # This call fails with
    # Inputs to eager execution function cannot be Keras symbolic tensors
    # suggesting that some transformation is being done by tf.keras.Model.__call__
    #  model.call2(mock_input)

    model.summary()

    tf.keras.models.save_model(model,
                               export_path,
                               overwrite=True,
                               include_optimizer=False,
                               save_format=None,
                               signatures=None,
                               options=None)
Beispiel #28
0
def test_zero2_reduce_scatter_off(tmpdir):
    if not bf16_required_version_check():
        pytest.skip(
            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
        )

    config_dict = {
        "train_batch_size": 2,
        "steps_per_print": 1,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 0.00015
            }
        },
        "gradient_clipping": 1.0,
        "zero_optimization": {
            "stage": 2,
            "contiguous_gradients": True,
            "allgather_bucket_size": 2000000000,
            "reduce_bucket_size": 200000000,
            "overlap_comm": False,
            "reduce_scatter": False
        },
        "fp16": {
            "enabled": False
        },
        "bfloat16": {
            "enabled": True
        }
    }
    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim)

    @distributed_test(world_size=[2])
    def _helper(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(
            args=args, model=model, model_parameters=model.parameters())
        data_loader = random_dataloader(model=model,
                                        total_samples=50,
                                        hidden_dim=hidden_dim,
                                        device=model.device,
                                        dtype=torch.bfloat16)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()

    _helper(args=args, model=model, hidden_dim=hidden_dim)
Beispiel #29
0
    def _test_unfused_some_overflow(args):
        hidden_dim = 1
        model = SimpleModel(hidden_dim, empty_grad=True)
        model, optim, _, _ = deepspeed.initialize(args=args,
                                                  model=model,
                                                  model_parameters=model.parameters())

        expected_loss_scale = 2**8
        expected_scale_window = 2
        expected_iteration = 0
        # Ensure the dynamic loss scaler is correctly configured.
        assert optim.dynamic_loss_scale == True
        assert optim.cur_scale == expected_loss_scale
        assert optim.scale_window == expected_scale_window

        # Run model with overflows to decrease scale
        overflow_gradients = [float('inf'), float('nan')]
        expected_iteration += len(overflow_gradients)
        run_model_step(model, overflow_gradients)
        expected_loss_scale /= (2**len(overflow_gradients))
        assert optim.cur_scale == expected_loss_scale
        assert optim.cur_iter == expected_iteration

        # Run model scale_window + 1 times to increase scale once
        normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1)
        expected_iteration += len(normal_gradients)
        run_model_step(model, normal_gradients)
        expected_loss_scale *= 2
        assert optim.cur_scale == expected_loss_scale
        assert optim.cur_iter == expected_iteration

        # Run model with overflows to decrease scale
        overflow_gradients = [float('inf')]
        expected_iteration += len(overflow_gradients)
        run_model_step(model, overflow_gradients)
        expected_loss_scale /= (2**len(overflow_gradients))
        assert optim.cur_scale == expected_loss_scale
        assert optim.cur_iter == expected_iteration
Beispiel #30
0
 def _test_zero_empty_partition(args):
     hidden_dim = 1
     model = SimpleModel(hidden_dim)
     # Ensure model has 2 parameters, to cause empty partition with DP=3
     assert len(list(model.parameters())) == 2
     model, _, _, _ = deepspeed.initialize(args=args,
                                           model=model,
                                           model_parameters=model.parameters())
     model.step()