def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor): model = SimpleModel(hidden_dim) client_optimizer = optimizer_constructor(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=client_optimizer)
def test_client_optimizer(tmpdir, optimizer_type): def _optimizer_callable(params) -> Optimizer: return AdamW(params=params) hidden_dim = 10 model = SimpleModel(hidden_dim) config_dict = {'train_batch_size': 1} if optimizer_type is None: client_optimizer = None config_dict['optimizer'] = {'type': ADAM_OPTIMIZER} elif optimizer_type is Optimizer: client_optimizer = Adam(model.parameters()) else: client_optimizer = _optimizer_callable args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=[1]) def _test_client_optimizer(args, model, client_optimizer): _, ds_optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=list(model.parameters()), optimizer=client_optimizer) if client_optimizer is None: assert isinstance(ds_optimizer, FusedAdam) elif isinstance(client_optimizer, Optimizer): assert ds_optimizer == client_optimizer else: assert isinstance(ds_optimizer, AdamW) _test_client_optimizer(args=args, model=model, client_optimizer=client_optimizer)
def simple(X, Y): x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123) proxy = SimpleModel() x_train, y_train_array, scalar_x, scalar_y = proxy.get_data( x_train, y_train) proxy.evaluate_simple(x_train, y_train)
def _test_zero_allow_untested_optimizer(args): hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) optimizer = SimpleOptimizer(model.parameters()) with pytest.raises(AssertionError): model, optim, _,_ = deepspeed.initialize(args=args, model=model, optimizer=optimizer, model_parameters=model.parameters())
def helper(args): model = SimpleModel(10) model, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) # get base optimizer under zero ds_optimizer = model.optimizer.optimizer opt_class, adam_w_mode = resulting_optimizer assert isinstance(ds_optimizer, opt_class) if adam_w_mode in [True, False]: assert ds_optimizer.adam_w_mode == adam_w_mode
def test_checkpoint_fp32_optimizer(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015, "betas": [0.8, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } }, "fp16": { "enabled": False } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[2]) def _test_checkpoint_fp32_optimizer(args, model, hidden_dim): checkpoint_correctness_verification(args, model, hidden_dim, tmpdir, fp16=False) _test_checkpoint_fp32_optimizer(args=args, model=model, hidden_dim=hidden_dim)
def test_lr_range_test(tmpdir, min_lr, step_rate, step_size, staircase): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 }, }, "scheduler": { "type": LR_RANGE_TEST, "params": { LR_RANGE_TEST_MIN_LR: min_lr, LR_RANGE_TEST_STEP_RATE: step_rate, LR_RANGE_TEST_STEP_SIZE: step_size, LR_RANGE_TEST_STAIRCASE: staircase } }, "gradient_clipping": 1.0 } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[1]) def _test_lr_range_test(args, model, hidden_dim, min_lr, step_size, staircase): model, _, _, lr_scheduler = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=max(50, step_size * 2), hidden_dim=hidden_dim, device=model.device, dtype=torch.float) step_lrs = [] for _, batch in enumerate(data_loader): step_lrs.append(lr_scheduler.get_lr()) loss = model(batch[0], batch[1]) model.backward(loss) model.step() # Verify starting lr assert step_lrs[0] == min_lr if staircase: # Verify staircase increasing lr _verify_staircase_increase(step_lrs, step_size) else: # Verify continuous increasing lr _verify_continuous_increase(step_lrs) _test_lr_range_test(args=args, model=model, hidden_dim=hidden_dim, min_lr=[min_lr], step_size=step_size, staircase=staircase)
def test_dict_config_adamw_fp16_basic(): config_dict = { "train_batch_size": 1, "steps_per_print": 1, "fp16": { "enabled": True } } args = create_deepspeed_args() hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[1]) def _test_adamw_fp16_basic(args, model, hidden_dim, config_dict): optimizer = torch.optim.AdamW(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=optimizer, config_params=config_dict) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim, config_dict=config_dict)
def test_lamb_fp16_empty_grad(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015 } }, "gradient_clipping": 1.0, "fp16": { "enabled": True } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) @distributed_test(world_size=[2]) def _test_lamb_fp16_empty_grad(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-fit", nargs='?', default=False) parser.add_argument("-model", nargs='?', default="simple_model") parser.add_argument("-visualize", nargs='?', default=False) parser.add_argument("-evaluate", nargs='?') parser.add_argument("-dataset") parser.add_argument("-model_path", nargs='?') parser.add_argument("-visualize_heatmap", nargs='?') args = parser.parse_args() if args.dataset: if args.dataset == 'fashion_mnist': train_loader, test_loader = fashion_mnist_dataset.get_data_loaders( ) visualize = fashion_mnist_dataset.visualize_dataset elif args.dataset == "dogs_cats": train_loader, test_loader = dogs_cats_dataset.get_data_loaders() visualize = dogs_cats_dataset.visualize_dataset if args.model: if args.model == 'simple_model': model = SimpleModel() if args.model == 'explain_model': model = ExplainModel() if args.fit: fit_classifier(model, train_loader, test_loader, args.model) elif args.visualize: visualize(train_loader) elif args.evaluate and args.model_path: model = load_model(model_path) evaluate(model, test_loader) elif args.visualize_heatmap and args.model_path: load_visualize_heatmap(args.model_path, test_loader)
def _helper(): model = SimpleModel(hidden_dim=10) with pytest.raises(AssertionError): model, _, _, _ = deepspeed.initialize(model=None, config=config) with pytest.raises(AssertionError): model, _, _, _ = deepspeed.initialize(model, config=config)
def test_non_elastic_batch_params_w_override(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015 } }, "gradient_clipping": 1.0, "elasticity": { "enabled": True, "max_train_batch_size": 4, "micro_batch_sizes": [1, 2, 3, 4], "min_gpus": 1, "max_gpus": 4, "min_time": 20, "version": 0.1, "ignore_non_elastic_batch_info": True } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[1, 2]) def _test_elastic(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) _test_elastic(args=args, model=model, hidden_dim=hidden_dim)
def test_adam_amp_basic(tmpdir): config_dict = { "train_batch_size": 1, "steps_per_print": 1, "amp": { "enabled": True } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim) @distributed_test(world_size=[1]) def _test_adam_amp_basic(args, model, hidden_dim): optimizer = torch.optim.Adam(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=optimizer) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
def __init__(self, no_agents: int): self.model: SimpleModel = SimpleModel(no_agents) self.states_dictionary: Dict[hash, int] = {} self.state_number: int = 0 self.epistemic_states_dictionaries: List[Dict[str, Set[int]]] = [] self.no_agents: int = no_agents self.prepare_epistemic_dictionaries()
def test_dataloader_drop_last(tmpdir, train_batch_size, drop_last): config_dict = { "train_batch_size": train_batch_size, "dataloader_drop_last": drop_last, "steps_per_print": 1 } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim) @distributed_test(world_size=[1]) def _test_dataloader_drop_last(args, model, hidden_dim): optimizer = torch.optim.AdamW(params=model.parameters()) #TODO: Figure out why this breaks with cuda device train_dataset = random_dataset(total_samples=50, hidden_dim=hidden_dim, device=torch.device('cpu'), dtype=torch.float32) model, _, training_dataloader, _ = deepspeed.initialize( args=args, model=model, training_data=train_dataset, optimizer=optimizer) for n, batch in enumerate(training_dataloader): x = batch[0].to(torch.cuda.current_device()) y = batch[1].to(torch.cuda.current_device()) loss = model(x, y) model.backward(loss) model.step() _test_dataloader_drop_last(args=args, model=model, hidden_dim=hidden_dim)
def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "fp16": { "enabled": True }, "zero_optimization": { "stage": zero_stage } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[1]) def _test_zero_supported_client_optimizer(args, model, optimizer_constructor): client_optimizer = optimizer_constructor(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=client_optimizer) _test_zero_supported_client_optimizer( args=args, model=model, optimizer_constructor=optimizer_constructor)
def main(): prefix = './data/{}/{}'.format('baby', 'baby') dim = 100 epochs = 50 layer_size = [64, 32, 16, 1] lr = 0.0001 alpha = 0.5 num_users, num_items = get_user_item_info(prefix + '_umap.csv', prefix + '_itmap.csv') train_users, train_items, train_revs, train_rs, word_dict, maxlen = load_train_data( prefix + '_train.csv') test_users, test_items, test_rs = load_test_data(prefix + '_test.csv') vocab = get_vocab(word_dict) #w2v = Word2Vec.load(prefix+'_w2v.emb') with open(prefix + '_w2v.emb', 'rb') as fp: w2v = pickle.load(fp) word_emb = get_word_emb(vocab, dim, w2v) sess = tf.Session() model = SimpleModel(sess, num_users, num_items, maxlen, len(vocab), dim, layer_size, lr, alpha) sess.run(model.word_emb.assign(tf.constant(word_emb, dtype=tf.float32))) best_mse, best_epoch = 10, 0 for epoch in range(epochs): loss, mse = 0, 0 for batch_u, batch_i, batch_rev, batch_len, batch_r in tqdm( generate_train_batch(train_users, train_items, train_revs, train_rs, vocab, maxlen)): batch_mse, batch_loss = model.train(batch_u, batch_i, batch_rev, batch_len, batch_r) loss += batch_loss * len(batch_u) mse += batch_mse * len(batch_u) mse = mse / len(train_users) loss = loss / len(train_users) print('train epoch:{},mse:{},loss:{}'.format(epoch + 1, mse, loss)) mse = 0 for batch_u, batch_i, batch_r in tqdm( generate_test_batch(test_users, test_items, test_rs)): y, batch_mse = model.test(batch_u, batch_i, batch_r) mse += len(batch_u) * batch_mse mse = mse / len(test_users) if mse < best_mse: best_mse = mse best_epoch = epoch + 1 print('test epoch:{},mse:{}'.format(epoch + 1, mse)) print('best mse:{},at epoch:{}'.format(best_mse, best_epoch))
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload): #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: # pytest.skip("cpu-adam is not installed") config_dict = { "train_batch_size": 1, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 } }, "scheduler": { "type": "OneCycle", "params": { "cycle_first_step_size": 16000, "cycle_first_stair_count": 8000, "decay_step_size": 16000, "cycle_min_lr": 1e-06, "cycle_max_lr": 3e-05, "decay_lr_rate": 1e-07, "cycle_min_mom": 0.85, "cycle_max_mom": 0.99, "decay_mom_rate": 0.0 } }, "fp16": { "enabled": True }, "zero_optimization": { "stage": zero_stage, "cpu_offload": use_cpu_offload } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) @distributed_test(world_size=[1]) def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_adam_fp16_zero_onecycle_compatibility(args=args, model=model, hidden_dim=hidden_dim)
def _helper(): model = SimpleModel(hidden_dim=10) model, _, _, _ = deepspeed.initialize(model=model, config=config) data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1])
def _test_fused_all_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim, empty_grad=True) model, optim, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**4 # Ensure the dynamic loss scaler is correctly configured. assert optim.dynamic_loss_scale == True assert optim.cur_scale == expected_loss_scale overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6 for i, value in enumerate(overflow_gradients): run_model_step(model, [value]) expected_loss_scale = max(expected_loss_scale / 2, 1) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == (i + 1)
def _go(hidden_dim): with deepspeed.zero.Init(enabled=zero_stage == 3, config_dict_or_path=ds_config): model = SimpleModel(hidden_dim, nlayers=78) print('total number of parameters:', sum([p.numel() for p in model.parameters()])) see_memory_usage('pre-init', force=True) model, _, _, _ = deepspeed.initialize(model=model, config=ds_config) see_memory_usage('post-init', force=True) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.half) print(f"optimizer={model.optimizer}") for batch in data_loader: model(batch[0], batch[1]) see_memory_usage('post-fwds', force=True)
class SimpleModelTest(TestCase): def setUp(self): R = np.diag([0.1, 0.1, 0.1, 0.1, 0.1]) Q = np.array([[0.01]]) Q_b = np.array([[10]]) self.model = SimpleModel(0.1, R, Q, Q_b) def test_accessor(self): self.model.mu = [1, 2, 3, 4, 5] self.assertEqual(1, self.model.x) self.assertEqual(2, self.model.y) self.assertEqual(3, self.model.theta) self.assertEqual(4, self.model.vx) self.assertEqual(5, self.model.vy) def test_update(self): """ Smoke test for the predictor. """ self.model.predict(np.array([1, 0])) self.model.predict(np.array([1, 0])) self.assertAlmostEqual(self.model.x, 0.01, places=4) def test_correct_angle(self): """ Smoke test for the angle corrector. """ self.model.correct_angle(1) self.assertAlmostEqual(self.model.theta, 0.5, places=3) def test_correct_beacon(self): """ Smoke test for the beacon corrector. """ x, y = 1, 1 # ground truth passed to the model for computing beacon pos def distance(bx, by): return np.sqrt((bx - x) ** 2 + (by - y) ** 2) beacons = [ (3, 3), (0, 3), (3, 0), ] for _ in range(100): # we run a prediction step to increase variance self.model.predict((0, 0)) for bx, by in beacons: self.model.correct_beacon(bx, by, distance(bx, by)) self.assertAlmostEqual(self.model.x, x, places=2)
def test_checkpoint_unfused_optimizer(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015, "max_grad_norm": 1.0 } }, "fp16": { "enabled": True }, "scheduler": { "type": "OneCycle", "params": { "cycle_first_step_size": 1000, "cycle_first_stair_count": 500, "cycle_second_step_size": 1000, "cycle_second_stair_count": 500, "decay_step_size": 1000, "cycle_min_lr": 0.0001, "cycle_max_lr": 0.0010, "decay_lr_rate": 0.001, "cycle_min_mom": 0.85, "cycle_max_mom": 0.99, "decay_mom_rate": 0.0 } } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[2]) def _test_checkpoint_unfused_optimizer(args, model, hidden_dim, load_optimizer_states): checkpoint_correctness_verification(args, model, hidden_dim, tmpdir, load_optimizer_states=load_optimizer_states) _test_checkpoint_unfused_optimizer(args=args, model=model, hidden_dim=hidden_dim, load_optimizer_states=True) _test_checkpoint_unfused_optimizer(args=args, model=model, hidden_dim=hidden_dim, load_optimizer_states=False)
def _test_unfused_no_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim, empty_grad=True) model, optim, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**8 expected_scale_window = 2 # Ensure the dynamic loss scaler is correctly configured. assert optim.dynamic_loss_scale == True assert optim.cur_scale == expected_loss_scale assert optim.scale_window == expected_scale_window for i, value in enumerate(np.random.uniform(-0.1, 0.1, 10)): run_model_step(model, [value]) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == (i + 1) if optim.cur_iter % expected_scale_window == 0: expected_loss_scale *= 2
def test_onebitlamb_checkpointing_overflow(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "OneBitLamb", "params": { "lr": 0.00015, "weight_decay": 0.01, "max_coeff": 0.3, "min_coeff": 0.01, "freeze_step": 2, "cuda_aware": False, "comm_backend_name": "nccl", "coeff_beta": 0.9, "factor_max": 1.0, "factor_min": 0.5, "factor_threshold": 0.1 } }, "gradient_clipping": 1.0, "fp16": { "enabled": True, "loss_scale": 0, "initial_scale_power": 16 } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim) @distributed_test(world_size=[2]) def _test_onebitlamb_checkpointing_overflow(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device) save_folder = os.path.join(tmpdir, 'saved_checkpoint') for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) if dist.get_rank() == 0 and n >= 10: loss = loss * 1000000.0 model.backward(loss) dist.barrier() model.step() dist.barrier() model.save_checkpoint(save_folder, tag=None) _test_onebitlamb_checkpointing_overflow(args=args, model=model, hidden_dim=hidden_dim)
def test_checkpoint_lr_scheduler(tmpdir, zero_stage): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015, "betas": [0.8, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } }, "fp16": { "enabled": True }, "zero_optimization": { "stage": zero_stage }, "scheduler": { "type": "WarmupLR", "params": { "warmup_min_lr": 0, "warmup_max_lr": 0.001, "warmup_num_steps": 1000 } } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[2]) def _test_checkpoint_lr_scheduler(args, model, hidden_dim, load_optimizer_states, load_lr_scheduler_states): checkpoint_correctness_verification( args, model, hidden_dim, tmpdir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_lr_scheduler_states) _test_checkpoint_lr_scheduler(args=args, model=model, hidden_dim=hidden_dim, load_optimizer_states=False, load_lr_scheduler_states=True)
def export_for_serving(export_path: str) -> None: mock_input = tf.keras.Input(shape=4, batch_size=None, name='main_input', dtype=tf.float32) model = SimpleModel() model(mock_input) # This call fails with # Inputs to eager execution function cannot be Keras symbolic tensors # suggesting that some transformation is being done by tf.keras.Model.__call__ # model.call2(mock_input) model.summary() tf.keras.models.save_model(model, export_path, overwrite=True, include_optimizer=False, save_format=None, signatures=None, options=None)
def test_zero2_reduce_scatter_off(tmpdir): if not bf16_required_version_check(): pytest.skip( " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly" ) config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 } }, "gradient_clipping": 1.0, "zero_optimization": { "stage": 2, "contiguous_gradients": True, "allgather_bucket_size": 2000000000, "reduce_bucket_size": 200000000, "overlap_comm": False, "reduce_scatter": False }, "fp16": { "enabled": False }, "bfloat16": { "enabled": True } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim) @distributed_test(world_size=[2]) def _helper(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.bfloat16) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _helper(args=args, model=model, hidden_dim=hidden_dim)
def _test_unfused_some_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim, empty_grad=True) model, optim, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**8 expected_scale_window = 2 expected_iteration = 0 # Ensure the dynamic loss scaler is correctly configured. assert optim.dynamic_loss_scale == True assert optim.cur_scale == expected_loss_scale assert optim.scale_window == expected_scale_window # Run model with overflows to decrease scale overflow_gradients = [float('inf'), float('nan')] expected_iteration += len(overflow_gradients) run_model_step(model, overflow_gradients) expected_loss_scale /= (2**len(overflow_gradients)) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration # Run model scale_window + 1 times to increase scale once normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1) expected_iteration += len(normal_gradients) run_model_step(model, normal_gradients) expected_loss_scale *= 2 assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration # Run model with overflows to decrease scale overflow_gradients = [float('inf')] expected_iteration += len(overflow_gradients) run_model_step(model, overflow_gradients) expected_loss_scale /= (2**len(overflow_gradients)) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration
def _test_zero_empty_partition(args): hidden_dim = 1 model = SimpleModel(hidden_dim) # Ensure model has 2 parameters, to cause empty partition with DP=3 assert len(list(model.parameters())) == 2 model, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) model.step()