def __init__(self) -> None: """Init NestedTinyModel.""" super().__init__() self.tiny1 = TinyModel() self.tiny2 = TinyModel() self.tiny3 = TinyModel()
def test_preconditioner_logging(caplog: Any) -> None: """Test KFACPreconditioner logs relevant info.""" caplog.set_level(logging.INFO) KFACPreconditioner(TinyModel(), loglevel=logging.DEBUG) assert len(caplog.records) == 0 caplog.clear() KFACPreconditioner(TinyModel(), loglevel=logging.INFO) messages = [r.getMessage() for r in caplog.records] # Should register two layers in TinyModel and have a record for each assert sum('Registered' in msg for msg in messages) == 2 # Should print KAISAAssignment once assert sum('KAISAAssignment' in msg for msg in messages) == 1
def test_input_check( preconditioner_type: type[BaseKFACPreconditioner], preconditioner_kwargs: dict[str, Any], ) -> None: """Test raises ValueError if preconditioner was already passed lambda.""" preconditioner = preconditioner_type( **preconditioner_kwargs, factor_update_steps=factor_func(1), ) with pytest.raises(ValueError): LambdaParamScheduler( preconditioner, factor_update_steps_lambda=factor_func(1), ) preconditioner = KFACPreconditioner( TinyModel(), inv_update_steps=factor_func(1), ) with pytest.raises(ValueError): LambdaParamScheduler( preconditioner, inv_update_steps_lambda=factor_func(1), ) preconditioner = KFACPreconditioner(TinyModel(), damping=factor_func(1)) with pytest.raises(ValueError): LambdaParamScheduler(preconditioner, damping_lambda=factor_func(1)) preconditioner = KFACPreconditioner( TinyModel(), factor_decay=factor_func(1), ) with pytest.raises(ValueError): LambdaParamScheduler( preconditioner, factor_decay_lambda=factor_func(1), ) preconditioner = KFACPreconditioner(TinyModel(), kl_clip=factor_func(1)) with pytest.raises(ValueError): LambdaParamScheduler(preconditioner, kl_clip_lambda=factor_func(1)) preconditioner = KFACPreconditioner(TinyModel(), lr=factor_func(1)) with pytest.raises(ValueError): LambdaParamScheduler(preconditioner, lr_lambda=factor_func(1))
def _f() -> None: p1 = KFACPreconditioner(TinyModel(), grad_worker_fraction=1) p2 = KFACPreconditioner( TinyModel(), grad_worker_fraction=DistributedStrategy.COMM_OPT, ) assert p1.distributed_strategy == p2.distributed_strategy assert p1.grad_worker_fraction == p2.grad_worker_fraction p1 = KFACPreconditioner( TinyModel(), grad_worker_fraction=DistributedStrategy.HYBRID_OPT, ) assert p1.grad_worker_fraction == 0.5 p1 = KFACPreconditioner( TinyModel(), grad_worker_fraction=DistributedStrategy.MEM_OPT, ) assert p1.grad_worker_fraction == 0.25 p1 = KFACPreconditioner(TinyModel(), grad_worker_fraction=0) assert p1.grad_worker_fraction == 0.25 assert p1.distributed_strategy == DistributedStrategy.MEM_OPT p1 = KFACPreconditioner( TinyModel(), grad_worker_fraction=0.5, ) assert p1.distributed_strategy == DistributedStrategy.HYBRID_OPT
def test_preconditioner_init() -> None: """Test KFACPreconditioner initialization.""" p1 = KFACPreconditioner(TinyModel(), assignment_strategy='memory') p2 = KFACPreconditioner( TinyModel(), assignment_strategy=AssignmentStrategy.MEMORY, ) assert p1.assignment_strategy == p2.assignment_strategy p1 = KFACPreconditioner(TinyModel(), compute_method='inverse') p2 = KFACPreconditioner(TinyModel(), compute_method=ComputeMethod.INVERSE) assert p1.compute_method == p2.compute_method @distributed_test(world_size=4) def _f() -> None: p1 = KFACPreconditioner(TinyModel(), grad_worker_fraction=1) p2 = KFACPreconditioner( TinyModel(), grad_worker_fraction=DistributedStrategy.COMM_OPT, ) assert p1.distributed_strategy == p2.distributed_strategy assert p1.grad_worker_fraction == p2.grad_worker_fraction p1 = KFACPreconditioner( TinyModel(), grad_worker_fraction=DistributedStrategy.HYBRID_OPT, ) assert p1.grad_worker_fraction == 0.5 p1 = KFACPreconditioner( TinyModel(), grad_worker_fraction=DistributedStrategy.MEM_OPT, ) assert p1.grad_worker_fraction == 0.25 p1 = KFACPreconditioner(TinyModel(), grad_worker_fraction=0) assert p1.grad_worker_fraction == 0.25 assert p1.distributed_strategy == DistributedStrategy.MEM_OPT p1 = KFACPreconditioner( TinyModel(), grad_worker_fraction=0.5, ) assert p1.distributed_strategy == DistributedStrategy.HYBRID_OPT _f() p1 = KFACPreconditioner(TinyModel(), allreduce_bucket_cap_mb=25) assert p1.allreduce_method == AllreduceMethod.ALLREDUCE_BUCKETED p1 = KFACPreconditioner(TinyModel(), allreduce_bucket_cap_mb=0) assert p1.allreduce_method == AllreduceMethod.ALLREDUCE
def train(grad_worker_frac: float) -> None: """Train TinyModel with KFAC on random data.""" batch_size = 4 in_features = 10 out_features = 10 steps = 20 x = torch.rand(batch_size, in_features) y = torch.rand(batch_size, out_features) if torch.distributed.is_initialized(): torch.distributed.all_reduce(x) torch.distributed.all_reduce(y) model: torch.nn.Module = TinyModel() if torch.distributed.is_initialized(): model = torch.nn.parallel.DistributedDataParallel(model) optimizer = torch.optim.SGD(model.parameters(), lr=0.1) preconditioner = KFACPreconditioner( model, factor_update_steps=5, inv_update_steps=10, grad_worker_fraction=grad_worker_frac, allreduce_bucket_cap_mb=0, update_factors_in_hook=False, ) criterion = torch.nn.MSELoss(reduction='sum') losses = [] for _ in range(steps): y_pred = model(x) loss = criterion(y_pred, y) losses.append(loss.item()) loss.backward() preconditioner.step() optimizer.step() optimizer.zero_grad() assert losses[0] > losses[-1]
def test_preconditioner_init_raises() -> None: """Test KFACPreconditioner argument validation.""" with pytest.raises(ValueError): KFACPreconditioner(TinyModel(), allreduce_bucket_cap_mb=-1) KFACPreconditioner( TinyModel(), compute_eigenvalue_outer_product=True, compute_method=ComputeMethod.INVERSE, colocate_factors=False, ) with pytest.raises(ValueError): KFACPreconditioner( TinyModel(), compute_eigenvalue_outer_product=True, compute_method=ComputeMethod.EIGEN, colocate_factors=False, ) with pytest.raises(ValueError): KFACPreconditioner(TinyModel(), grad_worker_fraction=2) with pytest.raises(ValueError): KFACPreconditioner(TinyModel(), grad_worker_fraction=-1) @distributed_test(world_size=8) def _f() -> None: with pytest.raises(ValueError): KFACPreconditioner(TinyModel(), grad_worker_fraction=0.33) _f() with pytest.warns(): KFACPreconditioner( TinyModel(), compute_method=ComputeMethod.INVERSE, colocate_factors=False, grad_worker_fraction=DistributedStrategy.MEM_OPT, )
def e2e() -> None: """Helper to run training in simulated distributed environment.""" batch_size = 2 model = TinyModel() criterion = torch.nn.MSELoss(reduction='sum') optimizer = torch.optim.SGD(model.parameters(), lr=0.001) tdc = TorchDistributedCommunicator() layers = register_modules( model, KFACInverseLayer, allreduce_method=AllreduceMethod.ALLREDUCE, grad_scaler=None, factor_dtype=None, inv_dtype=torch.float32, skip_layers=[], symmetry_aware=False, tdc=tdc, ) preconditioner = BaseKFACPreconditioner( layers=layers, assignment=LazyAssignment(broadcast=broadcast), tdc=tdc, accumulation_steps=accumulation_steps, **kfac_args, ) for i in range(1, 10): x = torch.rand(batch_size, 10) y = torch.rand(batch_size, 10) y_pred = model(x) if i % accumulation_steps == 0: loss = criterion(y_pred, y) loss.backward() grad_weight_linear2 = model.linear2.weight.grad grad_bias_linear2 = model.linear2.bias.grad preconditioner.step() # Verify gradient was preconditioned assert not torch.equal( grad_weight_linear2, model.linear2.weight.grad, ) assert not torch.equal( grad_bias_linear2, model.linear2.bias.grad, ) optimizer.step() optimizer.zero_grad() # Test state dict computes inverses state_dict = preconditioner.state_dict() for _, layer in preconditioner._layers.values(): layer = cast(KFACInverseLayer, layer) layer.a_factor = None layer.g_factor = None layer.a_inv = None layer.g_inv = None preconditioner.load_state_dict(state_dict) for _, layer in preconditioner._layers.values(): layer = cast(KFACInverseLayer, layer) assert isinstance(layer.a_inv, torch.Tensor) assert isinstance(layer.g_inv, torch.Tensor) # Test grad hook supports tensor input rather than tuple preconditioner._save_grad_output( model.linear1, torch.rand(batch_size, 10), torch.rand(batch_size, 20), ) # Test hook additional functionality if preconditioner._update_factors_in_hook: # Reset preconditioner to ensure hooks trigger preconditioner._steps = 0 preconditioner._mini_steps = defaultdict(int) preconditioner._accumulation_steps = 100 # Do forward/backward pass to verify hooks trigger and we # have temp factors for batch x = torch.rand(batch_size, 10) y = torch.rand(batch_size, 10) loss = criterion(model(x), y) loss.backward() mem_usage = preconditioner.memory_usage() for mem in mem_usage.values(): assert mem > 0 preconditioner.reset_batch() # Make sure hooks do not trigger when model is not in training mode model.eval() x = torch.rand(batch_size, 10) y = torch.rand(batch_size, 10) loss = criterion(model(x), y) loss.backward() mem_usage = preconditioner.memory_usage() for key, mem in mem_usage.items(): if 'batch' in key: assert mem == 0
def _f() -> None: with pytest.raises(ValueError): KFACPreconditioner(TinyModel(), grad_worker_fraction=0.33)
class NestedTinyModel(torch.nn.Module): """Nested model for testing recursive module discovery.""" def __init__(self) -> None: """Init NestedTinyModel.""" super().__init__() self.tiny1 = TinyModel() self.tiny2 = TinyModel() self.tiny3 = TinyModel() @pytest.mark.parametrize( 'module,expected', ( ( TinyModel(), [ ('linear1', torch.nn.Linear), ('activation', torch.nn.ReLU), ('linear2', torch.nn.Linear), ('softmax', torch.nn.Softmax), ], ), ( NestedTinyModel(), [ ('tiny1.linear1', torch.nn.Linear), ('tiny1.activation', torch.nn.ReLU), ('tiny1.linear2', torch.nn.Linear), ('tiny1.softmax', torch.nn.Softmax), ('tiny2.linear1', torch.nn.Linear),
from testing.models import TinyModel def factor_func(scale: int, constant: bool = True) -> Callable[..., int]: """Get function which returns scale given step.""" def factor(step: int = 1) -> int: """Scale function.""" return scale if constant else scale * step return factor @pytest.mark.parametrize( 'preconditioner_type,preconditioner_kwargs', ((KFACPreconditioner, { 'model': TinyModel() }), ), ) def test_input_check( preconditioner_type: type[BaseKFACPreconditioner], preconditioner_kwargs: dict[str, Any], ) -> None: """Test raises ValueError if preconditioner was already passed lambda.""" preconditioner = preconditioner_type( **preconditioner_kwargs, factor_update_steps=factor_func(1), ) with pytest.raises(ValueError): LambdaParamScheduler( preconditioner, factor_update_steps_lambda=factor_func(1),