Beispiel #1
0
    def __init__(self) -> None:
        """Init NestedTinyModel."""
        super().__init__()

        self.tiny1 = TinyModel()
        self.tiny2 = TinyModel()
        self.tiny3 = TinyModel()
Beispiel #2
0
def test_preconditioner_logging(caplog: Any) -> None:
    """Test KFACPreconditioner logs relevant info."""
    caplog.set_level(logging.INFO)

    KFACPreconditioner(TinyModel(), loglevel=logging.DEBUG)
    assert len(caplog.records) == 0
    caplog.clear()

    KFACPreconditioner(TinyModel(), loglevel=logging.INFO)
    messages = [r.getMessage() for r in caplog.records]
    # Should register two layers in TinyModel and have a record for each
    assert sum('Registered' in msg for msg in messages) == 2
    # Should print KAISAAssignment once
    assert sum('KAISAAssignment' in msg for msg in messages) == 1
Beispiel #3
0
def test_input_check(
    preconditioner_type: type[BaseKFACPreconditioner],
    preconditioner_kwargs: dict[str, Any],
) -> None:
    """Test raises ValueError if preconditioner was already passed lambda."""
    preconditioner = preconditioner_type(
        **preconditioner_kwargs,
        factor_update_steps=factor_func(1),
    )
    with pytest.raises(ValueError):
        LambdaParamScheduler(
            preconditioner,
            factor_update_steps_lambda=factor_func(1),
        )

    preconditioner = KFACPreconditioner(
        TinyModel(),
        inv_update_steps=factor_func(1),
    )
    with pytest.raises(ValueError):
        LambdaParamScheduler(
            preconditioner,
            inv_update_steps_lambda=factor_func(1),
        )

    preconditioner = KFACPreconditioner(TinyModel(), damping=factor_func(1))
    with pytest.raises(ValueError):
        LambdaParamScheduler(preconditioner, damping_lambda=factor_func(1))

    preconditioner = KFACPreconditioner(
        TinyModel(),
        factor_decay=factor_func(1),
    )
    with pytest.raises(ValueError):
        LambdaParamScheduler(
            preconditioner,
            factor_decay_lambda=factor_func(1),
        )

    preconditioner = KFACPreconditioner(TinyModel(), kl_clip=factor_func(1))
    with pytest.raises(ValueError):
        LambdaParamScheduler(preconditioner, kl_clip_lambda=factor_func(1))

    preconditioner = KFACPreconditioner(TinyModel(), lr=factor_func(1))
    with pytest.raises(ValueError):
        LambdaParamScheduler(preconditioner, lr_lambda=factor_func(1))
Beispiel #4
0
    def _f() -> None:
        p1 = KFACPreconditioner(TinyModel(), grad_worker_fraction=1)
        p2 = KFACPreconditioner(
            TinyModel(),
            grad_worker_fraction=DistributedStrategy.COMM_OPT,
        )
        assert p1.distributed_strategy == p2.distributed_strategy
        assert p1.grad_worker_fraction == p2.grad_worker_fraction

        p1 = KFACPreconditioner(
            TinyModel(),
            grad_worker_fraction=DistributedStrategy.HYBRID_OPT,
        )
        assert p1.grad_worker_fraction == 0.5

        p1 = KFACPreconditioner(
            TinyModel(),
            grad_worker_fraction=DistributedStrategy.MEM_OPT,
        )
        assert p1.grad_worker_fraction == 0.25

        p1 = KFACPreconditioner(TinyModel(), grad_worker_fraction=0)
        assert p1.grad_worker_fraction == 0.25
        assert p1.distributed_strategy == DistributedStrategy.MEM_OPT

        p1 = KFACPreconditioner(
            TinyModel(),
            grad_worker_fraction=0.5,
        )
        assert p1.distributed_strategy == DistributedStrategy.HYBRID_OPT
Beispiel #5
0
def test_preconditioner_init() -> None:
    """Test KFACPreconditioner initialization."""
    p1 = KFACPreconditioner(TinyModel(), assignment_strategy='memory')
    p2 = KFACPreconditioner(
        TinyModel(),
        assignment_strategy=AssignmentStrategy.MEMORY,
    )
    assert p1.assignment_strategy == p2.assignment_strategy

    p1 = KFACPreconditioner(TinyModel(), compute_method='inverse')
    p2 = KFACPreconditioner(TinyModel(), compute_method=ComputeMethod.INVERSE)
    assert p1.compute_method == p2.compute_method

    @distributed_test(world_size=4)
    def _f() -> None:
        p1 = KFACPreconditioner(TinyModel(), grad_worker_fraction=1)
        p2 = KFACPreconditioner(
            TinyModel(),
            grad_worker_fraction=DistributedStrategy.COMM_OPT,
        )
        assert p1.distributed_strategy == p2.distributed_strategy
        assert p1.grad_worker_fraction == p2.grad_worker_fraction

        p1 = KFACPreconditioner(
            TinyModel(),
            grad_worker_fraction=DistributedStrategy.HYBRID_OPT,
        )
        assert p1.grad_worker_fraction == 0.5

        p1 = KFACPreconditioner(
            TinyModel(),
            grad_worker_fraction=DistributedStrategy.MEM_OPT,
        )
        assert p1.grad_worker_fraction == 0.25

        p1 = KFACPreconditioner(TinyModel(), grad_worker_fraction=0)
        assert p1.grad_worker_fraction == 0.25
        assert p1.distributed_strategy == DistributedStrategy.MEM_OPT

        p1 = KFACPreconditioner(
            TinyModel(),
            grad_worker_fraction=0.5,
        )
        assert p1.distributed_strategy == DistributedStrategy.HYBRID_OPT

    _f()

    p1 = KFACPreconditioner(TinyModel(), allreduce_bucket_cap_mb=25)
    assert p1.allreduce_method == AllreduceMethod.ALLREDUCE_BUCKETED

    p1 = KFACPreconditioner(TinyModel(), allreduce_bucket_cap_mb=0)
    assert p1.allreduce_method == AllreduceMethod.ALLREDUCE
def train(grad_worker_frac: float) -> None:
    """Train TinyModel with KFAC on random data."""
    batch_size = 4
    in_features = 10
    out_features = 10
    steps = 20

    x = torch.rand(batch_size, in_features)
    y = torch.rand(batch_size, out_features)
    if torch.distributed.is_initialized():
        torch.distributed.all_reduce(x)
        torch.distributed.all_reduce(y)

    model: torch.nn.Module = TinyModel()
    if torch.distributed.is_initialized():
        model = torch.nn.parallel.DistributedDataParallel(model)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    preconditioner = KFACPreconditioner(
        model,
        factor_update_steps=5,
        inv_update_steps=10,
        grad_worker_fraction=grad_worker_frac,
        allreduce_bucket_cap_mb=0,
        update_factors_in_hook=False,
    )
    criterion = torch.nn.MSELoss(reduction='sum')

    losses = []
    for _ in range(steps):
        y_pred = model(x)
        loss = criterion(y_pred, y)
        losses.append(loss.item())
        loss.backward()
        preconditioner.step()
        optimizer.step()
        optimizer.zero_grad()

    assert losses[0] > losses[-1]
Beispiel #7
0
def test_preconditioner_init_raises() -> None:
    """Test KFACPreconditioner argument validation."""
    with pytest.raises(ValueError):
        KFACPreconditioner(TinyModel(), allreduce_bucket_cap_mb=-1)

    KFACPreconditioner(
        TinyModel(),
        compute_eigenvalue_outer_product=True,
        compute_method=ComputeMethod.INVERSE,
        colocate_factors=False,
    )
    with pytest.raises(ValueError):
        KFACPreconditioner(
            TinyModel(),
            compute_eigenvalue_outer_product=True,
            compute_method=ComputeMethod.EIGEN,
            colocate_factors=False,
        )

    with pytest.raises(ValueError):
        KFACPreconditioner(TinyModel(), grad_worker_fraction=2)

    with pytest.raises(ValueError):
        KFACPreconditioner(TinyModel(), grad_worker_fraction=-1)

    @distributed_test(world_size=8)
    def _f() -> None:
        with pytest.raises(ValueError):
            KFACPreconditioner(TinyModel(), grad_worker_fraction=0.33)

    _f()

    with pytest.warns():
        KFACPreconditioner(
            TinyModel(),
            compute_method=ComputeMethod.INVERSE,
            colocate_factors=False,
            grad_worker_fraction=DistributedStrategy.MEM_OPT,
        )
    def e2e() -> None:
        """Helper to run training in simulated distributed environment."""
        batch_size = 2
        model = TinyModel()
        criterion = torch.nn.MSELoss(reduction='sum')
        optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

        tdc = TorchDistributedCommunicator()
        layers = register_modules(
            model,
            KFACInverseLayer,
            allreduce_method=AllreduceMethod.ALLREDUCE,
            grad_scaler=None,
            factor_dtype=None,
            inv_dtype=torch.float32,
            skip_layers=[],
            symmetry_aware=False,
            tdc=tdc,
        )
        preconditioner = BaseKFACPreconditioner(
            layers=layers,
            assignment=LazyAssignment(broadcast=broadcast),
            tdc=tdc,
            accumulation_steps=accumulation_steps,
            **kfac_args,
        )

        for i in range(1, 10):
            x = torch.rand(batch_size, 10)
            y = torch.rand(batch_size, 10)
            y_pred = model(x)
            if i % accumulation_steps == 0:
                loss = criterion(y_pred, y)
                loss.backward()
                grad_weight_linear2 = model.linear2.weight.grad
                grad_bias_linear2 = model.linear2.bias.grad
                preconditioner.step()
                # Verify gradient was preconditioned
                assert not torch.equal(
                    grad_weight_linear2,
                    model.linear2.weight.grad,
                )
                assert not torch.equal(
                    grad_bias_linear2,
                    model.linear2.bias.grad,
                )
                optimizer.step()
                optimizer.zero_grad()

        # Test state dict computes inverses
        state_dict = preconditioner.state_dict()
        for _, layer in preconditioner._layers.values():
            layer = cast(KFACInverseLayer, layer)
            layer.a_factor = None
            layer.g_factor = None
            layer.a_inv = None
            layer.g_inv = None
        preconditioner.load_state_dict(state_dict)
        for _, layer in preconditioner._layers.values():
            layer = cast(KFACInverseLayer, layer)
            assert isinstance(layer.a_inv, torch.Tensor)
            assert isinstance(layer.g_inv, torch.Tensor)

        # Test grad hook supports tensor input rather than tuple
        preconditioner._save_grad_output(
            model.linear1,
            torch.rand(batch_size, 10),
            torch.rand(batch_size, 20),
        )

        # Test hook additional functionality
        if preconditioner._update_factors_in_hook:
            # Reset preconditioner to ensure hooks trigger
            preconditioner._steps = 0
            preconditioner._mini_steps = defaultdict(int)
            preconditioner._accumulation_steps = 100

            # Do forward/backward pass to verify hooks trigger and we
            # have temp factors for batch
            x = torch.rand(batch_size, 10)
            y = torch.rand(batch_size, 10)
            loss = criterion(model(x), y)
            loss.backward()
            mem_usage = preconditioner.memory_usage()
            for mem in mem_usage.values():
                assert mem > 0
            preconditioner.reset_batch()

            # Make sure hooks do not trigger when model is not in training mode
            model.eval()
            x = torch.rand(batch_size, 10)
            y = torch.rand(batch_size, 10)
            loss = criterion(model(x), y)
            loss.backward()
            mem_usage = preconditioner.memory_usage()
            for key, mem in mem_usage.items():
                if 'batch' in key:
                    assert mem == 0
Beispiel #9
0
 def _f() -> None:
     with pytest.raises(ValueError):
         KFACPreconditioner(TinyModel(), grad_worker_fraction=0.33)
Beispiel #10
0
class NestedTinyModel(torch.nn.Module):
    """Nested model for testing recursive module discovery."""
    def __init__(self) -> None:
        """Init NestedTinyModel."""
        super().__init__()

        self.tiny1 = TinyModel()
        self.tiny2 = TinyModel()
        self.tiny3 = TinyModel()


@pytest.mark.parametrize(
    'module,expected',
    (
        (
            TinyModel(),
            [
                ('linear1', torch.nn.Linear),
                ('activation', torch.nn.ReLU),
                ('linear2', torch.nn.Linear),
                ('softmax', torch.nn.Softmax),
            ],
        ),
        (
            NestedTinyModel(),
            [
                ('tiny1.linear1', torch.nn.Linear),
                ('tiny1.activation', torch.nn.ReLU),
                ('tiny1.linear2', torch.nn.Linear),
                ('tiny1.softmax', torch.nn.Softmax),
                ('tiny2.linear1', torch.nn.Linear),
Beispiel #11
0
from testing.models import TinyModel


def factor_func(scale: int, constant: bool = True) -> Callable[..., int]:
    """Get function which returns scale given step."""
    def factor(step: int = 1) -> int:
        """Scale function."""
        return scale if constant else scale * step

    return factor


@pytest.mark.parametrize(
    'preconditioner_type,preconditioner_kwargs',
    ((KFACPreconditioner, {
        'model': TinyModel()
    }), ),
)
def test_input_check(
    preconditioner_type: type[BaseKFACPreconditioner],
    preconditioner_kwargs: dict[str, Any],
) -> None:
    """Test raises ValueError if preconditioner was already passed lambda."""
    preconditioner = preconditioner_type(
        **preconditioner_kwargs,
        factor_update_steps=factor_func(1),
    )
    with pytest.raises(ValueError):
        LambdaParamScheduler(
            preconditioner,
            factor_update_steps_lambda=factor_func(1),