Beispiel #1
0
class ModelAPPNP(torch.nn.Module):

    def __init__(self, K, alpha, hidden, activation, data):
        super(ModelAPPNP, self).__init__()
        self.linear_1 = Linear(data.num_features, hidden)
        self.conv = APPNP(K, alpha)
        self.linear_2 = Linear(hidden, data.num_class)
        if activation == "relu":
            self.activation = relu
        elif activation == "leaky_relu":
            self.activation = leaky_relu
        self.reg_params = list(self.linear_1.parameters()) + list(self.conv.parameters()) + list(
            self.linear_2.parameters())

    def reset_parameters(self):
        self.linear_1.reset_parameters()
        self.linear_2.reset_parameters()

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
        edge_index, edge_weight = dropout_adj(edge_index, edge_attr=edge_weight, p=0.8, training=self.training)
        x = self.linear_1(x)
        x = self.activation(x)
        x = dropout(x, p=0.5, training=self.training)
        x = self.conv(x, edge_index, edge_weight=edge_weight)
        x = self.activation(x)
        x = dropout(x, p=0.5, training=self.training)
        x = self.linear_2(x)
        return log_softmax(x, dim=-1)
Beispiel #2
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 hiddens=[],
                 activations=[],
                 dropout=0.5,
                 l2_norm=5e-5,
                 lr=0.2,
                 use_bias=False):
        super().__init__()

        if len(hiddens) != len(activations):
            raise RuntimeError(
                f"Arguments 'hiddens' and 'activations' should have the same length."
                " Or you can set both of them to `[]`.")

        self.layers = ModuleList()

        paras = []
        inc = in_channels
        for hidden, activation in zip(hiddens, activations):
            layer = Linear(inc, hidden, bias=use_bias)
            paras.append(dict(params=layer.parameters(), weight_decay=l2_norm))
            self.layers.append(layer)
            inc = hidden

        layer = Linear(inc, out_channels, bias=use_bias)
        self.layers.append(layer)
        # do not use weight_decay in the final layer
        paras.append(dict(params=layer.parameters(), weight_decay=l2_norm))

        self.dropout = Dropout(dropout)
        self.optimizer = optim.Adam(paras, lr=lr)
        self.loss_fn = torch.nn.CrossEntropyLoss()
Beispiel #3
0
class ModelGAT(torch.nn.Module):
    def __init__(self, num_layers, hidden_list, activation, data):
        super(ModelGAT, self).__init__()
        assert len(hidden_list) == num_layers + 1
        self.linear_1 = Linear(data.num_features, hidden_list[0])
        self.convs = torch.nn.ModuleList()
        for i in range(num_layers):
            self.convs.append(GATConv(hidden_list[i], hidden_list[i + 1]))
        self.linear_2 = Linear(hidden_list[-1], data.num_class)
        if activation == "relu":
            self.activation = relu
        elif activation == "leaky_relu":
            self.activation = leaky_relu
        self.reg_params = list(self.linear_1.parameters()) + list(
            self.convs.parameters()) + list(self.linear_2.parameters())

    def reset_parameters(self):
        self.linear_1.reset_parameters()
        for conv in self.convs:
            conv.reset_parameters()
        self.linear_2.reset_parameters()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.linear_1(x)
        x = self.activation(x)
        x = dropout(x, p=0.5, training=self.training)
        for i in range(len(self.convs)):
            x = self.convs[i](x, edge_index)
            if i != len(self.convs) - 1:
                x = self.activation(x)
        x = dropout(x, p=0.5, training=self.training)
        x = self.linear_2(x)
        return log_softmax(x, dim=-1)
Beispiel #4
0
def _test_basic_func(rank,
                     world_size,
                     tempfile_name,
                     test_case,
                     oss,
                     model=None):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")

    if model is None:
        model = Linear(2, 2)
        model.bias.data.fill_(0.0)

    model.to("cuda")
    model = DDP(model, device_ids=[rank])

    assert oss in ["none", "ada-oss", "wrapper-oss", "oss-wrapper"]
    if oss == "ada-oss":
        optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
    elif oss == "wrapper-oss":
        optim = AdaScaleWrapper(model.parameters(),
                                optim_cls=OSS,
                                optim=SGD,
                                lr=0.1)
    elif oss == "oss-wrapper":
        optim = OSS(model.parameters(), AdaScaleWrapper, optim_cls=SGD, lr=0.1)
    else:
        assert oss == "none"
        optim = AdaScale(SGD(model.parameters(), lr=0.1))

    if "input" in test_case:
        inputs = [test_case["input"]]
    else:
        inputs = test_case["inputs"]

    for in_data in inputs:
        in_data = Tensor(in_data[rank]).cuda()
        out = model(in_data)
        out.sum().backward()
        optim.step()
        optim.zero_grad()

    if "expected_gain" in test_case:
        assert np.allclose(optim.gain(),
                           test_case["expected_gain"]), "{} vs {}".format(
                               optim.gain(), test_case["expected_gain"])

    if "expected_mean_weight" in test_case:
        mean_weight = mean(
            [model.module[i].weight.data.mean().item() for i in range(4)])
        assert np.allclose(mean_weight,
                           test_case["expected_mean_weight"]), mean_weight

    dist.destroy_process_group()
Beispiel #5
0
class SimpleDDPGAgent(Module):
    def __init__(self, **kwargs):
        super(SimpleDDPGAgent, self).__init__()

        hidden_size = kwargs['hidden_size']
        # actor
        self.actor_linears = ModuleList(
            [Linear(kwargs['state_dim'], hidden_size[0])])
        self.actor_linears.extend([
            Linear(hidden_size[i - 1], hidden_size[i])
            for i in range(1, len(hidden_size))
        ])
        self.action = Linear(hidden_size[-1], kwargs['action_dim'])

        # critic
        self.critic_linears = ModuleList([
            Linear(kwargs['state_dim'] + kwargs['action_dim'], hidden_size[0])
        ])
        self.critic_linears.extend([
            Linear(hidden_size[i - 1], hidden_size[i])
            for i in range(1, len(hidden_size))
        ])
        self.q = Linear(hidden_size[-1], 1)

        self.relu = ReLU()
        self.sigmoid = Sigmoid()
        self.tanh = Tanh()

        self.apply(init_weights)  # xavier uniform init

    def act(self, state):
        x = state
        for l in self.actor_linears:
            x = l(x)
            x = self.relu(x)
        action = self.tanh(self.action(x))
        return action

    def Q(self, state, action):
        x = torch.cat([state, action], dim=1)
        for l in self.critic_linears:
            x = l(x)
            x = self.relu(x)
        q = self.q(x)
        return q

    def get_actor_parameters(self):
        return list(self.actor_linears.parameters()) + list(
            self.action.parameters())

    def get_critic_parameters(self):
        return list(self.critic_linears.parameters()) + list(
            self.q.parameters())
def test_save_load_checkpoints():
    experts = {}
    expert = Linear(1, 1)
    opt = torch.optim.SGD(expert.parameters(), 0.0)
    expert_name = f'test_expert'
    args_schema = (BatchTensorDescriptor(1), )
    experts[expert_name] = ExpertBackend(
        name=expert_name,
        expert=expert,
        opt=opt,
        args_schema=args_schema,
        outputs_schema=BatchTensorDescriptor(1),
        max_batch_size=1,
    )
    with TemporaryDirectory() as tmpdir:
        tmp_path = Path(tmpdir)

        expert.weight.data[0] = 1
        store_experts(experts, tmp_path)
        expert.weight.data[0] = 2
        store_experts(experts, tmp_path)
        expert.weight.data[0] = 3
        store_experts(experts, tmp_path)

        checkpoints_dir = tmp_path / expert_name

        assert checkpoints_dir.exists()
        assert len(list(checkpoints_dir.iterdir())) == 3

        expert.weight.data[0] = 4

        load_weights(experts, tmp_path)
        assert expert.weight.data[0] == 3
Beispiel #7
0
    def test_one_iteration(self):
        """Test FSDP with uneven divide of parameter shards."""
        model = Linear(3, 3, bias=False)
        input = torch.rand(8, 3)
        my_lr = 0.1

        ref_forward_output_my_rank, ref_weight_out = self._get_ref_results(
            model, input, my_lr
        )

        model.to(self.rank)
        model = FSDP(model)
        optim = SGD(model.parameters(), lr=my_lr)
        self.assertTrue(len(input) >= self.world_size)
        in_data = torch.Tensor(input[self.rank]).to(self.rank)
        out = model(in_data)
        out.float().sum().backward()
        optim.step()
        optim.zero_grad()

        with model._summon_full_params():
            torch.cuda.synchronize()  # TODO: This is here because it was
            # originally part of get_full_params(), debug why it is needed here.
            weight_out = model.module.weight.T.clone()
            self.assertEqual(ref_forward_output_my_rank, out)
            self.assertEqual(ref_weight_out, weight_out)
def train(data_loader):
    net = Linear(3, 1)
    lr = 1e-3
    optimizer = AdamW(net.parameters(), lr)
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2)
    max_epochs = 40
    for ep_id in range(max_epochs):
        net.train()
        for b_id, batch in enumerate(data_loader):
            optimizer.zero_grad()
            output = net(batch['data'].to(data_loader))
            CEloss = CrossEntropyLoss()
            loss = CEloss(output, batch['ground_truth'])
            writer = SummaryWriter(
                log_dir=
                'C:\Users\andre\OneDrive\Рабочий стол\train\checkpoints',
                comment="Batch loss")
            writer.add_graph(loss)
            loss.backward()
            optimiser.step()
        scheduler.step()
        torch.save(
            {
                'epoch': ep_id,
                'model_state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
            }, '/checkpoints/checkpoints.txt')
Beispiel #9
0
def test_grad_accum_cpu(cpu=True):
    """Test the basic functionality on CPU with gradient accumulation without DDP"""
    model = Linear(2, 2, bias=False)
    if not cpu:
        model = model.cuda()
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
    for expected_gain in [2.0, 2.0]:  # test 2 iterations catch more corner cases.
        # grad pass 1
        in_data = Tensor([0.0, 1.0])
        if not cpu:
            in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        # grad pass 2
        in_data = Tensor([1.0, 0.0])
        if not cpu:
            in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        # stepping it. Note that if we did more than 2 passes as promised by the
        # num_gradients_to_accumulate argument above, AdaScale is not be able to
        # detect that mistake for now. The result will just be wrong in that case.
        assert np.allclose(optim.gain(), expected_gain), optim.gain()
        optim.step()
        optim.zero_grad()
Beispiel #10
0
def test_grad_accum(test_case, cpu):
    """Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
    model = Linear(2, 2, bias=False)
    if not cpu:
        if torch.cuda.device_count() < 1:
            pytest.skip("1 GPU is required")
        model = model.cuda()
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
    expected_gain = test_case["expected_gain"]
    if "input" in test_case:
        data = [test_case["input"]] * 2
        gains = [expected_gain] * 2
    else:
        data = test_case["inputs"]
        gains = [None, expected_gain]
    for in_data, exp_gain in zip(data, gains):  # test 2 iterations catch more corner cases.
        # grad pass 1
        in_data_0 = Tensor(in_data[0])
        if not cpu:
            in_data_0 = in_data_0.cuda()
        out = model(in_data_0)
        out.sum().backward()
        # grad pass 2
        in_data_1 = Tensor(in_data[1])
        if not cpu:
            in_data_1 = in_data_1.cuda()
        out = model(in_data_1)
        out.sum().backward()
        if exp_gain is not None:
            assert np.allclose(optim.gain(), exp_gain), optim.gain()
        # stepping it. Note that if we did more than 2 passes as promised by the
        # num_gradients_to_accumulate argument above, AdaScale is not be able to
        # detect that mistake for now. The result will just be wrong in that case.
        optim.step()
        optim.zero_grad()
Beispiel #11
0
def test_restore_update_count():
    experts = {}
    expert = Linear(1, 1)
    opt = torch.optim.SGD(expert.parameters(), 0.0)
    expert_name = f'test_expert'
    args_schema = (BatchTensorDescriptor(1),)
    expert_backend = ExpertBackend(name=expert_name, expert=expert, opt=opt,
                                   args_schema=args_schema,
                                   outputs_schema=BatchTensorDescriptor(1),
                                   max_batch_size=1,
                                   )
    experts[expert_name] = expert_backend

    batch = torch.randn(1, 1)
    loss_grad = torch.randn(1, 1)

    with TemporaryDirectory() as tmpdir:
        tmp_path = Path(tmpdir)

        for _ in range(BACKWARD_PASSES_BEFORE_SAVE):
            expert_backend.backward(batch, loss_grad)

        store_experts(experts, tmp_path)

        for _ in range(BACKWARD_PASSES_AFTER_SAVE):
            expert_backend.backward(batch, loss_grad)

        load_weights(experts, tmp_path)
        assert experts[expert_name].update_count == BACKWARD_PASSES_BEFORE_SAVE
Beispiel #12
0
def test_unhook():
    """Test unhook that frees the tensor from CUDA memory."""
    model = Linear(123, 456,
                   bias=False).cuda()  # unique shape so that it can be found
    optim = AdaScale(SGD(model.parameters(), lr=0.1),
                     num_gradients_to_accumulate=2)

    def find_tensor():
        """ Find the weight tensor from the heap

            Return True if found.
        """
        for obj in gc.get_objects():
            try:
                # Only need to check parameter type objects
                if "torch.nn.parameter.Parameter" not in str(type(obj)):
                    continue
                if torch.is_tensor(obj) or (hasattr(obj, "data")
                                            and torch.is_tensor(obj.data)):
                    if obj.shape == (456, 123):
                        return True
            except Exception as e:
                pass
        return False

    torch.cuda.empty_cache()
    assert find_tensor(
    ), "something wrong with gc-based method to find the tensor"

    optim.unhook()
    del model
    del optim
    torch.cuda.empty_cache()
    assert not find_tensor(), "tensor should have been released"
Beispiel #13
0
def test_create_supervised():
    model = Linear(1, 1)
    model.weight.data.zero_()
    model.bias.data.zero_()
    optimizer = SGD(model.parameters(), 0.1)
    trainer = create_supervised(model, optimizer, mse_loss)

    x = torch.FloatTensor([[1.0], [2.0]])
    y = torch.FloatTensor([[3.0], [5.0]])
    data = [(x, y)]

    trainer.validate(data)
    y_pred, y = trainer.validation_history[0]

    assert y_pred[0, 0] == approx(0.0)
    assert y_pred[1, 0] == approx(0.0)
    assert y[0, 0] == approx(3.0)
    assert y[1, 0] == approx(5.0)

    assert model.weight.data[0, 0] == approx(0.0)
    assert model.bias.data[0] == approx(0.0)

    trainer.run(data)
    loss = trainer.training_history[0]

    assert loss == approx(17.0)
    assert model.weight.data[0, 0] == approx(1.3)
    assert model.bias.data[0] == approx(0.8)
Beispiel #14
0
def _test_create_supervised_trainer(
    model_device: Optional[str] = None,
    trainer_device: Optional[str] = None,
    trace: bool = False,
    amp_mode: str = None,
    scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
):
    model = Linear(1, 1)

    if model_device:
        model.to(model_device)

    model.weight.data.zero_()
    model.bias.data.zero_()
    optimizer = SGD(model.parameters(), 0.1)

    if trace:
        example_input = torch.randn(1, 1)
        model = torch.jit.trace(model, example_input)

    if amp_mode == "apex" and model_device == trainer_device == "cuda":
        from apex import amp

        model, optimizer = amp.initialize(model, optimizer, opt_level="O2")

    trainer = create_supervised_trainer(
        model,
        optimizer,
        mse_loss,
        device=trainer_device,
        output_transform=lambda x, y, y_pred, loss: (y_pred, loss.item()),
        amp_mode=amp_mode,
        scaler=scaler,
    )

    x = torch.tensor([[0.1], [0.2]])
    y = torch.tensor([[0.3], [0.5]])
    data = [(x, y)]

    assert model.weight.data[0, 0].item() == approx(0.0)
    assert model.bias.item() == approx(0.0)

    if model_device == trainer_device or ((model_device == "cpu") ^ (trainer_device == "cpu")):
        state = trainer.run(data)

        assert state.output[-1] == approx(0.17), state.output[-1]
        assert round(model.weight.data[0, 0].item(), 3) == approx(0.013), model.weight.item()
        assert round(model.bias.item(), 3) == approx(0.08), model.bias.item()

        if amp_mode == "amp":
            assert state.output[0].dtype is torch.half
            if scaler and isinstance(scaler, bool):
                assert hasattr(state, "scaler")
            else:
                assert not hasattr(state, "scaler")
    else:
        if LooseVersion(torch.__version__) >= LooseVersion("1.7.0"):
            # This is broken in 1.6.0 but will be probably fixed with 1.7.0
            with pytest.raises(RuntimeError, match=r"is on CPU, but expected them to be on GPU"):
                trainer.run(data)
Beispiel #15
0
def _test_basic_func(rank, world_size, tempfile_name, test_case):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")  # Covers nccl

    model = Linear(2, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    optim = AdaScale(SGD(model.parameters(), lr=0.1))
    if "input" in test_case:
        # single iter
        in_data = Tensor(test_case["input"][rank])
        in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        assert np.allclose(optim.gain(),
                           test_case["expected_gain"]), optim.gain()
        optim.step()
        optim.zero_grad()
    else:
        # multiple iters
        for in_data in test_case["inputs"]:
            in_data = Tensor(in_data[rank]).cuda()
            out = model(in_data)
            out.sum().backward()
            optim.step()
            optim.zero_grad()
        assert np.allclose(optim.gain(),
                           test_case["expected_gain"]), optim.gain()

    dist.destroy_process_group()
Beispiel #16
0
def test_gradient_value():
    """Test that we don't mutate the gradients during backward"""
    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1),
                     num_gradients_to_accumulate=2)

    # fwd 1
    out = model(Tensor([0.0, 1.0]))
    out.sum().backward()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 1.0], [0.0, 1.0]]), model.weight.grad

    # fwd 2, grad is accumulated
    out = model(Tensor([0.0, 1.0]))
    out.sum().backward()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad

    # assert gain and grad value before/after step/zero_grad
    assert np.allclose(optim.gain(), 1.0000002499999376), optim.gain()
    optim.step()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad
    optim.zero_grad()
    assert np.allclose(model.weight.grad.numpy(),
                       [[0.0, 0.0], [0.0, 0.0]]), model.weight.grad
Beispiel #17
0
def test_save_load_checkpoints():
    experts = {}
    expert = Linear(1, 1)
    opt = torch.optim.SGD(expert.parameters(), 0.0)
    expert_name = f'test_expert'
    args_schema = (BatchTensorDescriptor(1),)
    experts[expert_name] = ExpertBackend(name=expert_name, expert=expert, opt=opt,
                                         args_schema=args_schema,
                                         outputs_schema=BatchTensorDescriptor(1),
                                         max_batch_size=1,
                                         )
    with TemporaryDirectory() as tmpdir:
        tmp_path = Path(tmpdir)

        for i in range(1, EXPERT_WEIGHT_UPDATES + 1):
            expert.weight.data[0] = i
            store_experts(experts, tmp_path)

        checkpoints_dir = tmp_path / expert_name

        assert checkpoints_dir.exists()
        # include checkpoint_last.pt
        assert len(list(checkpoints_dir.iterdir())) == EXPERT_WEIGHT_UPDATES + 1

        expert.weight.data[0] = 0

        load_weights(experts, tmp_path)
        assert expert.weight.data[0] == EXPERT_WEIGHT_UPDATES
Beispiel #18
0
def test_set_num_gradients_to_accumulate(test_case):
    """Test set_num_gradients_to_accumulate experimental feature."""
    new_accum = test_case["new_accum"]
    exp_gain = test_case["exp_gain"]

    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1),
                     num_gradients_to_accumulate=2)
    out = model(Tensor([0.0, 1.0]))
    out.sum().backward()
    out = model(Tensor([1.0, 0.0]))
    out.sum().backward()
    assert np.allclose(optim.gain(), 2.0)
    optim.step()
    optim.zero_grad()

    optim.set_scale(float(new_accum))
    optim.set_num_gradients_to_accumulate(new_accum)
    for _ in range(new_accum):
        out = model(Tensor([0.0, 1.0]))
        out.sum().backward()

    assert np.allclose(optim.gain(), exp_gain), optim.gain()
    optim.step()
    optim.zero_grad()
Beispiel #19
0
def _test_grad_accum_func(rank, world_size, tempfile_name):
    _dist_init(rank, world_size, tempfile_name, backend="gloo")  # Covers gloo

    model = Linear(4, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
    with model.no_sync():
        # iter 1, input vectors are pointing dim0 and dim1
        in_data = Tensor([0.0] * 4)
        in_data[rank] = 1.0
        in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
    # iter 2, input vectors are pointing dim2 and dim3
    in_data = Tensor([0.0] * 4)
    in_data[rank + 2] = 1.0
    in_data = in_data.cuda()
    out = model(in_data)
    out.sum().backward()
    # since all inputs are orthogonal, the gain should be exactly 4.0.
    assert np.allclose(optim.gain(), 4.0), optim.gain()
    optim.step()
    optim.zero_grad()

    dist.destroy_process_group()
Beispiel #20
0
def _test_apex_average(device, amp_mode, opt_level):
    assert amp_mode == "apex"
    assert device == "cuda"

    model = Linear(1, 1)

    if device:
        model.to(device)

    model.weight.data.zero_()
    model.bias.data.zero_()
    optimizer = SGD(model.parameters(), 0.1)

    from apex import amp

    model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)

    mean_var = VariableAccumulation(lambda a, x: a + x)
    y_true = torch.rand(100).float().to(device)

    for y in y_true:
        mean_var.update(y)

    a, n = mean_var.compute()
    assert a.item() == pytest.approx(y_true.sum().item())
    assert n == len(y_true)
Beispiel #21
0
def _test_create_mocked_supervised_trainer(
    model_device: Optional[str] = None,
    trainer_device: Optional[str] = None,
    trace: bool = False,
    amp_mode: str = None,
    scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
):
    with mock.patch("ignite.engine.supervised_training_step_amp") as training_step_amp_mock:
        with mock.patch("ignite.engine.supervised_training_step_apex") as training_step_apex_mock:
            with mock.patch("ignite.engine.supervised_training_step_tpu") as training_step_tpu_mock:
                with mock.patch("ignite.engine.supervised_training_step") as training_step_mock:
                    model = Linear(1, 1)

                    if model_device:
                        model.to(model_device)

                    model.weight.data.zero_()
                    model.bias.data.zero_()
                    optimizer = SGD(model.parameters(), 0.1)

                    if trace:
                        example_input = torch.randn(1, 1)
                        model = torch.jit.trace(model, example_input)

                    if amp_mode == "apex" and model_device == trainer_device == "cuda":
                        from apex import amp

                        model, optimizer = amp.initialize(model, optimizer, opt_level="O2")

                    trainer = create_supervised_trainer(
                        model,
                        optimizer,
                        mse_loss,
                        device=trainer_device,
                        output_transform=lambda x, y, y_pred, loss: (y_pred, loss.item()),
                        amp_mode=amp_mode,
                        scaler=scaler,
                    )

                    x = torch.tensor([[0.1], [0.2]])
                    y = torch.tensor([[0.3], [0.5]])
                    data = [(x, y)]

                    assert model.weight.data[0, 0].item() == approx(0.0)
                    assert model.bias.item() == approx(0.0)

                    on_tpu = "xla" in trainer_device if trainer_device is not None else False
                    mode, _ = _check_arg(on_tpu, amp_mode, scaler)

                    if model_device == trainer_device or ((model_device == "cpu") ^ (trainer_device == "cpu")):
                        trainer.run(data)

                        if mode == "amp":
                            assert training_step_amp_mock.called
                        elif mode == "apex":
                            assert training_step_apex_mock.called
                        elif mode == "tpu":
                            assert training_step_tpu_mock.called
                        else:
                            assert training_step_mock.called
Beispiel #22
0
def test_basic_cpu():
    """Test single batch behavior on CPU"""
    model = Linear(2, 2, bias=False)
    try:
        optim = AdaScale(SGD(model.parameters(), lr=0.1))
    except RuntimeError:
        return
    assert False, "Single batch AdaScale should not be suppported"
Beispiel #23
0
class ModelGCN(torch.nn.Module):
    def __init__(self, num_layers, hidden_list, activation, data):
        super(ModelGCN, self).__init__()
        assert len(hidden_list) == num_layers + 1
        self.linear_1 = Linear(data.num_features, hidden_list[0])
        self.convs = torch.nn.ModuleList()
        for i in range(num_layers):
            self.convs.append(GCNConv(hidden_list[i], hidden_list[i + 1]))
        self.JK = JumpingKnowledge(mode='max')
        self.linear_2 = Linear(hidden_list[-1], data.num_class)
        if activation == "relu":
            self.activation = relu
        elif activation == "leaky_relu":
            self.activation = leaky_relu
        self.reg_params = list(self.linear_1.parameters()) + list(
            self.convs.parameters()) + list(self.JK.parameters()) + list(
                self.linear_2.parameters())

    def reset_parameters(self):
        self.linear_1.reset_parameters()
        for conv in self.convs:
            conv.reset_parameters()
        self.linear_2.reset_parameters()

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
        edge_index, edge_weight = dropout_adj(edge_index,
                                              edge_attr=edge_weight,
                                              p=0.8,
                                              training=self.training)
        x_jk = []
        x = self.linear_1(x)
        x = self.activation(x)
        x_jk.append(dropout(x, p=0.5, training=self.training))
        for i in range(len(self.convs)):
            x = self.convs[i](x_jk[-1], edge_index, edge_weight=edge_weight)
            if i != len(self.convs) - 1:
                x_jk.append(self.activation(x))
            else:
                x_jk.append(dropout(x, p=0.5, training=self.training))
        x = self.JK(x_jk)
        x = self.linear_2(x)
        return log_softmax(x, dim=-1)
Beispiel #24
0
def _test_basic_func(rank, ddp_cls, world_size, tempfile_name, test_case):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")  # Covers nccl

    model = Linear(2, 2)
    model.to("cuda")
    if ddp_cls is DDP:
        model = ddp_cls(model, device_ids=[rank])
        optim = AdaScale(SGD(model.parameters(), lr=0.1))
    elif ddp_cls is SDP:
        optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
        model = ddp_cls(model, sharded_optimizer=optim)
    else:
        assert ddp_cls is FSDP, ddp_cls
        # Two cases:
        #    flatten=True : AdaScale wrapper must be after FSDP and it receives
        #                   a single grad tensor. It won't receive grad if
        #                   wrapped before.
        #    flatten=False: AdaScale can be both before or after FSDP.
        # So, it is better to do AdaScale after FSDP.
        model = ddp_cls(model, flatten_parameters=False)
        optim = AdaScale(SGD(model.parameters(), lr=0.1))
    if "input" in test_case:
        # single iter
        in_data = Tensor(test_case["input"][rank])
        in_data = in_data.cuda()
        out = model(in_data)
        out.sum().backward()
        if ddp_cls is DDP:
            assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
        optim.step()
        optim.zero_grad()
    else:
        # multiple iters
        for in_data in test_case["inputs"]:
            in_data = Tensor(in_data[rank]).cuda()
            out = model(in_data)
            out.sum().backward()
            optim.step()
            optim.zero_grad()
        if ddp_cls is DDP:
            assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()

    dist.destroy_process_group()
Beispiel #25
0
    def test_zero_model(self):
        model = Linear(3, 1)
        init.constant_(model.weight, 0)
        init.constant_(model.bias, 0)
        optim = torch.optim.SGD(model.parameters(), lr=0.01)

        trial = torchbearer.Trial(model, optim, loss)
        trial.with_test_data(torch.rand(10, 3), batch_size=3)
        preds = trial.predict()

        for i in range(len(preds)):
            self.assertAlmostEqual(preds[i], 0)
def test_debias_ewma():
    """Test debias_ewma experimental feature"""
    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=True)
    for _ in range(4):
        out = model(Tensor([0.0, 1.0]))
        out.sum().backward()
        out = model(Tensor([1.0, 0.0]))
        out.sum().backward()
        assert np.allclose(optim.gain(), 2.0), optim.gain()
        optim.step()
        optim.zero_grad()
Beispiel #27
0
def _test_basic_func(rank,
                     world_size,
                     tempfile_name,
                     test_case,
                     oss,
                     model=None):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")

    if model is None:
        model = Linear(2, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    if oss:
        # For now, we can only wrap AdaScale over OSS. If we do it the other way around,
        # AdaScale needs to take different parameter types, i.e. the parameter list, etc.
        optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
    else:
        optim = AdaScale(SGD(model.parameters(), lr=0.1))

    if "input" in test_case:
        inputs = [test_case["input"]]
    else:
        inputs = test_case["inputs"]

    for in_data in inputs:
        in_data = Tensor(in_data[rank]).cuda()
        out = model(in_data)
        out.sum().backward()
        optim.step()
        optim.zero_grad()

    assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()

    if "expected_mean_weight" in test_case:
        mean_weight = mean(
            [model.module[i].weight.data.mean().item() for i in range(4)])
        assert np.allclose(mean_weight,
                           test_case["expected_mean_weight"]), mean_weight

    dist.destroy_process_group()
Beispiel #28
0
def test_mc_loss():

    num_batches = 2
    num_classes = 4
    chi = 2
    # 4 classes
    x = torch.ones(num_batches, chi * num_classes)
    x[:, 0, ...] = 10
    target = torch.zeros(num_batches, dtype=torch.long)

    mod = Linear(chi * num_classes, chi * num_classes)

    # Check backprop
    for reduction in ['mean', 'sum', 'none']:
        for p in mod.parameters():
            p.grad = None
        train_loss = F.mutual_channel_loss(mod(x),
                                           target,
                                           ignore_index=0,
                                           reduction=reduction)
        if reduction == 'none':
            assert train_loss.shape == (num_batches, )
            train_loss = train_loss.sum()
        train_loss.backward()
        assert isinstance(mod.weight.grad, torch.Tensor)

    # Check type casting of weights
    for p in mod.parameters():
        p.grad = None
    class_weights = torch.ones(num_classes, dtype=torch.float16)
    ignore_index = 0

    criterion = nn.MutualChannelLoss(weight=class_weights,
                                     ignore_index=ignore_index,
                                     chi=chi)
    train_loss = criterion(mod(x), target)
    train_loss.backward()
    assert isinstance(mod.weight.grad, torch.Tensor)
    assert repr(criterion
                ) == f"MutualChannelLoss(reduction='mean', chi={chi}, alpha=1)"
Beispiel #29
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 hiddens=[],
                 activations=[],
                 dropout=0.5,
                 weight_decay=5e-5,
                 lr=0.2,
                 use_bias=False):
        super().__init__()

        if len(hiddens) != len(activations):
            raise RuntimeError(
                f"Arguments 'hiddens' and 'activations' should have the same length."
                " Or you can set both of them to `[]`.")

        layers = ModuleList()
        acts = []
        paras = []
        inc = in_channels
        for hidden, activation in zip(hiddens, activations):
            layer = Linear(inc, hidden, bias=use_bias)
            paras.append(
                dict(params=layer.parameters(), weight_decay=weight_decay))
            layers.append(layer)
            inc = hidden
            acts.append(get_activation(activation))

        layer = Linear(inc, out_channels, bias=use_bias)
        layers.append(layer)
        paras.append(dict(params=layer.parameters(),
                          weight_decay=weight_decay))

        self.layers = layers
        self.acts = acts
        self.dropout = Dropout(dropout)
        self.compile(loss=torch.nn.CrossEntropyLoss(),
                     optimizer=optim.Adam(paras, lr=lr),
                     metrics=[Accuracy()])
Beispiel #30
0
def example_experts():
    expert = Linear(1, 1)
    opt = torch.optim.SGD(expert.parameters(), PEAK_LR)

    args_schema = (BatchTensorDescriptor(1),)
    expert_backend = ExpertBackend(name=EXPERT_NAME, expert=expert, optimizer=opt,
                                   scheduler=get_linear_schedule_with_warmup,
                                   num_warmup_steps=BACKWARD_PASSES_BEFORE_SAVE,
                                   num_total_steps=BACKWARD_PASSES_BEFORE_SAVE + BACKWARD_PASSES_AFTER_SAVE,
                                   args_schema=args_schema, outputs_schema=BatchTensorDescriptor(1), max_batch_size=1,
                                   )
    experts = {EXPERT_NAME: expert_backend}
    yield experts