Beispiel #1
0
def test_client_anomaly_detection():
    HID_DIM = 16

    experts = {}
    for i in range(4):
        expert = layers.name_to_block['ffn'](HID_DIM)
        experts[f'expert.{i}'] = hivemind.ExpertBackend(
            name=f'expert.{i}',
            expert=expert,
            optimizer=torch.optim.Adam(expert.parameters()),
            args_schema=(hivemind.BatchTensorDescriptor(HID_DIM), ),
            outputs_schema=hivemind.BatchTensorDescriptor(HID_DIM),
            max_batch_size=16,
        )

    experts['expert.3'].expert.ffn.weight.data[0, 0] = float('nan')

    dht = hivemind.DHT(start=True)
    server = hivemind.Server(dht, experts, num_connection_handlers=1)
    server.start()
    try:
        server.ready.wait()

        dmoe = hivemind.RemoteMixtureOfExperts(in_features=16,
                                               grid_size=(3, ),
                                               dht=dht,
                                               k_best=3,
                                               uid_prefix='expert.',
                                               detect_anomalies=True)

        input = torch.randn(1, 16)
        input[0, 0] = float('nan')

        with pytest.raises(ValueError):
            dmoe(input)

        input[0, 0] = 0
        output = dmoe(input)

        inf_loss = float('inf') * output.sum()
        with pytest.raises(ValueError):
            inf_loss.backward()

        dmoe = hivemind.RemoteMixtureOfExperts(in_features=16,
                                               grid_size=(4, ),
                                               dht=dht,
                                               k_best=4,
                                               uid_prefix='expert.',
                                               detect_anomalies=True)
        output = dmoe(input)
        assert output.isfinite().all()

    finally:
        server.shutdown()
Beispiel #2
0
def test_moe():
    all_expert_uids = [
        f'ffn.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}'
        for _ in range(20)
    ]
    with background_server(expert_uids=all_expert_uids,
                           device='cpu',
                           expert_cls='ffn',
                           num_handlers=1,
                           hidden_dim=16) as (server_endpoint, dht_endpoint):
        dht = hivemind.DHT(start=True,
                           expiration=999,
                           initial_peers=[dht_endpoint])
        # declare expert uids. Server *should* declare them by itself, but it takes time.
        assert all(
            dht.declare_experts(all_expert_uids, endpoint=server_endpoint))

        dmoe = hivemind.RemoteMixtureOfExperts(in_features=16,
                                               grid_size=(32, 32, 32),
                                               dht=dht,
                                               k_best=3,
                                               uid_prefix='ffn')

        for i in range(10):
            out = dmoe(torch.randn(10, 16))
            out.sum().backward()
Beispiel #3
0
def test_moe_beam_search():
    all_expert_uids = [
        f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10)
        for k in range(10)
    ]
    dht = hivemind.DHT(start=True, expiration=999)
    assert all(dht.declare_experts(all_expert_uids, endpoint='fake-endpoint'))

    dmoe = hivemind.RemoteMixtureOfExperts(in_features=32,
                                           grid_size=(32, 32, 32),
                                           dht=dht,
                                           k_best=4,
                                           uid_prefix='ffn')

    for i in range(25):
        input = torch.randn(32)
        grid_scores = dmoe.proj(input).split_with_sizes(dmoe.grid_size, dim=-1)

        chosen_experts = dmoe.loop.run_until_complete(
            dmoe.beam_search(grid_scores, k_best=dmoe.k_best))

        chosen_scores = dmoe.compute_expert_scores(
            [dim_scores[None] for dim_scores in grid_scores],
            [chosen_experts])[0]

        all_scores = dmoe.compute_expert_scores(
            [dim_scores[None] for dim_scores in grid_scores],
            [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0]
        true_best_scores = sorted(all_scores.cpu().detach().numpy(),
                                  reverse=True)[:len(chosen_experts)]
        our_best_scores = list(chosen_scores.cpu().detach().numpy())
        assert np.allclose(true_best_scores, our_best_scores)
Beispiel #4
0
def test_beam_search_correctness():
    all_expert_uids = [
        f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10)
        for k in range(10)
    ]
    dht = hivemind.DHT(start=True, expiration=999)
    assert all(dht.declare_experts(all_expert_uids, endpoint='fake-endpoint'))

    dmoe = hivemind.RemoteMixtureOfExperts(in_features=32,
                                           grid_size=(32, 32, 32),
                                           dht=dht,
                                           k_best=4,
                                           uid_prefix='ffn.')

    for i in range(25):
        input = torch.randn(32)
        grid_scores = dmoe.proj(input).split_with_sizes(dmoe.grid_size, dim=-1)

        chosen_experts = dht.find_best_experts(
            dmoe.uid_prefix,
            [tensor.detach().numpy() for tensor in grid_scores],
            beam_size=dmoe.k_best)
        chosen_scores = dmoe.compute_expert_scores(
            [dim_scores[None] for dim_scores in grid_scores],
            [chosen_experts])[0]
        our_best_scores = list(chosen_scores.cpu().detach().numpy())

        # reference: independently find :beam_size: best experts with exhaustive search
        all_scores = dmoe.compute_expert_scores(
            [dim_scores.unsqueeze(0) for dim_scores in grid_scores],
            [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0]
        true_best_scores = sorted(all_scores.cpu().detach().numpy(),
                                  reverse=True)[:len(chosen_experts)]

        assert np.allclose(true_best_scores, our_best_scores)
Beispiel #5
0
def test_moe():
    all_expert_uids = [
        f'ffn.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}'
        for _ in range(10)
    ]
    with background_server(expert_uids=all_expert_uids,
                           device='cpu',
                           expert_cls='ffn',
                           num_handlers=1,
                           hidden_dim=16) as (server_endpoint, dht_endpoint):
        dht = hivemind.DHT(start=True, initial_peers=[dht_endpoint])

        dmoe = hivemind.RemoteMixtureOfExperts(in_features=16,
                                               grid_size=(4, 4, 4),
                                               dht=dht,
                                               k_best=3,
                                               uid_prefix='ffn.')

        for i in range(3):
            out = dmoe(torch.randn(10, 16))
            out.sum().backward()