Example #1
0
def test_store_get_experts():
    peers = [hivemind.DHT(start=True)]
    for i in range(10):
        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
        peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True))

    you: hivemind.dht.DHT = random.choice(peers)
    theguyshetoldyounottoworryabout: hivemind.dht.DHT = random.choice(peers)

    expert_uids = [f"my_expert.{i}" for i in range(110)]
    batch_size = 10
    for batch_start in range(0, len(expert_uids), batch_size):
        you.declare_experts(expert_uids[batch_start: batch_start + batch_size], 'localhost:1234')

    found = theguyshetoldyounottoworryabout.get_experts(random.sample(expert_uids, 5) + ['foo', 'bar'])
    assert all(res is not None for res in found[:-2]), "Could not find some existing experts"
    assert all(res is None for res in found[-2:]), "Found non-existing experts"

    that_guys_expert, that_guys_port = "my_other_expert.1337", random.randint(1000, 9999)
    theguyshetoldyounottoworryabout.declare_experts([that_guys_expert], f'that_host:{that_guys_port}')
    you_notfound, you_found = you.get_experts(['foobar', that_guys_expert])
    assert isinstance(you_found, hivemind.RemoteExpert)
    assert you_found.endpoint == f'that_host:{that_guys_port}'

    for peer in peers:
        peer.shutdown()
Example #2
0
def test_beam_search(dht_size=20, total_experts=128, batch_size=32, initial_peers=3, beam_size=4, parallel_rpc=16,
                     grid_dims=(32, 32, 32)):
    dht = []
    for i in range(dht_size):
        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(dht, min(initial_peers, len(dht)))]
        dht.append(hivemind.DHT(start=True, expiration=999999, initial_peers=neighbors_i, parallel_rpc=parallel_rpc))

    real_experts = sorted({
        'expert.' + '.'.join([str(random.randint(0, dim - 1)) for dim in grid_dims])
        for _ in range(total_experts)
    })
    for batch_start in range(0, len(real_experts), batch_size):
        random.choice(dht).declare_experts(
            real_experts[batch_start: batch_start + batch_size], wait=True,
            endpoint=f"host{batch_start // batch_size}:{random.randint(0, 65536)}")

    neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(dht, min(initial_peers, len(dht)))]
    you = hivemind.DHT(start=True, expiration=999999, initial_peers=neighbors_i, parallel_rpc=parallel_rpc)

    for i in range(50):
        topk_experts = you.find_best_experts('expert.', [np.random.randn(dim) for dim in grid_dims], beam_size=beam_size)
        assert all(isinstance(e, hivemind.RemoteExpert) for e in topk_experts)
        assert len(topk_experts) == beam_size

    for i in range(10):
        batch_experts = you.batch_find_best_experts('expert.', [np.random.randn(batch_size, dim) for dim in grid_dims],
                                                    beam_size=beam_size)
        assert isinstance(batch_experts, list) and len(batch_experts) == batch_size
        assert all(isinstance(e, hivemind.RemoteExpert) for experts in batch_experts for e in experts)
        assert all(len(experts) == beam_size for experts in batch_experts)
Example #3
0
def test_negative_caching():
    test_success = mp.Event()
    peers = []
    for i in range(10):
        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
        peers.append(hivemind.DHT(initial_peers=neighbors_i, negative_caching=False, cache_locally=False, start=True))

    normal_peer, writer_peer = random.sample(peers, 2)

    neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
    neg_caching_peer = hivemind.DHT(initial_peers=neighbors_i, negative_caching=True, cache_locally=False, start=True)

    assert all(writer_peer.declare_experts(['ffn.1.2.3', 'ffn.3.4.5'], 'myaddr:1234').values())
    # get prefixes by the peer with negative caching. Cache "no data" entries for ffn.0.*, ffn.2.*, ffn.4.*, ffn.5.*
    assert len(neg_caching_peer.get_initial_beam(prefix='ffn.', scores=[.1, .2, .3, .4, .5, .6], beam_size=3)) == 2

    async def _tester():
        node = await hivemind.DHTNode.create(initial_peers=neighbors_i)
        fetched = await asyncio.gather(*(node.get(f'ffn.{i}.') for i in range(10)))
        for i in range(6):
            assert fetched[i] is not None, f"node should have cached ffn.{i}."
        for i in range(6, len(fetched)):
            assert fetched[i] is None, f"node shouldn't have cached ffn.{i}."
        test_success.set()

    proc = mp.Process(target=lambda: asyncio.run(_tester()))
    proc.start()
    proc.join()
    assert test_success.is_set()
Example #4
0
def test_sending_validator_instance_between_processes():
    alice = hivemind.DHT(start=True)
    bob = hivemind.DHT(start=True, initial_peers=[f"{LOCALHOST}:{alice.port}"])

    alice.add_validators([SchemaValidator(SampleSchema)])
    bob.add_validators([SchemaValidator(SampleSchema)])

    assert bob.store('experiment_name', b'foo_bar', get_dht_time() + 10)
    assert not bob.store('experiment_name', 777, get_dht_time() + 10)
    assert alice.get('experiment_name', latest=True).value == b'foo_bar'
Example #5
0
def test_hivemind_dht():
    peers = [hivemind.DHT(start=True)]
    for i in range(10):
        neighbors_i = [
            f'{LOCALHOST}:{node.port}'
            for node in random.sample(peers, min(3, len(peers)))
        ]
        peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True))

    you: hivemind.dht.DHT = random.choice(peers)
    theguyshetoldyounottoworryabout: hivemind.dht.DHT = random.choice(peers)

    expert_uids = [str(uuid.uuid4()) for _ in range(110)]
    batch_size = 10
    for batch_start in range(0, len(expert_uids), batch_size):
        you.declare_experts(expert_uids[batch_start:batch_start + batch_size],
                            'localhost', 1234)

    found = theguyshetoldyounottoworryabout.get_experts(
        random.sample(expert_uids, 5) + ['foo', 'bar'])
    assert all(res is not None
               for res in found[:-2]), "Could not find some existing experts"
    assert all(res is None for res in found[-2:]), "Found non-existing experts"

    that_guys_expert, that_guys_port = str(uuid.uuid4()), random.randint(
        1000, 9999)
    theguyshetoldyounottoworryabout.declare_experts(
        [that_guys_expert], f'that_host:{that_guys_port}')
    you_notfound, you_found = you.get_experts(['foobar', that_guys_expert])
    assert isinstance(you_found, hivemind.RemoteExpert)
    assert you_found.endpoint == f'that_host:{that_guys_port}'

    # test first_k_active
    assert list(
        theguyshetoldyounottoworryabout.first_k_active(
            expert_uids, k=10)) == expert_uids[:10]

    some_permuted_experts = random.sample(expert_uids, k=32)
    assert list(
        theguyshetoldyounottoworryabout.first_k_active(
            some_permuted_experts, k=32)) == some_permuted_experts
    assert list(
        theguyshetoldyounottoworryabout.first_k_active(
            some_permuted_experts, k=1)) == some_permuted_experts[:1]
    fake_and_real_experts = list(
        chain(*zip([str(uuid.uuid4())
                    for _ in some_permuted_experts], some_permuted_experts)))
    assert list(
        theguyshetoldyounottoworryabout.first_k_active(
            fake_and_real_experts, k=9)) == some_permuted_experts[:9]

    for peer in peers:
        peer.shutdown()
Example #6
0
def test_dht_get_address(addr=LOCALHOST, dummy_endpoint='123.45.67.89:*'):
    node1 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*")
    node2 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", initial_peers=[f"{addr}:{node1.port}"])
    node3 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", initial_peers=[f"{addr}:{node2.port}"])
    assert addr in node3.get_visible_address(num_peers=2)

    node4 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*")
    with pytest.raises(ValueError):
        node4.get_visible_address()
    assert node4.get_visible_address(peers=[f'{addr}:{node1.port}']).endswith(addr)

    node5 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", endpoint=f"{dummy_endpoint}")
    assert node5.get_visible_address() == strip_port(dummy_endpoint)
Example #7
0
def test_allreduce_grid():
    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
    averagers = [
        hivemind.DecentralizedAverager(averaged_tensors=[torch.randn(3)],
                                       dht=dht,
                                       target_group_size=2,
                                       prefix='mygroup',
                                       initial_group_bits=bin(
                                           i // 2)[2:].rjust(2, '0'),
                                       start=True) for i in range(8)
    ]

    [means0], [stds0] = compute_mean_std(averagers)
    assert not torch.allclose(stds0, torch.zeros_like(stds0))

    prev_means, prev_stds = means0, stds0

    for i in range(5):
        step_futures = [averager.step(wait=False) for averager in averagers]
        groups = [future.result() for future in step_futures]
        [means], [stds] = compute_mean_std(averagers)
        assert torch.allclose(means, prev_means, atol=1e-6, rtol=0)
        assert all(len(group) == 2 for group in groups)

        if i <= 2:
            assert torch.all(torch.le(stds, prev_stds))
        else:
            assert torch.allclose(stds,
                                  torch.zeros_like(stds),
                                  atol=1e-6,
                                  rtol=0)

    for averager in averagers:
        averager.shutdown()
    dht.shutdown()
Example #8
0
def test_compute_expert_scores():
    try:
        dht = hivemind.DHT(start=True)
        moe = hivemind.client.moe.RemoteMixtureOfExperts(dht=dht,
                                                         in_features=16,
                                                         grid_size=(40, ),
                                                         k_best=4,
                                                         k_min=1,
                                                         timeout_after_k_min=1,
                                                         uid_prefix='expert.')
        gx, gy = torch.randn(4, 5, requires_grad=True), torch.randn(
            4, 3, requires_grad=True)
        ii = [[4, 0, 2], [3, 1, 1, 1, 3], [0], [3, 2]]
        jj = [[2, 2, 1], [0, 1, 2, 0, 1], [0], [1, 2]]
        batch_experts = [[
            hivemind.RemoteExpert(
                uid=f'expert.{ii[batch_i][expert_i]}.{jj[batch_i][expert_i]}',
                endpoint="[::]:1337") for expert_i in range(len(ii[batch_i]))
        ] for batch_i in range(
            len(ii)
        )]  # note: these experts do not exists on server, we use them only to test moe compute_expert_scores
        logits = moe.compute_expert_scores([gx, gy], batch_experts)
        torch.softmax(logits, dim=-1).norm(dim=-1).mean().backward()
        assert gx.grad.norm().item() > 0 and gy.grad.norm().item(
        ), "compute_expert_scores didn't backprop"

        for batch_i in range(len(ii)):
            for expert_i in range(len(ii[batch_i])):
                assert torch.allclose(logits[batch_i, expert_i],
                                      gx[batch_i, ii[batch_i][expert_i]] + gy[batch_i, jj[batch_i][expert_i]]), \
                    "compute_expert_scores returned incorrect score"
    finally:
        dht.shutdown()
Example #9
0
def test_allgather():
    dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*')
    averagers = [
        hivemind.DecentralizedAverager(torch.ones(1),
                                       dht=dht,
                                       target_group_size=4,
                                       averaging_expiration=15,
                                       prefix='mygroup',
                                       initial_group_bits='000',
                                       listen_on='127.0.0.1:*',
                                       start=True) for _ in range(8)
    ]

    futures = []
    for i, averager in enumerate(averagers):
        futures.append(
            averager.step(wait=False,
                          gather=dict(batch_size=123 + i, foo='bar')))

    assert len(set(repr(sorted(future.result())) for future in futures)) == 2

    reference_metadata = {
        averager.endpoint: dict(batch_size=123 + i, foo='bar')
        for i, averager in enumerate(averagers)
    }
    for future in futures:
        gathered = future.result()

        assert len(gathered) == 4

        for endpoint in gathered:
            assert gathered[endpoint] == reference_metadata[endpoint]
Example #10
0
def test_allreduce_once():
    dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*')

    tensors1 = [torch.randn(123), torch.zeros(3)]
    tensors2 = [torch.rand(123), torch.ones(3)]
    tensors3 = [-torch.rand(123), torch.arange(3).to(torch.float32)]
    tensors4 = [torch.randn(123)**3, torch.arange(3).to(torch.float32) / 2]

    reference = [(tensors1[i] + tensors2[i] + tensors3[i] + tensors4[i]) / 4
                 for i in range(len(tensors1))]

    averagers = [
        hivemind.DecentralizedAverager(tensors,
                                       dht=dht,
                                       target_group_size=4,
                                       averaging_expiration=15,
                                       prefix='mygroup',
                                       initial_group_bits='0110',
                                       listen_on='127.0.0.1:*',
                                       start=True)
        for tensors in [tensors1, tensors2, tensors3, tensors4]
    ]

    futures = []
    for averager in averagers:
        futures.append(averager.step(wait=False))
    for future in futures:
        result = future.result()
        for averager in averagers:
            assert averager.endpoint in result

    for averager in averagers:
        with averager.get_tensors() as averaged_tensors:
            for ref, our in zip(reference, averaged_tensors):
                assert torch.allclose(ref, our, atol=1e-6)
Example #11
0
def test_get_store():
    peers = []
    for i in range(10):
        neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))]
        peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True))

    node1, node2 = random.sample(peers, 2)
    assert node1.store('key1', 'value1', expiration_time=hivemind.get_dht_time() + 30)
    assert node1.get('key1').value == 'value1'
    assert node2.get('key1').value == 'value1'
    assert node2.get('key2') is None

    future = node1.get('foo', return_future=True)
    assert future.result() is None

    future = node1.get('foo', return_future=True)
    future.cancel()

    assert node2.store('key1', 123, expiration_time=hivemind.get_dht_time() + 31)
    assert node2.store('key2', 456, expiration_time=hivemind.get_dht_time() + 32)
    assert node1.get('key1', latest=True).value == 123
    assert node1.get('key2').value == 456

    assert node1.store('key2', subkey='subkey1', value=789, expiration_time=hivemind.get_dht_time() + 32)
    assert node2.store('key2', subkey='subkey2', value='pew', expiration_time=hivemind.get_dht_time() + 32)
    found_dict = node1.get('key2', latest=True).value
    assert isinstance(found_dict, dict) and len(found_dict) == 2
    assert found_dict['subkey1'].value == 789 and found_dict['subkey2'].value == 'pew'

    for peer in peers:
        peer.shutdown()
def test_dht_add_validators(validators_for_app):
    # One app may create a DHT with its validators
    dht = hivemind.DHT(start=False, record_validators=validators_for_app['A'])

    # While the DHT process is not started, you can't send a command to append new validators
    with pytest.raises(RuntimeError):
        dht.add_validators(validators_for_app['B'])
    dht.run_in_background(await_ready=True)

    # After starting the process, other apps may add new validators to the existing DHT
    dht.add_validators(validators_for_app['B'])

    assert dht.store('field_a', b'bytes_value', hivemind.get_dht_time() + 10)
    assert dht.get('field_a', latest=True).value == b'bytes_value'

    assert not dht.store('field_a', 666, hivemind.get_dht_time() + 10)
    assert dht.get('field_a', latest=True).value == b'bytes_value'

    local_public_key = validators_for_app['A'][0].local_public_key
    assert dht.store('field_b', 777, hivemind.get_dht_time() + 10, subkey=local_public_key)
    dictionary = dht.get('field_b', latest=True).value
    assert (len(dictionary) == 1 and
            dictionary[local_public_key].value == 777)

    assert not dht.store('unknown_key', 666, hivemind.get_dht_time() + 10)
    assert dht.get('unknown_key', latest=True) is None
def test_multiple_peers(num_processes, wait_seconds):
    """Test to ensure that if we have two running processes with the same peers, they connect and train
    successfully."""
    dht_root = hivemind.DHT(start=True)
    barrier = mp.Barrier(num_processes)
    initial_peers = dht_root.get_visible_maddrs()

    with mp.Manager() as manager:
        # allows processes to return their recorded logged peers/steps
        recorded_process_peers = manager.list()
        recorded_process_steps = manager.list()
        processes = [
            mp.Process(
                target=_run_collab_training_fn,
                kwargs=dict(
                    initial_peers=initial_peers,
                    wait_seconds=wait_seconds,
                    barrier=barrier,
                    recorded_process_peers=recorded_process_peers,
                    recorded_process_steps=recorded_process_steps,
                ),
            ) for x in range(num_processes)
        ]
        for process in processes:
            process.start()
        for process in processes:
            process.join()
        # assert that peers increase as expected and we run at-least 1 global step.
        for process_peers, process_steps in zip(recorded_process_peers,
                                                recorded_process_steps):
            assert any(num_peer == num_processes for num_peer in process_peers)
            assert any(global_step > 0 for global_step in process_steps)
Example #14
0
def test_beam_search_correctness():
    all_expert_uids = [
        f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10)
        for k in range(10)
    ]
    dht = hivemind.DHT(start=True, expiration=999)
    assert all(dht.declare_experts(all_expert_uids, endpoint='fake-endpoint'))

    dmoe = hivemind.RemoteMixtureOfExperts(in_features=32,
                                           grid_size=(32, 32, 32),
                                           dht=dht,
                                           k_best=4,
                                           uid_prefix='ffn.')

    for i in range(25):
        input = torch.randn(32)
        grid_scores = dmoe.proj(input).split_with_sizes(dmoe.grid_size, dim=-1)

        chosen_experts = dht.find_best_experts(
            dmoe.uid_prefix,
            [tensor.detach().numpy() for tensor in grid_scores],
            beam_size=dmoe.k_best)
        chosen_scores = dmoe.compute_expert_scores(
            [dim_scores[None] for dim_scores in grid_scores],
            [chosen_experts])[0]
        our_best_scores = list(chosen_scores.cpu().detach().numpy())

        # reference: independently find :beam_size: best experts with exhaustive search
        all_scores = dmoe.compute_expert_scores(
            [dim_scores.unsqueeze(0) for dim_scores in grid_scores],
            [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0]
        true_best_scores = sorted(all_scores.cpu().detach().numpy(),
                                  reverse=True)[:len(chosen_experts)]

        assert np.allclose(true_best_scores, our_best_scores)
Example #15
0
async def test_key_manager():
    key_manager = GroupKeyManager(hivemind.DHT(start=True), endpoint='localhvost',
                                  prefix='test_averaging', initial_group_bits='10110',
                                  target_group_size=2)

    t = hivemind.get_dht_time()
    key = key_manager.current_key
    await key_manager.declare_averager(key, 'localhvost', expiration_time=t + 60)
    await key_manager.declare_averager(key, 'localhvost2', expiration_time=t + 61)

    q1 = await key_manager.get_averagers(key, only_active=True)

    await key_manager.declare_averager(key, 'localhvost', expiration_time=t + 66)
    q2 = await key_manager.get_averagers(key, only_active=True)

    await key_manager.declare_averager(key, 'localhvost2', expiration_time=t + 61, looking_for_group=False)
    q3 = await key_manager.get_averagers(key, only_active=True)
    q4 = await key_manager.get_averagers(key, only_active=False)

    q5 = await key_manager.get_averagers('nonexistent_key.0b0101', only_active=False)

    assert len(q1) == 2 and ('localhvost', t + 60) in q1 and ('localhvost2', t + 61) in q1
    assert len(q2) == 2 and ('localhvost', t + 66) in q2 and ('localhvost2', t + 61) in q2
    assert len(q3) == 1 and ('localhvost', t + 66) in q3
    assert len(q4) == 2 and ('localhvost', t + 66) in q4 and ('localhvost2', t + 61) in q2
    assert len(q5) == 0
Example #16
0
def test_getset_averagers():
    dht = hivemind.DHT(start=True)

    t = hivemind.get_dht_time()
    dht.declare_averager(group_key='bucket.0b10110',
                         endpoint='localhvost',
                         expiration_time=t + 60)
    dht.declare_averager(group_key='bucket.0b10110',
                         endpoint='localhvost2',
                         expiration_time=t + 61)

    q1 = dht.get_averagers('bucket.0b10110', only_active=True)

    dht.declare_averager(group_key='bucket.0b10110',
                         endpoint='localhvost',
                         expiration_time=t + 66)
    q2 = dht.get_averagers('bucket.0b10110', only_active=True)

    dht.declare_averager(group_key='bucket.0b10110',
                         endpoint='localhvost2',
                         looking_for_group=False,
                         expiration_time=t + 61)
    q3 = dht.get_averagers('bucket.0b10110', only_active=True)
    q4 = dht.get_averagers('bucket.0b10110', only_active=False)

    assert len(q1) == 2 and ('localhvost', t + 60) in q1 and ('localhvost2',
                                                              t + 61) in q1
    assert len(q2) == 2 and ('localhvost', t + 66) in q2 and ('localhvost2',
                                                              t + 61) in q2
    assert len(q3) == 1 and ('localhvost', t + 66) in q3
    assert len(q4) == 2 and ('localhvost', t + 66) in q4 and ('localhvost2',
                                                              t + 61) in q2
Example #17
0
def test_moe():
    all_expert_uids = [
        f'ffn.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}'
        for _ in range(20)
    ]
    with background_server(expert_uids=all_expert_uids,
                           device='cpu',
                           expert_cls='ffn',
                           num_handlers=1,
                           hidden_dim=16) as (server_endpoint, dht_endpoint):
        dht = hivemind.DHT(start=True,
                           expiration=999,
                           initial_peers=[dht_endpoint])
        # declare expert uids. Server *should* declare them by itself, but it takes time.
        assert all(
            dht.declare_experts(all_expert_uids, endpoint=server_endpoint))

        dmoe = hivemind.RemoteMixtureOfExperts(in_features=16,
                                               grid_size=(32, 32, 32),
                                               dht=dht,
                                               k_best=3,
                                               uid_prefix='ffn')

        for i in range(10):
            out = dmoe(torch.randn(10, 16))
            out.sum().backward()
Example #18
0
def test_moe_beam_search():
    all_expert_uids = [
        f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10)
        for k in range(10)
    ]
    dht = hivemind.DHT(start=True, expiration=999)
    assert all(dht.declare_experts(all_expert_uids, endpoint='fake-endpoint'))

    dmoe = hivemind.RemoteMixtureOfExperts(in_features=32,
                                           grid_size=(32, 32, 32),
                                           dht=dht,
                                           k_best=4,
                                           uid_prefix='ffn')

    for i in range(25):
        input = torch.randn(32)
        grid_scores = dmoe.proj(input).split_with_sizes(dmoe.grid_size, dim=-1)

        chosen_experts = dmoe.loop.run_until_complete(
            dmoe.beam_search(grid_scores, k_best=dmoe.k_best))

        chosen_scores = dmoe.compute_expert_scores(
            [dim_scores[None] for dim_scores in grid_scores],
            [chosen_experts])[0]

        all_scores = dmoe.compute_expert_scores(
            [dim_scores[None] for dim_scores in grid_scores],
            [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0]
        true_best_scores = sorted(all_scores.cpu().detach().numpy(),
                                  reverse=True)[:len(chosen_experts)]
        our_best_scores = list(chosen_scores.cpu().detach().numpy())
        assert np.allclose(true_best_scores, our_best_scores)
Example #19
0
def test_load_state_from_peers():
    num_calls = 0
    super_metadata = dict(x=123)
    super_tensors = (torch.randn(3), torch.randint(0, 5, (3, )))

    class TestAverager(hivemind.DecentralizedAverager):
        def get_current_state(self):
            """
            Get current state and send it to a peer. executed in the host process. Meant to be overriden.
            :returns: a tuple of (serializable_small_metadata, sequence of torch tensors)
            """
            nonlocal num_calls, super_metadata, super_tensors
            num_calls += 1
            return super_metadata, super_tensors

    dht_root = hivemind.DHT(start=True)
    initial_peers = [f'{hivemind.LOCALHOST}:{dht_root.port}']
    dht1 = hivemind.DHT(initial_peers=initial_peers, start=True)
    averager1 = TestAverager([torch.randn(3), torch.rand(5)],
                             dht=dht1,
                             start=True,
                             prefix='demo-run',
                             target_group_size=2)

    dht2 = hivemind.DHT(initial_peers=initial_peers, start=True)
    dht2.get('demo-run.all_averagers')
    averager2 = TestAverager([torch.randn(3), torch.rand(5)],
                             dht=dht2,
                             start=True,
                             prefix='demo-run',
                             target_group_size=2)

    assert num_calls == 0
    got_metadata, got_tensors = averager2.load_state_from_peers()
    assert num_calls == 1
    assert got_metadata == super_metadata
    assert all(map(torch.allclose, got_tensors, super_tensors))

    super_metadata['y'] = 123
    super_tensors[1][2] = 9
    assert num_calls == 1
    assert got_metadata != super_metadata
    assert not all(map(torch.allclose, got_tensors, super_tensors))
    got_metadata, got_tensors = averager2.load_state_from_peers()
    assert num_calls == 2
    assert got_metadata == super_metadata
    assert all(map(torch.allclose, got_tensors, super_tensors))
Example #20
0
def test_getset_bits():
    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
    averager = hivemind.DecentralizedAverager([torch.randn(3)],
                                              dht=dht,
                                              start=True,
                                              prefix='test_prefix',
                                              target_group_size=2)
    averager.set_group_bits('00101011101010')
    assert averager.get_group_bits() == '00101011101010'
Example #21
0
def test_client_anomaly_detection():
    HID_DIM = 16

    experts = {}
    for i in range(4):
        expert = layers.name_to_block['ffn'](HID_DIM)
        experts[f'expert.{i}'] = hivemind.ExpertBackend(
            name=f'expert.{i}',
            expert=expert,
            optimizer=torch.optim.Adam(expert.parameters()),
            args_schema=(hivemind.BatchTensorDescriptor(HID_DIM), ),
            outputs_schema=hivemind.BatchTensorDescriptor(HID_DIM),
            max_batch_size=16,
        )

    experts['expert.3'].expert.ffn.weight.data[0, 0] = float('nan')

    dht = hivemind.DHT(start=True)
    server = hivemind.Server(dht, experts, num_connection_handlers=1)
    server.start()
    try:
        server.ready.wait()

        dmoe = hivemind.RemoteMixtureOfExperts(in_features=16,
                                               grid_size=(3, ),
                                               dht=dht,
                                               k_best=3,
                                               uid_prefix='expert.',
                                               detect_anomalies=True)

        input = torch.randn(1, 16)
        input[0, 0] = float('nan')

        with pytest.raises(ValueError):
            dmoe(input)

        input[0, 0] = 0
        output = dmoe(input)

        inf_loss = float('inf') * output.sum()
        with pytest.raises(ValueError):
            inf_loss.backward()

        dmoe = hivemind.RemoteMixtureOfExperts(in_features=16,
                                               grid_size=(4, ),
                                               dht=dht,
                                               k_best=4,
                                               uid_prefix='expert.',
                                               detect_anomalies=True)
        output = dmoe(input)
        assert output.isfinite().all()

    finally:
        server.shutdown()
Example #22
0
async def test_negative_caching():
    peers = []
    for i in range(10):
        neighbors_i = [
            f'{LOCALHOST}:{node.port}'
            for node in random.sample(peers, min(3, len(peers)))
        ]
        peers.append(
            hivemind.DHT(initial_peers=neighbors_i,
                         cache_locally=False,
                         start=True))

    writer_peer = random.choice(peers)
    assert all(
        hivemind.declare_experts(writer_peer, ['ffn.1.2.3', 'ffn.3.4.5'],
                                 'myaddr:1234').values())

    neighbors_i = [
        f'{LOCALHOST}:{node.port}'
        for node in random.sample(peers, min(3, len(peers)))
    ]
    neg_caching_peer = hivemind.DHT(initial_peers=neighbors_i,
                                    cache_locally=False,
                                    start=True)
    beam_search = MoEBeamSearcher(neg_caching_peer,
                                  uid_prefix='ffn.',
                                  grid_size=(10, 10, 10),
                                  negative_caching=True)
    # get prefixes by the peer with negative caching. Cache "no data" entries for ffn.0.*, ffn.2.*, ffn.4.*, ffn.5.*
    assert len(
        beam_search.get_initial_beam(scores=[.1, .2, .3, .4, .5, .6],
                                     beam_size=3)) == 2

    node = await hivemind.DHTNode.create(initial_peers=neighbors_i)
    fetched = await asyncio.gather(*(node.get(f'ffn.{i}.') for i in range(10)))
    for i in range(6):
        assert fetched[i] is not None, f"node should have cached ffn.{i}."
    for i in range(6, len(fetched)):
        assert fetched[i] is None, f"node shouldn't have cached ffn.{i}."
    def initialize_dht_and_averager(self, collaboration_args: CollaborationArguments):
        collaboration_args.initial_peers = list(map(str.strip, collaboration_args.initial_peers.split(',')))
        logger.info(f"Found {len(collaboration_args.initial_peers)} initial peers: {collaboration_args.initial_peers}")
        if len(collaboration_args.initial_peers) == 0:
            raise ValueError("Please specify at least one network endpoint in initial peers.")

        dht = hivemind.DHT(initial_peers=list(collaboration_args.initial_peers), start=True)
        averager = SimpleAverager(self, dht=dht, prefix=self.matchmaking_prefix,
                                  target_group_size=collaboration_args.target_group_size,
                                  throughput=collaboration_args.bandwidth,
                                  compression_type=hivemind.utils.CompressionType.FLOAT16,
                                  averaging_expiration=collaboration_args.averaging_expiration, start=True)
        return dht, averager
Example #24
0
def benchmark_averaging(num_peers: int, target_group_size: int,
                        num_rounds: int, averaging_expiration: float,
                        request_timeout: float, round_timeout: float,
                        hid_size: int, num_layers: int, spawn_dtime: float):
    dht_root = hivemind.DHT(listen_on=f'{LOCALHOST}:*', start=True)
    num_groups = 2**int(round(math.log2(num_peers / target_group_size)))
    nbits = int(round(math.log2(num_groups)))
    peer_tensors = [
        sample_tensors(hid_size, num_layers) for _ in range(num_peers)
    ]
    processes = {dht_root}

    def run_averager(index):
        dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*',
                           initial_peers=[f"{LOCALHOST}:{dht_root.port}"],
                           start=True)
        initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0')
        averager = hivemind.DecentralizedAverager(
            peer_tensors[i],
            dht,
            prefix='my_tensor',
            initial_group_bits=initial_bits,
            listen_on=f"{LOCALHOST}:*",
            compression_type=runtime_pb2.CompressionType.FLOAT16,
            target_group_size=target_group_size,
            averaging_expiration=averaging_expiration,
            request_timeout=request_timeout,
            start=True)
        processes.update({dht, averager})

        print(end=f'<started {index}>\n', flush=True)
        for _ in range(num_rounds):
            success = averager.step(timeout=round_timeout)
            print(end=('+' if success else '-'), flush=True)
        print(end=f'<finished {index}>\n', flush=True)

    threads = []
    for i in range(num_peers):
        thread = threading.Thread(target=run_averager, args=[i])
        threads.append(thread)
        thread.start()
        time.sleep(spawn_dtime)

    t = time.time()
    for thread in threads:
        thread.join()

    print(f"\ntest run took {time.time() - t:.3f} seconds")

    for process in processes:
        process.terminate()
Example #25
0
def test_too_few_peers():
    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
    averagers = [hivemind.DecentralizedAverager(
        averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2,
        averaging_expiration=1, request_timeout=0.5,
        prefix='mygroup', initial_group_bits=bin(i)[2:].rjust(3, '0'), start=True)
        for i in range(4)]
    step_futures = [averager.step(wait=False) for averager in averagers]
    for future in step_futures:
        assert len(future.result()) == 2

    for averager in averagers:
        averager.shutdown()
    dht.shutdown()
Example #26
0
def test_overcrowded(num_peers=16):
    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
    averagers = [hivemind.DecentralizedAverager(
        averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2,
        averaging_expiration=1, request_timeout=0.5,
        prefix='mygroup', initial_group_bits='', start=True)
        for _ in range(num_peers)]
    for t in range(5):
        step_futures = [averager.step(wait=False, timeout=5) for averager in averagers]
        assert sum(len(future.result() or []) == 2 for future in step_futures) >= len(averagers) - 1

    for averager in averagers:
        averager.shutdown()
    dht.shutdown()
def benchmark_averaging(num_peers: int, target_group_size: int, num_rounds: int,
                        averaging_expiration: float, request_timeout: float, round_timeout: float,
                        hid_size: int, num_layers: int, spawn_dtime: float):
    dht_root = hivemind.DHT(listen_on=f'{LOCALHOST}:*', start=True)
    num_groups = 2 ** int(round(math.log2(num_peers / target_group_size)))
    nbits = int(round(math.log2(num_groups)))
    peer_tensors = [sample_tensors(hid_size, num_layers)
                    for _ in range(num_peers)]
    processes = {dht_root}
    lock_stats = threading.Lock()
    successful_steps = total_steps = 0

    def run_averager(index):
        nonlocal successful_steps, total_steps, lock_stats
        dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*',
                           initial_peers=[f"{LOCALHOST}:{dht_root.port}"],
                           start=True)
        initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0')
        averager = hivemind.DecentralizedAverager(
            peer_tensors[i], dht, prefix='my_tensor', initial_group_bits=initial_bits, listen_on=f"{LOCALHOST}:*",
            compression_type=runtime_pb2.CompressionType.FLOAT16, target_group_size=target_group_size,
            averaging_expiration=averaging_expiration, request_timeout=request_timeout, start=True)
        processes.update({dht, averager})

        logger.info(f'Averager {index}: started on endpoint {averager.endpoint}, group_bits: {averager.get_group_bits()}')
        for step in range(num_rounds):
            try:
                success = averager.step(timeout=round_timeout) is not None
            except:
                success = False
            with lock_stats:
                successful_steps += int(success)
                total_steps += 1
            logger.info(f"Averager {index}: {'finished' if success else 'failed'} step {step}")
        logger.info(f"Averager {index}: done.")

    threads = []
    for i in range(num_peers):
        thread = threading.Thread(target=run_averager, args=[i])
        threads.append(thread)
        thread.start()
        time.sleep(spawn_dtime)

    t = time.time()
    for thread in threads:
        thread.join()

    logger.info(f"Benchmark finished in {time.time() - t:.3f} seconds.")
    logger.info(f"Success rate: {successful_steps / total_steps} ({successful_steps} out of {total_steps} attempts)")
Example #28
0
def test_store_get_experts():
    peers = [hivemind.DHT(start=True)]
    for i in range(10):
        neighbors_i = [
            f'{LOCALHOST}:{node.port}'
            for node in random.sample(peers, min(3, len(peers)))
        ]
        peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True))

    first_peer = random.choice(peers)
    other_peer = random.choice(peers)

    expert_uids = [f"my_expert.{i}" for i in range(50)]
    batch_size = 10
    for batch_start in range(0, len(expert_uids), batch_size):
        hivemind.declare_experts(
            first_peer, expert_uids[batch_start:batch_start + batch_size],
            'localhost:1234')

    found = other_peer.get_experts(
        random.sample(expert_uids, 5) + ['foo', 'bar'])
    assert all(res is not None
               for res in found[:-2]), "Could not find some existing experts"
    assert all(res is None for res in found[-2:]), "Found non-existing experts"

    other_expert, other_port = "my_other_expert.1337", random.randint(
        1000, 9999)
    hivemind.declare_experts(other_peer, [other_expert],
                             f'that_host:{other_port}')
    first_notfound, first_found = hivemind.get_experts(
        first_peer, ['foobar', other_expert])
    assert isinstance(first_found, hivemind.RemoteExpert)
    assert first_found.endpoint == f'that_host:{other_port}'

    for peer in peers:
        peer.shutdown()
Example #29
0
    def run_averager(index):
        dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*',
                           initial_peers=[f"{LOCALHOST}:{dht_root.port}"],
                           start=True)
        initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0')
        averager = hivemind.DecentralizedAverager(
            peer_tensors[i], dht, prefix='my_tensor', initial_group_bits=initial_bits, listen_on=f"{LOCALHOST}:*",
            compression_type=runtime_pb2.CompressionType.FLOAT16, target_group_size=target_group_size,
            averaging_expiration=averaging_expiration, request_timeout=request_timeout, start=True)
        processes.update({dht, averager})

        print(end=f'<started {index}>\n', flush=True)
        for _ in range(num_rounds):
            success = averager.step(timeout=round_timeout)
            print(end=('+' if success else '-'), flush=True)
        print(end=f'<finished {index}>\n', flush=True)
Example #30
0
def test_dht_single_node():
    node = hivemind.DHT(start=True, expiration=999)
    beam_search = MoEBeamSearcher(node, 'expert.')

    assert all(
        node.declare_experts(['expert.1', 'expert.2', 'expert.3'],
                             f"{hivemind.LOCALHOST}:1337").values())
    assert len(node.declare_experts(["ffn.1", "ffn.2"],
                                    endpoint="that_place")) == 4
    assert len(
        node.declare_experts(['e.1.2.3', 'e.1.2.5', 'e.2.0'],
                             f"{hivemind.LOCALHOST}:42")) == 7

    for expert in node.get_experts(['expert.3', 'expert.2']):
        assert expert.endpoint == f"{hivemind.LOCALHOST}:1337"

    assert all(
        node.declare_experts(['expert.5', 'expert.2'],
                             f"{hivemind.LOCALHOST}:1337").values())
    found_experts = beam_search.find_best_experts(
        [(0., 1., 2., 3., 4., 5., 6., 7., 8.)], beam_size=2)
    assert len(found_experts) == 2 and [
        expert.uid for expert in found_experts
    ] == ['expert.5', 'expert.3']

    successors = beam_search.get_active_successors(
        ['e.1.2.', 'e.2.', 'e.4.5.'])
    assert len(successors['e.1.2.']) == 2
    assert successors['e.1.2.'][3] == UidEndpoint('e.1.2.3', f'{LOCALHOST}:42')
    assert successors['e.1.2.'][5] == UidEndpoint('e.1.2.5', f'{LOCALHOST}:42')
    assert len(
        successors['e.2.']) == 1 and successors['e.2.'][0] == UidEndpoint(
            'e.2.0', f'{LOCALHOST}:42')
    assert successors['e.4.5.'] == {}

    initial_beam = beam_search.get_initial_beam((3, 2, 1, 0, -1, -2, -3),
                                                beam_size=3)
    assert len(initial_beam) == 3
    assert initial_beam[0][:2] == (2.0, 'expert.1.')
    assert initial_beam[1][:2] == (1.0, 'expert.2.')
    assert initial_beam[2][:2] == (0.0, 'expert.3.')

    with pytest.raises(AssertionError):
        beam_search = MoEBeamSearcher(node, 'expert.1.ffn')

    with pytest.raises(AssertionError):
        beam_search.get_active_successors(['e.1.2.', 'e.2', 'e.4.5.'])