Example #1
0
def test_beam_search(dht_size=20,
                     total_experts=128,
                     batch_size=32,
                     initial_peers=3,
                     beam_size=4,
                     parallel_rpc=4,
                     grid_dims=(32, 32, 32)):
    dht = []
    for i in range(dht_size):
        neighbors_i = [
            f'{LOCALHOST}:{node.port}'
            for node in random.sample(dht, min(initial_peers, len(dht)))
        ]
        dht.append(
            hivemind.DHT(start=True,
                         initial_peers=neighbors_i,
                         parallel_rpc=parallel_rpc))

    real_experts = sorted({
        'expert.' +
        '.'.join([str(random.randint(0, dim - 1)) for dim in grid_dims])
        for _ in range(total_experts)
    })
    for batch_start in range(0, len(real_experts), batch_size):
        declare_experts(
            random.choice(dht),
            real_experts[batch_start:batch_start + batch_size],
            wait=True,
            endpoint=
            f"host{batch_start // batch_size}:{random.randint(0, 65536)}")

    neighbors_i = [
        f'{LOCALHOST}:{node.port}'
        for node in random.sample(dht, min(initial_peers, len(dht)))
    ]
    you = hivemind.DHT(start=True,
                       initial_peers=neighbors_i,
                       parallel_rpc=parallel_rpc)
    beam_search = MoEBeamSearcher(you, 'expert.', grid_dims)

    for i in range(10):
        topk_experts = beam_search.find_best_experts(
            [np.random.randn(dim) for dim in grid_dims], beam_size)
        assert all(isinstance(e, hivemind.RemoteExpert) for e in topk_experts)
        assert len(topk_experts) == beam_size

    for i in range(10):
        batch_experts = beam_search.batch_find_best_experts(
            [np.random.randn(batch_size, dim) for dim in grid_dims],
            beam_size=beam_size)
        assert isinstance(batch_experts,
                          list) and len(batch_experts) == batch_size
        assert all(
            isinstance(e, hivemind.RemoteExpert) for experts in batch_experts
            for e in experts)
        assert all(len(experts) == beam_size for experts in batch_experts)
Example #2
0
def test_dht_single_node():
    node = hivemind.DHT(start=True)
    beam_search = MoEBeamSearcher(node, 'expert.', grid_size=(10, ))

    assert all(
        declare_experts(node, ['expert.1', 'expert.2', 'expert.3'],
                        f"{hivemind.LOCALHOST}:1337").values())
    assert len(declare_experts(node, ["ffn.1", "ffn.2"],
                               endpoint="that_place")) == 4
    assert len(
        declare_experts(node, ['e.1.2.3', 'e.1.2.5', 'e.2.0'],
                        f"{hivemind.LOCALHOST}:42")) == 7

    for expert in get_experts(node, ['expert.3', 'expert.2']):
        assert expert.endpoint == f"{hivemind.LOCALHOST}:1337"

    assert all(
        declare_experts(node, ['expert.5', 'expert.2'],
                        f"{hivemind.LOCALHOST}:1337").values())
    found_experts = beam_search.find_best_experts(
        [(0., 1., 2., 3., 4., 5., 6., 7., 8.)], beam_size=2)
    assert len(found_experts) == 2 and [
        expert.uid for expert in found_experts
    ] == ['expert.5', 'expert.3']

    successors = beam_search.get_active_successors(
        ['e.1.2.', 'e.2.', 'e.4.5.'])
    assert len(successors['e.1.2.']) == 2
    assert successors['e.1.2.'][3] == UidEndpoint('e.1.2.3', f'{LOCALHOST}:42')
    assert successors['e.1.2.'][5] == UidEndpoint('e.1.2.5', f'{LOCALHOST}:42')
    assert len(
        successors['e.2.']) == 1 and successors['e.2.'][0] == UidEndpoint(
            'e.2.0', f'{LOCALHOST}:42')
    assert successors['e.4.5.'] == {}

    initial_beam = beam_search.get_initial_beam((3, 2, 1, 0, -1, -2, -3),
                                                beam_size=3)
    assert len(initial_beam) == 3
    assert initial_beam[0][:2] == (2.0, 'expert.1.')
    assert initial_beam[1][:2] == (1.0, 'expert.2.')
    assert initial_beam[2][:2] == (0.0, 'expert.3.')

    with pytest.raises(AssertionError):
        beam_search = MoEBeamSearcher(node, 'expert.1.ffn', (2, 2))

    with pytest.raises(AssertionError):
        beam_search.get_active_successors(['e.1.2.', 'e.2', 'e.4.5.'])
Example #3
0
def test_store_get_experts():
    peers = [hivemind.DHT(start=True)]
    for i in range(10):
        neighbors_i = [
            f'{LOCALHOST}:{node.port}'
            for node in random.sample(peers, min(3, len(peers)))
        ]
        peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True))

    first_peer = random.choice(peers)
    other_peer = random.choice(peers)

    expert_uids = [f"my_expert.{i}" for i in range(50)]
    batch_size = 10
    for batch_start in range(0, len(expert_uids), batch_size):
        hivemind.declare_experts(
            first_peer, expert_uids[batch_start:batch_start + batch_size],
            'localhost:1234')

    found = other_peer.get_experts(
        random.sample(expert_uids, 5) + ['foo', 'bar'])
    assert all(res is not None
               for res in found[:-2]), "Could not find some existing experts"
    assert all(res is None for res in found[-2:]), "Found non-existing experts"

    other_expert, other_port = "my_other_expert.1337", random.randint(
        1000, 9999)
    hivemind.declare_experts(other_peer, [other_expert],
                             f'that_host:{other_port}')
    first_notfound, first_found = hivemind.get_experts(
        first_peer, ['foobar', other_expert])
    assert isinstance(first_found, hivemind.RemoteExpert)
    assert first_found.endpoint == f'that_host:{other_port}'

    for peer in peers:
        peer.shutdown()
Example #4
0
async def test_negative_caching():
    peers = []
    for i in range(10):
        neighbors_i = [
            f'{LOCALHOST}:{node.port}'
            for node in random.sample(peers, min(3, len(peers)))
        ]
        peers.append(
            hivemind.DHT(initial_peers=neighbors_i,
                         cache_locally=False,
                         start=True))

    writer_peer = random.choice(peers)
    assert all(
        hivemind.declare_experts(writer_peer, ['ffn.1.2.3', 'ffn.3.4.5'],
                                 'myaddr:1234').values())

    neighbors_i = [
        f'{LOCALHOST}:{node.port}'
        for node in random.sample(peers, min(3, len(peers)))
    ]
    neg_caching_peer = hivemind.DHT(initial_peers=neighbors_i,
                                    cache_locally=False,
                                    start=True)
    beam_search = MoEBeamSearcher(neg_caching_peer,
                                  uid_prefix='ffn.',
                                  grid_size=(10, 10, 10),
                                  negative_caching=True)
    # get prefixes by the peer with negative caching. Cache "no data" entries for ffn.0.*, ffn.2.*, ffn.4.*, ffn.5.*
    assert len(
        beam_search.get_initial_beam(scores=[.1, .2, .3, .4, .5, .6],
                                     beam_size=3)) == 2

    node = await hivemind.DHTNode.create(initial_peers=neighbors_i)
    fetched = await asyncio.gather(*(node.get(f'ffn.{i}.') for i in range(10)))
    for i in range(6):
        assert fetched[i] is not None, f"node should have cached ffn.{i}."
    for i in range(6, len(fetched)):
        assert fetched[i] is None, f"node shouldn't have cached ffn.{i}."
Example #5
0
def test_beam_search_correctness():
    all_expert_uids = [
        f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10)
        for k in range(10)
    ]
    dht = hivemind.DHT(start=True)
    assert all(
        hivemind.declare_experts(dht,
                                 all_expert_uids,
                                 endpoint='fake-endpoint'))

    dmoe = hivemind.RemoteMixtureOfExperts(in_features=32,
                                           grid_size=(32, 32, 32),
                                           dht=dht,
                                           k_best=4,
                                           uid_prefix='ffn.')

    for i in range(25):
        input = torch.randn(32)
        grid_scores = dmoe.proj(input).split_with_sizes(
            dmoe.beam_search.grid_size, dim=-1)

        chosen_experts = dmoe.beam_search.find_best_experts(
            [tensor.detach().numpy() for tensor in grid_scores],
            beam_size=dmoe.k_best)
        chosen_scores = dmoe.compute_expert_scores(
            [dim_scores[None] for dim_scores in grid_scores],
            [chosen_experts])[0]
        our_best_scores = list(chosen_scores.cpu().detach().numpy())

        # reference: independently find :beam_size: best experts with exhaustive search
        all_scores = dmoe.compute_expert_scores(
            [dim_scores.unsqueeze(0) for dim_scores in grid_scores],
            [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0]
        true_best_scores = sorted(all_scores.cpu().detach().numpy(),
                                  reverse=True)[:len(chosen_experts)]

        assert np.allclose(true_best_scores, our_best_scores)
Example #6
0
def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int,
                  expert_batch_size: int, random_seed: int,
                  wait_after_request: float, wait_before_read: float,
                  wait_timeout: float, expiration: float):
    random.seed(random_seed)

    print("Creating peers...")
    peers = []
    for _ in trange(num_peers):
        neighbors = [
            f'0.0.0.0:{node.port}'
            for node in random.sample(peers, min(initial_peers, len(peers)))
        ]
        peer = hivemind.DHT(initial_peers=neighbors,
                            start=True,
                            wait_timeout=wait_timeout,
                            listen_on=f'0.0.0.0:*')
        peers.append(peer)

    store_peer, get_peer = peers[-2:]

    expert_uids = list(
        set(f"expert.{random.randint(0, 999)}.{random.randint(0, 999)}.{random.randint(0, 999)}"
            for _ in range(num_experts)))
    print(f"Sampled {len(expert_uids)} unique ids (after deduplication)")
    random.shuffle(expert_uids)

    print(f"Storing experts to dht in batches of {expert_batch_size}...")
    successful_stores = total_stores = total_store_time = 0
    benchmark_started = time.perf_counter()
    endpoints = []

    for start in trange(0, num_experts, expert_batch_size):
        store_start = time.perf_counter()
        endpoints.append(random_endpoint())
        store_ok = hivemind.declare_experts(store_peer,
                                            expert_uids[start:start +
                                                        expert_batch_size],
                                            endpoints[-1],
                                            expiration=expiration)
        successes = store_ok.values()
        total_store_time += time.perf_counter() - store_start

        total_stores += len(successes)
        successful_stores += sum(successes)
        time.sleep(wait_after_request)

    print(
        f"Store success rate: {successful_stores / total_stores * 100:.1f}% ({successful_stores} / {total_stores})"
    )
    print(
        f"Mean store time: {total_store_time / total_stores:.5}, Total: {total_store_time:.5}"
    )
    time.sleep(wait_before_read)

    if time.perf_counter() - benchmark_started > expiration:
        logger.warning(
            "All keys expired before benchmark started getting them. Consider increasing expiration_time"
        )

    successful_gets = total_get_time = 0

    for start in trange(0, len(expert_uids), expert_batch_size):
        get_start = time.perf_counter()
        get_result = hivemind.get_experts(
            get_peer, expert_uids[start:start + expert_batch_size])
        total_get_time += time.perf_counter() - get_start

        for i, expert in enumerate(get_result):
            if expert is not None and expert.uid == expert_uids[start + i] \
                    and expert.endpoint == endpoints[start // expert_batch_size]:
                successful_gets += 1

    if time.perf_counter() - benchmark_started > expiration:
        logger.warning(
            "keys expired midway during get requests. If that isn't desired, increase expiration_time param"
        )

    print(
        f"Get success rate: {successful_gets / len(expert_uids) * 100:.1f} ({successful_gets} / {len(expert_uids)})"
    )
    print(
        f"Mean get time: {total_get_time / len(expert_uids):.5f}, Total: {total_get_time:.5f}"
    )

    alive_peers = [peer.is_alive() for peer in peers]
    print(f"Node survival rate: {len(alive_peers) / len(peers) * 100:.3f}%")
Example #7
0
 def declare_experts(self, uids, endpoint, wait: bool = True):
     logger.warning(
         "dht.declare_experts is scheduled for removal in 0.9.8, please use hivemind.declare_experts.",
     )
     return hivemind.declare_experts(self, uids, endpoint, wait=wait)