def test_beam_search(dht_size=20, total_experts=128, batch_size=32, initial_peers=3, beam_size=4, parallel_rpc=4, grid_dims=(32, 32, 32)): dht = [] for i in range(dht_size): neighbors_i = [ f'{LOCALHOST}:{node.port}' for node in random.sample(dht, min(initial_peers, len(dht))) ] dht.append( hivemind.DHT(start=True, initial_peers=neighbors_i, parallel_rpc=parallel_rpc)) real_experts = sorted({ 'expert.' + '.'.join([str(random.randint(0, dim - 1)) for dim in grid_dims]) for _ in range(total_experts) }) for batch_start in range(0, len(real_experts), batch_size): declare_experts( random.choice(dht), real_experts[batch_start:batch_start + batch_size], wait=True, endpoint= f"host{batch_start // batch_size}:{random.randint(0, 65536)}") neighbors_i = [ f'{LOCALHOST}:{node.port}' for node in random.sample(dht, min(initial_peers, len(dht))) ] you = hivemind.DHT(start=True, initial_peers=neighbors_i, parallel_rpc=parallel_rpc) beam_search = MoEBeamSearcher(you, 'expert.', grid_dims) for i in range(10): topk_experts = beam_search.find_best_experts( [np.random.randn(dim) for dim in grid_dims], beam_size) assert all(isinstance(e, hivemind.RemoteExpert) for e in topk_experts) assert len(topk_experts) == beam_size for i in range(10): batch_experts = beam_search.batch_find_best_experts( [np.random.randn(batch_size, dim) for dim in grid_dims], beam_size=beam_size) assert isinstance(batch_experts, list) and len(batch_experts) == batch_size assert all( isinstance(e, hivemind.RemoteExpert) for experts in batch_experts for e in experts) assert all(len(experts) == beam_size for experts in batch_experts)
def test_dht_single_node(): node = hivemind.DHT(start=True) beam_search = MoEBeamSearcher(node, 'expert.', grid_size=(10, )) assert all( declare_experts(node, ['expert.1', 'expert.2', 'expert.3'], f"{hivemind.LOCALHOST}:1337").values()) assert len(declare_experts(node, ["ffn.1", "ffn.2"], endpoint="that_place")) == 4 assert len( declare_experts(node, ['e.1.2.3', 'e.1.2.5', 'e.2.0'], f"{hivemind.LOCALHOST}:42")) == 7 for expert in get_experts(node, ['expert.3', 'expert.2']): assert expert.endpoint == f"{hivemind.LOCALHOST}:1337" assert all( declare_experts(node, ['expert.5', 'expert.2'], f"{hivemind.LOCALHOST}:1337").values()) found_experts = beam_search.find_best_experts( [(0., 1., 2., 3., 4., 5., 6., 7., 8.)], beam_size=2) assert len(found_experts) == 2 and [ expert.uid for expert in found_experts ] == ['expert.5', 'expert.3'] successors = beam_search.get_active_successors( ['e.1.2.', 'e.2.', 'e.4.5.']) assert len(successors['e.1.2.']) == 2 assert successors['e.1.2.'][3] == UidEndpoint('e.1.2.3', f'{LOCALHOST}:42') assert successors['e.1.2.'][5] == UidEndpoint('e.1.2.5', f'{LOCALHOST}:42') assert len( successors['e.2.']) == 1 and successors['e.2.'][0] == UidEndpoint( 'e.2.0', f'{LOCALHOST}:42') assert successors['e.4.5.'] == {} initial_beam = beam_search.get_initial_beam((3, 2, 1, 0, -1, -2, -3), beam_size=3) assert len(initial_beam) == 3 assert initial_beam[0][:2] == (2.0, 'expert.1.') assert initial_beam[1][:2] == (1.0, 'expert.2.') assert initial_beam[2][:2] == (0.0, 'expert.3.') with pytest.raises(AssertionError): beam_search = MoEBeamSearcher(node, 'expert.1.ffn', (2, 2)) with pytest.raises(AssertionError): beam_search.get_active_successors(['e.1.2.', 'e.2', 'e.4.5.'])
def test_store_get_experts(): peers = [hivemind.DHT(start=True)] for i in range(10): neighbors_i = [ f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers))) ] peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True)) first_peer = random.choice(peers) other_peer = random.choice(peers) expert_uids = [f"my_expert.{i}" for i in range(50)] batch_size = 10 for batch_start in range(0, len(expert_uids), batch_size): hivemind.declare_experts( first_peer, expert_uids[batch_start:batch_start + batch_size], 'localhost:1234') found = other_peer.get_experts( random.sample(expert_uids, 5) + ['foo', 'bar']) assert all(res is not None for res in found[:-2]), "Could not find some existing experts" assert all(res is None for res in found[-2:]), "Found non-existing experts" other_expert, other_port = "my_other_expert.1337", random.randint( 1000, 9999) hivemind.declare_experts(other_peer, [other_expert], f'that_host:{other_port}') first_notfound, first_found = hivemind.get_experts( first_peer, ['foobar', other_expert]) assert isinstance(first_found, hivemind.RemoteExpert) assert first_found.endpoint == f'that_host:{other_port}' for peer in peers: peer.shutdown()
async def test_negative_caching(): peers = [] for i in range(10): neighbors_i = [ f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers))) ] peers.append( hivemind.DHT(initial_peers=neighbors_i, cache_locally=False, start=True)) writer_peer = random.choice(peers) assert all( hivemind.declare_experts(writer_peer, ['ffn.1.2.3', 'ffn.3.4.5'], 'myaddr:1234').values()) neighbors_i = [ f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers))) ] neg_caching_peer = hivemind.DHT(initial_peers=neighbors_i, cache_locally=False, start=True) beam_search = MoEBeamSearcher(neg_caching_peer, uid_prefix='ffn.', grid_size=(10, 10, 10), negative_caching=True) # get prefixes by the peer with negative caching. Cache "no data" entries for ffn.0.*, ffn.2.*, ffn.4.*, ffn.5.* assert len( beam_search.get_initial_beam(scores=[.1, .2, .3, .4, .5, .6], beam_size=3)) == 2 node = await hivemind.DHTNode.create(initial_peers=neighbors_i) fetched = await asyncio.gather(*(node.get(f'ffn.{i}.') for i in range(10))) for i in range(6): assert fetched[i] is not None, f"node should have cached ffn.{i}." for i in range(6, len(fetched)): assert fetched[i] is None, f"node shouldn't have cached ffn.{i}."
def test_beam_search_correctness(): all_expert_uids = [ f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10) for k in range(10) ] dht = hivemind.DHT(start=True) assert all( hivemind.declare_experts(dht, all_expert_uids, endpoint='fake-endpoint')) dmoe = hivemind.RemoteMixtureOfExperts(in_features=32, grid_size=(32, 32, 32), dht=dht, k_best=4, uid_prefix='ffn.') for i in range(25): input = torch.randn(32) grid_scores = dmoe.proj(input).split_with_sizes( dmoe.beam_search.grid_size, dim=-1) chosen_experts = dmoe.beam_search.find_best_experts( [tensor.detach().numpy() for tensor in grid_scores], beam_size=dmoe.k_best) chosen_scores = dmoe.compute_expert_scores( [dim_scores[None] for dim_scores in grid_scores], [chosen_experts])[0] our_best_scores = list(chosen_scores.cpu().detach().numpy()) # reference: independently find :beam_size: best experts with exhaustive search all_scores = dmoe.compute_expert_scores( [dim_scores.unsqueeze(0) for dim_scores in grid_scores], [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0] true_best_scores = sorted(all_scores.cpu().detach().numpy(), reverse=True)[:len(chosen_experts)] assert np.allclose(true_best_scores, our_best_scores)
def benchmark_dht(num_peers: int, initial_peers: int, num_experts: int, expert_batch_size: int, random_seed: int, wait_after_request: float, wait_before_read: float, wait_timeout: float, expiration: float): random.seed(random_seed) print("Creating peers...") peers = [] for _ in trange(num_peers): neighbors = [ f'0.0.0.0:{node.port}' for node in random.sample(peers, min(initial_peers, len(peers))) ] peer = hivemind.DHT(initial_peers=neighbors, start=True, wait_timeout=wait_timeout, listen_on=f'0.0.0.0:*') peers.append(peer) store_peer, get_peer = peers[-2:] expert_uids = list( set(f"expert.{random.randint(0, 999)}.{random.randint(0, 999)}.{random.randint(0, 999)}" for _ in range(num_experts))) print(f"Sampled {len(expert_uids)} unique ids (after deduplication)") random.shuffle(expert_uids) print(f"Storing experts to dht in batches of {expert_batch_size}...") successful_stores = total_stores = total_store_time = 0 benchmark_started = time.perf_counter() endpoints = [] for start in trange(0, num_experts, expert_batch_size): store_start = time.perf_counter() endpoints.append(random_endpoint()) store_ok = hivemind.declare_experts(store_peer, expert_uids[start:start + expert_batch_size], endpoints[-1], expiration=expiration) successes = store_ok.values() total_store_time += time.perf_counter() - store_start total_stores += len(successes) successful_stores += sum(successes) time.sleep(wait_after_request) print( f"Store success rate: {successful_stores / total_stores * 100:.1f}% ({successful_stores} / {total_stores})" ) print( f"Mean store time: {total_store_time / total_stores:.5}, Total: {total_store_time:.5}" ) time.sleep(wait_before_read) if time.perf_counter() - benchmark_started > expiration: logger.warning( "All keys expired before benchmark started getting them. Consider increasing expiration_time" ) successful_gets = total_get_time = 0 for start in trange(0, len(expert_uids), expert_batch_size): get_start = time.perf_counter() get_result = hivemind.get_experts( get_peer, expert_uids[start:start + expert_batch_size]) total_get_time += time.perf_counter() - get_start for i, expert in enumerate(get_result): if expert is not None and expert.uid == expert_uids[start + i] \ and expert.endpoint == endpoints[start // expert_batch_size]: successful_gets += 1 if time.perf_counter() - benchmark_started > expiration: logger.warning( "keys expired midway during get requests. If that isn't desired, increase expiration_time param" ) print( f"Get success rate: {successful_gets / len(expert_uids) * 100:.1f} ({successful_gets} / {len(expert_uids)})" ) print( f"Mean get time: {total_get_time / len(expert_uids):.5f}, Total: {total_get_time:.5f}" ) alive_peers = [peer.is_alive() for peer in peers] print(f"Node survival rate: {len(alive_peers) / len(peers) * 100:.3f}%")
def declare_experts(self, uids, endpoint, wait: bool = True): logger.warning( "dht.declare_experts is scheduled for removal in 0.9.8, please use hivemind.declare_experts.", ) return hivemind.declare_experts(self, uids, endpoint, wait=wait)