def test_store_get_experts(): peers = [hivemind.DHT(start=True)] for i in range(10): neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))] peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True)) you: hivemind.dht.DHT = random.choice(peers) theguyshetoldyounottoworryabout: hivemind.dht.DHT = random.choice(peers) expert_uids = [f"my_expert.{i}" for i in range(110)] batch_size = 10 for batch_start in range(0, len(expert_uids), batch_size): you.declare_experts(expert_uids[batch_start: batch_start + batch_size], 'localhost:1234') found = theguyshetoldyounottoworryabout.get_experts(random.sample(expert_uids, 5) + ['foo', 'bar']) assert all(res is not None for res in found[:-2]), "Could not find some existing experts" assert all(res is None for res in found[-2:]), "Found non-existing experts" that_guys_expert, that_guys_port = "my_other_expert.1337", random.randint(1000, 9999) theguyshetoldyounottoworryabout.declare_experts([that_guys_expert], f'that_host:{that_guys_port}') you_notfound, you_found = you.get_experts(['foobar', that_guys_expert]) assert isinstance(you_found, hivemind.RemoteExpert) assert you_found.endpoint == f'that_host:{that_guys_port}' for peer in peers: peer.shutdown()
def test_beam_search(dht_size=20, total_experts=128, batch_size=32, initial_peers=3, beam_size=4, parallel_rpc=16, grid_dims=(32, 32, 32)): dht = [] for i in range(dht_size): neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(dht, min(initial_peers, len(dht)))] dht.append(hivemind.DHT(start=True, expiration=999999, initial_peers=neighbors_i, parallel_rpc=parallel_rpc)) real_experts = sorted({ 'expert.' + '.'.join([str(random.randint(0, dim - 1)) for dim in grid_dims]) for _ in range(total_experts) }) for batch_start in range(0, len(real_experts), batch_size): random.choice(dht).declare_experts( real_experts[batch_start: batch_start + batch_size], wait=True, endpoint=f"host{batch_start // batch_size}:{random.randint(0, 65536)}") neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(dht, min(initial_peers, len(dht)))] you = hivemind.DHT(start=True, expiration=999999, initial_peers=neighbors_i, parallel_rpc=parallel_rpc) for i in range(50): topk_experts = you.find_best_experts('expert.', [np.random.randn(dim) for dim in grid_dims], beam_size=beam_size) assert all(isinstance(e, hivemind.RemoteExpert) for e in topk_experts) assert len(topk_experts) == beam_size for i in range(10): batch_experts = you.batch_find_best_experts('expert.', [np.random.randn(batch_size, dim) for dim in grid_dims], beam_size=beam_size) assert isinstance(batch_experts, list) and len(batch_experts) == batch_size assert all(isinstance(e, hivemind.RemoteExpert) for experts in batch_experts for e in experts) assert all(len(experts) == beam_size for experts in batch_experts)
def test_negative_caching(): test_success = mp.Event() peers = [] for i in range(10): neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))] peers.append(hivemind.DHT(initial_peers=neighbors_i, negative_caching=False, cache_locally=False, start=True)) normal_peer, writer_peer = random.sample(peers, 2) neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))] neg_caching_peer = hivemind.DHT(initial_peers=neighbors_i, negative_caching=True, cache_locally=False, start=True) assert all(writer_peer.declare_experts(['ffn.1.2.3', 'ffn.3.4.5'], 'myaddr:1234').values()) # get prefixes by the peer with negative caching. Cache "no data" entries for ffn.0.*, ffn.2.*, ffn.4.*, ffn.5.* assert len(neg_caching_peer.get_initial_beam(prefix='ffn.', scores=[.1, .2, .3, .4, .5, .6], beam_size=3)) == 2 async def _tester(): node = await hivemind.DHTNode.create(initial_peers=neighbors_i) fetched = await asyncio.gather(*(node.get(f'ffn.{i}.') for i in range(10))) for i in range(6): assert fetched[i] is not None, f"node should have cached ffn.{i}." for i in range(6, len(fetched)): assert fetched[i] is None, f"node shouldn't have cached ffn.{i}." test_success.set() proc = mp.Process(target=lambda: asyncio.run(_tester())) proc.start() proc.join() assert test_success.is_set()
def test_sending_validator_instance_between_processes(): alice = hivemind.DHT(start=True) bob = hivemind.DHT(start=True, initial_peers=[f"{LOCALHOST}:{alice.port}"]) alice.add_validators([SchemaValidator(SampleSchema)]) bob.add_validators([SchemaValidator(SampleSchema)]) assert bob.store('experiment_name', b'foo_bar', get_dht_time() + 10) assert not bob.store('experiment_name', 777, get_dht_time() + 10) assert alice.get('experiment_name', latest=True).value == b'foo_bar'
def test_hivemind_dht(): peers = [hivemind.DHT(start=True)] for i in range(10): neighbors_i = [ f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers))) ] peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True)) you: hivemind.dht.DHT = random.choice(peers) theguyshetoldyounottoworryabout: hivemind.dht.DHT = random.choice(peers) expert_uids = [str(uuid.uuid4()) for _ in range(110)] batch_size = 10 for batch_start in range(0, len(expert_uids), batch_size): you.declare_experts(expert_uids[batch_start:batch_start + batch_size], 'localhost', 1234) found = theguyshetoldyounottoworryabout.get_experts( random.sample(expert_uids, 5) + ['foo', 'bar']) assert all(res is not None for res in found[:-2]), "Could not find some existing experts" assert all(res is None for res in found[-2:]), "Found non-existing experts" that_guys_expert, that_guys_port = str(uuid.uuid4()), random.randint( 1000, 9999) theguyshetoldyounottoworryabout.declare_experts( [that_guys_expert], f'that_host:{that_guys_port}') you_notfound, you_found = you.get_experts(['foobar', that_guys_expert]) assert isinstance(you_found, hivemind.RemoteExpert) assert you_found.endpoint == f'that_host:{that_guys_port}' # test first_k_active assert list( theguyshetoldyounottoworryabout.first_k_active( expert_uids, k=10)) == expert_uids[:10] some_permuted_experts = random.sample(expert_uids, k=32) assert list( theguyshetoldyounottoworryabout.first_k_active( some_permuted_experts, k=32)) == some_permuted_experts assert list( theguyshetoldyounottoworryabout.first_k_active( some_permuted_experts, k=1)) == some_permuted_experts[:1] fake_and_real_experts = list( chain(*zip([str(uuid.uuid4()) for _ in some_permuted_experts], some_permuted_experts))) assert list( theguyshetoldyounottoworryabout.first_k_active( fake_and_real_experts, k=9)) == some_permuted_experts[:9] for peer in peers: peer.shutdown()
def test_dht_get_address(addr=LOCALHOST, dummy_endpoint='123.45.67.89:*'): node1 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*") node2 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", initial_peers=[f"{addr}:{node1.port}"]) node3 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", initial_peers=[f"{addr}:{node2.port}"]) assert addr in node3.get_visible_address(num_peers=2) node4 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*") with pytest.raises(ValueError): node4.get_visible_address() assert node4.get_visible_address(peers=[f'{addr}:{node1.port}']).endswith(addr) node5 = hivemind.DHT(start=True, listen_on=f"0.0.0.0:*", endpoint=f"{dummy_endpoint}") assert node5.get_visible_address() == strip_port(dummy_endpoint)
def test_allreduce_grid(): dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*') averagers = [ hivemind.DecentralizedAverager(averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2, prefix='mygroup', initial_group_bits=bin( i // 2)[2:].rjust(2, '0'), start=True) for i in range(8) ] [means0], [stds0] = compute_mean_std(averagers) assert not torch.allclose(stds0, torch.zeros_like(stds0)) prev_means, prev_stds = means0, stds0 for i in range(5): step_futures = [averager.step(wait=False) for averager in averagers] groups = [future.result() for future in step_futures] [means], [stds] = compute_mean_std(averagers) assert torch.allclose(means, prev_means, atol=1e-6, rtol=0) assert all(len(group) == 2 for group in groups) if i <= 2: assert torch.all(torch.le(stds, prev_stds)) else: assert torch.allclose(stds, torch.zeros_like(stds), atol=1e-6, rtol=0) for averager in averagers: averager.shutdown() dht.shutdown()
def test_compute_expert_scores(): try: dht = hivemind.DHT(start=True) moe = hivemind.client.moe.RemoteMixtureOfExperts(dht=dht, in_features=16, grid_size=(40, ), k_best=4, k_min=1, timeout_after_k_min=1, uid_prefix='expert.') gx, gy = torch.randn(4, 5, requires_grad=True), torch.randn( 4, 3, requires_grad=True) ii = [[4, 0, 2], [3, 1, 1, 1, 3], [0], [3, 2]] jj = [[2, 2, 1], [0, 1, 2, 0, 1], [0], [1, 2]] batch_experts = [[ hivemind.RemoteExpert( uid=f'expert.{ii[batch_i][expert_i]}.{jj[batch_i][expert_i]}', endpoint="[::]:1337") for expert_i in range(len(ii[batch_i])) ] for batch_i in range( len(ii) )] # note: these experts do not exists on server, we use them only to test moe compute_expert_scores logits = moe.compute_expert_scores([gx, gy], batch_experts) torch.softmax(logits, dim=-1).norm(dim=-1).mean().backward() assert gx.grad.norm().item() > 0 and gy.grad.norm().item( ), "compute_expert_scores didn't backprop" for batch_i in range(len(ii)): for expert_i in range(len(ii[batch_i])): assert torch.allclose(logits[batch_i, expert_i], gx[batch_i, ii[batch_i][expert_i]] + gy[batch_i, jj[batch_i][expert_i]]), \ "compute_expert_scores returned incorrect score" finally: dht.shutdown()
def test_allgather(): dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*') averagers = [ hivemind.DecentralizedAverager(torch.ones(1), dht=dht, target_group_size=4, averaging_expiration=15, prefix='mygroup', initial_group_bits='000', listen_on='127.0.0.1:*', start=True) for _ in range(8) ] futures = [] for i, averager in enumerate(averagers): futures.append( averager.step(wait=False, gather=dict(batch_size=123 + i, foo='bar'))) assert len(set(repr(sorted(future.result())) for future in futures)) == 2 reference_metadata = { averager.endpoint: dict(batch_size=123 + i, foo='bar') for i, averager in enumerate(averagers) } for future in futures: gathered = future.result() assert len(gathered) == 4 for endpoint in gathered: assert gathered[endpoint] == reference_metadata[endpoint]
def test_allreduce_once(): dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*') tensors1 = [torch.randn(123), torch.zeros(3)] tensors2 = [torch.rand(123), torch.ones(3)] tensors3 = [-torch.rand(123), torch.arange(3).to(torch.float32)] tensors4 = [torch.randn(123)**3, torch.arange(3).to(torch.float32) / 2] reference = [(tensors1[i] + tensors2[i] + tensors3[i] + tensors4[i]) / 4 for i in range(len(tensors1))] averagers = [ hivemind.DecentralizedAverager(tensors, dht=dht, target_group_size=4, averaging_expiration=15, prefix='mygroup', initial_group_bits='0110', listen_on='127.0.0.1:*', start=True) for tensors in [tensors1, tensors2, tensors3, tensors4] ] futures = [] for averager in averagers: futures.append(averager.step(wait=False)) for future in futures: result = future.result() for averager in averagers: assert averager.endpoint in result for averager in averagers: with averager.get_tensors() as averaged_tensors: for ref, our in zip(reference, averaged_tensors): assert torch.allclose(ref, our, atol=1e-6)
def test_get_store(): peers = [] for i in range(10): neighbors_i = [f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers)))] peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True)) node1, node2 = random.sample(peers, 2) assert node1.store('key1', 'value1', expiration_time=hivemind.get_dht_time() + 30) assert node1.get('key1').value == 'value1' assert node2.get('key1').value == 'value1' assert node2.get('key2') is None future = node1.get('foo', return_future=True) assert future.result() is None future = node1.get('foo', return_future=True) future.cancel() assert node2.store('key1', 123, expiration_time=hivemind.get_dht_time() + 31) assert node2.store('key2', 456, expiration_time=hivemind.get_dht_time() + 32) assert node1.get('key1', latest=True).value == 123 assert node1.get('key2').value == 456 assert node1.store('key2', subkey='subkey1', value=789, expiration_time=hivemind.get_dht_time() + 32) assert node2.store('key2', subkey='subkey2', value='pew', expiration_time=hivemind.get_dht_time() + 32) found_dict = node1.get('key2', latest=True).value assert isinstance(found_dict, dict) and len(found_dict) == 2 assert found_dict['subkey1'].value == 789 and found_dict['subkey2'].value == 'pew' for peer in peers: peer.shutdown()
def test_dht_add_validators(validators_for_app): # One app may create a DHT with its validators dht = hivemind.DHT(start=False, record_validators=validators_for_app['A']) # While the DHT process is not started, you can't send a command to append new validators with pytest.raises(RuntimeError): dht.add_validators(validators_for_app['B']) dht.run_in_background(await_ready=True) # After starting the process, other apps may add new validators to the existing DHT dht.add_validators(validators_for_app['B']) assert dht.store('field_a', b'bytes_value', hivemind.get_dht_time() + 10) assert dht.get('field_a', latest=True).value == b'bytes_value' assert not dht.store('field_a', 666, hivemind.get_dht_time() + 10) assert dht.get('field_a', latest=True).value == b'bytes_value' local_public_key = validators_for_app['A'][0].local_public_key assert dht.store('field_b', 777, hivemind.get_dht_time() + 10, subkey=local_public_key) dictionary = dht.get('field_b', latest=True).value assert (len(dictionary) == 1 and dictionary[local_public_key].value == 777) assert not dht.store('unknown_key', 666, hivemind.get_dht_time() + 10) assert dht.get('unknown_key', latest=True) is None
def test_multiple_peers(num_processes, wait_seconds): """Test to ensure that if we have two running processes with the same peers, they connect and train successfully.""" dht_root = hivemind.DHT(start=True) barrier = mp.Barrier(num_processes) initial_peers = dht_root.get_visible_maddrs() with mp.Manager() as manager: # allows processes to return their recorded logged peers/steps recorded_process_peers = manager.list() recorded_process_steps = manager.list() processes = [ mp.Process( target=_run_collab_training_fn, kwargs=dict( initial_peers=initial_peers, wait_seconds=wait_seconds, barrier=barrier, recorded_process_peers=recorded_process_peers, recorded_process_steps=recorded_process_steps, ), ) for x in range(num_processes) ] for process in processes: process.start() for process in processes: process.join() # assert that peers increase as expected and we run at-least 1 global step. for process_peers, process_steps in zip(recorded_process_peers, recorded_process_steps): assert any(num_peer == num_processes for num_peer in process_peers) assert any(global_step > 0 for global_step in process_steps)
def test_beam_search_correctness(): all_expert_uids = [ f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10) for k in range(10) ] dht = hivemind.DHT(start=True, expiration=999) assert all(dht.declare_experts(all_expert_uids, endpoint='fake-endpoint')) dmoe = hivemind.RemoteMixtureOfExperts(in_features=32, grid_size=(32, 32, 32), dht=dht, k_best=4, uid_prefix='ffn.') for i in range(25): input = torch.randn(32) grid_scores = dmoe.proj(input).split_with_sizes(dmoe.grid_size, dim=-1) chosen_experts = dht.find_best_experts( dmoe.uid_prefix, [tensor.detach().numpy() for tensor in grid_scores], beam_size=dmoe.k_best) chosen_scores = dmoe.compute_expert_scores( [dim_scores[None] for dim_scores in grid_scores], [chosen_experts])[0] our_best_scores = list(chosen_scores.cpu().detach().numpy()) # reference: independently find :beam_size: best experts with exhaustive search all_scores = dmoe.compute_expert_scores( [dim_scores.unsqueeze(0) for dim_scores in grid_scores], [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0] true_best_scores = sorted(all_scores.cpu().detach().numpy(), reverse=True)[:len(chosen_experts)] assert np.allclose(true_best_scores, our_best_scores)
async def test_key_manager(): key_manager = GroupKeyManager(hivemind.DHT(start=True), endpoint='localhvost', prefix='test_averaging', initial_group_bits='10110', target_group_size=2) t = hivemind.get_dht_time() key = key_manager.current_key await key_manager.declare_averager(key, 'localhvost', expiration_time=t + 60) await key_manager.declare_averager(key, 'localhvost2', expiration_time=t + 61) q1 = await key_manager.get_averagers(key, only_active=True) await key_manager.declare_averager(key, 'localhvost', expiration_time=t + 66) q2 = await key_manager.get_averagers(key, only_active=True) await key_manager.declare_averager(key, 'localhvost2', expiration_time=t + 61, looking_for_group=False) q3 = await key_manager.get_averagers(key, only_active=True) q4 = await key_manager.get_averagers(key, only_active=False) q5 = await key_manager.get_averagers('nonexistent_key.0b0101', only_active=False) assert len(q1) == 2 and ('localhvost', t + 60) in q1 and ('localhvost2', t + 61) in q1 assert len(q2) == 2 and ('localhvost', t + 66) in q2 and ('localhvost2', t + 61) in q2 assert len(q3) == 1 and ('localhvost', t + 66) in q3 assert len(q4) == 2 and ('localhvost', t + 66) in q4 and ('localhvost2', t + 61) in q2 assert len(q5) == 0
def test_getset_averagers(): dht = hivemind.DHT(start=True) t = hivemind.get_dht_time() dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost', expiration_time=t + 60) dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost2', expiration_time=t + 61) q1 = dht.get_averagers('bucket.0b10110', only_active=True) dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost', expiration_time=t + 66) q2 = dht.get_averagers('bucket.0b10110', only_active=True) dht.declare_averager(group_key='bucket.0b10110', endpoint='localhvost2', looking_for_group=False, expiration_time=t + 61) q3 = dht.get_averagers('bucket.0b10110', only_active=True) q4 = dht.get_averagers('bucket.0b10110', only_active=False) assert len(q1) == 2 and ('localhvost', t + 60) in q1 and ('localhvost2', t + 61) in q1 assert len(q2) == 2 and ('localhvost', t + 66) in q2 and ('localhvost2', t + 61) in q2 assert len(q3) == 1 and ('localhvost', t + 66) in q3 assert len(q4) == 2 and ('localhvost', t + 66) in q4 and ('localhvost2', t + 61) in q2
def test_moe(): all_expert_uids = [ f'ffn.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}' for _ in range(20) ] with background_server(expert_uids=all_expert_uids, device='cpu', expert_cls='ffn', num_handlers=1, hidden_dim=16) as (server_endpoint, dht_endpoint): dht = hivemind.DHT(start=True, expiration=999, initial_peers=[dht_endpoint]) # declare expert uids. Server *should* declare them by itself, but it takes time. assert all( dht.declare_experts(all_expert_uids, endpoint=server_endpoint)) dmoe = hivemind.RemoteMixtureOfExperts(in_features=16, grid_size=(32, 32, 32), dht=dht, k_best=3, uid_prefix='ffn') for i in range(10): out = dmoe(torch.randn(10, 16)) out.sum().backward()
def test_moe_beam_search(): all_expert_uids = [ f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10) for k in range(10) ] dht = hivemind.DHT(start=True, expiration=999) assert all(dht.declare_experts(all_expert_uids, endpoint='fake-endpoint')) dmoe = hivemind.RemoteMixtureOfExperts(in_features=32, grid_size=(32, 32, 32), dht=dht, k_best=4, uid_prefix='ffn') for i in range(25): input = torch.randn(32) grid_scores = dmoe.proj(input).split_with_sizes(dmoe.grid_size, dim=-1) chosen_experts = dmoe.loop.run_until_complete( dmoe.beam_search(grid_scores, k_best=dmoe.k_best)) chosen_scores = dmoe.compute_expert_scores( [dim_scores[None] for dim_scores in grid_scores], [chosen_experts])[0] all_scores = dmoe.compute_expert_scores( [dim_scores[None] for dim_scores in grid_scores], [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0] true_best_scores = sorted(all_scores.cpu().detach().numpy(), reverse=True)[:len(chosen_experts)] our_best_scores = list(chosen_scores.cpu().detach().numpy()) assert np.allclose(true_best_scores, our_best_scores)
def test_load_state_from_peers(): num_calls = 0 super_metadata = dict(x=123) super_tensors = (torch.randn(3), torch.randint(0, 5, (3, ))) class TestAverager(hivemind.DecentralizedAverager): def get_current_state(self): """ Get current state and send it to a peer. executed in the host process. Meant to be overriden. :returns: a tuple of (serializable_small_metadata, sequence of torch tensors) """ nonlocal num_calls, super_metadata, super_tensors num_calls += 1 return super_metadata, super_tensors dht_root = hivemind.DHT(start=True) initial_peers = [f'{hivemind.LOCALHOST}:{dht_root.port}'] dht1 = hivemind.DHT(initial_peers=initial_peers, start=True) averager1 = TestAverager([torch.randn(3), torch.rand(5)], dht=dht1, start=True, prefix='demo-run', target_group_size=2) dht2 = hivemind.DHT(initial_peers=initial_peers, start=True) dht2.get('demo-run.all_averagers') averager2 = TestAverager([torch.randn(3), torch.rand(5)], dht=dht2, start=True, prefix='demo-run', target_group_size=2) assert num_calls == 0 got_metadata, got_tensors = averager2.load_state_from_peers() assert num_calls == 1 assert got_metadata == super_metadata assert all(map(torch.allclose, got_tensors, super_tensors)) super_metadata['y'] = 123 super_tensors[1][2] = 9 assert num_calls == 1 assert got_metadata != super_metadata assert not all(map(torch.allclose, got_tensors, super_tensors)) got_metadata, got_tensors = averager2.load_state_from_peers() assert num_calls == 2 assert got_metadata == super_metadata assert all(map(torch.allclose, got_tensors, super_tensors))
def test_getset_bits(): dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*') averager = hivemind.DecentralizedAverager([torch.randn(3)], dht=dht, start=True, prefix='test_prefix', target_group_size=2) averager.set_group_bits('00101011101010') assert averager.get_group_bits() == '00101011101010'
def test_client_anomaly_detection(): HID_DIM = 16 experts = {} for i in range(4): expert = layers.name_to_block['ffn'](HID_DIM) experts[f'expert.{i}'] = hivemind.ExpertBackend( name=f'expert.{i}', expert=expert, optimizer=torch.optim.Adam(expert.parameters()), args_schema=(hivemind.BatchTensorDescriptor(HID_DIM), ), outputs_schema=hivemind.BatchTensorDescriptor(HID_DIM), max_batch_size=16, ) experts['expert.3'].expert.ffn.weight.data[0, 0] = float('nan') dht = hivemind.DHT(start=True) server = hivemind.Server(dht, experts, num_connection_handlers=1) server.start() try: server.ready.wait() dmoe = hivemind.RemoteMixtureOfExperts(in_features=16, grid_size=(3, ), dht=dht, k_best=3, uid_prefix='expert.', detect_anomalies=True) input = torch.randn(1, 16) input[0, 0] = float('nan') with pytest.raises(ValueError): dmoe(input) input[0, 0] = 0 output = dmoe(input) inf_loss = float('inf') * output.sum() with pytest.raises(ValueError): inf_loss.backward() dmoe = hivemind.RemoteMixtureOfExperts(in_features=16, grid_size=(4, ), dht=dht, k_best=4, uid_prefix='expert.', detect_anomalies=True) output = dmoe(input) assert output.isfinite().all() finally: server.shutdown()
async def test_negative_caching(): peers = [] for i in range(10): neighbors_i = [ f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers))) ] peers.append( hivemind.DHT(initial_peers=neighbors_i, cache_locally=False, start=True)) writer_peer = random.choice(peers) assert all( hivemind.declare_experts(writer_peer, ['ffn.1.2.3', 'ffn.3.4.5'], 'myaddr:1234').values()) neighbors_i = [ f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers))) ] neg_caching_peer = hivemind.DHT(initial_peers=neighbors_i, cache_locally=False, start=True) beam_search = MoEBeamSearcher(neg_caching_peer, uid_prefix='ffn.', grid_size=(10, 10, 10), negative_caching=True) # get prefixes by the peer with negative caching. Cache "no data" entries for ffn.0.*, ffn.2.*, ffn.4.*, ffn.5.* assert len( beam_search.get_initial_beam(scores=[.1, .2, .3, .4, .5, .6], beam_size=3)) == 2 node = await hivemind.DHTNode.create(initial_peers=neighbors_i) fetched = await asyncio.gather(*(node.get(f'ffn.{i}.') for i in range(10))) for i in range(6): assert fetched[i] is not None, f"node should have cached ffn.{i}." for i in range(6, len(fetched)): assert fetched[i] is None, f"node shouldn't have cached ffn.{i}."
def initialize_dht_and_averager(self, collaboration_args: CollaborationArguments): collaboration_args.initial_peers = list(map(str.strip, collaboration_args.initial_peers.split(','))) logger.info(f"Found {len(collaboration_args.initial_peers)} initial peers: {collaboration_args.initial_peers}") if len(collaboration_args.initial_peers) == 0: raise ValueError("Please specify at least one network endpoint in initial peers.") dht = hivemind.DHT(initial_peers=list(collaboration_args.initial_peers), start=True) averager = SimpleAverager(self, dht=dht, prefix=self.matchmaking_prefix, target_group_size=collaboration_args.target_group_size, throughput=collaboration_args.bandwidth, compression_type=hivemind.utils.CompressionType.FLOAT16, averaging_expiration=collaboration_args.averaging_expiration, start=True) return dht, averager
def benchmark_averaging(num_peers: int, target_group_size: int, num_rounds: int, averaging_expiration: float, request_timeout: float, round_timeout: float, hid_size: int, num_layers: int, spawn_dtime: float): dht_root = hivemind.DHT(listen_on=f'{LOCALHOST}:*', start=True) num_groups = 2**int(round(math.log2(num_peers / target_group_size))) nbits = int(round(math.log2(num_groups))) peer_tensors = [ sample_tensors(hid_size, num_layers) for _ in range(num_peers) ] processes = {dht_root} def run_averager(index): dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*', initial_peers=[f"{LOCALHOST}:{dht_root.port}"], start=True) initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0') averager = hivemind.DecentralizedAverager( peer_tensors[i], dht, prefix='my_tensor', initial_group_bits=initial_bits, listen_on=f"{LOCALHOST}:*", compression_type=runtime_pb2.CompressionType.FLOAT16, target_group_size=target_group_size, averaging_expiration=averaging_expiration, request_timeout=request_timeout, start=True) processes.update({dht, averager}) print(end=f'<started {index}>\n', flush=True) for _ in range(num_rounds): success = averager.step(timeout=round_timeout) print(end=('+' if success else '-'), flush=True) print(end=f'<finished {index}>\n', flush=True) threads = [] for i in range(num_peers): thread = threading.Thread(target=run_averager, args=[i]) threads.append(thread) thread.start() time.sleep(spawn_dtime) t = time.time() for thread in threads: thread.join() print(f"\ntest run took {time.time() - t:.3f} seconds") for process in processes: process.terminate()
def test_too_few_peers(): dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*') averagers = [hivemind.DecentralizedAverager( averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2, averaging_expiration=1, request_timeout=0.5, prefix='mygroup', initial_group_bits=bin(i)[2:].rjust(3, '0'), start=True) for i in range(4)] step_futures = [averager.step(wait=False) for averager in averagers] for future in step_futures: assert len(future.result()) == 2 for averager in averagers: averager.shutdown() dht.shutdown()
def test_overcrowded(num_peers=16): dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*') averagers = [hivemind.DecentralizedAverager( averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2, averaging_expiration=1, request_timeout=0.5, prefix='mygroup', initial_group_bits='', start=True) for _ in range(num_peers)] for t in range(5): step_futures = [averager.step(wait=False, timeout=5) for averager in averagers] assert sum(len(future.result() or []) == 2 for future in step_futures) >= len(averagers) - 1 for averager in averagers: averager.shutdown() dht.shutdown()
def benchmark_averaging(num_peers: int, target_group_size: int, num_rounds: int, averaging_expiration: float, request_timeout: float, round_timeout: float, hid_size: int, num_layers: int, spawn_dtime: float): dht_root = hivemind.DHT(listen_on=f'{LOCALHOST}:*', start=True) num_groups = 2 ** int(round(math.log2(num_peers / target_group_size))) nbits = int(round(math.log2(num_groups))) peer_tensors = [sample_tensors(hid_size, num_layers) for _ in range(num_peers)] processes = {dht_root} lock_stats = threading.Lock() successful_steps = total_steps = 0 def run_averager(index): nonlocal successful_steps, total_steps, lock_stats dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*', initial_peers=[f"{LOCALHOST}:{dht_root.port}"], start=True) initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0') averager = hivemind.DecentralizedAverager( peer_tensors[i], dht, prefix='my_tensor', initial_group_bits=initial_bits, listen_on=f"{LOCALHOST}:*", compression_type=runtime_pb2.CompressionType.FLOAT16, target_group_size=target_group_size, averaging_expiration=averaging_expiration, request_timeout=request_timeout, start=True) processes.update({dht, averager}) logger.info(f'Averager {index}: started on endpoint {averager.endpoint}, group_bits: {averager.get_group_bits()}') for step in range(num_rounds): try: success = averager.step(timeout=round_timeout) is not None except: success = False with lock_stats: successful_steps += int(success) total_steps += 1 logger.info(f"Averager {index}: {'finished' if success else 'failed'} step {step}") logger.info(f"Averager {index}: done.") threads = [] for i in range(num_peers): thread = threading.Thread(target=run_averager, args=[i]) threads.append(thread) thread.start() time.sleep(spawn_dtime) t = time.time() for thread in threads: thread.join() logger.info(f"Benchmark finished in {time.time() - t:.3f} seconds.") logger.info(f"Success rate: {successful_steps / total_steps} ({successful_steps} out of {total_steps} attempts)")
def test_store_get_experts(): peers = [hivemind.DHT(start=True)] for i in range(10): neighbors_i = [ f'{LOCALHOST}:{node.port}' for node in random.sample(peers, min(3, len(peers))) ] peers.append(hivemind.DHT(initial_peers=neighbors_i, start=True)) first_peer = random.choice(peers) other_peer = random.choice(peers) expert_uids = [f"my_expert.{i}" for i in range(50)] batch_size = 10 for batch_start in range(0, len(expert_uids), batch_size): hivemind.declare_experts( first_peer, expert_uids[batch_start:batch_start + batch_size], 'localhost:1234') found = other_peer.get_experts( random.sample(expert_uids, 5) + ['foo', 'bar']) assert all(res is not None for res in found[:-2]), "Could not find some existing experts" assert all(res is None for res in found[-2:]), "Found non-existing experts" other_expert, other_port = "my_other_expert.1337", random.randint( 1000, 9999) hivemind.declare_experts(other_peer, [other_expert], f'that_host:{other_port}') first_notfound, first_found = hivemind.get_experts( first_peer, ['foobar', other_expert]) assert isinstance(first_found, hivemind.RemoteExpert) assert first_found.endpoint == f'that_host:{other_port}' for peer in peers: peer.shutdown()
def run_averager(index): dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*', initial_peers=[f"{LOCALHOST}:{dht_root.port}"], start=True) initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0') averager = hivemind.DecentralizedAverager( peer_tensors[i], dht, prefix='my_tensor', initial_group_bits=initial_bits, listen_on=f"{LOCALHOST}:*", compression_type=runtime_pb2.CompressionType.FLOAT16, target_group_size=target_group_size, averaging_expiration=averaging_expiration, request_timeout=request_timeout, start=True) processes.update({dht, averager}) print(end=f'<started {index}>\n', flush=True) for _ in range(num_rounds): success = averager.step(timeout=round_timeout) print(end=('+' if success else '-'), flush=True) print(end=f'<finished {index}>\n', flush=True)
def test_dht_single_node(): node = hivemind.DHT(start=True, expiration=999) beam_search = MoEBeamSearcher(node, 'expert.') assert all( node.declare_experts(['expert.1', 'expert.2', 'expert.3'], f"{hivemind.LOCALHOST}:1337").values()) assert len(node.declare_experts(["ffn.1", "ffn.2"], endpoint="that_place")) == 4 assert len( node.declare_experts(['e.1.2.3', 'e.1.2.5', 'e.2.0'], f"{hivemind.LOCALHOST}:42")) == 7 for expert in node.get_experts(['expert.3', 'expert.2']): assert expert.endpoint == f"{hivemind.LOCALHOST}:1337" assert all( node.declare_experts(['expert.5', 'expert.2'], f"{hivemind.LOCALHOST}:1337").values()) found_experts = beam_search.find_best_experts( [(0., 1., 2., 3., 4., 5., 6., 7., 8.)], beam_size=2) assert len(found_experts) == 2 and [ expert.uid for expert in found_experts ] == ['expert.5', 'expert.3'] successors = beam_search.get_active_successors( ['e.1.2.', 'e.2.', 'e.4.5.']) assert len(successors['e.1.2.']) == 2 assert successors['e.1.2.'][3] == UidEndpoint('e.1.2.3', f'{LOCALHOST}:42') assert successors['e.1.2.'][5] == UidEndpoint('e.1.2.5', f'{LOCALHOST}:42') assert len( successors['e.2.']) == 1 and successors['e.2.'][0] == UidEndpoint( 'e.2.0', f'{LOCALHOST}:42') assert successors['e.4.5.'] == {} initial_beam = beam_search.get_initial_beam((3, 2, 1, 0, -1, -2, -3), beam_size=3) assert len(initial_beam) == 3 assert initial_beam[0][:2] == (2.0, 'expert.1.') assert initial_beam[1][:2] == (1.0, 'expert.2.') assert initial_beam[2][:2] == (0.0, 'expert.3.') with pytest.raises(AssertionError): beam_search = MoEBeamSearcher(node, 'expert.1.ffn') with pytest.raises(AssertionError): beam_search.get_active_successors(['e.1.2.', 'e.2', 'e.4.5.'])