def test_allgather(): dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*') averagers = [ hivemind.DecentralizedAverager(torch.ones(1), dht=dht, target_group_size=4, averaging_expiration=15, prefix='mygroup', initial_group_bits='000', listen_on='127.0.0.1:*', start=True) for _ in range(8) ] futures = [] for i, averager in enumerate(averagers): futures.append( averager.step(wait=False, gather=dict(batch_size=123 + i, foo='bar'))) assert len(set(repr(sorted(future.result())) for future in futures)) == 2 reference_metadata = { averager.endpoint: dict(batch_size=123 + i, foo='bar') for i, averager in enumerate(averagers) } for future in futures: gathered = future.result() assert len(gathered) == 4 for endpoint in gathered: assert gathered[endpoint] == reference_metadata[endpoint]
def test_allreduce_grid(): dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*') averagers = [ hivemind.DecentralizedAverager(averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2, prefix='mygroup', initial_group_bits=bin( i // 2)[2:].rjust(2, '0'), start=True) for i in range(8) ] [means0], [stds0] = compute_mean_std(averagers) assert not torch.allclose(stds0, torch.zeros_like(stds0)) prev_means, prev_stds = means0, stds0 for i in range(5): step_futures = [averager.step(wait=False) for averager in averagers] groups = [future.result() for future in step_futures] [means], [stds] = compute_mean_std(averagers) assert torch.allclose(means, prev_means, atol=1e-6, rtol=0) assert all(len(group) == 2 for group in groups) if i <= 2: assert torch.all(torch.le(stds, prev_stds)) else: assert torch.allclose(stds, torch.zeros_like(stds), atol=1e-6, rtol=0) for averager in averagers: averager.shutdown() dht.shutdown()
def test_allreduce_once(): dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*') tensors1 = [torch.randn(123), torch.zeros(3)] tensors2 = [torch.rand(123), torch.ones(3)] tensors3 = [-torch.rand(123), torch.arange(3).to(torch.float32)] tensors4 = [torch.randn(123)**3, torch.arange(3).to(torch.float32) / 2] reference = [(tensors1[i] + tensors2[i] + tensors3[i] + tensors4[i]) / 4 for i in range(len(tensors1))] averagers = [ hivemind.DecentralizedAverager(tensors, dht=dht, target_group_size=4, averaging_expiration=15, prefix='mygroup', initial_group_bits='0110', listen_on='127.0.0.1:*', start=True) for tensors in [tensors1, tensors2, tensors3, tensors4] ] futures = [] for averager in averagers: futures.append(averager.step(wait=False)) for future in futures: result = future.result() for averager in averagers: assert averager.endpoint in result for averager in averagers: with averager.get_tensors() as averaged_tensors: for ref, our in zip(reference, averaged_tensors): assert torch.allclose(ref, our, atol=1e-6)
def test_getset_bits(): dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*') averager = hivemind.DecentralizedAverager([torch.randn(3)], dht=dht, start=True, prefix='test_prefix', target_group_size=2) averager.set_group_bits('00101011101010') assert averager.get_group_bits() == '00101011101010'
def test_overcrowded(num_peers=16): dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*') averagers = [hivemind.DecentralizedAverager( averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2, averaging_expiration=1, request_timeout=0.5, prefix='mygroup', initial_group_bits='', start=True) for _ in range(num_peers)] for t in range(5): step_futures = [averager.step(wait=False, timeout=5) for averager in averagers] assert sum(len(future.result() or []) == 2 for future in step_futures) >= len(averagers) - 1 for averager in averagers: averager.shutdown() dht.shutdown()
def test_too_few_peers(): dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*') averagers = [hivemind.DecentralizedAverager( averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2, averaging_expiration=1, request_timeout=0.5, prefix='mygroup', initial_group_bits=bin(i)[2:].rjust(3, '0'), start=True) for i in range(4)] step_futures = [averager.step(wait=False) for averager in averagers] for future in step_futures: assert len(future.result()) == 2 for averager in averagers: averager.shutdown() dht.shutdown()
def run_averager(index): dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*', initial_peers=[f"{LOCALHOST}:{dht_root.port}"], start=True) initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0') averager = hivemind.DecentralizedAverager( peer_tensors[i], dht, prefix='my_tensor', initial_group_bits=initial_bits, listen_on=f"{LOCALHOST}:*", compression_type=runtime_pb2.CompressionType.FLOAT16, target_group_size=target_group_size, averaging_expiration=averaging_expiration, request_timeout=request_timeout, start=True) processes.update({dht, averager}) print(end=f'<started {index}>\n', flush=True) for _ in range(num_rounds): success = averager.step(timeout=round_timeout) print(end=('+' if success else '-'), flush=True) print(end=f'<finished {index}>\n', flush=True)
def test_allreduce_once(n_client_mode_peers): dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*') n_peers = 4 should_listen = [False] * n_client_mode_peers + [True] * ( n_peers - n_client_mode_peers) random.shuffle(should_listen) tensors1 = [torch.randn(123), torch.zeros(3)] tensors2 = [torch.rand(123), torch.ones(3)] tensors3 = [-torch.rand(123), torch.arange(3).to(torch.float32)] tensors4 = [torch.randn(123)**3, torch.arange(3).to(torch.float32) / 2] reference = [(tensors1[i] + tensors2[i] + tensors3[i] + tensors4[i]) / 4 for i in range(len(tensors1))] averagers = [ hivemind.DecentralizedAverager(tensors, dht=dht, target_group_size=4, averaging_expiration=15, prefix='mygroup', listen=listen, listen_on='127.0.0.1:*', start=True) for tensors, listen in zip([tensors1, tensors2, tensors3, tensors4], should_listen) ] futures = [] for averager in averagers: futures.append(averager.step(wait=False)) for future in futures: result = future.result() for averager in averagers: assert averager.endpoint in result for averager in averagers: with averager.get_tensors() as averaged_tensors: for ref, our in zip(reference, averaged_tensors): assert torch.allclose(ref, our, atol=1e-6) for averager in averagers: averager.shutdown() dht.shutdown()
def run_averager(index): nonlocal successful_steps, total_steps, lock_stats dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*', initial_peers=[f"{LOCALHOST}:{dht_root.port}"], start=True) initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0') averager = hivemind.DecentralizedAverager( peer_tensors[i], dht, prefix='my_tensor', initial_group_bits=initial_bits, listen_on=f"{LOCALHOST}:*", compression_type=runtime_pb2.CompressionType.FLOAT16, target_group_size=target_group_size, averaging_expiration=averaging_expiration, request_timeout=request_timeout, start=True) processes.update({dht, averager}) logger.info(f'Averager {index}: started on endpoint {averager.endpoint}, group_bits: {averager.get_group_bits()}') for step in range(num_rounds): try: success = averager.step(timeout=round_timeout) is not None except: success = False with lock_stats: successful_steps += int(success) total_steps += 1 logger.info(f"Averager {index}: {'finished' if success else 'failed'} step {step}") logger.info(f"Averager {index}: done.")