Ejemplo n.º 1
0
def test_allgather():
    dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*')
    averagers = [
        hivemind.DecentralizedAverager(torch.ones(1),
                                       dht=dht,
                                       target_group_size=4,
                                       averaging_expiration=15,
                                       prefix='mygroup',
                                       initial_group_bits='000',
                                       listen_on='127.0.0.1:*',
                                       start=True) for _ in range(8)
    ]

    futures = []
    for i, averager in enumerate(averagers):
        futures.append(
            averager.step(wait=False,
                          gather=dict(batch_size=123 + i, foo='bar')))

    assert len(set(repr(sorted(future.result())) for future in futures)) == 2

    reference_metadata = {
        averager.endpoint: dict(batch_size=123 + i, foo='bar')
        for i, averager in enumerate(averagers)
    }
    for future in futures:
        gathered = future.result()

        assert len(gathered) == 4

        for endpoint in gathered:
            assert gathered[endpoint] == reference_metadata[endpoint]
Ejemplo n.º 2
0
def test_allreduce_grid():
    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
    averagers = [
        hivemind.DecentralizedAverager(averaged_tensors=[torch.randn(3)],
                                       dht=dht,
                                       target_group_size=2,
                                       prefix='mygroup',
                                       initial_group_bits=bin(
                                           i // 2)[2:].rjust(2, '0'),
                                       start=True) for i in range(8)
    ]

    [means0], [stds0] = compute_mean_std(averagers)
    assert not torch.allclose(stds0, torch.zeros_like(stds0))

    prev_means, prev_stds = means0, stds0

    for i in range(5):
        step_futures = [averager.step(wait=False) for averager in averagers]
        groups = [future.result() for future in step_futures]
        [means], [stds] = compute_mean_std(averagers)
        assert torch.allclose(means, prev_means, atol=1e-6, rtol=0)
        assert all(len(group) == 2 for group in groups)

        if i <= 2:
            assert torch.all(torch.le(stds, prev_stds))
        else:
            assert torch.allclose(stds,
                                  torch.zeros_like(stds),
                                  atol=1e-6,
                                  rtol=0)

    for averager in averagers:
        averager.shutdown()
    dht.shutdown()
Ejemplo n.º 3
0
def test_allreduce_once():
    dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*')

    tensors1 = [torch.randn(123), torch.zeros(3)]
    tensors2 = [torch.rand(123), torch.ones(3)]
    tensors3 = [-torch.rand(123), torch.arange(3).to(torch.float32)]
    tensors4 = [torch.randn(123)**3, torch.arange(3).to(torch.float32) / 2]

    reference = [(tensors1[i] + tensors2[i] + tensors3[i] + tensors4[i]) / 4
                 for i in range(len(tensors1))]

    averagers = [
        hivemind.DecentralizedAverager(tensors,
                                       dht=dht,
                                       target_group_size=4,
                                       averaging_expiration=15,
                                       prefix='mygroup',
                                       initial_group_bits='0110',
                                       listen_on='127.0.0.1:*',
                                       start=True)
        for tensors in [tensors1, tensors2, tensors3, tensors4]
    ]

    futures = []
    for averager in averagers:
        futures.append(averager.step(wait=False))
    for future in futures:
        result = future.result()
        for averager in averagers:
            assert averager.endpoint in result

    for averager in averagers:
        with averager.get_tensors() as averaged_tensors:
            for ref, our in zip(reference, averaged_tensors):
                assert torch.allclose(ref, our, atol=1e-6)
Ejemplo n.º 4
0
def test_getset_bits():
    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
    averager = hivemind.DecentralizedAverager([torch.randn(3)],
                                              dht=dht,
                                              start=True,
                                              prefix='test_prefix',
                                              target_group_size=2)
    averager.set_group_bits('00101011101010')
    assert averager.get_group_bits() == '00101011101010'
Ejemplo n.º 5
0
def test_overcrowded(num_peers=16):
    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
    averagers = [hivemind.DecentralizedAverager(
        averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2,
        averaging_expiration=1, request_timeout=0.5,
        prefix='mygroup', initial_group_bits='', start=True)
        for _ in range(num_peers)]
    for t in range(5):
        step_futures = [averager.step(wait=False, timeout=5) for averager in averagers]
        assert sum(len(future.result() or []) == 2 for future in step_futures) >= len(averagers) - 1

    for averager in averagers:
        averager.shutdown()
    dht.shutdown()
Ejemplo n.º 6
0
def test_too_few_peers():
    dht = hivemind.DHT(start=True, endpoint='127.0.0.1:*')
    averagers = [hivemind.DecentralizedAverager(
        averaged_tensors=[torch.randn(3)], dht=dht, target_group_size=2,
        averaging_expiration=1, request_timeout=0.5,
        prefix='mygroup', initial_group_bits=bin(i)[2:].rjust(3, '0'), start=True)
        for i in range(4)]
    step_futures = [averager.step(wait=False) for averager in averagers]
    for future in step_futures:
        assert len(future.result()) == 2

    for averager in averagers:
        averager.shutdown()
    dht.shutdown()
Ejemplo n.º 7
0
    def run_averager(index):
        dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*',
                           initial_peers=[f"{LOCALHOST}:{dht_root.port}"],
                           start=True)
        initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0')
        averager = hivemind.DecentralizedAverager(
            peer_tensors[i], dht, prefix='my_tensor', initial_group_bits=initial_bits, listen_on=f"{LOCALHOST}:*",
            compression_type=runtime_pb2.CompressionType.FLOAT16, target_group_size=target_group_size,
            averaging_expiration=averaging_expiration, request_timeout=request_timeout, start=True)
        processes.update({dht, averager})

        print(end=f'<started {index}>\n', flush=True)
        for _ in range(num_rounds):
            success = averager.step(timeout=round_timeout)
            print(end=('+' if success else '-'), flush=True)
        print(end=f'<finished {index}>\n', flush=True)
Ejemplo n.º 8
0
def test_allreduce_once(n_client_mode_peers):
    dht = hivemind.DHT(start=True, endpoint=f'{hivemind.LOCALHOST}:*')

    n_peers = 4
    should_listen = [False] * n_client_mode_peers + [True] * (
        n_peers - n_client_mode_peers)
    random.shuffle(should_listen)

    tensors1 = [torch.randn(123), torch.zeros(3)]
    tensors2 = [torch.rand(123), torch.ones(3)]
    tensors3 = [-torch.rand(123), torch.arange(3).to(torch.float32)]
    tensors4 = [torch.randn(123)**3, torch.arange(3).to(torch.float32) / 2]

    reference = [(tensors1[i] + tensors2[i] + tensors3[i] + tensors4[i]) / 4
                 for i in range(len(tensors1))]

    averagers = [
        hivemind.DecentralizedAverager(tensors,
                                       dht=dht,
                                       target_group_size=4,
                                       averaging_expiration=15,
                                       prefix='mygroup',
                                       listen=listen,
                                       listen_on='127.0.0.1:*',
                                       start=True) for tensors, listen in
        zip([tensors1, tensors2, tensors3, tensors4], should_listen)
    ]

    futures = []
    for averager in averagers:
        futures.append(averager.step(wait=False))
    for future in futures:
        result = future.result()
        for averager in averagers:
            assert averager.endpoint in result

    for averager in averagers:
        with averager.get_tensors() as averaged_tensors:
            for ref, our in zip(reference, averaged_tensors):
                assert torch.allclose(ref, our, atol=1e-6)

    for averager in averagers:
        averager.shutdown()
    dht.shutdown()
Ejemplo n.º 9
0
    def run_averager(index):
        nonlocal successful_steps, total_steps, lock_stats
        dht = hivemind.DHT(listen_on=f'{LOCALHOST}:*',
                           initial_peers=[f"{LOCALHOST}:{dht_root.port}"],
                           start=True)
        initial_bits = bin(index % num_groups)[2:].rjust(nbits, '0')
        averager = hivemind.DecentralizedAverager(
            peer_tensors[i], dht, prefix='my_tensor', initial_group_bits=initial_bits, listen_on=f"{LOCALHOST}:*",
            compression_type=runtime_pb2.CompressionType.FLOAT16, target_group_size=target_group_size,
            averaging_expiration=averaging_expiration, request_timeout=request_timeout, start=True)
        processes.update({dht, averager})

        logger.info(f'Averager {index}: started on endpoint {averager.endpoint}, group_bits: {averager.get_group_bits()}')
        for step in range(num_rounds):
            try:
                success = averager.step(timeout=round_timeout) is not None
            except:
                success = False
            with lock_stats:
                successful_steps += int(success)
                total_steps += 1
            logger.info(f"Averager {index}: {'finished' if success else 'failed'} step {step}")
        logger.info(f"Averager {index}: done.")