Example #1
0
def speedup():
    print(f'speedup test {RANK}')
    examples = list(range(4))

    ranks = dlp_mpi.gather(dlp_mpi.RANK)
    if dlp_mpi.IS_MASTER:
        assert ranks == [0, 1, 2], ranks

    sleep_time = 0.1

    def bar(i):
        time.sleep(sleep_time)
        # print(f'Callback from {RANK}')
        assert dlp_mpi.RANK in [1, 2], (dlp_mpi.RANK, dlp_mpi.SIZE)

    start = time.perf_counter()
    for i in dlp_mpi.map_unordered(bar, examples):
        # print(f'Loop body from {RANK}')
        assert dlp_mpi.RANK in [0], (dlp_mpi.RANK, dlp_mpi.SIZE)
    elapsed = time.perf_counter() - start

    serial_time = sleep_time * len(examples)

    # Two workers, one manager (3 mpi processes) reduce the time by 0.5
    # Consider some python overhead
    assert elapsed < 0.6 * serial_time, (elapsed, serial_time)
    assert elapsed >= 0.5 * serial_time, (elapsed, serial_time)
Example #2
0
def pbar():
    print(f'executable test {RANK}')

    examples = list(range(5))

    ranks = dlp_mpi.gather(dlp_mpi.RANK)
    if dlp_mpi.IS_MASTER:
        assert ranks == [0, 1, 2], ranks

    def bar(i):
        time.sleep(0.04)
        assert dlp_mpi.RANK in [1, 2], (dlp_mpi.RANK, dlp_mpi.SIZE)

    class MockPbar:
        call_history = []

        def __init__(self):
            self.i = 0

        def set_description(self, text):
            self.call_history.append(text)

        def update(self, inc=1):
            self.i += 1
            self.call_history.append(f'update {self.i}')

    import contextlib

    @contextlib.contextmanager
    def mock_pbar(total, disable):
        assert disable is False, disable
        yield MockPbar()

    import mock

    with mock.patch('tqdm.tqdm', mock_pbar):

        dlp_mpi.barrier()
        if RANK == 2:
            time.sleep(0.02)

        for i in dlp_mpi.map_unordered(
                bar,
                examples,
                progress_bar=True,
        ):
            assert dlp_mpi.RANK in [0], (dlp_mpi.RANK, dlp_mpi.SIZE)

    if RANK == 0:
        assert MockPbar.call_history == [
            'busy: 2', 'update 1', 'update 2', 'update 3', 'update 4',
            'busy: 1', 'update 5', 'busy: 0'
        ], MockPbar.call_history
    else:
        assert MockPbar.call_history == [], MockPbar.call_history
Example #3
0
def executable():
    print(f'executable test {RANK}')

    examples = list(range(5))

    ranks = dlp_mpi.gather(dlp_mpi.RANK)
    if dlp_mpi.IS_MASTER:
        assert ranks == [0, 1, 2], ranks

    def bar(i):
        assert dlp_mpi.RANK in [1, 2], (dlp_mpi.RANK, dlp_mpi.SIZE)

    for i in dlp_mpi.map_unordered(bar, examples):
        assert dlp_mpi.RANK in [0], (dlp_mpi.RANK, dlp_mpi.SIZE)
Example #4
0
def overhead():
    """

    This test shows the overhead of map_unordered.
    A simple for loop with map cam process around 1 000 000 examples per
    second. When using map_unordered, obviously the number should decrease.

    When your code processes less than 1000 examples per second, you can expect
    a gain from map_unordered. When you process serial more than 10000 examples
    per second, it is unlikely to get a gain from map_unordered.
    Thing about chunking to get less than 100 example per second.
    """
    print(f'executable test {RANK}')

    examples = list(range(10000))

    ranks = dlp_mpi.gather(dlp_mpi.RANK)
    if dlp_mpi.IS_MASTER:
        assert ranks == [0, 1, 2], ranks

    total = 0

    def bar(i):
        nonlocal total
        total += i
        return i

    start = time.perf_counter()
    for i in dlp_mpi.map_unordered(bar, examples):
        total += i
    elapsed = time.perf_counter() - start

    time_per_example = elapsed / 1000
    mpi_examples_per_second = 1 / time_per_example

    assert mpi_examples_per_second >= 10_000, mpi_examples_per_second
    assert mpi_examples_per_second <= 300_000, mpi_examples_per_second

    start = time.perf_counter()
    for i in map(bar, examples):
        total += i
    elapsed = time.perf_counter() - start

    time_per_example = elapsed / 1000
    py_examples_per_second = 1 / time_per_example

    assert py_examples_per_second >= 250_000, py_examples_per_second
    assert py_examples_per_second <= 9_000_000, py_examples_per_second
Example #5
0
def worker_fails():
    print(f'executable test {RANK}')

    examples = list(range(5))

    ranks = dlp_mpi.gather(dlp_mpi.RANK)
    if dlp_mpi.IS_MASTER:
        assert ranks == [0, 1, 2], ranks

    def bar(i):
        if RANK == 1:
            print(f'let {RANK} fail for data {i}')
            raise ValueError('failed')
        assert dlp_mpi.RANK in [1, 2], (dlp_mpi.RANK, dlp_mpi.SIZE)
        return i, RANK

    processed = []
    try:
        dlp_mpi.barrier()
        if RANK == 2:
            # Delay rank 2, this ensures that rank 1 gets the first example
            time.sleep(0.1)
        for i, worker_rank in dlp_mpi.map_unordered(bar, examples):
            print(
                f'Loop body from {RANK} for data {i} that was processed by {worker_rank}'
            )
            assert dlp_mpi.RANK in [0], (dlp_mpi.RANK, dlp_mpi.SIZE)
            processed.append(i)
    except ValueError:
        assert RANK in [1], RANK
    except AssertionError:
        assert RANK in [0], RANK
        # Example zero failed for worker 1, but the master process fails at the
        # end of the for loop. So examples 1 to 4 are processed
        assert processed == [1, 2, 3, 4], processed
    else:
        assert RANK in [2], RANK
Example #6
0
def bottleneck():
    print(f'bottleneck test {RANK}')
    examples = list(range(4))

    ranks = dlp_mpi.gather(dlp_mpi.RANK)
    if dlp_mpi.IS_MASTER:
        assert ranks == [0, 1, 2], ranks

    sleep_time = 0.1

    def bar(i):
        # print(f'Callback from {RANK} for data {i}')
        assert dlp_mpi.RANK in [1, 2], (dlp_mpi.RANK, dlp_mpi.SIZE)
        return i, RANK

    start = time.perf_counter()
    for i, worker_rank in dlp_mpi.map_unordered(bar, examples):
        time.sleep(sleep_time)
        # print(f'Loop body from {RANK} for data {i} that was processed by {worker_rank}')
        assert dlp_mpi.RANK in [0], (dlp_mpi.RANK, dlp_mpi.SIZE)
    elapsed = time.perf_counter() - start

    # Two workers, one manager (3 mpi processes) would reduce the time by 0.5
    # but the load is in the loop body -> not speedup
    # Consider some python overhead
    if dlp_mpi.IS_MASTER:
        serial_time = sleep_time * len(examples)
        assert elapsed < 1.1 * serial_time, (elapsed, serial_time)
        assert elapsed >= 0.9 * serial_time, (elapsed, serial_time)
    else:
        # The worker finsies while the master still process the last two
        # examples. One worker finished two steps earlier, the other 1 step
        # earlier
        serial_time_high = sleep_time * (len(examples) - 1)
        serial_time_low = sleep_time * (len(examples) - 2)
        assert elapsed < 1.1 * serial_time_high, (elapsed, serial_time_high)
        assert elapsed >= 0.9 * serial_time_low, (elapsed, serial_time_low)
Example #7
0
python mpi_5_map_unordered.py

Run on multiple CPUs:
mpiexec -np 5 python mpi_5_map_unordered.py
"""

from dlp_mpi import COMM, RANK, SIZE, MASTER, IS_MASTER, map_unordered
import time
import numpy as np


def fn(example_id):
    time.sleep(np.random.uniform(0, 1))
    example = 'hello'
    print(RANK, example_id, example[example_id])
    return example[example_id]


if __name__ == '__main__':
    if IS_MASTER:
        print('### Unordered map scattered around processes:')

    result = list(map_unordered(fn, range(5)))

    if IS_MASTER:
        print(result)

    if IS_MASTER:
        print('### Map function run only on master:')
        print(list(map(fn, range(5))))