Esempio n. 1
0
def test_nanny(s):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)

    yield n._start(0)
    with rpc(n.address) as nn:
        assert isalive(n.process)  # alive
        assert s.ncores[n.worker_address] == 2

        assert s.worker_info[n.worker_address]['services']['nanny'] > 1024

        yield nn.kill()
        assert not n.process
        assert n.worker_address not in s.ncores
        assert n.worker_address not in s.worker_info

        yield nn.kill()
        assert n.worker_address not in s.ncores
        assert n.worker_address not in s.worker_info
        assert not n.process

        yield nn.instantiate()
        assert isalive(n.process)
        assert s.ncores[n.worker_address] == 2
        assert s.worker_info[n.worker_address]['services']['nanny'] > 1024

        yield nn.terminate()
        assert not n.process

    yield n._close()
Esempio n. 2
0
def test_nanny(s):
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)

    yield n._start(0)
    with rpc(ip=n.ip, port=n.port) as nn:
        assert isalive(n.process)  # alive
        assert s.ncores[n.worker_address] == 2

        assert s.worker_info[n.worker_address]['services']['nanny'] > 1024

        yield nn.kill()
        assert not n.process
        assert n.worker_address not in s.ncores
        assert n.worker_address not in s.worker_info

        yield nn.kill()
        assert n.worker_address not in s.ncores
        assert n.worker_address not in s.worker_info
        assert not n.process

        yield nn.instantiate()
        assert isalive(n.process)
        assert s.ncores[n.worker_address] == 2
        assert s.worker_info[n.worker_address]['services']['nanny'] > 1024

        yield nn.terminate()
        assert not n.process

    yield n._close()
Esempio n. 3
0
def test_failed_worker_without_warning(c, s, a, b):
    L = c.map(inc, range(10))
    yield _wait(L)

    original_process = a.process
    a.process.terminate()
    start = time()
    while a.process is original_process and not isalive(a.process):
        yield gen.sleep(0.01)
        assert time() - start < 10

    yield gen.sleep(0.5)

    start = time()
    while len(s.ncores) < 2:
        yield gen.sleep(0.01)
        assert time() - start < 10

    yield _wait(L)

    L2 = c.map(inc, range(10, 20))
    yield _wait(L2)
    assert all(len(keys) > 0 for keys in s.has_what.values())
    ncores2 = s.ncores.copy()

    yield c._restart()

    L = c.map(inc, range(10))
    yield _wait(L)
    assert all(len(keys) > 0 for keys in s.has_what.values())

    assert not (set(ncores2) & set(s.ncores))  # no overlap
Esempio n. 4
0
def test_failed_worker_without_warning(c, s, a, b):
    L = c.map(inc, range(10))
    yield _wait(L)

    original_process = a.process
    a.process.terminate()
    start = time()
    while a.process is original_process and not isalive(a.process):
        yield gen.sleep(0.01)
        assert time() - start < 10

    yield gen.sleep(0.5)

    start = time()
    while len(s.ncores) < 2:
        yield gen.sleep(0.01)
        assert time() - start < 10

    yield _wait(L)

    L2 = c.map(inc, range(10, 20))
    yield _wait(L2)
    assert all(len(keys) > 0 for keys in s.has_what.values())
    ncores2 = s.ncores.copy()

    yield c._restart()

    L = c.map(inc, range(10))
    yield _wait(L)
    assert all(len(keys) > 0 for keys in s.has_what.values())

    assert not (set(ncores2) & set(s.ncores))  # no overlap
Esempio n. 5
0
def test_no_reconnect(nanny, loop):
    with popen(['dask-worker', '127.0.0.1:8786', '--no-reconnect', nanny]) as worker:
        with popen(['dask-scheduler']) as sched:
            sleep(1)
        start = time()
        while isalive(worker):
            sleep(0.1)
            assert time() < start + 10
Esempio n. 6
0
def test_no_reconnect(nanny, loop):
    with popen(['dask-scheduler', '--no-bokeh']) as sched:
        wait_for_port('127.0.0.1:8786')
        with popen(['dask-worker', '127.0.0.1:8786', '--no-reconnect', nanny,
                    '--no-bokeh']) as worker:
            sleep(2)
            terminate_process(sched)
        start = time()
        while isalive(worker):
            sleep(0.1)
            assert time() < start + 10
Esempio n. 7
0
def test_monitor_resources(s):
    pytest.importorskip('psutil')
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)

    yield n._start()
    assert isalive(n.process)
    d = n.resource_collect()
    assert {'cpu_percent', 'memory_percent'}.issubset(d)

    assert 'timestamp' in d

    comm = yield connect(n.address)
    yield comm.write({'op': 'monitor_resources', 'interval': 0.01})

    for i in range(3):
        msg = yield comm.read()
        assert isinstance(msg, dict)
        assert {'cpu_percent', 'memory_percent'}.issubset(msg)

    yield comm.close()
    yield n._close()
    s.stop()
Esempio n. 8
0
def test_monitor_resources(s):
    pytest.importorskip('psutil')
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)

    yield n._start()
    assert isalive(n.process)
    d = n.resource_collect()
    assert {'cpu_percent', 'memory_percent'}.issubset(d)

    assert 'timestamp' in d

    stream = yield connect(ip=n.ip, port=n.port)
    yield write(stream, {'op': 'monitor_resources', 'interval': 0.01})

    for i in range(3):
        msg = yield read(stream)
        assert isinstance(msg, dict)
        assert {'cpu_percent', 'memory_percent'}.issubset(msg)

    close(stream)
    yield n._close()
    s.stop()
Esempio n. 9
0
def test_nanny_process_failure(s):
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)
    yield n._start()
    nn = rpc(ip=n.ip, port=n.port)
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_process = n.process
    ww = rpc(ip=n.ip, port=n.worker_port)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(StreamClosedError):
        yield ww.compute(function=dumps(sys.exit),
                         args=dumps((0,)),
                         key='z')

    start = time()
    while n.process is original_process:  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not isalive(n.process):  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    nn.close_streams()
    s.stop()
Esempio n. 10
0
def test_nanny_process_failure(c, s):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    yield n._start()
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_process = n.process
    ww = rpc(n.worker_address)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(CommClosedError):
        yield c._run(sys.exit, 0, workers=[n.worker_address])

    start = time()
    while n.process is original_process:  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not isalive(n.process):  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    ww.close_rpc()
    s.stop()
Esempio n. 11
0
def test_nanny_process_failure(s):
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)
    yield n._start()
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_process = n.process
    ww = rpc(ip=n.ip, port=n.worker_port)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(StreamClosedError):
        yield ww.compute(function=dumps(sys.exit), args=dumps((0, )), key='z')

    start = time()
    while n.process is original_process:  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not isalive(n.process):  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    ww.close_rpc()
    s.stop()
Esempio n. 12
0
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs,
         nanny, name, memory_limit, pid_file, temp_filename, reconnect):
    if nanny:
        port = nanny_port
    else:
        port = worker_port

    try:
        scheduler_host, scheduler_port = scheduler.split(':')
        scheduler_ip = socket.gethostbyname(scheduler_host)
        scheduler_port = int(scheduler_port)
    except IndexError:
        logger.info("Usage:  dask-worker scheduler_host:scheduler_port")

    if nprocs > 1 and worker_port != 0:
        logger.error(
            "Failed to launch worker.  You cannot use the --port argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and name:
        logger.error(
            "Failed to launch worker.  You cannot use the --name argument when nprocs > 1."
        )
        exit(1)

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {('http', http_port): HTTPWorker}

    loop = IOLoop.current()

    if memory_limit == 'auto':
        import psutil
        memory_limit = psutil.virtual_memory().total * 0.60

    if memory_limit:
        memory_limit = float(memory_limit)
        if memory_limit < 1.0:
            import psutil
            memory_limit = psutil.virtual_memory().total * memory_limit
        memory_limit /= nprocs
        memory_limit = int(memory_limit)

    if nanny:
        kwargs = {'worker_port': worker_port}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs['service_ports'] = {'nanny': nanny_port}
        t = Worker

    if host is not None:
        ip = socket.gethostbyname(host)
    else:
        # lookup the ip address of a local interface on a network that
        # reach the scheduler
        ip = get_ip(scheduler_ip, scheduler_port)
    nannies = [
        t(scheduler_ip,
          scheduler_port,
          ncores=nthreads,
          ip=ip,
          services=services,
          name=name,
          loop=loop,
          memory_limit=memory_limit,
          reconnect=reconnect,
          **kwargs) for i in range(nprocs)
    ]

    for n in nannies:
        n.start(port)
        if t is Nanny:
            global_nannies.append(n)

    if temp_filename:

        @gen.coroutine
        def f():
            while nannies[0].status != 'running':
                yield gen.sleep(0.01)
            import json
            msg = {
                'port': nannies[0].port,
                'local_directory': nannies[0].local_dir
            }
            with open(temp_filename, 'w') as f:
                json.dump(msg, f)

        loop.add_callback(f)

    @gen.coroutine
    def run():
        while all(n.status != 'closed' for n in nannies):
            yield gen.sleep(0.2)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
        loop.close()

    loop2 = IOLoop()

    @gen.coroutine
    def f():
        scheduler = rpc(ip=nannies[0].scheduler.ip,
                        port=nannies[0].scheduler.port)
        if nanny:
            yield gen.with_timeout(timedelta(seconds=2),
                                   All([
                                       scheduler.unregister(
                                           address=n.worker_address,
                                           close=True) for n in nannies
                                       if n.process and n.worker_port
                                   ]),
                                   io_loop=loop2)

    loop2.run_sync(f)

    if nanny:
        for n in nannies:
            if isalive(n.process):
                n.process.terminate()

    if nanny:
        start = time()
        while (any(isalive(n.process) for n in nannies)
               and time() < start + 1):
            sleep(0.1)

    for nanny in nannies:
        nanny.stop()
Esempio n. 13
0
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs,
         nanny, name, memory_limit, pid_file, temp_filename, reconnect,
         resources, bokeh, bokeh_port, local_directory, scheduler_file,
         interface, death_timeout, preload):
    if nanny:
        port = nanny_port
    else:
        port = worker_port

    if nprocs > 1 and worker_port != 0:
        logger.error(
            "Failed to launch worker.  You cannot use the --port argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and name:
        logger.error(
            "Failed to launch worker.  You cannot use the --name argument when nprocs > 1."
        )
        exit(1)

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {('http', http_port): HTTPWorker}

    if bokeh:
        try:
            from distributed.bokeh.worker import BokehWorker
        except ImportError:
            pass
        else:
            services[('bokeh', bokeh_port)] = BokehWorker

    if resources:
        resources = resources.replace(',', ' ').split()
        resources = dict(pair.split('=') for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    if nanny:
        kwargs = {'worker_port': worker_port}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs['service_ports'] = {'nanny': nanny_port}
        t = Worker

    if scheduler_file:
        while not os.path.exists(scheduler_file):
            sleep(0.01)
        for i in range(10):
            try:
                with open(scheduler_file) as f:
                    cfg = json.load(f)
                scheduler = cfg['address']
                break
            except (ValueError, KeyError):  # race with scheduler on file
                sleep(0.01)

    if not scheduler:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    nannies = [
        t(scheduler,
          ncores=nthreads,
          services=services,
          name=name,
          loop=loop,
          resources=resources,
          memory_limit=memory_limit,
          reconnect=reconnect,
          local_dir=local_directory,
          death_timeout=death_timeout,
          preload=preload,
          **kwargs) for i in range(nprocs)
    ]

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    for n in nannies:
        if host:
            n.start((host, port))
        else:
            n.start(port)
        if t is Nanny:
            global_nannies.append(n)

    if temp_filename:

        @gen.coroutine
        def f():
            while nannies[0].status != 'running':
                yield gen.sleep(0.01)
            import json
            msg = {
                'port': nannies[0].port,
                'local_directory': nannies[0].local_dir
            }
            with open(temp_filename, 'w') as f:
                json.dump(msg, f)

        loop.add_callback(f)

    @gen.coroutine
    def run():
        while all(n.status != 'closed' for n in nannies):
            yield gen.sleep(0.2)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
        loop.close()

    # Clean exit: unregister all workers from scheduler

    loop2 = IOLoop()

    @gen.coroutine
    def f():
        with rpc(nannies[0].scheduler.address) as scheduler:
            if nanny:
                yield gen.with_timeout(timeout=timedelta(seconds=2),
                                       future=All([
                                           scheduler.unregister(
                                               address=n.worker_address,
                                               close=True) for n in nannies
                                           if n.process and n.worker_address
                                       ]),
                                       io_loop=loop2)

    loop2.run_sync(f)

    if nanny:
        for n in nannies:
            if isalive(n.process):
                n.process.terminate()

    if nanny:
        start = time()
        while (any(isalive(n.process) for n in nannies)
               and time() < start + 1):
            sleep(0.1)

    for nanny in nannies:
        nanny.stop()
Esempio n. 14
0
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs,
        no_nanny, name, memory_limit, pid_file, temp_filename):
    if no_nanny:
        port = worker_port
    else:
        port = nanny_port

    try:
        scheduler_host, scheduler_port = scheduler.split(':')
        scheduler_ip = socket.gethostbyname(scheduler_host)
        scheduler_port = int(scheduler_port)
    except IndexError:
        logger.info("Usage:  dask-worker scheduler_host:scheduler_port")

    if nprocs > 1 and worker_port != 0:
        logger.error("Failed to launch worker.  You cannot use the --port argument when nprocs > 1.")
        exit(1)

    if nprocs > 1 and name:
        logger.error("Failed to launch worker.  You cannot use the --name argument when nprocs > 1.")
        exit(1)

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)
        atexit.register(del_pid_file)

    services = {('http', http_port): HTTPWorker}

    loop = IOLoop.current()

    if memory_limit == 'auto':
        import psutil
        memory_limit = psutil.virtual_memory().total * 0.60

    if memory_limit:
        memory_limit = float(memory_limit)
        if memory_limit < 1.0:
            import psutil
            memory_limit = psutil.virtual_memory().total * memory_limit
        memory_limit /= nprocs
        memory_limit = int(memory_limit)

    if no_nanny:
        kwargs = {}
        if nanny_port:
            kwargs['service_ports'] = {'nanny': nanny_port}
        t = Worker
    else:
        kwargs = {'worker_port': worker_port}
        t = Nanny

    if host is not None:
        ip = socket.gethostbyname(host)
    else:
        # lookup the ip address of a local interface on a network that
        # reach the scheduler
        ip = get_ip(scheduler_ip, scheduler_port)
    nannies = [t(scheduler_ip, scheduler_port, ncores=nthreads, ip=ip,
                 services=services, name=name, loop=loop,
                 memory_limit=memory_limit, **kwargs)
               for i in range(nprocs)]

    for nanny in nannies:
        nanny.start(port)
        if t is Nanny:
            global_nannies.append(nanny)

    if temp_filename:
        @gen.coroutine
        def f():
            while nannies[0].status != 'running':
                yield gen.sleep(0.01)
            import json
            msg = {'port': nannies[0].port,
                   'local_directory': nannies[0].local_dir}
            with open(temp_filename, 'w') as f:
                json.dump(msg, f)
        loop.add_callback(f)

    loop.start()
    logger.info("End worker")
    loop.close()

    loop2 = IOLoop()

    @gen.coroutine
    def f():
        scheduler = rpc(ip=nannies[0].scheduler.ip,
                        port=nannies[0].scheduler.port)
        if not no_nanny:
            yield gen.with_timeout(timedelta(seconds=2),
                    All([scheduler.unregister(address=n.worker_address, close=True)
                        for n in nannies if n.process and n.worker_port]), io_loop=loop2)

    loop2.run_sync(f)

    if not no_nanny:
        for n in nannies:
            n.process.terminate()

    if not no_nanny:
        start = time()
        while (any(isalive(n.process) for n in nannies)
                and time() < start + 1):
            sleep(0.1)

    for nanny in nannies:
        nanny.stop()
Esempio n. 15
0
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs,
        nanny, name, memory_limit, pid_file, temp_filename, reconnect,
        resources, bokeh, bokeh_port):
    if nanny:
        port = nanny_port
    else:
        port = worker_port

    try:
        scheduler_host, scheduler_port = scheduler.split(':')
        scheduler_ip = socket.gethostbyname(scheduler_host)
        scheduler_port = int(scheduler_port)
    except IndexError:
        logger.info("Usage:  dask-worker scheduler_host:scheduler_port")

    if nprocs > 1 and worker_port != 0:
        logger.error("Failed to launch worker.  You cannot use the --port argument when nprocs > 1.")
        exit(1)

    if nprocs > 1 and name:
        logger.error("Failed to launch worker.  You cannot use the --name argument when nprocs > 1.")
        exit(1)

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)
        atexit.register(del_pid_file)

    services = {('http', http_port): HTTPWorker}

    if bokeh:
        try:
            from distributed.bokeh.worker import BokehWorker
        except ImportError:
            pass
        else:
            services[('bokeh', bokeh_port)] = BokehWorker

    if resources:
        resources = resources.replace(',', ' ').split()
        resources = dict(pair.split('=') for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    if nanny:
        kwargs = {'worker_port': worker_port}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs['service_ports'] = {'nanny': nanny_port}
        t = Worker

    if host is not None:
        ip = socket.gethostbyname(host)
    else:
        # lookup the ip address of a local interface on a network that
        # reach the scheduler
        ip = get_ip(scheduler_ip, scheduler_port)
    nannies = [t(scheduler_ip, scheduler_port, ncores=nthreads, ip=ip,
                 services=services, name=name, loop=loop, resources=resources,
                 memory_limit=memory_limit, reconnect=reconnect, **kwargs)
               for i in range(nprocs)]

    for n in nannies:
        n.start(port)
        if t is Nanny:
            global_nannies.append(n)

    if temp_filename:
        @gen.coroutine
        def f():
            while nannies[0].status != 'running':
                yield gen.sleep(0.01)
            import json
            msg = {'port': nannies[0].port,
                   'local_directory': nannies[0].local_dir}
            with open(temp_filename, 'w') as f:
                json.dump(msg, f)
        loop.add_callback(f)

    @gen.coroutine
    def run():
        while all(n.status != 'closed' for n in nannies):
            yield gen.sleep(0.2)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
        loop.close()

    loop2 = IOLoop()

    @gen.coroutine
    def f():
        scheduler = rpc(ip=nannies[0].scheduler.ip,
                        port=nannies[0].scheduler.port)
        if nanny:
            yield gen.with_timeout(timedelta(seconds=2),
                    All([scheduler.unregister(address=n.worker_address, close=True)
                        for n in nannies if n.process and n.worker_port]), io_loop=loop2)

    loop2.run_sync(f)

    if nanny:
        for n in nannies:
            if isalive(n.process):
                n.process.terminate()

    if nanny:
        start = time()
        while (any(isalive(n.process) for n in nannies)
                and time() < start + 1):
            sleep(0.1)

    for nanny in nannies:
        nanny.stop()