Example #1
0
    def testHolder(self):
        pool_address = '127.0.0.1:%d' % get_next_port()
        with create_actor_pool(n_process=1,
                               backend='gevent',
                               address=pool_address) as pool:
            pool.create_actor(PlasmaKeyMapActor,
                              uid=PlasmaKeyMapActor.default_name())
            pool.create_actor(WorkerClusterInfoActor,
                              schedulers=[pool_address],
                              uid=WorkerClusterInfoActor.default_name())
            pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name())
            pool.create_actor(DispatchActor, uid=DispatchActor.default_name())
            pool.create_actor(QuotaActor,
                              1024 * 1024 * 10,
                              uid=MemQuotaActor.default_name())
            cache_ref = pool.create_actor(ChunkHolderActor,
                                          self.plasma_storage_size,
                                          uid=ChunkHolderActor.default_name())
            pool.create_actor(SpillActor)

            try:
                test_ref = pool.create_actor(CacheTestActor)
                test_ref.run_test_cache()
                while not test_ref.get_exc_info()[0]:
                    pool.sleep(0.1)
                exc_info = test_ref.get_exc_info()[1]
                if exc_info:
                    six.reraise(*exc_info)
            finally:
                pool.destroy_actor(cache_ref)
Example #2
0
    def testEnsureTimeout(self, *_):
        from mars.errors import PromiseTimeout

        pool_address = '127.0.0.1:%d' % get_next_port()
        with create_actor_pool(n_process=1,
                               backend='gevent',
                               address=pool_address) as pool:
            pool.create_actor(PlasmaKeyMapActor,
                              uid=PlasmaKeyMapActor.default_name())
            pool.create_actor(WorkerClusterInfoActor,
                              schedulers=[pool_address],
                              uid=WorkerClusterInfoActor.default_name())
            pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name())
            pool.create_actor(DispatchActor, uid=DispatchActor.default_name())
            pool.create_actor(QuotaActor,
                              1024 * 1024 * 10,
                              uid=MemQuotaActor.default_name())
            pool.create_actor(SpillActor, uid=SpillActor.default_name())
            cache_ref = pool.create_actor(ChunkHolderActor,
                                          self.plasma_storage_size,
                                          uid=ChunkHolderActor.default_name())

            try:
                options.worker.prepare_data_timeout = 2
                test_ref = pool.create_actor(CacheTestActor)
                test_ref.run_test_ensure_timeout()
                while not test_ref.get_exc_info()[0]:
                    pool.sleep(0.1)
                exc_info = test_ref.get_exc_info()[1]
                self.assertIsNotNone(exc_info)
                self.assertIsInstance(exc_info[1], PromiseTimeout)
            finally:
                options.worker.prepare_data_timeout = 600
                pool.destroy_actor(cache_ref)
Example #3
0
    def testKVStoreActor(self):
        etcd_port = get_next_port()
        proc_helper = EtcdProcessHelper(port_range_start=etcd_port)
        options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
        with proc_helper.run(), create_actor_pool(n_process=1,
                                                  backend='gevent') as pool:
            store_ref = pool.create_actor(KVStoreActor,
                                          uid=KVStoreActor.default_name())

            store_ref.write('/node/v1', 'value1')
            store_ref.write('/node/v2', 'value2')
            store_ref.write_batch([
                ('/node/v2', 'value2'),
                ('/node/v3', 'value3'),
            ])

            self.assertEqual(store_ref.read('/node/v1').value, 'value1')
            self.assertListEqual([
                v.value for v in store_ref.read_batch(['/node/v2', '/node/v3'])
            ], ['value2', 'value3'])

            store_ref.delete('/node', dir=True, recursive=True)
            with self.assertRaises(KeyError):
                store_ref.delete('/node', dir=True, recursive=True)
            store_ref.delete('/node', dir=True, recursive=True, silent=True)
Example #4
0
 def post_create(self):
     super(TransferTestActor, self).post_create()
     self._remote_plasma_client = plasma.connect(self._remote_plasma_socket,
                                                 '', 0)
     self._remote_store = PlasmaChunkStore(
         self._remote_plasma_client,
         self.ctx.actor_ref(KVStoreActor.default_name()))
Example #5
0
    def testExecute(self):
        pool_address = '127.0.0.1:%d' % get_next_port()
        with create_actor_pool(n_process=1,
                               backend='gevent',
                               address=pool_address) as pool:
            pool.create_actor(ClusterInfoActor,
                              schedulers=[pool_address],
                              uid=ClusterInfoActor.default_name())
            cache_ref = pool.create_actor(ChunkHolderActor,
                                          self._plasma_helper._size,
                                          uid='ChunkHolderActor')
            pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name())
            pool.create_actor(DispatchActor, uid='DispatchActor')
            pool.create_actor(QuotaActor, 1024 * 1024, uid='MemQuotaActor')
            pool.create_actor(CpuCalcActor)
            pool.create_actor(ExecutionActor, uid='ExecutionActor')

            try:
                test_ref = pool.create_actor(ExecuteTestActor)
                test_ref.run_test()
                while not test_ref.get_exc_info()[0]:
                    gevent.sleep(0.1)
                exc_info = test_ref.get_exc_info()[1]
                if exc_info:
                    six.reraise(*exc_info)
            finally:
                pool.destroy_actor(cache_ref)
Example #6
0
def run_transfer_worker(pool_address, session_id, chunk_keys, spill_dir,
                        msg_queue):
    from mars.config import options

    options.worker.spill_directory = spill_dir
    plasma_size = 1024 * 1024 * 10

    # don't use multiple with-statement as we need the options be forked
    with plasma.start_plasma_store(plasma_size) as store_args:
        options.worker.plasma_socket = plasma_socket = store_args[0]

        with create_actor_pool(n_process=2,
                               backend='gevent',
                               distributor=WorkerDistributor(2),
                               address=pool_address) as pool:
            try:
                pool.create_actor(ClusterInfoActor,
                                  schedulers=[pool_address],
                                  uid=ClusterInfoActor.default_name())
                pool.create_actor(KVStoreActor,
                                  uid=KVStoreActor.default_name())
                pool.create_actor(DispatchActor,
                                  uid=DispatchActor.default_name())
                pool.create_actor(QuotaActor,
                                  1024 * 1024 * 20,
                                  uid=MemQuotaActor.default_name())
                holder_ref = pool.create_actor(HolderActor, uid='HolderActor')
                chunk_holder_ref = pool.create_actor(
                    ChunkHolderActor,
                    plasma_size,
                    uid=ChunkHolderActor.default_name())
                pool.create_actor(SpillActor)

                pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4()))
                pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4()))

                pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4()))
                pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4()))

                register_actor = pool.create_actor(WorkerRegistrationTestActor)
                register_actor.register(session_id, chunk_keys)

                check_time = time.time()
                while not register_actor.get_finished():
                    gevent.sleep(0.5)
                    if time.time() - check_time > 60:
                        raise SystemError('Wait result timeout')
                register_actor.destroy()

                msg_queue.put(plasma_socket)
                check_time = time.time()
                while not holder_ref.obtain():
                    gevent.sleep(1)
                    if time.time() - check_time > 60:
                        raise SystemError('Wait result timeout')
            finally:
                pool.destroy_actor(chunk_holder_ref)
Example #7
0
    def testKVStoreActor(self):
        proc_helper = EtcdProcessHelper(port_range_start=54131)
        with proc_helper.run(), create_actor_pool(n_process=1,
                                                  backend='gevent') as pool:
            store_ref = pool.create_actor(KVStoreActor,
                                          uid=KVStoreActor.default_name())

            store_ref.write('/node/v1', 'value1')
            store_ref.write('/node/v2', 'value2')
            store_ref.write_batch([
                ('/node/v2', 'value2'),
                ('/node/v3', 'value3'),
            ])

            self.assertEqual(store_ref.read('/node/v1').value, 'value1')
            self.assertListEqual([
                v.value for v in store_ref.read_batch(['/node/v2', '/node/v3'])
            ], ['value2', 'value3'])
Example #8
0
    def testSimpleTransfer(self):
        import tempfile
        session_id = str(uuid.uuid4())

        local_pool_addr = 'localhost:%d' % get_next_port()
        remote_pool_addr = 'localhost:%d' % get_next_port()
        remote_chunk_keys = [str(uuid.uuid4()) for _ in range(9)]
        msg_queue = multiprocessing.Queue()

        remote_plasma_socket = '/tmp/plasma_%d_%d.sock' % (os.getpid(), id(run_transfer_worker))
        remote_spill_dir = os.path.join(tempfile.gettempdir(),
                                        'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker)))

        proc = multiprocessing.Process(
            target=run_transfer_worker,
            args=(remote_pool_addr, session_id, remote_plasma_socket,
                  remote_chunk_keys, remote_spill_dir, msg_queue)
        )
        proc.start()
        try:
            msg_queue.get(30)
        except:
            if proc.is_alive():
                proc.terminate()
            raise

        with create_actor_pool(n_process=1, distributor=WorkerDistributor(3),
                               backend='gevent', address=local_pool_addr) as pool:
            pool.create_actor(ClusterInfoActor, schedulers=[local_pool_addr],
                              uid=ClusterInfoActor.default_name())
            pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name())
            pool.create_actor(DispatchActor, uid=DispatchActor.default_name())
            pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name())
            cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size,
                                          uid=ChunkHolderActor.default_name())
            pool.create_actor(SpillActor)

            sender_refs = [
                pool.create_actor(SenderActor, uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(SenderActor, uid='w:2:%s' % str(uuid.uuid4())),
            ]

            receiver_refs = [
                pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())),
            ]

            test_ref = pool.create_actor(TransferTestActor, local_pool_addr, remote_pool_addr,
                                         remote_plasma_socket, remote_spill_dir)
            try:
                for data_id in (-1, 1):
                    chunk_key = remote_chunk_keys[data_id]

                    test_ref.do_transfer_test(session_id, chunk_key)

                    check_time = time.time()
                    while test_ref.get_results()[0] != chunk_key:
                        gevent.sleep(0.5)
                        if not proc.is_alive():
                            raise SystemError('Transfer worker dead. exit code %s' % proc.exitcode)
                        if time.time() - check_time > 60:
                            raise SystemError('Wait result timeout')
                    exc = test_ref.get_results()[1]
                    if exc:
                        six.reraise(*exc)

                remote_holder_ref = pool.actor_ref('HolderActor', address=remote_pool_addr)
                remote_holder_ref.trigger()
            finally:
                for ref in sender_refs:
                    pool.destroy_actor(ref)
                for ref in receiver_refs:
                    pool.destroy_actor(ref)
                pool.destroy_actor(cache_ref)
                pool.destroy_actor(test_ref)

                os.unlink(remote_plasma_socket)
                if proc.is_alive():
                    proc.terminate()
Example #9
0
def run_transfer_worker(pool_address, session_id, plasma_socket, chunk_keys,
                        spill_dir, msg_queue):
    from mars.config import options
    from mars.utils import PlasmaProcessHelper

    options.worker.plasma_socket = plasma_socket
    options.worker.spill_directory = spill_dir

    plasma_helper = PlasmaProcessHelper(size=1024 * 1024 * 10,
                                        socket=options.worker.plasma_socket)
    try:
        plasma_helper.run()

        with create_actor_pool(n_process=2,
                               backend='gevent',
                               distributor=BaseDistributor(2),
                               address=pool_address) as pool:
            try:
                pool.create_actor(ClusterInfoActor,
                                  schedulers=[pool_address],
                                  uid=ClusterInfoActor.default_name())
                pool.create_actor(KVStoreActor,
                                  uid=KVStoreActor.default_name())
                pool.create_actor(DispatchActor, uid='DispatchActor')
                pool.create_actor(QuotaActor,
                                  1024 * 1024 * 20,
                                  uid='MemQuotaActor')
                holder_ref = pool.create_actor(HolderActor, uid='HolderActor')
                chunk_holder_ref = pool.create_actor(ChunkHolderActor,
                                                     plasma_helper._size,
                                                     uid='ChunkHolderActor')
                pool.create_actor(SpillActor)

                pool.create_actor(SenderActor, uid='w:%s' % str(uuid.uuid4()))
                pool.create_actor(SenderActor, uid='w:%s' % str(uuid.uuid4()))

                pool.create_actor(ReceiverActor,
                                  uid='w:%s' % str(uuid.uuid4()))
                pool.create_actor(ReceiverActor,
                                  uid='w:%s' % str(uuid.uuid4()))

                register_actor = pool.create_actor(WorkerRegistrationTestActor)
                register_actor.register(session_id, chunk_keys)

                check_time = time.time()
                while not register_actor.get_finished():
                    gevent.sleep(0.5)
                    if time.time() - check_time > 60:
                        raise SystemError('Wait result timeout')
                register_actor.destroy()

                msg_queue.put(1)
                check_time = time.time()
                while not holder_ref.obtain():
                    gevent.sleep(1)
                    if time.time() - check_time > 60:
                        raise SystemError('Wait result timeout')
            finally:
                pool.destroy_actor(chunk_holder_ref)
    finally:
        plasma_helper.stop()
Example #10
0
    def testSimpleTransfer(self):
        import tempfile
        session_id = str(uuid.uuid4())

        local_pool_addr = 'localhost:%d' % get_next_port()
        remote_pool_addr = 'localhost:%d' % get_next_port()
        remote_chunk_keys = [str(uuid.uuid4()) for _ in range(9)]
        msg_queue = multiprocessing.Queue()

        remote_spill_dir = os.path.join(tempfile.gettempdir(),
                                        'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker)))

        proc = multiprocessing.Process(
            target=run_transfer_worker,
            args=(remote_pool_addr, session_id, remote_chunk_keys, remote_spill_dir, msg_queue)
        )
        proc.start()
        try:
            remote_plasma_socket = msg_queue.get(30)
        except:
            if proc.is_alive():
                proc.terminate()
            raise

        with create_actor_pool(n_process=1, distributor=WorkerDistributor(1),
                               backend='gevent', address=local_pool_addr) as pool:
            pool.create_actor(ClusterInfoActor, schedulers=[local_pool_addr],
                              uid=ClusterInfoActor.default_name())
            kv_store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name())
            pool.create_actor(DispatchActor, uid=DispatchActor.default_name())
            pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name())
            cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size,
                                          uid=ChunkHolderActor.default_name())
            pool.create_actor(SpillActor)

            sender_refs = [
                pool.create_actor(SenderActor, uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(SenderActor, uid='w:2:%s' % str(uuid.uuid4())),
            ]

            receiver_refs = [
                pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())),
            ]

            try:
                for data_id in (-1, 1):
                    chunk_key = remote_chunk_keys[data_id]

                    with self.run_actor_test(pool) as test_actor:
                        from mars.worker.spill import build_spill_file_name
                        from mars.serialize import dataserializer
                        from numpy.testing import assert_array_equal

                        remote_dispatch_ref = test_actor.promise_ref(
                            DispatchActor.default_name(), address=remote_pool_addr)
                        remote_plasma_client = plasma.connect(remote_plasma_socket, '', 0)
                        remote_store = PlasmaChunkStore(remote_plasma_client, kv_store_ref)

                        def _call_send_data(sender_uid):
                            sender_ref = test_actor.promise_ref(sender_uid, address=remote_pool_addr)
                            return sender_ref.send_data(session_id, chunk_key, local_pool_addr, _promise=True)

                        def _test_data_exist(*_):
                            try:
                                local_data = test_actor._chunk_store.get(session_id, chunk_key)
                            except KeyError:
                                with open(build_spill_file_name(chunk_key), 'rb') as spill_file:
                                    local_data = dataserializer.load(spill_file)

                            try:
                                remote_data = remote_store.get(session_id, chunk_key)
                            except KeyError:
                                with open(build_spill_file_name(chunk_key, remote_spill_dir), 'rb') as spill_file:
                                    remote_data = dataserializer.load(spill_file)
                            assert_array_equal(local_data, remote_data)

                            del local_data, remote_data

                        remote_dispatch_ref.get_free_slot('sender', _promise=True) \
                            .then(_call_send_data) \
                            .then(_test_data_exist) \
                            .then(
                            lambda *_: test_actor.set_result(chunk_key),
                            lambda *exc: test_actor.set_result(exc, False),
                        )
                    self.assertEqual(self.get_result(60), chunk_key)

                remote_holder_ref = pool.actor_ref('HolderActor', address=remote_pool_addr)
                remote_holder_ref.trigger()
            finally:
                for ref in sender_refs:
                    pool.destroy_actor(ref)
                for ref in receiver_refs:
                    pool.destroy_actor(ref)
                pool.destroy_actor(cache_ref)

                os.unlink(remote_plasma_socket)

                os.kill(proc.pid, signal.SIGINT)
                t = time.time()
                while proc.is_alive() and time.time() < t + 5:
                    time.sleep(1)
                if proc.is_alive():
                    proc.terminate()
Example #11
0
    def testExecuteWorker(self):
        import mars.tensor as mt
        mock_scheduler_addr = '127.0.0.1:%d' % get_next_port()
        try:

            session_id = str(uuid.uuid4())
            with create_actor_pool(n_process=1,
                                   backend='gevent',
                                   address=mock_scheduler_addr) as pool:
                pool.create_actor(ClusterInfoActor,
                                  schedulers=[mock_scheduler_addr],
                                  uid=ClusterInfoActor.default_name())
                kv_ref = pool.create_actor(KVStoreActor,
                                           uid=KVStoreActor.default_name())
                pool.create_actor(ResourceActor,
                                  uid=ResourceActor.default_name())

                proc = subprocess.Popen([
                    sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                    '--schedulers', mock_scheduler_addr, '--cpu-procs', '1',
                    '--cache-mem', '10m', '--ignore-avail-mem'
                ])
                worker_ips = []

                def waiter():
                    check_time = time.time()
                    while True:
                        if kv_ref.read('/workers/meta_timestamp',
                                       silent=True) is None:
                            gevent.sleep(0.5)
                            if proc.poll() is not None:
                                raise SystemError('Worker dead. exit code %s' %
                                                  proc.poll())
                            if time.time() - check_time > 20:
                                raise SystemError(
                                    'Check meta_timestamp timeout')
                            continue
                        else:
                            break
                    val = kv_ref.read('/workers/meta')
                    worker_ips.extend(
                        [c.key.rsplit('/', 1)[-1] for c in val.children])

                gl = gevent.spawn(waiter)
                gl.join()

                a = mt.ones((100, 50), chunks=30)
                b = mt.ones((50, 200), chunks=30)
                result = a.dot(b)

                graph = result.build_graph(tiled=True)

                reply_ref = pool.create_actor(PromiseReplyTestActor)
                reply_callback = ((reply_ref.uid, reply_ref.address), 'reply')

                executor_ref = pool.actor_ref(ExecutionActor.default_name(),
                                              address=worker_ips[0])
                io_meta = dict(chunks=[c.key for c in result.chunks])
                executor_ref.execute_graph(session_id,
                                           str(id(graph)),
                                           serialize_graph(graph),
                                           io_meta,
                                           None,
                                           callback=reply_callback)

                check_time = time.time()
                while not reply_ref.get_reply():
                    gevent.sleep(0.1)
                    if time.time() - check_time > 20:
                        raise SystemError('Check reply timeout')
        finally:
            if proc.poll() is None:
                proc.send_signal(signal.SIGINT)
                check_time = time.time()
                while True:
                    time.sleep(1)
                    if proc.poll(
                    ) is not None or time.time() - check_time >= 5:
                        break
                if proc.poll() is None:
                    proc.kill()
            if os.path.exists(options.worker.plasma_socket):
                os.unlink(options.worker.plasma_socket)