Example #1
0
    def testSendTargets(self):
        pool_address = '127.0.0.1:%d' % get_next_port()
        session_id = str(uuid.uuid4())
        mock_data = np.array([1, 2, 3, 4])
        with create_actor_pool(n_process=1, backend='gevent',
                               address=pool_address, distributor=WorkerDistributor(2)) as pool:
            self.create_standard_actors(pool, pool_address, with_daemon=False, with_status=False)
            pool.create_actor(CpuCalcActor)

            import mars.tensor as mt
            arr = mt.ones((4,), chunk_size=4)
            arr_add = mt.array(mock_data)
            result_tensor = arr + arr_add
            graph = result_tensor.build_graph(compose=False, tiled=True)
            result_key = result_tensor.chunks[0].key

            pool.create_actor(MockSenderActor, mock_data + np.ones((4,)), 'out', uid='w:mock_sender')
            with self.run_actor_test(pool) as test_actor:
                def _validate(_):
                    data = test_actor._chunk_store.get(session_id, result_tensor.chunks[0].key)
                    assert_array_equal(data, mock_data + np.ones((4,)))

                graph_key = str(uuid.uuid4())
                execution_ref = test_actor.promise_ref(ExecutionActor.default_name())
                execution_ref.enqueue_graph(session_id, graph_key, serialize_graph(graph),
                                            dict(chunks=[result_tensor.chunks[0].key]), None,
                                            send_addresses={result_key: (pool_address,)}, _promise=True) \
                    .then(lambda *_: execution_ref.start_execution(session_id, graph_key, _promise=True)) \
                    .then(_validate) \
                    .then(lambda *_: test_actor.set_result(None)) \
                    .catch(lambda *exc: test_actor.set_result(exc, False))

            self.get_result()
Example #2
0
    def testCalcProcessFailure(self):
        pool_address = '127.0.0.1:%d' % get_next_port()
        session_id = str(uuid.uuid4())
        mock_data = np.array([1, 2, 3, 4])
        with create_actor_pool(n_process=2,
                               backend='gevent',
                               address=pool_address,
                               distributor=WorkerDistributor(2)) as pool:
            self.create_standard_actors(pool, pool_address, with_status=False)

            daemon_ref = pool.actor_ref(WorkerDaemonActor.default_name())
            dispatch_ref = pool.actor_ref(DispatchActor.default_name())
            calc_ref = daemon_ref.create_actor(MockCpuCalcActor,
                                               session_id,
                                               mock_data,
                                               10,
                                               uid='w:1:cpu-calc-a')
            daemon_ref.create_actor(ProcessHelperActor,
                                    uid='w:1:proc-helper-a')

            test_actor = pool.create_actor(ExecutionTestActor,
                                           uid='w:test_actor')
            test_actor.run_simple_calc(session_id, _tell=True)

            pool.sleep(2)
            proc_id = pool.distributor.distribute(calc_ref.uid)
            daemon_ref.kill_actor_process(calc_ref)
            assert not daemon_ref.is_actor_process_alive(calc_ref)
            pool.restart_process(proc_id)
            daemon_ref.handle_process_down([proc_id])

            with self.assertRaises(WorkerProcessStopped):
                self.wait_for_result(pool, test_actor)
            self.assertEqual(len(dispatch_ref.get_slots('cpu')), 1)
Example #3
0
    def testStopGraphCalc(self):
        pool_address = '127.0.0.1:%d' % get_next_port()
        session_id = str(uuid.uuid4())
        mock_data = np.array([1, 2, 3, 4])
        with create_actor_pool(n_process=2, backend='gevent',
                               address=pool_address, distributor=WorkerDistributor(2)) as pool:
            self.create_standard_actors(pool, pool_address, with_status=False)

            daemon_ref = pool.actor_ref(WorkerDaemonActor.default_name())
            execution_ref = pool.actor_ref(ExecutionActor.default_name())

            calc_ref = daemon_ref.create_actor(
                MockCpuCalcActor, session_id, mock_data, 10, uid='w:1:cpu-calc-a')
            daemon_ref.create_actor(ProcessHelperActor, uid='w:1:proc-helper-a')

            test_actor = pool.create_actor(ExecutionTestActor, uid='w:test_actor')
            test_actor.run_simple_calc(session_id, _tell=True)

            pool.sleep(2)
            proc_id = pool.distributor.distribute(calc_ref.uid)
            execution_ref.stop_execution(session_id, test_actor.get_graph_key(), _tell=True)
            while daemon_ref.is_actor_process_alive(calc_ref):
                pool.sleep(0.1)
            pool.restart_process(proc_id)
            daemon_ref.handle_process_down([proc_id])

            with self.assertRaises(ExecutionInterrupted):
                self.wait_for_result(pool, test_actor)
Example #4
0
    def testDaemon(self):
        mock_scheduler_addr = '127.0.0.1:%d' % get_next_port()
        with create_actor_pool(n_process=2,
                               backend='gevent',
                               distributor=WorkerDistributor(2),
                               address=mock_scheduler_addr) as pool:
            daemon_ref = pool.create_actor(
                WorkerDaemonActor, uid=WorkerDaemonActor.default_name())
            pool.create_actor(DispatchActor, uid=DispatchActor.default_name())
            sleeper_ref = daemon_ref.create_actor(DaemonSleeperActor,
                                                  uid='w:1:DaemonSleeperActor')
            daemon_ref.create_actor(ProcessHelperActor, uid='w:1:ProcHelper')
            test_actor = pool.create_actor(DaemonTestActor)
            daemon_ref.register_callback(test_actor, 'handle_process_down')

            test_actor.run_test_sleep(sleeper_ref, 10, _tell=True)
            self.assertTrue(daemon_ref.is_actor_process_alive(sleeper_ref))

            pool.sleep(0.5)

            daemon_ref.kill_actor_process(sleeper_ref)
            # repeated kill shall not produce errors
            daemon_ref.kill_actor_process(sleeper_ref)
            self.assertFalse(daemon_ref.is_actor_process_alive(sleeper_ref))

            pool.restart_process(1)
            daemon_ref.handle_process_down([1])
            pool.sleep(1)
            self.assertTrue(pool.has_actor(sleeper_ref))
            with self.assertRaises(WorkerProcessStopped):
                test_actor.get_result()

            test_actor.run_test_sleep(sleeper_ref, 1)
            pool.sleep(1.5)
            test_actor.get_result()
Example #5
0
def run_transfer_worker(pool_address, session_id, chunk_keys, spill_dir,
                        msg_queue):
    from mars.config import options

    options.worker.spill_directory = spill_dir
    plasma_size = 1024 * 1024 * 10

    # don't use multiple with-statement as we need the options be forked
    with plasma.start_plasma_store(plasma_size) as store_args:
        options.worker.plasma_socket = plasma_socket = store_args[0]

        with create_actor_pool(n_process=2,
                               backend='gevent',
                               distributor=WorkerDistributor(2),
                               address=pool_address) as pool:
            try:
                pool.create_actor(ClusterInfoActor,
                                  schedulers=[pool_address],
                                  uid=ClusterInfoActor.default_name())
                pool.create_actor(ChunkMetaActor,
                                  uid=ChunkMetaActor.default_name())
                pool.create_actor(DispatchActor,
                                  uid=DispatchActor.default_name())
                pool.create_actor(QuotaActor,
                                  1024 * 1024 * 20,
                                  uid=MemQuotaActor.default_name())
                holder_ref = pool.create_actor(HolderActor, uid='HolderActor')
                chunk_holder_ref = pool.create_actor(
                    ChunkHolderActor,
                    plasma_size,
                    uid=ChunkHolderActor.default_name())
                pool.create_actor(SpillActor)

                pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4()))
                pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4()))

                pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4()))
                pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4()))

                register_actor = pool.create_actor(WorkerRegistrationTestActor)
                register_actor.register(session_id, chunk_keys)

                check_time = time.time()
                while not register_actor.get_finished():
                    gevent.sleep(0.5)
                    if time.time() - check_time > 60:
                        raise SystemError('Wait result timeout')
                register_actor.destroy()

                msg_queue.put(plasma_socket)
                check_time = time.time()
                while not holder_ref.obtain():
                    gevent.sleep(1)
                    if time.time() - check_time > 60:
                        raise SystemError('Wait result timeout')
            finally:
                pool.destroy_actor(chunk_holder_ref)
Example #6
0
    def testFetchRemoteData(self):
        pool_address = '127.0.0.1:%d' % get_next_port()
        session_id = str(uuid.uuid4())
        mock_data = np.array([1, 2, 3, 4])
        with create_actor_pool(n_process=1,
                               backend='gevent',
                               address=pool_address,
                               distributor=WorkerDistributor(2)) as pool:
            self.create_standard_actors(pool,
                                        pool_address,
                                        with_daemon=False,
                                        with_status=False)
            pool.create_actor(CpuCalcActor)
            pool.create_actor(MockSenderActor,
                              mock_data,
                              'in',
                              uid='w:mock_sender')
            chunk_meta_ref = pool.actor_ref(ChunkMetaActor.default_name())

            import mars.tensor as mt
            from mars.tensor.expressions.datasource import TensorFetch
            arr = mt.ones((4, ), chunk_size=4)
            arr_add = mt.array(mock_data)
            result_tensor = arr + arr_add
            graph = result_tensor.build_graph(compose=False, tiled=True)

            modified_chunk = arr_add.chunks[0]
            arr_add.chunks[0]._op = TensorFetch(
                dtype=modified_chunk.dtype,
                _outputs=[weakref.ref(o) for o in modified_chunk.op.outputs],
                _key=modified_chunk.op.key)

            chunk_meta_ref.set_chunk_meta(session_id,
                                          modified_chunk.key,
                                          size=mock_data.nbytes,
                                          shape=mock_data.shape,
                                          workers=('0.0.0.0:1234',
                                                   pool_address))
            with self.run_actor_test(pool) as test_actor:

                def _validate(_):
                    data = test_actor._chunk_store.get(
                        session_id, result_tensor.chunks[0].key)
                    assert_array_equal(data, mock_data + np.ones((4, )))

                graph_key = str(uuid.uuid4())
                execution_ref = test_actor.promise_ref(
                    ExecutionActor.default_name())
                execution_ref.enqueue_graph(session_id, graph_key, serialize_graph(graph),
                                            dict(chunks=[result_tensor.chunks[0].key]), None, _promise=True) \
                    .then(lambda *_: execution_ref.start_execution(session_id, graph_key, _promise=True)) \
                    .then(_validate) \
                    .then(lambda *_: test_actor.set_result(None)) \
                    .catch(lambda *exc: test_actor.set_result(exc, False))

            self.get_result()
Example #7
0
def run_transfer_worker(pool_address, session_id, plasma_socket, chunk_keys,
                        spill_dir, msg_queue):
    from mars.config import options
    from mars.utils import PlasmaProcessHelper

    options.worker.plasma_socket = plasma_socket
    options.worker.spill_directory = spill_dir

    plasma_helper = PlasmaProcessHelper(size=1024 * 1024 * 10, socket=options.worker.plasma_socket)
    try:
        plasma_helper.run()

        with create_actor_pool(n_process=2, backend='gevent', distributor=WorkerDistributor(2),
                               address=pool_address) as pool:
            try:
                pool.create_actor(ClusterInfoActor, schedulers=[pool_address],
                                  uid=ClusterInfoActor.default_name())
                pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name())
                pool.create_actor(DispatchActor, uid=DispatchActor.default_name())
                pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name())
                holder_ref = pool.create_actor(HolderActor, uid='HolderActor')
                chunk_holder_ref = pool.create_actor(ChunkHolderActor, plasma_helper._size,
                                                     uid=ChunkHolderActor.default_name())
                pool.create_actor(SpillActor)

                pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4()))
                pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4()))

                pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4()))
                pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4()))

                register_actor = pool.create_actor(WorkerRegistrationTestActor)
                register_actor.register(session_id, chunk_keys)

                check_time = time.time()
                while not register_actor.get_finished():
                    gevent.sleep(0.5)
                    if time.time() - check_time > 60:
                        raise SystemError('Wait result timeout')
                register_actor.destroy()

                msg_queue.put(1)
                check_time = time.time()
                while not holder_ref.obtain():
                    gevent.sleep(1)
                    if time.time() - check_time > 60:
                        raise SystemError('Wait result timeout')
            finally:
                pool.destroy_actor(chunk_holder_ref)
    finally:
        plasma_helper.stop()
Example #8
0
    def testSimpleTransfer(self):
        import tempfile
        session_id = str(uuid.uuid4())

        local_pool_addr = 'localhost:%d' % get_next_port()
        remote_pool_addr = 'localhost:%d' % get_next_port()
        remote_chunk_keys = [str(uuid.uuid4()) for _ in range(9)]
        msg_queue = multiprocessing.Queue()

        remote_spill_dir = os.path.join(
            tempfile.gettempdir(),
            'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker)))

        proc = multiprocessing.Process(target=run_transfer_worker,
                                       args=(remote_pool_addr, session_id,
                                             remote_chunk_keys,
                                             remote_spill_dir, msg_queue))
        proc.start()
        try:
            remote_plasma_socket = msg_queue.get(30)
        except:
            if proc.is_alive():
                proc.terminate()
            raise

        with create_actor_pool(n_process=1,
                               distributor=WorkerDistributor(1),
                               backend='gevent',
                               address=local_pool_addr) as pool:
            pool.create_actor(ClusterInfoActor,
                              schedulers=[local_pool_addr],
                              uid=ClusterInfoActor.default_name())
            pool.create_actor(ChunkMetaActor,
                              uid=ChunkMetaActor.default_name())
            pool.create_actor(DispatchActor, uid=DispatchActor.default_name())
            pool.create_actor(QuotaActor,
                              1024 * 1024 * 20,
                              uid=MemQuotaActor.default_name())
            cache_ref = pool.create_actor(ChunkHolderActor,
                                          self.plasma_storage_size,
                                          uid=ChunkHolderActor.default_name())
            pool.create_actor(SpillActor)

            sender_refs = [
                pool.create_actor(SenderActor,
                                  uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(SenderActor,
                                  uid='w:2:%s' % str(uuid.uuid4())),
            ]

            receiver_refs = [
                pool.create_actor(ReceiverActor,
                                  uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor,
                                  uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor,
                                  uid='w:2:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor,
                                  uid='w:2:%s' % str(uuid.uuid4())),
            ]

            try:
                for data_id in (-1, 1):
                    chunk_key = remote_chunk_keys[data_id]

                    with self.run_actor_test(pool) as test_actor:
                        from mars.worker.spill import build_spill_file_name
                        from mars.serialize import dataserializer
                        from numpy.testing import assert_array_equal

                        remote_dispatch_ref = test_actor.promise_ref(
                            DispatchActor.default_name(),
                            address=remote_pool_addr)
                        remote_plasma_client = plasma.connect(
                            remote_plasma_socket, '', 0)
                        remote_store = PlasmaChunkStore(remote_plasma_client)

                        def _call_send_data(sender_uid):
                            sender_ref = test_actor.promise_ref(
                                sender_uid, address=remote_pool_addr)
                            return sender_ref.send_data(session_id,
                                                        chunk_key,
                                                        local_pool_addr,
                                                        _promise=True)

                        def _test_data_exist(*_):
                            try:
                                local_data = test_actor._chunk_store.get(
                                    session_id, chunk_key)
                            except KeyError:
                                with open(build_spill_file_name(chunk_key),
                                          'rb') as spill_file:
                                    local_data = dataserializer.load(
                                        spill_file)

                            try:
                                remote_data = remote_store.get(
                                    session_id, chunk_key)
                            except KeyError:
                                with open(
                                        build_spill_file_name(
                                            chunk_key, remote_spill_dir),
                                        'rb') as spill_file:
                                    remote_data = dataserializer.load(
                                        spill_file)
                            assert_array_equal(local_data, remote_data)

                            del local_data, remote_data

                        remote_dispatch_ref.get_free_slot('sender', _promise=True) \
                            .then(_call_send_data) \
                            .then(_test_data_exist) \
                            .then(
                            lambda *_: test_actor.set_result(chunk_key),
                            lambda *exc: test_actor.set_result(exc, False),
                        )
                    self.assertEqual(self.get_result(60), chunk_key)

                remote_holder_ref = pool.actor_ref('HolderActor',
                                                   address=remote_pool_addr)
                remote_holder_ref.trigger()
            finally:
                for ref in sender_refs:
                    pool.destroy_actor(ref)
                for ref in receiver_refs:
                    pool.destroy_actor(ref)
                pool.destroy_actor(cache_ref)

                os.unlink(remote_plasma_socket)

                os.kill(proc.pid, signal.SIGINT)
                t = time.time()
                while proc.is_alive() and time.time() < t + 5:
                    time.sleep(1)
                if proc.is_alive():
                    proc.terminate()
Example #9
0
    def testSimpleTransfer(self):
        import tempfile
        session_id = str(uuid.uuid4())

        local_pool_addr = 'localhost:%d' % get_next_port()
        remote_pool_addr = 'localhost:%d' % get_next_port()
        remote_chunk_keys = [str(uuid.uuid4()) for _ in range(9)]
        msg_queue = multiprocessing.Queue()

        remote_plasma_socket = '/tmp/plasma_%d_%d.sock' % (os.getpid(), id(run_transfer_worker))
        remote_spill_dir = os.path.join(tempfile.gettempdir(),
                                        'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker)))

        proc = multiprocessing.Process(
            target=run_transfer_worker,
            args=(remote_pool_addr, session_id, remote_plasma_socket,
                  remote_chunk_keys, remote_spill_dir, msg_queue)
        )
        proc.start()
        try:
            msg_queue.get(30)
        except:
            if proc.is_alive():
                proc.terminate()
            raise

        with create_actor_pool(n_process=1, distributor=WorkerDistributor(3),
                               backend='gevent', address=local_pool_addr) as pool:
            pool.create_actor(ClusterInfoActor, schedulers=[local_pool_addr],
                              uid=ClusterInfoActor.default_name())
            pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name())
            pool.create_actor(DispatchActor, uid=DispatchActor.default_name())
            pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name())
            cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size,
                                          uid=ChunkHolderActor.default_name())
            pool.create_actor(SpillActor)

            sender_refs = [
                pool.create_actor(SenderActor, uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(SenderActor, uid='w:2:%s' % str(uuid.uuid4())),
            ]

            receiver_refs = [
                pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())),
                pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())),
            ]

            test_ref = pool.create_actor(TransferTestActor, local_pool_addr, remote_pool_addr,
                                         remote_plasma_socket, remote_spill_dir)
            try:
                for data_id in (-1, 1):
                    chunk_key = remote_chunk_keys[data_id]

                    test_ref.do_transfer_test(session_id, chunk_key)

                    check_time = time.time()
                    while test_ref.get_results()[0] != chunk_key:
                        gevent.sleep(0.5)
                        if not proc.is_alive():
                            raise SystemError('Transfer worker dead. exit code %s' % proc.exitcode)
                        if time.time() - check_time > 60:
                            raise SystemError('Wait result timeout')
                    exc = test_ref.get_results()[1]
                    if exc:
                        six.reraise(*exc)

                remote_holder_ref = pool.actor_ref('HolderActor', address=remote_pool_addr)
                remote_holder_ref.trigger()
            finally:
                for ref in sender_refs:
                    pool.destroy_actor(ref)
                for ref in receiver_refs:
                    pool.destroy_actor(ref)
                pool.destroy_actor(cache_ref)
                pool.destroy_actor(test_ref)

                os.unlink(remote_plasma_socket)
                if proc.is_alive():
                    proc.terminate()