Exemple #1
0
def run_transfer_worker(pool_address, session_id, chunk_keys, spill_dir, msg_queue):
    options.worker.spill_directory = spill_dir
    plasma_size = 1024 * 1024 * 10

    # don't use multiple with-statement as we need the options be forked
    with plasma.start_plasma_store(plasma_size) as store_args:
        options.worker.plasma_socket = plasma_socket = store_args[0]
        plasma_client = plasma.connect(plasma_socket, '', 0)

        with start_transfer_test_pool(address=pool_address, plasma_size=plasma_size) as pool:
            chunk_holder_ref = pool.actor_ref(ChunkHolderActor.default_name())
            mapper_ref = pool.actor_ref(PlasmaKeyMapActor.default_name())
            plasma_store = PlasmaChunkStore(plasma_client, mapper_ref)

            for _ in range(2):
                pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4()))
                pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4()))

            for idx in range(0, len(chunk_keys) - 7):
                data = np.ones((640 * 1024,), dtype=np.int16) * idx
                write_spill_file(chunk_keys[idx], data)
            for idx in range(len(chunk_keys) - 7, len(chunk_keys)):
                data = np.ones((640 * 1024,), dtype=np.int16) * idx
                plasma_store.put(session_id, chunk_keys[idx], data)
                chunk_holder_ref.register_chunk(session_id, chunk_keys[idx])

            msg_queue.put(plasma_socket)
            t = time.time()
            while True:
                try:
                    msg_queue.get_nowait()
                except Empty:
                    if time.time() > t + 60:
                        raise SystemError('Transfer finish timed out.')
                    pool.sleep(0.1)
Exemple #2
0
    def testPrepareSpilled(self):
        from mars.worker.spill import write_spill_file

        pool_address = '127.0.0.1:%d' % get_next_port()
        session_id = str(uuid.uuid4())
        mock_data = np.array([1, 2, 3, 4])

        options.worker.spill_directory = tempfile.mkdtemp(prefix='mars_worker_prep_spilled-')

        with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool:
            self.create_standard_actors(pool, pool_address, with_daemon=False, with_status=False)
            pool.create_actor(SpillActor)
            pool.create_actor(CpuCalcActor)
            cluster_info_ref = pool.actor_ref(WorkerClusterInfoActor.default_uid())
            chunk_meta_client = ChunkMetaClient(pool, cluster_info_ref)
            pool.actor_ref(ChunkHolderActor.default_uid())

            import mars.tensor as mt
            from mars.tensor.fetch import TensorFetch
            arr = mt.ones((4,), chunk_size=4)
            arr_add = mt.array(mock_data)
            result_tensor = arr + arr_add
            graph = result_tensor.build_graph(compose=False, tiled=True)

            modified_chunk = arr_add.chunks[0]
            arr_add.chunks[0]._op = TensorFetch(
                dtype=modified_chunk.dtype, _outputs=[weakref.ref(o) for o in modified_chunk.op.outputs],
                _key=modified_chunk.op.key)

            # test meta missing
            with self.run_actor_test(pool) as test_actor:
                graph_key = str(uuid.uuid4())
                execution_ref = test_actor.promise_ref(ExecutionActor.default_uid())
                execution_ref.execute_graph(session_id, graph_key, serialize_graph(graph),
                                            dict(chunks=[result_tensor.chunks[0].key]), None, _promise=True) \
                    .then(lambda *_: test_actor.set_result(None)) \
                    .catch(lambda *exc: test_actor.set_result(exc, False))

            with self.assertRaises(DependencyMissing):
                self.get_result()

            chunk_meta_client.set_chunk_meta(session_id, modified_chunk.key, size=mock_data.nbytes,
                                             shape=mock_data.shape, workers=('0.0.0.0:1234', pool_address))
            write_spill_file(modified_chunk.key, mock_data)

            # test read from spilled file
            with self.run_actor_test(pool) as test_actor:
                def _validate(_):
                    data = test_actor._chunk_store.get(session_id, result_tensor.chunks[0].key)
                    assert_array_equal(data, mock_data + np.ones((4,)))

                graph_key = str(uuid.uuid4())
                execution_ref = test_actor.promise_ref(ExecutionActor.default_uid())
                execution_ref.execute_graph(session_id, graph_key, serialize_graph(graph),
                                            dict(chunks=[result_tensor.chunks[0].key]), None, _promise=True) \
                    .then(_validate) \
                    .then(lambda *_: test_actor.set_result(None)) \
                    .catch(lambda *exc: test_actor.set_result(exc, False))

            self.get_result()
Exemple #3
0
    def testReceiver(self):
        pool_addr = 'localhost:%d' % get_next_port()
        options.worker.spill_directory = os.path.join(
            tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker)))
        session_id = str(uuid.uuid4())

        mock_data = np.array([1, 2, 3, 4])
        serialized_mock_data = dataserializer.dumps(mock_data)
        serialized_crc32 = zlib.crc32(serialized_mock_data)

        chunk_key1 = str(uuid.uuid4())
        chunk_key2 = str(uuid.uuid4())
        chunk_key3 = str(uuid.uuid4())
        chunk_key4 = str(uuid.uuid4())
        chunk_key5 = str(uuid.uuid4())
        chunk_key6 = str(uuid.uuid4())

        with start_transfer_test_pool(address=pool_addr, plasma_size=self.plasma_storage_size) as pool:
            chunk_holder_ref = pool.actor_ref(ChunkHolderActor.default_name())
            mapper_ref = pool.actor_ref(PlasmaKeyMapActor.default_name())
            receiver_ref = pool.create_actor(ReceiverActor, uid=str(uuid.uuid4()))

            store = PlasmaChunkStore(self._plasma_client, mapper_ref)

            # check_status on receiving and received
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.NOT_STARTED)

            write_spill_file(chunk_key1, mock_data)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.RECEIVED)
            os.unlink(build_spill_file_name(chunk_key1))

            ref = store.put(session_id, chunk_key1, mock_data)
            data_size = store.get_actual_size(session_id, chunk_key1)
            chunk_holder_ref.register_chunk(session_id, chunk_key1)
            del ref
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.RECEIVED)

            with self.run_actor_test(pool) as test_actor:
                receiver_ref_p = test_actor.promise_ref(receiver_ref)

                # cancel on an un-run / missing result will result in nothing
                receiver_ref_p.cancel_receive(session_id, chunk_key2)

                # start creating writer
                receiver_ref_p.create_data_writer(session_id, chunk_key1, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVED))

                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVING))

                receiver_ref_p.cancel_receive(session_id, chunk_key2)
                self.assertEqual(receiver_ref.check_status(session_id, chunk_key2),
                                 ReceiveStatus.NOT_STARTED)

                # test checksum error on receive_data_part
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data, 0)

                with self.assertRaises(ChecksumMismatch):
                    self.get_result(5)

                # test checksum error on finish_receive
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data, serialized_crc32)
                receiver_ref_p.finish_receive(session_id, chunk_key2, 0)

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                with self.assertRaises(ChecksumMismatch):
                    self.get_result(5)

                receiver_ref_p.cancel_receive(session_id, chunk_key2)

                # test intermediate cancellation
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data[:64],
                                                 zlib.crc32(serialized_mock_data[:64]))
                receiver_ref_p.cancel_receive(session_id, chunk_key2)
                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data[64:],
                                                 serialized_crc32)
                with self.assertRaises(ExecutionInterrupted):
                    self.get_result(5)

                # test transfer in memory
                receiver_ref_p.register_finish_callback(session_id, chunk_key3, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.create_data_writer(session_id, chunk_key3, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.receive_data_part(session_id, chunk_key3, serialized_mock_data[:64],
                                                 zlib.crc32(serialized_mock_data[:64]))
                receiver_ref_p.receive_data_part(session_id, chunk_key3, serialized_mock_data[64:], serialized_crc32)
                receiver_ref_p.finish_receive(session_id, chunk_key3, serialized_crc32)

                self.assertTupleEqual((), self.get_result(5))

                receiver_ref_p.create_data_writer(session_id, chunk_key3, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVED))

                # test transfer in spill file
                def mocked_store_create(*_):
                    raise StoreFull

                with patch_method(PlasmaChunkStore.create, new=mocked_store_create):
                    # test receive aborted
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key4, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False))
                    self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                    receiver_ref_p.register_finish_callback(session_id, chunk_key4, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                    receiver_ref_p.receive_data_part(session_id, chunk_key4, serialized_mock_data[:64],
                                                     zlib.crc32(serialized_mock_data[:64]))
                    receiver_ref_p.cancel_receive(session_id, chunk_key4)
                    with self.assertRaises(ExecutionInterrupted):
                        self.get_result(5)

                    # test receive into spill
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key4, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False))
                    self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                    receiver_ref_p.register_finish_callback(session_id, chunk_key4, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                    receiver_ref_p.receive_data_part(session_id, chunk_key4, serialized_mock_data, serialized_crc32)
                    receiver_ref_p.finish_receive(session_id, chunk_key4, serialized_crc32)

                    self.assertTupleEqual((), self.get_result(5))

                # test intermediate error
                def mocked_store_create(*_):
                    raise SpillNotConfigured

                with patch_method(PlasmaChunkStore.create, new=mocked_store_create):
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key5, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False),
                              lambda *s: test_actor.set_result(s, accept=False, destroy=False))

                    with self.assertRaises(SpillNotConfigured):
                        self.get_result(5)

                # test receive timeout
                receiver_ref_p.register_finish_callback(session_id, chunk_key6, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.create_data_writer(session_id, chunk_key6, data_size, test_actor,
                                                  timeout=2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))
                receiver_ref_p.receive_data_part(session_id, chunk_key6, serialized_mock_data[:64],
                                                 zlib.crc32(serialized_mock_data[:64]))

                with self.assertRaises(TimeoutError):
                    self.get_result(5)
Exemple #4
0
    def testSender(self):
        send_pool_addr = 'localhost:%d' % get_next_port()
        recv_pool_addr = 'localhost:%d' % get_next_port()
        recv_pool_addr2 = 'localhost:%d' % get_next_port()

        options.worker.spill_directory = os.path.join(
            tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker)))
        session_id = str(uuid.uuid4())

        mock_data = np.array([1, 2, 3, 4])
        chunk_key1 = str(uuid.uuid4())
        chunk_key2 = str(uuid.uuid4())

        @contextlib.contextmanager
        def start_send_recv_pool():
            with start_transfer_test_pool(
                    address=send_pool_addr, plasma_size=self.plasma_storage_size) as sp:
                sp.create_actor(SenderActor, uid=SenderActor.default_name())
                with start_transfer_test_pool(
                        address=recv_pool_addr, plasma_size=self.plasma_storage_size) as rp:
                    rp.create_actor(MockReceiverActor, uid=ReceiverActor.default_name())
                    yield sp, rp

        with start_send_recv_pool() as (send_pool, recv_pool):
            chunk_holder_ref = send_pool.actor_ref(ChunkHolderActor.default_name())
            sender_ref = send_pool.actor_ref(SenderActor.default_name())
            receiver_ref = recv_pool.actor_ref(ReceiverActor.default_name())

            sender_mapper_ref = send_pool.actor_ref(PlasmaKeyMapActor.default_name())
            store = PlasmaChunkStore(self._plasma_client, sender_mapper_ref)

            with self.run_actor_test(send_pool) as test_actor:
                # send when data missing
                sender_ref_p = test_actor.promise_ref(sender_ref)
                sender_ref_p.send_data(session_id, str(uuid.uuid4()), recv_pool_addr, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                with self.assertRaises(DependencyMissing):
                    self.get_result(5)

                # send data in spill
                write_spill_file(chunk_key1, mock_data)

                sender_ref_p.send_data(session_id, chunk_key1, recv_pool_addr, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                self.get_result(5)
                assert_array_equal(mock_data, receiver_ref.get_result_data(session_id, chunk_key1))
                os.unlink(build_spill_file_name(chunk_key1))

                # send data in plasma store
                store.put(session_id, chunk_key1, mock_data)
                chunk_holder_ref.register_chunk(session_id, chunk_key1)

                sender_ref_p.send_data(session_id, chunk_key1, recv_pool_addr, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                self.get_result(5)
                assert_array_equal(mock_data, receiver_ref.get_result_data(session_id, chunk_key1))

                # send data to multiple targets
                with start_transfer_test_pool(
                        address=recv_pool_addr2, plasma_size=self.plasma_storage_size) as rp2:
                    recv_ref2 = rp2.create_actor(MockReceiverActor, uid=ReceiverActor.default_name())

                    sender_ref_p.send_data(session_id, chunk_key1,
                                           [recv_pool_addr, recv_pool_addr2], _promise=True)
                    # send data to already transferred / transferring
                    sender_ref_p.send_data(session_id, chunk_key1,
                                           [recv_pool_addr, recv_pool_addr2], _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                    self.get_result(5)
                    assert_array_equal(mock_data, recv_ref2.get_result_data(session_id, chunk_key1))

                # send data to non-exist endpoint which causes error
                store.put(session_id, chunk_key2, mock_data)
                chunk_holder_ref.register_chunk(session_id, chunk_key2)

                sender_ref_p.send_data(session_id, chunk_key2, recv_pool_addr2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                with self.assertRaises(BrokenPipeError):
                    self.get_result(5)

                def mocked_receive_data_part(*_):
                    raise ChecksumMismatch

                with patch_method(MockReceiverActor.receive_data_part, new=mocked_receive_data_part):
                    sender_ref_p.send_data(session_id, chunk_key2, recv_pool_addr, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))
                    with self.assertRaises(ChecksumMismatch):
                        self.get_result(5)