Exemple #1
0
    def testAssignerActor(self):
        mock_scheduler_addr = '127.0.0.1:%d' % get_next_port()
        with create_actor_pool(n_process=1,
                               backend='gevent',
                               address=mock_scheduler_addr) as pool:
            pool.create_actor(ClusterInfoActor, [pool.cluster_info.address],
                              uid=ClusterInfoActor.default_name())
            resource_ref = pool.create_actor(ResourceActor,
                                             uid=ResourceActor.default_name())
            chunk_meta_ref = pool.create_actor(
                ChunkMetaActor, uid=ChunkMetaActor.default_name())

            endpoint1 = 'localhost:12345'
            endpoint2 = 'localhost:23456'
            res = dict(hardware=dict(cpu=4, memory=4096))

            def write_mock_meta():
                resource_ref.set_worker_meta(endpoint1, res)
                resource_ref.set_worker_meta(endpoint2, res)

            g = gevent.spawn(write_mock_meta)
            g.join()

            assigner_ref = pool.create_actor(AssignerActor,
                                             uid=AssignerActor.default_name())

            session_id = str(uuid.uuid4())
            chunk_key1 = str(uuid.uuid4())
            chunk_key2 = str(uuid.uuid4())
            chunk_key3 = str(uuid.uuid4())

            op_info = {
                'op_name': 'test_op',
                'io_meta':
                dict(input_chunks=[chunk_key1, chunk_key2, chunk_key3]),
                'output_size': 512,
                'retries': 0,
                'optimize': {
                    'depth': 0,
                    'demand_depths': (),
                    'successor_size': 1,
                    'descendant_size': 0
                }
            }

            chunk_meta_ref.set_chunk_meta(session_id,
                                          chunk_key1,
                                          size=512,
                                          workers=(endpoint1, ))
            chunk_meta_ref.set_chunk_meta(session_id,
                                          chunk_key2,
                                          size=512,
                                          workers=(endpoint1, ))
            chunk_meta_ref.set_chunk_meta(session_id,
                                          chunk_key3,
                                          size=512,
                                          workers=(endpoint2, ))

            workers = assigner_ref.get_worker_assignments(session_id, op_info)
            self.assertEqual(workers[0], endpoint1)
Exemple #2
0
    def _prepare_test_graph(self, session_id, graph_key, mock_workers):
        addr = '127.0.0.1:%d' % get_next_port()
        a1 = mt.random.random((100,))
        a2 = mt.random.random((100,))
        s = a1 + a2
        v1, v2 = mt.split(s, 2)

        graph = DAG()
        v1.build_graph(graph=graph, compose=False)
        v2.build_graph(graph=graph, compose=False)

        with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool:
            pool.create_actor(ClusterInfoActor, [pool.cluster_info.address],
                              uid=ClusterInfoActor.default_name())
            resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name())
            pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name())
            pool.create_actor(AssignerActor, uid=AssignerActor.default_name())
            graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph),
                                          uid=GraphActor.gen_name(session_id, graph_key))

            for w in mock_workers:
                resource_ref.set_worker_meta(w, dict(hardware=dict(cpu_total=4)))

            graph_ref.prepare_graph()
            graph_ref.analyze_graph()
            graph_ref.create_operand_actors(_start=False)

            yield pool, graph_ref
Exemple #3
0
    def testErrorOnPrepare(self, *_):
        session_id = str(uuid.uuid4())

        addr = '127.0.0.1:%d' % get_next_port()
        with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool:
            pool.create_actor(ClusterInfoActor, [pool.cluster_info.address],
                              uid=ClusterInfoActor.default_name())
            resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name())
            pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name())
            pool.create_actor(AssignerActor, uid=AssignerActor.default_name())

            resource_ref.set_worker_meta('localhost:12345', dict(hardware=dict(cpu_total=4)))
            resource_ref.set_worker_meta('localhost:23456', dict(hardware=dict(cpu_total=4)))

            # error occurred in create_operand_actors
            graph_key = str(uuid.uuid4())
            expr = mt.random.random((8, 2), chunk_size=2) + 1
            graph = expr.build_graph(compose=False)
            serialized_graph = serialize_graph(graph)

            graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph,
                                          uid=GraphActor.gen_name(session_id, graph_key))

            def _mock_raises(*_, **__):
                raise RuntimeError

            with patch_method(GraphActor.create_operand_actors, new=_mock_raises):
                with self.assertRaises(RuntimeError):
                    graph_ref.execute_graph()
            self.assertEqual(graph_ref.get_state(), GraphState.FAILED)
            graph_ref.destroy()

            # interrupted during create_operand_actors
            graph_key = str(uuid.uuid4())
            graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph,
                                          uid=GraphActor.gen_name(session_id, graph_key))

            def _mock_cancels(*_, **__):
                graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key))
                graph_meta_ref.set_state(GraphState.CANCELLING)

            with patch_method(GraphActor.create_operand_actors, new=_mock_cancels):
                graph_ref.execute_graph()
            self.assertEqual(graph_ref.get_state(), GraphState.CANCELLED)

            # interrupted during previous steps
            graph_key = str(uuid.uuid4())
            graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph,
                                          uid=GraphActor.gen_name(session_id, graph_key))

            def _mock_cancels(*_, **__):
                graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key))
                graph_meta_ref.set_state(GraphState.CANCELLING)
                return dict()

            with patch_method(GraphAnalyzer.calc_operand_assignments, new=_mock_cancels):
                graph_ref.execute_graph()
            self.assertEqual(graph_ref.get_state(), GraphState.CANCELLED)
Exemple #4
0
    def _run_operand_case(session_id, graph_key, tensor, execution_creator):
        graph = tensor.build_graph(compose=False)

        with create_actor_pool(n_process=1, backend='gevent') as pool:
            pool.create_actor(ClusterInfoActor, [pool.cluster_info.address],
                              uid=ClusterInfoActor.default_name())
            resource_ref = pool.create_actor(ResourceActor,
                                             uid=ResourceActor.default_name())
            pool.create_actor(ChunkMetaActor,
                              uid=ChunkMetaActor.default_name())
            pool.create_actor(AssignerActor, uid=AssignerActor.default_name())
            graph_ref = pool.create_actor(GraphActor,
                                          session_id,
                                          graph_key,
                                          serialize_graph(graph),
                                          uid=GraphActor.gen_name(
                                              session_id, graph_key))
            addr_dict = dict()

            def _build_mock_ref(uid=None, address=None):
                if address in addr_dict:
                    return addr_dict[address]
                else:
                    r = addr_dict[address] = execution_creator(pool)
                    return r

            # handle mock objects
            OperandActor._get_raw_execution_ref.side_effect = _build_mock_ref

            mock_resource = dict(hardware=dict(cpu=4, cpu_total=4, memory=512))

            resource_ref.set_worker_meta('localhost:12345', mock_resource)
            resource_ref.set_worker_meta('localhost:23456', mock_resource)

            graph_ref.prepare_graph()
            fetched_graph = graph_ref.get_chunk_graph()

            graph_ref.analyze_graph()

            final_keys = set()
            for c in fetched_graph:
                if fetched_graph.count_successors(c) == 0:
                    final_keys.add(c.op.key)

            graph_ref.create_operand_actors()

            graph_meta_ref = pool.actor_ref(
                GraphMetaActor.gen_name(session_id, graph_key))
            start_time = time.time()
            while True:
                pool.sleep(0.1)
                if time.time() - start_time > 30:
                    raise SystemError('Wait for execution finish timeout')
                if graph_meta_ref.get_state() in (GraphState.SUCCEEDED,
                                                  GraphState.FAILED,
                                                  GraphState.CANCELLED):
                    break
Exemple #5
0
    def testFailoverMessage(self):
        mock_session_id = str(uuid.uuid4())
        mock_graph_key = str(uuid.uuid4())
        mock_chunk_key = str(uuid.uuid4())
        addr = '127.0.0.1:%d' % get_next_port()
        mock_worker_addr = '127.0.0.1:54132'

        options.scheduler.worker_blacklist_time = 0.5

        with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool:
            pool.create_actor(ClusterInfoActor, [pool.cluster_info.address],
                              uid=ClusterInfoActor.default_name())
            pool.create_actor(AssignerActor, uid=AssignerActor.default_name())
            session_manager_ref = pool.create_actor(
                SessionManagerActor, uid=SessionManagerActor.default_name())
            resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name())
            chunk_meta_ref = pool.create_actor(
                ChunkMetaActor, uid=ChunkMetaActor.default_name())

            session_ref = pool.actor_ref(session_manager_ref.create_session(mock_session_id))
            chunk_meta_ref.set_chunk_meta(mock_session_id, mock_chunk_key,
                                          size=80, shape=(10,), workers=(mock_worker_addr,))

            with mock.patch(GraphActor.__module__ + '.' + GraphActor.__name__, new=MockGraphActor):
                session_ref.submit_tensor_graph(None, mock_graph_key)
                graph_ref = pool.actor_ref(GraphActor.gen_name(mock_session_id, mock_graph_key))

                expire_time = time.time() - options.scheduler.status_timeout - 1
                resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=expire_time))

                resource_ref.detect_dead_workers(_tell=True)
                pool.sleep(0.2)

                _, removes, lost_chunks = graph_ref.get_worker_change_args()
                self.assertListEqual(removes, [mock_worker_addr])
                self.assertListEqual(lost_chunks, [mock_chunk_key])

                self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta())
                resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time()))
                self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta())

                pool.sleep(0.4)
                resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time()))
                self.assertIn(mock_worker_addr, resource_ref.get_workers_meta())
Exemple #6
0
            def execute_case():
                pool.create_actor(ClusterInfoActor,
                                  [pool.cluster_info.address],
                                  uid=ClusterInfoActor.default_name())
                resource_ref = pool.create_actor(
                    ResourceActor, uid=ResourceActor.default_name())
                pool.create_actor(ChunkMetaActor,
                                  uid=ChunkMetaActor.default_name())
                pool.create_actor(AssignerActor,
                                  uid=AssignerActor.default_name())
                graph_ref = pool.create_actor(GraphActor,
                                              session_id,
                                              graph_key,
                                              serialize_graph(graph),
                                              uid=GraphActor.gen_name(
                                                  session_id, graph_key))
                addr_dict = dict()

                def _build_mock_ref(uid=None, address=None):
                    if address in addr_dict:
                        return addr_dict[address]
                    else:
                        r = addr_dict[address] = pool.create_actor(
                            FakeExecutionActor, sleep=1)
                        return r

                # handle mock objects
                OperandActor._get_raw_execution_ref.side_effect = _build_mock_ref

                mock_resource = dict(
                    hardware=dict(cpu=4, cpu_total=4, memory=512))

                def write_mock_meta():
                    resource_ref.set_worker_meta('localhost:12345',
                                                 mock_resource)
                    resource_ref.set_worker_meta('localhost:23456',
                                                 mock_resource)

                v = gevent.spawn(write_mock_meta)
                v.join()

                graph_ref.prepare_graph()
                fetched_graph = graph_ref.get_chunk_graph()

                graph_ref.scan_node()
                graph_ref.place_initial_chunks()

                final_keys = set()
                for c in fetched_graph:
                    if fetched_graph.count_successors(c) == 0:
                        final_keys.add(c.op.key)

                graph_ref.create_operand_actors()
                graph_meta_ref = pool.actor_ref(
                    GraphMetaActor.gen_name(session_id, graph_key))
                start_time = time.time()
                cancel_called = False
                while True:
                    gevent.sleep(0.1)
                    if not cancel_called and time.time() > start_time + 0.8:
                        cancel_called = True
                        graph_ref.stop_graph(_tell=True)
                    if time.time() - start_time > 30:
                        raise SystemError('Wait for execution finish timeout')
                    if graph_meta_ref.get_state() in (GraphState.SUCCEEDED,
                                                      GraphState.FAILED,
                                                      GraphState.CANCELLED):
                        break
Exemple #7
0
    def testOperandActorWithCancel(self, *_):
        import logging
        logging.basicConfig(level=logging.DEBUG)

        arr = mt.random.randint(10, size=(10, 8), chunk_size=4)
        arr_add = mt.random.randint(10, size=(10, 8), chunk_size=4)
        arr2 = arr + arr_add

        session_id = str(uuid.uuid4())
        graph_key = str(uuid.uuid4())

        graph = arr2.build_graph(compose=False)

        with create_actor_pool(n_process=1, backend='gevent') as pool:
            pool.create_actor(ClusterInfoActor, [pool.cluster_info.address],
                              uid=ClusterInfoActor.default_name())
            resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name())
            pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name())
            pool.create_actor(AssignerActor, uid=AssignerActor.default_name())
            graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph),
                                          uid=GraphActor.gen_name(session_id, graph_key))
            addr_dict = dict()

            def _build_mock_ref(uid=None, address=None):
                if address in addr_dict:
                    return addr_dict[address]
                else:
                    r = addr_dict[address] = pool.create_actor(
                        FakeExecutionActor, exec_delay=0.2)
                    return r

            # handle mock objects
            OperandActor._get_raw_execution_ref.side_effect = _build_mock_ref

            mock_resource = dict(hardware=dict(cpu=4, cpu_total=4, memory=512))

            for idx in range(20):
                resource_ref.set_worker_meta('localhost:%d' % (idx + 12345), mock_resource)

            graph_ref.prepare_graph(compose=False)
            fetched_graph = graph_ref.get_chunk_graph()

            graph_ref.analyze_graph()

            final_keys = set()
            for c in fetched_graph:
                if fetched_graph.count_successors(c) == 0:
                    final_keys.add(c.op.key)

            graph_ref.create_operand_actors()
            graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key))
            start_time = time.time()
            cancel_called = False
            while True:
                pool.sleep(0.05)
                if not cancel_called and time.time() > start_time + 0.3:
                    cancel_called = True
                    graph_ref.stop_graph(_tell=True)
                if time.time() - start_time > 30:
                    raise SystemError('Wait for execution finish timeout')
                if graph_meta_ref.get_state() in (GraphState.SUCCEEDED, GraphState.FAILED, GraphState.CANCELLED):
                    break
            self.assertEqual(graph_meta_ref.get_state(), GraphState.CANCELLED)
Exemple #8
0
    def prepare_graph_in_pool(self, expr, clean_io_meta=True, compose=False):
        session_id = str(uuid.uuid4())
        graph_key = str(uuid.uuid4())

        graph = expr.build_graph(compose=compose)
        serialized_graph = serialize_graph(graph)
        chunked_graph = expr.build_graph(compose=compose, tiled=True)

        addr = '127.0.0.1:%d' % get_next_port()
        with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool:
            pool.create_actor(ClusterInfoActor, [pool.cluster_info.address],
                              uid=ClusterInfoActor.default_name())
            resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name())
            pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name())
            pool.create_actor(AssignerActor, uid=AssignerActor.default_name())
            graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph,
                                          uid=GraphActor.gen_name(session_id, graph_key))

            graph_ref.prepare_graph(compose=compose)
            fetched_graph = graph_ref.get_chunk_graph()
            self.assertIsNotNone(fetched_graph)
            self.assertEqual(len(chunked_graph), len(fetched_graph))

            graph_ref.analyze_graph(do_placement=False)
            op_infos = graph_ref.get_operand_info()
            for n in fetched_graph:
                depth = op_infos[n.op.key]['optimize']['depth']
                self.assertIsNotNone(depth)
                successor_size = op_infos[n.op.key]['optimize']['successor_size']
                self.assertIsNotNone(successor_size)
                descendant_size = op_infos[n.op.key]['optimize']['descendant_size']
                self.assertIsNotNone(descendant_size)

            resource_ref.set_worker_meta('localhost:12345', dict(hardware=dict(cpu_total=4)))
            resource_ref.set_worker_meta('localhost:23456', dict(hardware=dict(cpu_total=4)))

            graph_ref.analyze_graph()
            op_infos = graph_ref.get_operand_info()

            for n in fetched_graph:
                if fetched_graph.count_predecessors(n) != 0:
                    continue
                target_worker = op_infos[n.op.key]['target_worker']
                self.assertIsNotNone(target_worker)

            graph_ref.create_operand_actors(_clean_io_meta=clean_io_meta)
            op_infos = graph_ref.get_operand_info()

            if not clean_io_meta:
                orig_metas = dict()
                for n in fetched_graph:
                    try:
                        meta = orig_metas[n.op.key]
                    except KeyError:
                        meta = orig_metas[n.op.key] = dict(
                            predecessors=set(), successors=set(), input_chunks=set(), chunks=set()
                        )
                    meta['predecessors'].update([pn.op.key for pn in fetched_graph.iter_predecessors(n)])
                    meta['successors'].update([sn.op.key for sn in fetched_graph.iter_successors(n)])
                    meta['input_chunks'].update([pn.key for pn in fetched_graph.iter_predecessors(n)])
                    meta['chunks'].update([c.key for c in n.op.outputs])

                for n in fetched_graph:
                    self.assertEqual(op_infos[n.op.key]['op_name'], type(n.op).__name__)

                    io_meta = op_infos[n.op.key]['io_meta']
                    orig_io_meta = orig_metas[n.op.key]

                    self.assertSetEqual(set(io_meta['predecessors']), set(orig_io_meta['predecessors']))
                    self.assertSetEqual(set(io_meta['successors']), set(orig_io_meta['successors']))
                    self.assertSetEqual(set(io_meta['input_chunks']), set(orig_io_meta['input_chunks']))
                    self.assertSetEqual(set(io_meta['chunks']), set(orig_io_meta['chunks']))

            yield pool, graph_ref