def testAssignerActor(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) chunk_meta_ref = pool.create_actor( ChunkMetaActor, uid=ChunkMetaActor.default_name()) endpoint1 = 'localhost:12345' endpoint2 = 'localhost:23456' res = dict(hardware=dict(cpu=4, memory=4096)) def write_mock_meta(): resource_ref.set_worker_meta(endpoint1, res) resource_ref.set_worker_meta(endpoint2, res) g = gevent.spawn(write_mock_meta) g.join() assigner_ref = pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) session_id = str(uuid.uuid4()) chunk_key1 = str(uuid.uuid4()) chunk_key2 = str(uuid.uuid4()) chunk_key3 = str(uuid.uuid4()) op_info = { 'op_name': 'test_op', 'io_meta': dict(input_chunks=[chunk_key1, chunk_key2, chunk_key3]), 'output_size': 512, 'retries': 0, 'optimize': { 'depth': 0, 'demand_depths': (), 'successor_size': 1, 'descendant_size': 0 } } chunk_meta_ref.set_chunk_meta(session_id, chunk_key1, size=512, workers=(endpoint1, )) chunk_meta_ref.set_chunk_meta(session_id, chunk_key2, size=512, workers=(endpoint1, )) chunk_meta_ref.set_chunk_meta(session_id, chunk_key3, size=512, workers=(endpoint2, )) workers = assigner_ref.get_worker_assignments(session_id, op_info) self.assertEqual(workers[0], endpoint1)
def _prepare_test_graph(self, session_id, graph_key, mock_workers): addr = '127.0.0.1:%d' % get_next_port() a1 = mt.random.random((100,)) a2 = mt.random.random((100,)) s = a1 + a2 v1, v2 = mt.split(s, 2) graph = DAG() v1.build_graph(graph=graph, compose=False) v2.build_graph(graph=graph, compose=False) with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph), uid=GraphActor.gen_name(session_id, graph_key)) for w in mock_workers: resource_ref.set_worker_meta(w, dict(hardware=dict(cpu_total=4))) graph_ref.prepare_graph() graph_ref.analyze_graph() graph_ref.create_operand_actors(_start=False) yield pool, graph_ref
def testErrorOnPrepare(self, *_): session_id = str(uuid.uuid4()) addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) resource_ref.set_worker_meta('localhost:12345', dict(hardware=dict(cpu_total=4))) resource_ref.set_worker_meta('localhost:23456', dict(hardware=dict(cpu_total=4))) # error occurred in create_operand_actors graph_key = str(uuid.uuid4()) expr = mt.random.random((8, 2), chunk_size=2) + 1 graph = expr.build_graph(compose=False) serialized_graph = serialize_graph(graph) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph, uid=GraphActor.gen_name(session_id, graph_key)) def _mock_raises(*_, **__): raise RuntimeError with patch_method(GraphActor.create_operand_actors, new=_mock_raises): with self.assertRaises(RuntimeError): graph_ref.execute_graph() self.assertEqual(graph_ref.get_state(), GraphState.FAILED) graph_ref.destroy() # interrupted during create_operand_actors graph_key = str(uuid.uuid4()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph, uid=GraphActor.gen_name(session_id, graph_key)) def _mock_cancels(*_, **__): graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key)) graph_meta_ref.set_state(GraphState.CANCELLING) with patch_method(GraphActor.create_operand_actors, new=_mock_cancels): graph_ref.execute_graph() self.assertEqual(graph_ref.get_state(), GraphState.CANCELLED) # interrupted during previous steps graph_key = str(uuid.uuid4()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph, uid=GraphActor.gen_name(session_id, graph_key)) def _mock_cancels(*_, **__): graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key)) graph_meta_ref.set_state(GraphState.CANCELLING) return dict() with patch_method(GraphAnalyzer.calc_operand_assignments, new=_mock_cancels): graph_ref.execute_graph() self.assertEqual(graph_ref.get_state(), GraphState.CANCELLED)
def _run_operand_case(session_id, graph_key, tensor, execution_creator): graph = tensor.build_graph(compose=False) with create_actor_pool(n_process=1, backend='gevent') as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph), uid=GraphActor.gen_name( session_id, graph_key)) addr_dict = dict() def _build_mock_ref(uid=None, address=None): if address in addr_dict: return addr_dict[address] else: r = addr_dict[address] = execution_creator(pool) return r # handle mock objects OperandActor._get_raw_execution_ref.side_effect = _build_mock_ref mock_resource = dict(hardware=dict(cpu=4, cpu_total=4, memory=512)) resource_ref.set_worker_meta('localhost:12345', mock_resource) resource_ref.set_worker_meta('localhost:23456', mock_resource) graph_ref.prepare_graph() fetched_graph = graph_ref.get_chunk_graph() graph_ref.analyze_graph() final_keys = set() for c in fetched_graph: if fetched_graph.count_successors(c) == 0: final_keys.add(c.op.key) graph_ref.create_operand_actors() graph_meta_ref = pool.actor_ref( GraphMetaActor.gen_name(session_id, graph_key)) start_time = time.time() while True: pool.sleep(0.1) if time.time() - start_time > 30: raise SystemError('Wait for execution finish timeout') if graph_meta_ref.get_state() in (GraphState.SUCCEEDED, GraphState.FAILED, GraphState.CANCELLED): break
def testFailoverMessage(self): mock_session_id = str(uuid.uuid4()) mock_graph_key = str(uuid.uuid4()) mock_chunk_key = str(uuid.uuid4()) addr = '127.0.0.1:%d' % get_next_port() mock_worker_addr = '127.0.0.1:54132' options.scheduler.worker_blacklist_time = 0.5 with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) session_manager_ref = pool.create_actor( SessionManagerActor, uid=SessionManagerActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) chunk_meta_ref = pool.create_actor( ChunkMetaActor, uid=ChunkMetaActor.default_name()) session_ref = pool.actor_ref(session_manager_ref.create_session(mock_session_id)) chunk_meta_ref.set_chunk_meta(mock_session_id, mock_chunk_key, size=80, shape=(10,), workers=(mock_worker_addr,)) with mock.patch(GraphActor.__module__ + '.' + GraphActor.__name__, new=MockGraphActor): session_ref.submit_tensor_graph(None, mock_graph_key) graph_ref = pool.actor_ref(GraphActor.gen_name(mock_session_id, mock_graph_key)) expire_time = time.time() - options.scheduler.status_timeout - 1 resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=expire_time)) resource_ref.detect_dead_workers(_tell=True) pool.sleep(0.2) _, removes, lost_chunks = graph_ref.get_worker_change_args() self.assertListEqual(removes, [mock_worker_addr]) self.assertListEqual(lost_chunks, [mock_chunk_key]) self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta()) resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time())) self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta()) pool.sleep(0.4) resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time())) self.assertIn(mock_worker_addr, resource_ref.get_workers_meta())
def execute_case(): pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph), uid=GraphActor.gen_name( session_id, graph_key)) addr_dict = dict() def _build_mock_ref(uid=None, address=None): if address in addr_dict: return addr_dict[address] else: r = addr_dict[address] = pool.create_actor( FakeExecutionActor, sleep=1) return r # handle mock objects OperandActor._get_raw_execution_ref.side_effect = _build_mock_ref mock_resource = dict( hardware=dict(cpu=4, cpu_total=4, memory=512)) def write_mock_meta(): resource_ref.set_worker_meta('localhost:12345', mock_resource) resource_ref.set_worker_meta('localhost:23456', mock_resource) v = gevent.spawn(write_mock_meta) v.join() graph_ref.prepare_graph() fetched_graph = graph_ref.get_chunk_graph() graph_ref.scan_node() graph_ref.place_initial_chunks() final_keys = set() for c in fetched_graph: if fetched_graph.count_successors(c) == 0: final_keys.add(c.op.key) graph_ref.create_operand_actors() graph_meta_ref = pool.actor_ref( GraphMetaActor.gen_name(session_id, graph_key)) start_time = time.time() cancel_called = False while True: gevent.sleep(0.1) if not cancel_called and time.time() > start_time + 0.8: cancel_called = True graph_ref.stop_graph(_tell=True) if time.time() - start_time > 30: raise SystemError('Wait for execution finish timeout') if graph_meta_ref.get_state() in (GraphState.SUCCEEDED, GraphState.FAILED, GraphState.CANCELLED): break
def testOperandActorWithCancel(self, *_): import logging logging.basicConfig(level=logging.DEBUG) arr = mt.random.randint(10, size=(10, 8), chunk_size=4) arr_add = mt.random.randint(10, size=(10, 8), chunk_size=4) arr2 = arr + arr_add session_id = str(uuid.uuid4()) graph_key = str(uuid.uuid4()) graph = arr2.build_graph(compose=False) with create_actor_pool(n_process=1, backend='gevent') as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph), uid=GraphActor.gen_name(session_id, graph_key)) addr_dict = dict() def _build_mock_ref(uid=None, address=None): if address in addr_dict: return addr_dict[address] else: r = addr_dict[address] = pool.create_actor( FakeExecutionActor, exec_delay=0.2) return r # handle mock objects OperandActor._get_raw_execution_ref.side_effect = _build_mock_ref mock_resource = dict(hardware=dict(cpu=4, cpu_total=4, memory=512)) for idx in range(20): resource_ref.set_worker_meta('localhost:%d' % (idx + 12345), mock_resource) graph_ref.prepare_graph(compose=False) fetched_graph = graph_ref.get_chunk_graph() graph_ref.analyze_graph() final_keys = set() for c in fetched_graph: if fetched_graph.count_successors(c) == 0: final_keys.add(c.op.key) graph_ref.create_operand_actors() graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key)) start_time = time.time() cancel_called = False while True: pool.sleep(0.05) if not cancel_called and time.time() > start_time + 0.3: cancel_called = True graph_ref.stop_graph(_tell=True) if time.time() - start_time > 30: raise SystemError('Wait for execution finish timeout') if graph_meta_ref.get_state() in (GraphState.SUCCEEDED, GraphState.FAILED, GraphState.CANCELLED): break self.assertEqual(graph_meta_ref.get_state(), GraphState.CANCELLED)
def prepare_graph_in_pool(self, expr, clean_io_meta=True, compose=False): session_id = str(uuid.uuid4()) graph_key = str(uuid.uuid4()) graph = expr.build_graph(compose=compose) serialized_graph = serialize_graph(graph) chunked_graph = expr.build_graph(compose=compose, tiled=True) addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph, uid=GraphActor.gen_name(session_id, graph_key)) graph_ref.prepare_graph(compose=compose) fetched_graph = graph_ref.get_chunk_graph() self.assertIsNotNone(fetched_graph) self.assertEqual(len(chunked_graph), len(fetched_graph)) graph_ref.analyze_graph(do_placement=False) op_infos = graph_ref.get_operand_info() for n in fetched_graph: depth = op_infos[n.op.key]['optimize']['depth'] self.assertIsNotNone(depth) successor_size = op_infos[n.op.key]['optimize']['successor_size'] self.assertIsNotNone(successor_size) descendant_size = op_infos[n.op.key]['optimize']['descendant_size'] self.assertIsNotNone(descendant_size) resource_ref.set_worker_meta('localhost:12345', dict(hardware=dict(cpu_total=4))) resource_ref.set_worker_meta('localhost:23456', dict(hardware=dict(cpu_total=4))) graph_ref.analyze_graph() op_infos = graph_ref.get_operand_info() for n in fetched_graph: if fetched_graph.count_predecessors(n) != 0: continue target_worker = op_infos[n.op.key]['target_worker'] self.assertIsNotNone(target_worker) graph_ref.create_operand_actors(_clean_io_meta=clean_io_meta) op_infos = graph_ref.get_operand_info() if not clean_io_meta: orig_metas = dict() for n in fetched_graph: try: meta = orig_metas[n.op.key] except KeyError: meta = orig_metas[n.op.key] = dict( predecessors=set(), successors=set(), input_chunks=set(), chunks=set() ) meta['predecessors'].update([pn.op.key for pn in fetched_graph.iter_predecessors(n)]) meta['successors'].update([sn.op.key for sn in fetched_graph.iter_successors(n)]) meta['input_chunks'].update([pn.key for pn in fetched_graph.iter_predecessors(n)]) meta['chunks'].update([c.key for c in n.op.outputs]) for n in fetched_graph: self.assertEqual(op_infos[n.op.key]['op_name'], type(n.op).__name__) io_meta = op_infos[n.op.key]['io_meta'] orig_io_meta = orig_metas[n.op.key] self.assertSetEqual(set(io_meta['predecessors']), set(orig_io_meta['predecessors'])) self.assertSetEqual(set(io_meta['successors']), set(orig_io_meta['successors'])) self.assertSetEqual(set(io_meta['input_chunks']), set(orig_io_meta['input_chunks'])) self.assertSetEqual(set(io_meta['chunks']), set(orig_io_meta['chunks'])) yield pool, graph_ref