def testChunkBroadcast(self, *_): proc_count = 2 endpoints = ['127.0.0.1:%d' % get_next_port() for _ in range(proc_count)] keys = [] def _mock_get_scheduler(key): return endpoints[keys.index(key[1]) % len(endpoints)] ChunkMetaActor.get_scheduler.side_effect = _mock_get_scheduler session_id = str(uuid.uuid4()) with create_actor_pool(n_process=1, backend='gevent', address=endpoints[0]) as pool1: pool1.create_actor(ClusterInfoActor, endpoints, uid=ClusterInfoActor.default_name()) pool1.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) with create_actor_pool(n_process=1, backend='gevent', address=endpoints[1]) as pool2: pool2.create_actor(ClusterInfoActor, endpoints, uid=ClusterInfoActor.default_name()) pool2.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) client = new_client() ref1 = client.actor_ref(ChunkMetaActor.default_name(), address=endpoints[0]) ref2 = client.actor_ref(ChunkMetaActor.default_name(), address=endpoints[0]) local_ref1 = client.actor_ref(LocalChunkMetaActor.default_name(), address=endpoints[0]) local_ref2 = client.actor_ref(LocalChunkMetaActor.default_name(), address=endpoints[1]) key1 = str(uuid.uuid4()) key2 = str(uuid.uuid4()) keys = [key1, key2] ref1.set_chunk_broadcasts(session_id, key1, [endpoints[1]]) ref1.set_chunk_size(session_id, key1, 512) ref1.set_chunk_shape(session_id, key1, (10,) * 2) ref1.add_worker(session_id, key1, 'abc') ref2.set_chunk_broadcasts(session_id, key2, [endpoints[0]]) ref2.set_chunk_size(session_id, key2, 512) ref1.set_chunk_shape(session_id, key2, (10,) * 2) ref2.add_worker(session_id, key2, 'def') pool2.sleep(0.1) self.assertEqual(local_ref1.get_chunk_meta(session_id, key1).chunk_size, 512) self.assertEqual(local_ref1.get_chunk_meta(session_id, key1).chunk_shape, (10,) * 2) self.assertEqual(local_ref1.get_chunk_broadcasts(session_id, key1), [endpoints[1]]) self.assertEqual(local_ref2.get_chunk_meta(session_id, key1).chunk_size, 512) self.assertEqual(local_ref2.get_chunk_meta(session_id, key1).chunk_shape, (10,) * 2) self.assertEqual(local_ref2.get_chunk_broadcasts(session_id, key2), [endpoints[0]]) ref1.delete_meta(session_id, key1) pool2.sleep(0.1) self.assertIsNone(local_ref1.get_chunk_meta(session_id, key1)) self.assertIsNone(local_ref2.get_chunk_meta(session_id, key1)) self.assertIsNone(local_ref1.get_chunk_broadcasts(session_id, key1)) local_ref1.remove_workers_in_session(session_id, ['def']) local_ref2.remove_workers_in_session(session_id, ['def']) pool2.sleep(0.1) self.assertIsNone(local_ref1.get_chunk_meta(session_id, key2)) self.assertIsNone(local_ref2.get_chunk_meta(session_id, key2)) self.assertIsNone(local_ref2.get_chunk_broadcasts(session_id, key2))
def wait_scheduler_worker_start(self): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) actor_client = new_client() time.sleep(1) check_time = time.time() while True: try: resource_ref = actor_client.actor_ref( ResourceActor.default_uid(), address='127.0.0.1:' + self.scheduler_port) if actor_client.has_actor(resource_ref): break else: raise SystemError('Check meta_timestamp timeout') except: # noqa: E722 if time.time() - check_time > 10: raise time.sleep(0.1) check_time = time.time() while not resource_ref.get_worker_count(): if self.proc_scheduler.poll() is not None: raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll()) if self.proc_worker.poll() is not None: raise SystemError('Worker not started. exit code %s' % self.proc_worker.poll()) if time.time() - check_time > 20: raise SystemError('Check meta_timestamp timeout') time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors
def start(self): from mars.actors import new_client super(CupidSchedulerServiceMain, self).start() # create process helper on every process proc_helper_refs = [] for proc_id in range(self.pool.cluster_info.n_process): uid = 's:%d:mars-process-helper' % proc_id actor_ref = self.pool.create_actor( CupidSchedulerProcessHelperActor, uid=uid) proc_helper_refs.append(actor_ref) cupid_scheduler_key, scheduler_keys = self.args.cupid_scheduler_key.split( ';') if self.args.cupid_scheduler_key: self.write_cupid_service_info(cupid_scheduler_key) self.wait_for_all_ready(scheduler_keys) self.create_scheduler_discoverer() actor_client = new_client() for proc_helper_actor in proc_helper_refs: envs = self.cupid_context.prepare_channel() proc_helper_ref = actor_client.actor_ref(proc_helper_actor) new_envs = dict((env.name, env.value) for env in envs) proc_helper_ref.start_channel(new_envs)
def start(self): from mars.actors import new_client from cupid import context self.cupid_context = context() self.read_cupid_service_info(self.args.cupid_scheduler_key) self.create_scheduler_discoverer() super(CupidWorkerServiceMain, self).start() actor_client = new_client() proc_helpers = self._service._process_helper_actors for proc_helper_actor in proc_helpers: envs = self.cupid_context.prepare_channel() proc_helper_ref = actor_client.actor_ref(proc_helper_actor) new_envs = dict((env.name, env.value) for env in envs) proc_helper_ref.start_channel(new_envs)
def start(self): from mars.actors import new_client from cupid import context self.cupid_context = context() self.read_cupid_service_info(self.args.cupid_scheduler_key) self.create_scheduler_discoverer() super(CupidWorkerServiceMain, self).start() actor_client = new_client() proc_helpers = self._service._process_helper_actors for proc_helper_actor in proc_helpers: logger.info('Start channel for subprocess %s.', proc_helper_actor.uid) envs = self.cupid_context.prepare_channel() proc_helper_ref = actor_client.actor_ref(proc_helper_actor) new_envs = dict((env.name, env.value) for env in envs) proc_helper_ref.start_channel(new_envs) logger.info('All channel ready, upload worker status now.') self._service._status_ref.enable_status_upload(channel_ready=True, _tell=True)
def _start_service(self): worker_port = self.worker_port = str(get_next_port()) scheduler_port = self.scheduler_port = str(get_next_port()) proc_worker = subprocess.Popen([sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '-p', worker_port, '--cpu-procs', '2', '--cache-mem', '10m', '--schedulers', '127.0.0.1:' + scheduler_port, '--log-level', 'debug', '--log-format', 'WOR %(asctime)-15s %(message)s', '--ignore-avail-mem']) proc_scheduler = subprocess.Popen([sys.executable, '-m', 'mars.scheduler', '--nproc', '1', '-H', '127.0.0.1', '-p', scheduler_port, '-Dscheduler.default_cpu_usage=0', '--log-level', 'debug', '--log-format', 'SCH %(asctime)-15s %(message)s']) self.proc_worker = proc_worker self.proc_scheduler = proc_scheduler actor_client = new_client() time.sleep(1) check_time = time.time() while True: try: resource_ref = actor_client.actor_ref( ResourceActor.default_uid(), address='127.0.0.1:' + self.scheduler_port) if actor_client.has_actor(resource_ref): break else: raise SystemError('Check meta_timestamp timeout') except: # noqa: E722 if time.time() - check_time > 10: raise time.sleep(0.1) check_time = time.time() while not resource_ref.get_worker_count(): if self.proc_scheduler.poll() is not None: raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll()) if self.proc_worker.poll() is not None: raise SystemError('Worker not started. exit code %s' % self.proc_worker.poll()) if time.time() - check_time > 30: raise SystemError('Check meta_timestamp timeout') time.sleep(0.1) web_port = self.web_port = str(get_next_port()) proc_web = subprocess.Popen([sys.executable, '-m', 'mars.web', '-H', '127.0.0.1', '--log-level', 'debug', '--log-format', 'WEB %(asctime)-15s %(message)s', '-p', web_port, '-s', '127.0.0.1:' + self.scheduler_port]) self.proc_web = proc_web service_ep = 'http://127.0.0.1:' + self.web_port check_time = time.time() while True: if time.time() - check_time > 30: raise SystemError('Wait for service start timeout') try: resp = requests.get(service_ep + '/api', timeout=1) except (requests.ConnectionError, requests.Timeout): time.sleep(0.1) continue if resp.status_code >= 400: time.sleep(0.1) continue break
def testWebApi(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: self.assertEqual(sess.count_workers(), 1) a = mt.ones((100, 100), chunk_size=30) b = mt.ones((100, 100), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) assert_array_equal(value, np.ones((100, 100)) * 100) # check resubmission value2 = sess.run(c, timeout=timeout) assert_array_equal(value, value2) # check when local compression libs are missing from mars.serialize import dataserializer try: a = mt.ones((10, 10), chunk_size=30) b = mt.ones((10, 10), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) assert_array_equal(value, np.ones((10, 10)) * 10) dataserializer.decompressors[dataserializer.CompressType.LZ4] = None dataserializer.decompressobjs[dataserializer.CompressType.LZ4] = None dataserializer.compress_openers[dataserializer.CompressType.LZ4] = None assert_array_equal(sess.fetch(c), np.ones((10, 10)) * 10) finally: dataserializer.decompressors[dataserializer.CompressType.LZ4] = dataserializer.lz4_decompress dataserializer.decompressobjs[dataserializer.CompressType.LZ4] = dataserializer.lz4_decompressobj dataserializer.compress_openers[dataserializer.CompressType.LZ4] = dataserializer.lz4_open # check serialization by pickle try: sess._sess._serial_type = SerialType.PICKLE a = mt.ones((10, 10), chunk_size=30) b = mt.ones((10, 10), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) assert_array_equal(value, np.ones((10, 10)) * 10) finally: sess._sess._serial_type = SerialType.ARROW va = np.random.randint(0, 10000, (100, 100)) vb = np.random.randint(0, 10000, (100, 100)) a = mt.array(va, chunk_size=30) b = mt.array(vb, chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) assert_array_equal(value, va.dot(vb)) graphs = sess.get_graph_states() # make sure status got uploaded time.sleep(1.5) # check web UI requests res = requests.get(service_ep) self.assertEqual(res.status_code, 200) res = requests.get('%s/scheduler' % (service_ep,)) self.assertEqual(res.status_code, 200) res = requests.get('%s/scheduler/127.0.0.1:%s' % (service_ep, self.scheduler_port)) self.assertEqual(res.status_code, 200) res = requests.get('%s/worker' % (service_ep,)) self.assertEqual(res.status_code, 200) res = requests.get('%s/worker/127.0.0.1:%s' % (service_ep, self.worker_port)) self.assertEqual(res.status_code, 200) res = requests.get('%s/worker/127.0.0.1:%s/timeline' % (service_ep, self.worker_port)) self.assertEqual(res.status_code, 200) res = requests.get('%s/session' % (service_ep,)) self.assertEqual(res.status_code, 200) task_id = next(iter(graphs.keys())) res = requests.get('%s/session/%s/graph/%s' % (service_ep, sess._session_id, task_id)) self.assertEqual(res.status_code, 200) res = requests.get('%s/session/%s/graph/%s/running_nodes' % (service_ep, sess._session_id, task_id)) self.assertEqual(res.status_code, 200) from mars.web.task_pages import PROGRESS_APP_NAME res = requests.get('%s/%s?session_id=%s&task_id=%s' % (service_ep, PROGRESS_APP_NAME, sess._session_id, task_id)) self.assertEqual(res.status_code, 200) from mars.web.worker_pages import TIMELINE_APP_NAME res = requests.get('%s/%s?endpoint=127.0.0.1:%s' % (service_ep, TIMELINE_APP_NAME, self.worker_port)) self.assertEqual(res.status_code, 200) # make sure all chunks freed when session quits from mars.worker.storage import StorageManagerActor actor_client = new_client() storage_manager_ref = actor_client.actor_ref(StorageManagerActor.default_uid(), address='127.0.0.1:' + str(self.worker_port)) self.assertSetEqual(set(storage_manager_ref.dump_keys()), set())
def testWebApi(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: session_id = sess._session_id self.assertEqual(sess.count_workers(), 1) a = mt.ones((100, 100), chunk_size=30) b = mt.ones((100, 100), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) np.testing.assert_array_equal(value, np.ones((100, 100)) * 100) # check resubmission value2 = sess.run(c, timeout=timeout) np.testing.assert_array_equal(value, value2) # check when local compression libs are missing from mars.serialize import dataserializer try: a = mt.ones((10, 10), chunk_size=30) b = mt.ones((10, 10), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) np.testing.assert_array_equal(value, np.ones((10, 10)) * 10) dataserializer.decompressors[ dataserializer.CompressType.LZ4] = None dataserializer.decompressobjs[ dataserializer.CompressType.LZ4] = None dataserializer.compress_openers[ dataserializer.CompressType.LZ4] = None np.testing.assert_array_equal(sess.fetch(c), np.ones((10, 10)) * 10) finally: dataserializer.decompressors[ dataserializer.CompressType. LZ4] = dataserializer.lz4_decompress dataserializer.decompressobjs[ dataserializer.CompressType. LZ4] = dataserializer.lz4_decompressobj dataserializer.compress_openers[ dataserializer.CompressType.LZ4] = dataserializer.lz4_open # check serialization by pickle try: sess._sess._serial_type = SerialType.PICKLE a = mt.ones((10, 10), chunk_size=30) b = mt.ones((10, 10), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) np.testing.assert_array_equal(value, np.ones((10, 10)) * 10) raw = pd.DataFrame(np.random.rand(10, 5), columns=list('ABCDE'), index=pd.RangeIndex(10, 0, -1)) data = md.DataFrame(raw).astype({'E': 'arrow_string'}) ret_data = data.execute(session=sess).fetch(session=sess) self.assertEqual(ret_data.dtypes['E'], np.dtype('O')) pd.testing.assert_frame_equal(ret_data.astype({'E': 'float'}), raw, check_less_precise=True) raw = pd.Series(np.random.rand(10), index=pd.RangeIndex(10, 0, -1), name='r') data = md.Series(raw).astype('Arrow[string]') ret_data = data.execute(session=sess).fetch(session=sess) self.assertEqual(ret_data.dtype, np.dtype('O')) pd.testing.assert_series_equal(ret_data.astype('float'), raw) finally: sess._sess._serial_type = SerialType.ARROW va = np.random.randint(0, 10000, (100, 100)) vb = np.random.randint(0, 10000, (100, 100)) a = mt.array(va, chunk_size=30) b = mt.array(vb, chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) np.testing.assert_array_equal(value, va.dot(vb)) # test fetch log def f(): print('test') r = mr.spawn(f).execute(session=sess, timeout=timeout) self.assertEqual(str(r.fetch_log()).strip(), 'test') self.assertEqual(str(r.fetch_log(offsets=0)).strip(), 'test') self.assertEqual(str(r.fetch_log()).strip(), '') self.assertEqual( str(r.fetch_log(offsets='-0.003k', sizes=2)).strip(), 'st') graphs = sess.get_graph_states() # make sure status got uploaded time.sleep(1.5) # check web UI requests res = requests.get(service_ep) self.assertEqual(res.status_code, 200) res = requests.get(f'{service_ep}/scheduler') self.assertEqual(res.status_code, 200) res = requests.get( f'{service_ep}/scheduler/127.0.0.1:{self.scheduler_port}') self.assertEqual(res.status_code, 200) res = requests.get(f'{service_ep}/worker') self.assertEqual(res.status_code, 200) res = requests.get( f'{service_ep}/worker/127.0.0.1:{self.worker_port}') self.assertEqual(res.status_code, 200) res = requests.get( f'{service_ep}/worker/127.0.0.1:{self.worker_port}/timeline') self.assertEqual(res.status_code, 200) res = requests.get(f'{service_ep}/session') self.assertEqual(res.status_code, 200) task_id = next(iter(graphs.keys())) res = requests.get( f'{service_ep}/session/{session_id}/graph/{task_id}') self.assertEqual(res.status_code, 200) res = requests.get( f'{service_ep}/session/{session_id}/graph/{task_id}/running_nodes' ) self.assertEqual(res.status_code, 200) from mars.web.task_pages import PROGRESS_APP_NAME res = requests.get( f'{service_ep}/{PROGRESS_APP_NAME}?session_id={session_id}&task_id={task_id}' ) self.assertEqual(res.status_code, 200) from mars.web.worker_pages import TIMELINE_APP_NAME res = requests.get( f'{service_ep}/{TIMELINE_APP_NAME}?endpoint=127.0.0.1:{self.worker_port}' ) self.assertEqual(res.status_code, 200) # make sure all chunks freed when session quits from mars.worker.storage import StorageManagerActor actor_client = new_client() storage_manager_ref = actor_client.actor_ref( StorageManagerActor.default_uid(), address='127.0.0.1:' + str(self.worker_port)) self.assertSetEqual(set(storage_manager_ref.dump_keys()), set())
def testChunkMetaActors(self, *_): proc_count = 2 endpoints = [ '127.0.0.1:%d' % get_next_port() for _ in range(proc_count) ] keys = [] def _mock_get_scheduler(key): return endpoints[keys.index(key[1]) % len(endpoints)] ChunkMetaClient.get_scheduler.side_effect = _mock_get_scheduler session1 = str(uuid.uuid4()) session2 = str(uuid.uuid4()) with create_actor_pool(n_process=1, backend='gevent', address=endpoints[0]) as pool1: cluster_info1 = pool1.create_actor( SchedulerClusterInfoActor, endpoints, uid=SchedulerClusterInfoActor.default_name()) pool1.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) with create_actor_pool(n_process=1, backend='gevent', address=endpoints[1]) as pool2: cluster_info2 = pool2.create_actor( SchedulerClusterInfoActor, endpoints, uid=SchedulerClusterInfoActor.default_name()) pool2.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) actor_client = new_client() client1 = ChunkMetaClient( actor_client, actor_client.actor_ref(cluster_info1)) client2 = ChunkMetaClient( actor_client, actor_client.actor_ref(cluster_info2)) loc_ref1 = actor_client.actor_ref( ChunkMetaActor.default_name(), address=endpoints[0]) loc_ref2 = actor_client.actor_ref( ChunkMetaActor.default_name(), address=endpoints[1]) key1 = (str(uuid.uuid4()), str(uuid.uuid4())) key2 = str(uuid.uuid4()) key3 = str(uuid.uuid4()) key4 = (str(uuid.uuid4()), str(uuid.uuid4())) key5 = str(uuid.uuid4()) key6 = str(uuid.uuid4()) keys = [key1, key2, key3, key4, key5, key6] client1.set_chunk_size(session1, key1, 512) client2.set_chunk_size(session1, key2, 1024) client2.set_chunk_size(session2, key3, 1024) self.assertEqual(client1.get_chunk_size(session1, key1), 512) self.assertEqual(client2.get_chunk_size(session1, key2), 1024) self.assertEqual(client1.get_chunk_size(session1, key2), 1024) self.assertEqual(client2.get_chunk_size(session1, key1), 512) self.assertListEqual( client1.batch_get_chunk_size(session1, [key1, key2]), [512, 1024]) self.assertListEqual( client2.batch_get_chunk_size(session1, [key1, key2]), [512, 1024]) client1.set_chunk_shape(session1, key1, (10, )) client2.set_chunk_shape(session1, key2, (10, ) * 2) client2.set_chunk_shape(session2, key3, (10, ) * 2) self.assertEqual(client1.get_chunk_shape(session1, key1), (10, )) self.assertEqual(client2.get_chunk_shape(session1, key2), (10, ) * 2) self.assertEqual(client1.get_chunk_shape(session1, key2), (10, ) * 2) self.assertEqual(client2.get_chunk_shape(session1, key1), (10, )) self.assertListEqual( client1.batch_get_chunk_shape(session1, [key1, key2]), [(10, ), (10, ) * 2]) self.assertListEqual( client2.batch_get_chunk_shape(session1, [key1, key2]), [(10, ), (10, ) * 2]) mock_endpoint = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=mock_endpoint) as pool3: cluster_info3 = pool3.create_actor( SchedulerClusterInfoActor, endpoints, uid=SchedulerClusterInfoActor.default_name()) client3 = ChunkMetaClient( actor_client, actor_client.actor_ref(cluster_info3)) self.assertListEqual( client3.batch_get_chunk_shape(session1, [key1, key2]), [(10, ), (10, ) * 2]) client1.add_worker(session1, key1, 'abc') client1.add_worker(session1, key1, 'def') client2.add_worker(session1, key2, 'ghi') client1.add_worker(session2, key3, 'ghi') self.assertEqual(sorted(client1.get_workers(session1, key1)), sorted(('abc', 'def'))) self.assertEqual(sorted(client2.get_workers(session1, key2)), sorted(('ghi', ))) batch_result = client1.batch_get_workers( session1, [key1, key2]) self.assertEqual(sorted(batch_result[0]), sorted( ('abc', 'def'))) self.assertEqual(sorted(batch_result[1]), sorted(('ghi', ))) affected = [] for loc_ref in (loc_ref1, loc_ref2): affected.extend( loc_ref.remove_workers_in_session(session2, ['ghi'])) self.assertEqual(affected, [key3]) self.assertEqual(sorted(client1.get_workers(session1, key2)), sorted(('ghi', ))) self.assertIsNone(client1.get_workers(session2, key3)) client1.delete_meta(session1, key1) self.assertIsNone(client1.get_workers(session1, key1)) self.assertIsNone( client1.batch_get_chunk_size(session1, [key1, key2])[0]) self.assertIsNone( client1.batch_get_workers(session1, [key1, key2])[0]) client2.batch_delete_meta(session1, [key1, key2]) self.assertIsNone(client1.get_workers(session1, key2)) self.assertIsNone( client1.batch_get_chunk_size(session1, [key1, key2])[1]) self.assertIsNone( client1.batch_get_workers(session1, [key1, key2])[1]) meta4 = WorkerMeta(chunk_size=512, chunk_shape=(10, ) * 2, workers=(endpoints[0], )) loc_ref2.batch_set_chunk_meta(session1, [key4], [meta4]) self.assertEqual( loc_ref2.get_chunk_meta(session1, key4).chunk_size, 512) self.assertEqual( loc_ref2.get_chunk_meta(session1, key4).chunk_shape, (10, ) * 2) meta5 = WorkerMeta(chunk_size=512, chunk_shape=(10, ) * 2, workers=(endpoints[0], )) meta6 = WorkerMeta(chunk_size=512, chunk_shape=(10, ) * 2, workers=(endpoints[0], )) client1.batch_set_chunk_meta(session1, [key5, key6], [meta5, meta6]) self.assertEqual( loc_ref1.get_chunk_meta(session1, key5).chunk_size, 512) self.assertEqual( loc_ref2.get_chunk_meta(session1, key6).chunk_size, 512)
def testChunkBroadcast(self, *_): proc_count = 2 endpoints = ['127.0.0.1:%d' % get_next_port() for _ in range(proc_count)] keys = [] def _mock_get_scheduler(key): return endpoints[keys.index(key[1]) % len(endpoints)] ChunkMetaClient.get_scheduler.side_effect = _mock_get_scheduler session_id = str(uuid.uuid4()) with create_actor_pool(n_process=1, backend='gevent', address=endpoints[0]) as pool1: cluster_info1 = pool1.create_actor(SchedulerClusterInfoActor, endpoints, uid=SchedulerClusterInfoActor.default_uid()) pool1.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_uid()) with create_actor_pool(n_process=1, backend='gevent', address=endpoints[1]) as pool2: cluster_info2 = pool2.create_actor(SchedulerClusterInfoActor, endpoints, uid=SchedulerClusterInfoActor.default_uid()) pool2.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_uid()) actor_client = new_client() client1 = ChunkMetaClient(actor_client, actor_client.actor_ref(cluster_info1)) client2 = ChunkMetaClient(actor_client, actor_client.actor_ref(cluster_info2)) local_ref1 = actor_client.actor_ref(ChunkMetaActor.default_uid(), address=endpoints[0]) local_ref2 = actor_client.actor_ref(ChunkMetaActor.default_uid(), address=endpoints[1]) key1 = str(uuid.uuid4()) key2 = str(uuid.uuid4()) key3 = str(uuid.uuid4()) keys = [key1, key2, key3] client1.set_chunk_broadcasts(session_id, key1, [endpoints[1]]) client1.set_chunk_size(session_id, key1, 512) client1.set_chunk_shape(session_id, key1, (10,) * 2) client1.add_worker(session_id, key1, 'abc') client2.set_chunk_broadcasts(session_id, key2, [endpoints[0]]) client2.set_chunk_size(session_id, key2, 512) client1.set_chunk_shape(session_id, key2, (10,) * 2) client2.add_worker(session_id, key2, 'def') pool2.sleep(0.1) self.assertEqual(local_ref1.get_chunk_meta(session_id, key1).chunk_size, 512) self.assertEqual(local_ref1.get_chunk_meta(session_id, key1).chunk_shape, (10,) * 2) self.assertEqual(local_ref1.get_chunk_broadcasts(session_id, key1), [endpoints[1]]) self.assertEqual(local_ref2.get_chunk_meta(session_id, key1).chunk_size, 512) self.assertEqual(local_ref2.get_chunk_meta(session_id, key1).chunk_shape, (10,) * 2) self.assertEqual(local_ref2.get_chunk_broadcasts(session_id, key2), [endpoints[0]]) client1.batch_set_chunk_broadcasts(session_id, [key3], [[endpoints[1]]]) meta3 = WorkerMeta(chunk_size=512, chunk_shape=(10,) * 2, workers=(endpoints[0],)) local_ref1.batch_set_chunk_meta(session_id, [key3], [meta3]) self.assertEqual(local_ref2.get_chunk_meta(session_id, key3).chunk_size, 512) self.assertEqual(local_ref2.get_chunk_meta(session_id, key3).chunk_shape, (10,) * 2) client1.delete_meta(session_id, key1) pool2.sleep(0.1) self.assertIsNone(local_ref1.get_chunk_meta(session_id, key1)) self.assertIsNone(local_ref2.get_chunk_meta(session_id, key1)) self.assertIsNone(local_ref1.get_chunk_broadcasts(session_id, key1)) local_ref1.remove_workers_in_session(session_id, ['def']) local_ref2.remove_workers_in_session(session_id, ['def']) pool2.sleep(0.1) self.assertIsNone(local_ref1.get_chunk_meta(session_id, key2)) self.assertIsNone(local_ref2.get_chunk_meta(session_id, key2)) self.assertIsNone(local_ref2.get_chunk_broadcasts(session_id, key2))
def setUp(self): scheduler_port = str(get_next_port()) proc_worker = subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--level', 'debug', '--cpu-procs', '2', '--cache-mem', '10m', '--schedulers', '127.0.0.1:' + scheduler_port, '--ignore-avail-mem' ]) proc_scheduler = subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '--nproc', '1', '--level', 'debug', '-H', '127.0.0.1', '-p', scheduler_port, '--format', '%(asctime)-15s %(message)s' ]) self.scheduler_port = scheduler_port self.proc_worker = proc_worker self.proc_scheduler = proc_scheduler actor_client = new_client() time.sleep(2) check_time = time.time() while True: try: kv_ref = actor_client.actor_ref(KVStoreActor.default_name(), address='127.0.0.1:' + scheduler_port) if actor_client.has_actor(kv_ref): break else: raise SystemError('Check meta_timestamp timeout') except: if time.time() - check_time > 10: raise time.sleep(1) check_time = time.time() while True: content = kv_ref.read('/workers/meta_timestamp', silent=True) if self.proc_scheduler.poll() is not None: raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll()) if self.proc_worker.poll() is not None: raise SystemError('Worker not started. exit code %s' % self.proc_worker.poll()) if time.time() - check_time > 20: raise SystemError('Check meta_timestamp timeout') if not content: time.sleep(0.5) else: break web_port = str(get_next_port()) self.web_port = web_port proc_web = subprocess.Popen([ sys.executable, '-m', 'mars.web', '-H', '127.0.0.1', '--level', 'debug', '--ui-port', web_port, '-s', '127.0.0.1:' + self.scheduler_port ]) self.proc_web = proc_web service_ep = 'http://127.0.0.1:' + self.web_port check_time = time.time() while True: if time.time() - check_time > 30: raise SystemError('Wait for service start timeout') try: resp = requests.get(service_ep + '/api', timeout=1) except (requests.ConnectionError, requests.Timeout): time.sleep(1) continue if resp.status_code >= 400: time.sleep(1) continue break self.exceptions = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, )
def testChunkMetaActors(self, *_): proc_count = 2 endpoints = ['127.0.0.1:%d' % get_next_port() for _ in range(proc_count)] keys = [] def _mock_get_scheduler(key): return endpoints[keys.index(key[1]) % len(endpoints)] ChunkMetaActor.get_scheduler.side_effect = _mock_get_scheduler session1 = str(uuid.uuid4()) session2 = str(uuid.uuid4()) with create_actor_pool(n_process=1, backend='gevent', address=endpoints[0]) as pool1: pool1.create_actor(ClusterInfoActor, endpoints, uid=ClusterInfoActor.default_name()) pool1.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) with create_actor_pool(n_process=1, backend='gevent', address=endpoints[1]) as pool2: pool2.create_actor(ClusterInfoActor, endpoints, uid=ClusterInfoActor.default_name()) pool2.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) client = new_client() ref1 = client.actor_ref(ChunkMetaActor.default_name(), address=endpoints[0]) ref2 = client.actor_ref(ChunkMetaActor.default_name(), address=endpoints[1]) loc_ref1 = client.actor_ref(LocalChunkMetaActor.default_name(), address=endpoints[0]) loc_ref2 = client.actor_ref(LocalChunkMetaActor.default_name(), address=endpoints[1]) key1 = str(uuid.uuid4()) key2 = str(uuid.uuid4()) key3 = str(uuid.uuid4()) keys = [key1, key2, key3] ref1.set_chunk_size(session1, key1, 512) ref2.set_chunk_size(session1, key2, 1024) ref2.set_chunk_size(session2, key3, 1024) self.assertEqual(ref1.get_chunk_size(session1, key1), 512) self.assertEqual(ref2.get_chunk_size(session1, key2), 1024) self.assertEqual(ref1.get_chunk_size(session1, key2), 1024) self.assertEqual(ref2.get_chunk_size(session1, key1), 512) self.assertListEqual(ref1.batch_get_chunk_size(session1, [key1, key2]), [512, 1024]) self.assertListEqual(ref2.batch_get_chunk_size(session1, [key1, key2]), [512, 1024]) ref1.add_worker(session1, key1, 'abc') ref1.add_worker(session1, key1, 'def') ref2.add_worker(session1, key2, 'ghi') ref1.add_worker(session2, key3, 'ghi') self.assertEqual(sorted(ref1.get_workers(session1, key1)), sorted(('abc', 'def'))) self.assertEqual(sorted(ref2.get_workers(session1, key2)), sorted(('ghi',))) batch_result = ref1.batch_get_workers(session1, [key1, key2]) self.assertEqual(sorted(batch_result[0]), sorted(('abc', 'def'))) self.assertEqual(sorted(batch_result[1]), sorted(('ghi',))) affected = [] for loc_ref in (loc_ref1, loc_ref2): affected.extend(loc_ref.remove_workers_in_session(session2, ['ghi'])) self.assertEqual(affected, [key3]) self.assertEqual(sorted(ref1.get_workers(session1, key2)), sorted(('ghi',))) self.assertIsNone(ref1.get_workers(session2, key3)) ref1.delete_meta(session1, key1) self.assertIsNone(ref1.get_workers(session1, key1)) self.assertIsNone(ref1.batch_get_chunk_size(session1, [key1, key2])[0]) self.assertIsNone(ref1.batch_get_workers(session1, [key1, key2])[0]) ref2.batch_delete_meta(session1, [key1, key2]) self.assertIsNone(ref1.get_workers(session1, key2)) self.assertIsNone(ref1.batch_get_chunk_size(session1, [key1, key2])[1]) self.assertIsNone(ref1.batch_get_workers(session1, [key1, key2])[1])