def create_actors(self): """ Connect to the cluster and start sampling of the remote actor. """ parl.connect(self.config['master_address']) logger.info('Waiting for {} remote actors to connect.'.format( self.config['actor_num'])) ident = 0 self.predict_output_queues = [] for i in six.moves.range(self.config['actor_num']): self.remote_count += 1 logger.info('Remote simulator count: {}'.format(self.remote_count)) if self.start_time is None: self.start_time = time.time() q = queue.Queue() self.predict_output_queues.append(q) remote_thread = threading.Thread(target=self.run_remote_sample, args=(ident, )) remote_thread.setDaemon(True) remote_thread.start() ident += 1
def test_sync_config_file(self): master = Master(port=1335) th = threading.Thread(target=master.run) th.start() time.sleep(1) worker = Worker('localhost:1335', 1) random_file = 'random.npy' random_array = np.random.randn(3, 5) np.save(random_file, random_array) random_sum = random_array.sum() with open('config.json', 'w') as f: config_file = {'test': 1000} json.dump(config_file, f) parl.connect('localhost:1335', ['random.npy', 'config.json']) actor = Actor('random.npy', 'config.json') time.sleep(5) os.remove('./random.npy') os.remove('./config.json') remote_sum = actor.random_sum() self.assertEqual(remote_sum, random_sum) time.sleep(10) remote_config = actor.read_config() self.assertEqual(config_file['test'], remote_config) del actor worker.exit() master.exit()
def _connect_and_create_actor(cluster_addr): parl.connect(cluster_addr) for _ in range(2): actor = Actor() ret = actor.add_one(1) assert ret == 2 disconnect()
def test_max_memory(self): port = 3001 master = Master(port=port) th = threading.Thread(target=master.run) th.start() time.sleep(5) cluster_addr = 'localhost:{}'.format(port) worker = Worker(cluster_addr, 1) cluster_monitor = ClusterMonitor(cluster_addr) time.sleep(5) parl.connect(cluster_addr) actor = Actor() time.sleep(20) self.assertEqual(1, cluster_monitor.data['clients'][0]['actor_num']) del actor time.sleep(10) p = Process(target=self.actor, args=(cluster_addr, )) p.start() for _ in range(6): x = cluster_monitor.data['clients'][0]['actor_num'] if x == 0: break else: time.sleep(10) if x == 1: raise ValueError("Actor max memory test failed.") self.assertEqual(0, cluster_monitor.data['clients'][0]['actor_num']) p.terminate() worker.exit() master.exit()
def test_send_file(self): port = 1239 master = Master(port=port) th = threading.Thread(target=master.run) th.start() worker = Worker('localhost:{}'.format(port), 1) time.sleep(2) tmp_dir = 'rom_files' tmp_file = os.path.join(tmp_dir, 'pong.bin') os.system('mkdir {}'.format(tmp_dir)) if _IS_WINDOWS: os.system('type NUL >> {}'.format(tmp_file)) else: os.system('touch {}'.format(tmp_file)) assert os.path.exists(tmp_file) parl.connect('localhost:{}'.format(port), distributed_files=[tmp_file]) time.sleep(5) actor = Actor() for _ in range(10): if actor.check_local_file(): break time.sleep(10) self.assertEqual(True, actor.check_local_file()) del actor time.sleep(10) worker.exit() master.exit()
def test_create_actor_in_multiprocessing(self): # start the master master = Master(port=8240) th = threading.Thread(target=master.run) th.start() time.sleep(1) worker1 = Worker('localhost:8240', 4) parl.connect('localhost:8240') if not _IS_WINDOWS: # In windows, fork process cannot access client created in main process. proc1 = multiprocessing.Process(target=self._create_actor) proc2 = multiprocessing.Process(target=self._create_actor) proc1.start() proc2.start() proc1.join() proc2.join() print("[test_create_actor_in_multiprocessing] Join") # make sure that the client of the main process still works self._create_actor() worker1.exit() master.exit()
def test_job_exit_exceptionally(self): master = Master(port=1334) th = threading.Thread(target=master.run) th.start() time.sleep(1) worker1 = Worker('localhost:1334', 4) time.sleep(10) self.assertEqual(worker1.job_buffer.full(), True) time.sleep(1) self.assertEqual(master.cpu_num, 4) print("We are going to kill all the jobs.") if _IS_WINDOWS: command = r'''for /F "skip=2 tokens=2 delims=," %a in ('wmic process where "commandline like '%remote\\job.py%'" get processid^,status /format:csv') do taskkill /F /T /pid %a''' print(os.popen(command).read()) else: command = ( "ps aux | grep remote/job.py | awk '{print $2}' | xargs kill -9" ) subprocess.call([command], shell=True) parl.connect('localhost:1334') actor = Actor() self.assertEqual(actor.add_one(1), 2) time.sleep(20) master.exit() worker1.exit()
def test_actor_exception_2(self): logger.info("running: test_actor_exception_2") master = Master(port=8236) th = threading.Thread(target=master.run) th.start() time.sleep(3) worker1 = Worker('localhost:8236', 1) self.assertEqual(1, master.cpu_num) parl.connect('localhost:8236') actor = Actor() try: actor.will_raise_exception_func() except: pass actor2 = Actor() for _ in range(5): if master.cpu_num == 0: break time.sleep(10) self.assertEqual(actor2.add_one(1), 2) self.assertEqual(0, master.cpu_num) del actor del actor2 worker1.exit() master.exit()
def test_actor_exception(self): logger.info("running:test_actor_exception") master = Master(port=8235) th = threading.Thread(target=master.run) th.start() time.sleep(3) worker1 = Worker('localhost:8235', 1) for _ in range(3): if master.cpu_num == 1: break time.sleep(10) self.assertEqual(1, master.cpu_num) logger.info("running:test_actor_exception: 0") parl.connect('localhost:8235') logger.info("running:test_actor_exception: 1") with self.assertRaises(exceptions.RemoteError): actor = Actor(abcd='a bug') logger.info("running:test_actor_exception: 2") actor2 = Actor() for _ in range(3): if master.cpu_num == 0: break time.sleep(10) self.assertEqual(actor2.add_one(1), 2) self.assertEqual(0, master.cpu_num) master.exit() worker1.exit()
def create_actors(self): """Connect to the cluster and start sampling of the remote actor. """ parl.connect(args.cluster_address, ['official_obs_scaler.npz']) for i in range(args.actor_num): logger.info('Remote actor count: {}'.format(i + 1)) remote_thread = threading.Thread(target=self.run_remote_sample) remote_thread.setDaemon(True) remote_thread.start() # There is a memory-leak problem in osim-rl package. # So we will dynamically add actors when remote actors killed due to excessive memory usage. time.sleep(10 * 60) parl_client = get_global_client() while True: if parl_client.actor_num < args.actor_num: logger.info( 'Dynamic adding acotr, current actor num:{}'.format( parl_client.actor_num)) remote_thread = threading.Thread(target=self.run_remote_sample) remote_thread.setDaemon(True) remote_thread.start() time.sleep(5)
def test_connect_and_create_actor_in_multiprocessing_with_connected_in_main_process( self): # start the master master = Master(port=8238) th = threading.Thread(target=master.run) th.start() time.sleep(1) worker1 = Worker('localhost:8238', 4) parl.connect('localhost:8238') proc1 = multiprocessing.Process(target=self._connect_and_create_actor, args=('localhost:8238', )) proc2 = multiprocessing.Process(target=self._connect_and_create_actor, args=('localhost:8238', )) proc1.start() proc2.start() proc1.join() proc2.join() # make sure that the client of the main process still works self._create_actor() worker1.exit() master.exit()
def _connect_and_create_actor(cluster_addr): parl.connect(cluster_addr) outputs = [] for i in range(2): actor = Actor(number=i) ret = actor.sim_output(1, 4) assert ret != "" outputs.append(ret) return outputs
def __init__(self, env_name, xparl_addr, train_envs_params): parl.connect(xparl_addr) self.env_list = [ CarlaRemoteEnv(env_name=env_name, params=params) for params in train_envs_params ] self.env_num = len(self.env_list) self.episode_reward_list = [0] * self.env_num self.episode_steps_list = [0] * self.env_num self._max_episode_steps = train_envs_params[0]['max_time_episode'] self.total_steps = 0
def _create_remote_actors(self): # connect to xparl cluster to submit jobs parl.connect(self.args.master_address) for i in range(self.args.actors_num): signal_queue = queue.Queue() self.remote_actors_signal_queues.append(signal_queue) remote_thread = threading.Thread(target=self._run_remote_tasks, args=(signal_queue, )) remote_thread.setDaemon(True) remote_thread.start()
def create_actors(self): """ Connect to the cluster and start sampling of the remote actor. """ parl.connect(self.config['master_address']) logger.info('Waiting for {} remote actors to connect.'.format( self.config['actor_num'])) for i in range(self.config['actor_num']): self.remote_count += 1 logger.info('Remote actor count: {}'.format(self.remote_count)) if self.start_time is None: self.start_time = time.time() remote_thread = threading.Thread(target=self.run_remote_sample) remote_thread.setDaemon(True) remote_thread.start()
def test_cluster_status(self): port = 4321 master = Master(port=port) th = threading.Thread(target=master.run) th.start() time.sleep(5) worker = Worker('localhost:{}'.format(port), 1) time.sleep(5) status_info = master.cluster_monitor.get_status_info() self.assertEqual(status_info, 'has 0 used cpus, 1 vacant cpus.') parl.connect('localhost:{}'.format(port)) actor = Actor() time.sleep(50) status_info = master.cluster_monitor.get_status_info() self.assertEqual(status_info, 'has 1 used cpus, 0 vacant cpus.') worker.exit() master.exit()
def test_get_attribute(self): logger.info("running:test_get_attirbute") master = Master(port=8507) th = threading.Thread(target=master.run) th.start() time.sleep(3) worker1 = Worker('localhost:8507', 1) arg1 = np.random.randint(100) arg2 = np.random.randn() arg3 = np.random.randn(3, 3) parl.connect('localhost:8507') actor = Actor(arg1, arg2, arg3) self.assertTrue(arg1 == actor.arg1) self.assertTrue(arg2 == actor.arg2) self.assertTrue((arg3 == actor.arg3).all()) master.exit() worker1.exit()
def create_actors(self): """ create actors for parallel training. """ parl.connect(self.config['master_address']) self.remote_count = 0 for i in range(self.config['actor_num']): signal_queue = queue.Queue() output_queue = queue.Queue() self.actors_signal_input_queues.append(signal_queue) self.actors_output_queues.append(output_queue) self.remote_count += 1 remote_thread = threading.Thread(target=self.run_remote_sample, args=(signal_queue, output_queue)) remote_thread.setDaemon(True) remote_thread.start() logger.info('All remote actors are ready, begin to learn.')
def create_actors(self): parl.connect(self.config['master_address']) logger.info('Waiting for {} remote actors to connect.'.format( self.config['actor_num'])) for i in six.moves.range(self.config['actor_num']): params_queue = queue.Queue() self.params_queues.append(params_queue) self.remote_count += 1 logger.info('Remote actor count: {}'.format(self.remote_count)) remote_thread = threading.Thread( target=self.run_remote_sample, args=(params_queue, )) remote_thread.setDaemon(True) remote_thread.start() logger.info('All remote actors are ready, begin to learn.') self.start_time = time.time()
def test_acor_exit_exceptionally(self): port = 1337 master = Master(port) th = threading.Thread(target=master.run) th.start() time.sleep(1) worker1 = Worker('localhost:{}'.format(port), 1) file_path = __file__.replace('reset_job_test', 'simulate_client') command = [sys.executable, file_path] proc = subprocess.Popen(command) for _ in range(6): if master.cpu_num == 0: break else: time.sleep(10) self.assertEqual(master.cpu_num, 0) proc.kill() parl.connect('localhost:{}'.format(port)) actor = Actor() master.exit() worker1.exit() disconnect()
def test_reset_actor(self): logger.info("running: test_reset_actor") # start the master master = Master(port=8237) th = threading.Thread(target=master.run) th.start() time.sleep(3) worker1 = Worker('localhost:8237', 4) parl.connect('localhost:8237') for _ in range(10): actor = Actor() ret = actor.add_one(1) self.assertEqual(ret, 2) del actor for _ in range(10): if master.cpu_num == 4: break time.sleep(10) self.assertEqual(master.cpu_num, 4) worker1.exit() master.exit()
def actor(cluster_addr): parl.connect(cluster_addr) actor1 = Actor() time.sleep(10) actor1.add_500mb()
def train(): parl.connect('localhost:1337') actor = Actor() actor.add_one(1) time.sleep(100000)
from parl.remote.client import disconnect from parl.remote.master import Master from parl.remote.worker import Worker import time import threading c = 10 port = 3002 if __name__ == '__main__': master = Master(port=port) th = threading.Thread(target=master.run) th.setDaemon(True) th.start() time.sleep(5) cluster_addr = 'localhost:{}'.format(port) parl.connect(cluster_addr) worker = Worker(cluster_addr, 1) @parl.remote_class class Actor(object): def add(self, a, b): return a + b + c actor = Actor() class TestRecursive_actor(unittest.TestCase): def tearDown(self): disconnect()
@parl.remote_class class Actor(object): def hello_world(self): print("Hello world.") def add(self, a, b, f, l): time.sleep(2) f() l.append(b) return a + b # Connect to the master node. parl.connect("localhost:8010") def thread(i, a, f, l): ans = a.add(1, i, f, l) print(ans) def f(): print('f') def main(): ts = [] #l = []