def play(policy_file, seed, n_test_rollouts, render): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Load params with open(PARAMS_FILE) as json_file: params = json.load(json_file) params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger=logger) structure = params['structure'] task_selection = params['task_selection'] goal_selection = params['goal_selection'] dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'structure': structure, 'task_selection': task_selection, 'goal_selection': goal_selection, 'queue_length': params['queue_length'], 'eval': True, 'render': render } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def main(policy_file, seed, n_test_rollouts, render): """ run HER from a saved policy :param policy_file: (str) pickle path to a saved policy :param seed: (int) initial seed :param n_test_rollouts: (int) the number of test rollouts :param render: (bool) if rendering should be done """ set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as file_handler: policy = pickle.load(file_handler) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger_input=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'compute_q': True, 'rollout_batch_size': 1, 'render': bool(render), } for name in ['time_horizon', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def main(policy_file, seed, n_test_rollouts, render, record): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'compute_Q': True, 'rollout_batch_size': 1, 'render': bool(render), } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] if record: make_env = params['make_env'] def video_callable(episode_id): return True def make_record_env(): env = make_env() return gym.wrappers.Monitor(env, '../../../results/video/' + env_name, force=True, video_callable=video_callable) params['make_env'] = make_record_env evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def main(policy_file, seed, n_test_rollouts, render, with_forces): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS params['with_forces'] = with_forces params['plot_forces'] = False if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'compute_Q': True, 'rollout_batch_size': 1, 'with_forces': with_forces, 'plot_forces': False, 'render': bool(render), } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def main(policy_file, seed, n_test_rollouts, render): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'compute_Q': True, 'rollout_batch_size': 1, 'render': bool(render), } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, k, n, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update( config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, k=k, n=n, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, threshold=params['threshold'])
def launch(env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def main(policy_file, seed, n_test_rollouts, render, exploit, compute_q, collect_data, goal_generation, note): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS params['note'] = note or params['note'] if note: with open('params/' + env_name + '/' + note + '.json', 'r') as file: override_params = json.loads(file.read()) params.update(**override_params) if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params['env_name'] = env_name goal_generation = params['goal_generation'] params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': exploit, # eval: True, train: False 'use_target_net': params['test_with_polyak'], # eval/train: False 'compute_Q': compute_q, # eval: True, train: False 'rollout_batch_size': 1, 'render': render, } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() num_skills = params['num_skills'] if goal_generation == 'Zero': generated_goal = np.zeros(evaluator.g.shape) else: generated_goal = False for z in range(num_skills): assert (evaluator.rollout_batch_size == 1) z_s_onehot = np.zeros([evaluator.rollout_batch_size, num_skills]) z_s_onehot[0, z] = 1 base = os.path.splitext(policy_file)[0] for i_test_rollouts in range(n_test_rollouts): if render == 'rgb_array' or render == 'human': imgs, episode = evaluator.generate_rollouts( generated_goal=generated_goal, z_s_onehot=z_s_onehot) end = '_test_{:02d}_exploit_{}_compute_q_{}_skill_{}.avi'.format( i_test_rollouts, exploit, compute_q, z) test_filename = base + end save_video(imgs[0], test_filename, lib='cv2') else: episode = evaluator.generate_rollouts( generated_goal=generated_goal, z_s_onehot=z_s_onehot) if collect_data: end = '_test_{:02d}_exploit_{}_compute_q_{}_skill_{}.txt'.format( i_test_rollouts, exploit, compute_q, z) test_filename = base + end with open(test_filename, 'w') as file: file.write(json.dumps(episode['o'].tolist())) # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def launch(env_name, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, temperature, prioritization, binding, logging, version, dump_buffer, n_cycles, rank_method, w_potential, w_linear, w_rotational, clip_energy, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu, binding) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if logging: logdir = 'logs/'+str(env_name)+'-temperature'+str(temperature)+\ '-prioritization'+str(prioritization)+'-replay_strategy'+str(replay_strategy)+\ '-n_epochs'+str(n_epochs)+'-num_cpu'+str(num_cpu)+'-seed'+str(seed)+\ '-n_cycles'+str(n_cycles)+'-rank_method'+str(rank_method)+\ '-w_potential'+str(w_potential)+'-w_linear'+str(w_linear)+'-w_rotational'+str(w_rotational)+\ '-clip_energy'+str(clip_energy)+\ '-version'+str(version) else: logdir = osp.join( tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy params['temperature'] = temperature params['prioritization'] = prioritization params['binding'] = binding params['max_timesteps'] = n_epochs * params['n_cycles'] * params[ 'n_batches'] * num_cpu params['version'] = version params['dump_buffer'] = dump_buffer params['n_cycles'] = n_cycles params['rank_method'] = rank_method params['w_potential'] = w_potential params['w_linear'] = w_linear params['w_rotational'] = w_rotational params['clip_energy'] = clip_energy params['n_epochs'] = n_epochs params['num_cpu'] = num_cpu if params['dump_buffer']: params['alpha'] = 0 if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, num_cpu=num_cpu, dump_buffer=dump_buffer, w_potential=params['w_potential'], w_linear=params['w_linear'], w_rotational=params['w_rotational'], rank_method=rank_method, clip_energy=clip_energy)
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def launch(env_name, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, binding, logging, version, n_cycles, note, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu, binding) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if logging: logdir = 'logs/' + str(env_name) + '-replay_strategy' + str( replay_strategy) + '-n_epochs' + str(n_epochs) + '-num_cpu' + str( num_cpu) + '-seed' + str(seed) + '-n_cycles' + str( n_cycles) + '-version' + str( version) + '-T-' + datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") else: logdir = osp.join( tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() # use temp folder for other rank logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy params['binding'] = binding params['max_timesteps'] = n_epochs * params['n_cycles'] * params[ 'n_batches'] * num_cpu params['version'] = version params['n_cycles'] = n_cycles params['num_cpu'] = num_cpu params['note'] = note or params['note'] if note: with open('params/' + env_name + '/' + note + '.json', 'r') as file: override_params = json.loads(file.read()) params.update(**override_params) if params['load_weight']: if type(params['load_weight']) is list: params['load_weight'] = params['load_weight'][seed] base = os.path.splitext(params['load_weight'])[0] policy_weight_file = open(base + '_weight.pkl', 'rb') pretrain_weights = pickle.load(policy_weight_file) policy_weight_file.close() else: pretrain_weights = None if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, pretrain_weights=pretrain_weights, clip_return=clip_return) render = False if params['collect_video']: render = 'rgb_array' rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'render': render, } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, num_cpu=num_cpu, collect_data=params['collect_data'], collect_video=params['collect_video'], goal_generation=params['goal_generation'], num_skills=params['num_skills'], use_skill_n=params['use_skill_n'], batch_size=params['_batch_size'], mi_r_scale=params['mi_r_scale'], mi_end_epoch=params['mi_end_epoch'], sk_r_scale=params['sk_r_scale'], no_train_mi=params['no_train_mi'])
def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging # if rank == 0: # if logdir or logger.get_dir() is None: # logger.configure(dir=logdir) # else: # logger.configure() # logdir = logger.get_dir() # assert logdir is not None # os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update( config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter # with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: # json.dump(params, f) # {'Q_lr': 0.001, 'action_l2': 1.0, 'batch_size': 256, 'buffer_size': 1000000, 'clip_obs': 200.0, # 'env_name': u'GazeboSmartBotPinc...Kinect-v0', 'hidden': 256, 'layers': 3, 'max_u': 1.0, 'n_batches': 40, # 'n_cycles': 50, 'n_test_rollouts': 10, 'network_class': 'baselines.her.actor...torCritic', 'noise_eps': 0.2, ...} params = config.prepare_params( params ) # {'T': 200, '_Q_lr': 0.001, '_action_l2': 1.0, '_batch_size': 256, '_buffer_size': 1000000, '_clip_obs': 200.0, '_hidden': 256, '_layers': 3, '_max_u': 1.0, '_network_class': 'baselines.her.actor...torCritic', '_norm_clip': 5, '_norm_eps': 0.01, '_pi_lr': 0.001, '_polyak': 0.95, ...} config.log_params(params, logger=logger) # T: 200 # _Q_lr: 0.001 # _action_l2: 1.0 # _batch_size: 256 # _buffer_size: 1000000 # _clip_obs: 200.0 # _hidden: 256 # _layers: 3 # _max_u: 1.0 # _network_class: baselines.her.actor_critic:ActorCritic # _norm_clip: 5 # _norm_eps: 0.01 # _pi_lr: 0.001 # _polyak: 0.95 # _relative_goals: False # _scope: ddpg # ddpg_params: {'layers': 3, 'norm_clip': 5, 'action_l2': 1.0, 'pi_lr': 0.001, 'norm_eps': 0.01, 'batch_size': 256, 'polyak': 0.95, 'clip_obs': 200.0, 'network_class': 'baselines.her.actor_critic:ActorCritic', 'max_u': 1.0, 'Q_lr': 0.001, 'scope': 'ddpg', 'buffer_size': 1000000, 'hidden': 256, 'relative_goals': False} # env_name: GazeboSmartBotPincherKinect-v0 # gamma: 0.995 # make_env: <function make_env at 0x7f7de3cb2050> # n_batches: 40 # n_cycles: 50 # n_test_rollouts: 10 # noise_eps: 0.2 # random_eps: 0.3 # replay_k: 4 # replay_strategy: future # rollout_batch_size: 2 # test_with_polyak: False # if num_cpu == 1: # logger.warn() # logger.warn('*** Warning ***') # logger.warn( # 'You are running HER with just a single MPI worker. This will work, but the ' + # 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + # 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + # 'are looking to reproduce those results, be aware of this. Please also refer to ' + # 'https://github.com/openai/baselines/issues/314 for further details.') # logger.warn('****************') # logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params=None, save_policies=True): """ launch training with mpi :param env: (str) environment ID :param logdir: (str) the log directory :param n_epochs: (int) the number of training epochs :param num_cpu: (int) the number of CPUs to run on :param seed: (int) the initial random seed :param replay_strategy: (str) the type of replay strategy ('future' or 'none') :param policy_save_interval: (int) the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled. :param clip_return: (float): clip returns to be in [-clip_return, clip_return] :param override_params: (dict) override any parameter for training :param save_policies: (bool) whether or not to save the policies """ if override_params is None: override_params = {} # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) tf_util.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(folder=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update( config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as file_handler: json.dump(params, file_handler) params = config.prepare_params(params) config.log_params(params, logger_input=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, # 'use_demo_states': True, 'compute_q': False, 'time_horizon': params['time_horizon'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], # 'use_demo_states': False, 'compute_q': True, 'time_horizon': params['time_horizon'], } for name in [ 'time_horizon', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def launch(env, trial_id, n_epochs, num_cpu, seed, policy_save_interval, clip_return, normalize_obs, structure, task_selection, goal_selection, goal_replay, task_replay, perturb, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: save_dir = find_save_path('./save/' + env + "/", trial_id) logger.configure(dir=save_dir) else: save_dir = None # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params, add main function arguments and log all parameters if structure == 'curious' or structure == 'task_experts': params = config.MULTI_TASK_PARAMS else: params = config.DEFAULT_PARAMS time = str(datetime.datetime.now()) params['time'] = time params['env_name'] = env params['task_selection'] = task_selection params['goal_selection'] = goal_selection params['task_replay'] = task_replay params['goal_replay'] = goal_replay params['structure'] = structure params['normalize_obs'] = normalize_obs params['num_cpu'] = num_cpu params['clip_return'] = clip_return params['trial_id'] = trial_id params['seed'] = seed if rank == 0: with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['ddpg_params']['normalize_obs'] = normalize_obs if rank == 0: config.log_params(params, logger=logger) if num_cpu != 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Colas et al. (2018, https://arxiv.org/abs/1810.06284) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) buffers = config.configure_buffer(dims=dims, params=params) # creates several policies with shared buffers in the task-experts structure, otherwise use just one policy if structure == 'task_experts': policy = [config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return, t_id=i) for i in range(params['nb_tasks'])] else: policy = config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'structure': structure, 'task_selection': task_selection, 'goal_selection': goal_selection, 'queue_length': params['queue_length'], 'eval': False, 'eps_task': params['eps_task'] } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'structure' : structure, 'task_selection': task_selection, 'goal_selection' : goal_selection, 'queue_length': params['queue_length'], 'eval': True, } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] if structure == 'task_experts': # create one rollout worker per policy/task rollout_worker = [RolloutWorker(params['make_env'], policy[i], dims, logger, unique_task=i, **rollout_params) for i in range(params['nb_tasks'])] for i in range(params['nb_tasks']): rollout_worker[i].seed(rank_seed + i) else: rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed + 100) train(logdir=save_dir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], perturbation_study=perturb, policy_save_interval=policy_save_interval, save_policies=save_policies, structure=structure, task_selection=task_selection, params=params)
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def run_task(vv, log_dir=None, exp_name=None): override_params = {} # Fork for multi-CPU MPI implementation. if vv['num_cpu'] > 1: whoami = mpi_fork(vv['num_cpu']) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() log_dir = '/media/part/cmu_ri/deep/deep_RL/data/local/square2d-debug/square2d_debug_2018_06_17/' #hack for now, fix later # Configure logging if rank == 0: if log_dir or logger.get_dir() is None: from pathlib import Path logger.configure(dir=log_dir, exp_name=exp_name) else: if log_dir or logger.get_dir() is None: from pathlib import Path logger.configure(dir=log_dir, exp_name=exp_name) logdir = logger.get_dir() #logdir = ''# a quick hack, fix later assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = vv['seed'] + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = vv['env_name'] params['replay_strategy'] = vv['replay_strategy'] params['replay_sample_strategy'] = vv['replay_sample_strategy'] params['reward_type'] = vv['reward_type'] params['replay_k'] = vv['replay_k'] if vv['network'] == 'fc': params['network_class'] = 'baselines.her.actor_critic:ActorCritic' elif vv['network'] == 'cnn_fc': params[ 'network_class'] = 'baselines.her.cnn_actor_critic:CNNActorCritic' if vv['env_name'] in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[ vv['env_name']]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'variant.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) shapes = config.configure_shapes(params) dims = shapes_to_dims(shapes) policy = config.configure_ddpg(dims=dims, shapes=shapes, params=params, clip_return=vv['clip_return']) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=vv['n_epochs'], n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=vv['policy_save_interval'], save_policies=vv['save_policies'])