def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True, render=False, max_test=True, policy_file="", weight_file="", level=0, train_render=False ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) if policy_file == "": policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) else: # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) fn = config.configure_her(params) # print(fn) policy.set_sample_transitions(fn) # print(dir(policy)) policy.set_obs_size(dims) if weight_file != "": policy.load_weights(weight_file) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutStudent(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutStudent(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, render=render, level=level, max_test=max_test, train_render=train_render)
def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, bc_loss, q_filter, num_demo, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) resource.setrlimit(resource.RLIMIT_NOFILE, (65536, 65536)) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update( config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return, bc_loss=bc_loss, q_filter=q_filter, num_demo=num_demo) if params['env_name'] == 'GazeboWAMemptyEnv-v2': demoFileName = '/home/rjangir/wamObjectDemoData/data_wam_double_random_100_40_25.npz' rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], #'render': 1, } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], #'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'rollout_batch_size': 1, #'render': 1, } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] madeEnv = config.cached_make_env(params['make_env']) rollout_worker = RolloutWorker(madeEnv, params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(madeEnv, params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) else: demoFileName = '/home/rjangir/fetchDemoData/data_fetch_random_100.npz' rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], #'render': 1, } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], #'use_demo_states': False, 'compute_Q': True, 'T': params['T'], #'render': 1, } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorkerOriginal(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorkerOriginal(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, demo_file_name=demoFileName)
def learn( *, network, env, total_timesteps, ### 4 seed=None, eval_env=None, replay_strategy='future', policy_save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, **kwargs): print( "-------------------JW Debug learn func @ her.py with hrl baseline merge ----------------------" ) override_params = override_params or {} if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS env_name = env.specs[0].id params['env_name'] = env_name # print(env_name) params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['rollout_batch_size'] = env.num_envs if demo_file is not None: params['bc_loss'] = 1 params.update(kwargs) config.log_params(params, logger=logger) ### 5 if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') ### 6 logger.warn() dims = config.configure_dims(params) # policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return, FLAGS=FLAGS, agent_params=agent_params) #===============================# FLAGS = parse_options() ## Prepare params for HAC. FLAGS.layers = 2 # Enter number of levels in agent hierarchy FLAGS.time_scale = 10 # Enter max sequence length in which each policy will specialize # Enter max number of atomic actions. # This will typically be FLAGS.time_scale**(FLAGS.layers). # However, in the UR5 Reacher task, we use a shorter episode length. # max_actions = FLAGS.time_scale**(FLAGS.layers-1)*6 max_actions = 1000 timesteps_per_action = 15 # Provide the number of time steps per atomic action. agent_params = {} # Define percentage of actions that a subgoal level (i.e. level i > 0) will test subgoal actions agent_params["subgoal_test_perc"] = 0.3 # Define subgoal penalty for missing subgoal. Please note that by default the Q value target for missed subgoals does not include Q-value of next state (i.e, discount rate = 0). As a result, the Q-value target for missed subgoal just equals penalty. For instance in this 3-level UR5 implementation, if a level proposes a subgoal and misses it, the Q target value for this action would be -10. To incorporate the next state in the penalty, go to the "penalize_subgoal" method in the "layer.py" file. agent_params["subgoal_penalty"] = -FLAGS.time_scale # Define exploration noise that is added to both subgoal actions and atomic actions. Noise added is Gaussian N(0, noise_percentage * action_dim_range) agent_params["atomic_noise"] = [0.1 for i in range(3)] agent_params["subgoal_noise"] = [0.03 for i in range(6)] # Define number of episodes of transitions to be stored by each level of the hierarchy agent_params["episodes_to_store"] = 500 # Provide training schedule for agent. # Training by default will alternate between exploration and testing. # Hyperparameter below indicates number of exploration episodes. # Testing occurs for 100 episodes. To change number of testing episodes, go to "ran_HAC.py". agent_params["num_exploration_episodes"] = 50 # policy = config.configure_ddpg(params, FLAGS, dims, reuse, use_mpi, clip_return) # 이걸 어떻게 해야해! # def configure_ddpg(dims, params, FLAGS, agent_params, reuse=False, use_mpi=True, clip_return=True): # policy = [] ## policy를 레이어 별로 만들어줌 # for i in range(0, FLAGS.layers): # print("!!!!!!!!!!!!!!!!!!!!!!!!!! i={}".format(i)) # policy[i] = config.configure_ddpg(dims=dims, params=params, FLAGS=FLAGS, agent_params=agent_params, reuse=False, use_mpi=True, clip_return=True) # 이걸 어떻게 해야해! policy = config.configure_ddpg(dims=dims, params=params, FLAGS=FLAGS, agent_params=agent_params, reuse=False, use_mpi=True, clip_return=True) # 이걸 어떻게 해야해! # 원래 dims, params, reuse=False, use_mpi=True, clip_return=True # agent = design_agent_and_env(FLAGS, env, dims=dims, params=params, clip_return=clip_return) ## make agent(TD3) for HAC. # policy = design_agent_and_env(FLAGS, env, dims=dims, params=params, clip_return=clip_return) ## make agent(TD3) for HAC. #===============================# if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], ############hrl################ ############################### } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env print("================") print(FLAGS) print(type(FLAGS)) print("================") ## Done with prepare # run_HAC(FLAGS, agent) # agent = design_agent_and_env(FLAGS, env, dims=dims, params=params, clip_return=clip_return) ## 원래거 agent = design_agent_and_env(FLAGS=FLAGS, env=env, policy=policy, dims=dims, logger=logger, rollout_params=rollout_params, eval_params=eval_params, agent_params=agent_params, monitor=True) # agent = design_agent_and_env(FLAGS, env, dims, policy, rollout_params, eval_params, agent_params, monitor=True) ''' FLAGS : Namespace(layers=2, time_scale=10), <class 'argparse.Namespace'> env : <baselines.common.vec_env.dummy_vec_env.DummyVecEnv object at 0x1c22644ef0>, <class 'baselines.common.vec_env.dummy_vec_env.DummyVecEnv'> dims : {'o': 10, 'u': 4, 'g': 3, 'info_is_success': 1}, dict policy : <baselines.her.ddpg.DDPG object at 0x1c2a83de48>, <class 'baselines.her.ddpg.DDPG'> logger : <module 'baselines.logger' from '/Users/ryujiwon/rl-robotarm-final/baselines/baselines/logger.py'>, <class 'module'> ''' # rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) # ## # # rollout_worker_high = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) # ## # evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) ## 뭐하는 놈임 n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // agent.layers[ 0].rollout_worker.T // agent.layers[0].rollout_worker.rollout_batch_size # n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size # print("#######################################n_epoch = {}".format(n_epochs)) ### 7 # return train( # save_path=save_path, # env_name=env_name, #jw # agent=agent, #jw # policy=policy, rollout_worker=rollout_worker, # evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], # n_cycles=params['n_cycles'], n_batches=params['n_batches'], # policy_save_interval=policy_save_interval, demo_file=demo_file, FLAGS=FLAGS) ''' def HAC_train(*, env, agnet, policy, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, save_path, demo_file, FLAGS, **kwargs): ''' print( "@ her.py learn, env={}, agent={}, policy={}, n_epochs={}, n_test_rollouts={}, n_cycles={}, n_batches={}, policy_save_interval={}, save_path={}, demo_file={}, FLAGS={}" .format( env, agent, ## A.R policy, n_epochs, params['n_test_rollouts'], params['n_cycles'], params['n_batches'], policy_save_interval, save_path, demo_file, FLAGS)) return HAC_train( # env_name=env_name, #jw env=env, ## A.R agent=agent, ## A.R policy=policy, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_path=save_path, demo_file=demo_file, FLAGS=FLAGS)
def launch( env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, demo_file, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) resource.setrlimit(resource.RLIMIT_NOFILE, (65536, 65536)) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if params['env_name'] == 'FetchPickAndPlace-v0': rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'render': 1, } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], #'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'rollout_batch_size': 1, 'render': 1, } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] madeEnv = config.cached_make_env(params['make_env']) rollout_worker = RolloutWorker(madeEnv, params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(madeEnv, params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) else: rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'render': 1, } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], #'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'render': 1, } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorkerOriginal(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorkerOriginal(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, demo_file = demo_file)