def demo4_bullet_mujoco_on_policy(): args = Arguments(if_on_policy=True) # hyper-parameters of on-policy is different from off-policy import pybullet_envs # for python-bullet-gym dir(pybullet_envs) "TotalStep: 1e5, TargetReturn: 18, UsedTime: 3ks, ReacherBulletEnv-v0, PPO" "TotalStep: 1e6, TargetReturn: 18, UsedTime: 30ks, ReacherBulletEnv-v0, PPO" args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0')) from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True args.break_step = int(2e5 * 8) args.reward_scale = 2 ** 0 # RewardRange: -15 < 0 < 18 < 25 args.gamma = 0.96 args.eval_times1 = 2 ** 2 args.eval_times1 = 2 ** 5 # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args) "TotalStep: 3e6, TargetReturn: 1500, UsedTime: 2ks, AntBulletEnv-v0, PPO" "TotalStep: 10e6, TargetReturn: 2500, UsedTime: 6ks, AntBulletEnv-v0, PPO" "TotalStep: 46e6, TargetReturn: 3017, UsedTime: 25ks, AntBulletEnv-v0, PPO" "TotalStep: 5e6, TargetReturn: 1500, UsedTime: 3ks, AntBulletEnv-v0, PPO if_use_dn" "TotalStep: 15e6, TargetReturn: 2500, UsedTime: 10ks, AntBulletEnv-v0, PPO if_use_dn" "TotalStep: 60e6, TargetReturn: 2949, UsedTime: 34ks, AntBulletEnv-v0, PPO if_use_dn" "TotalStep: 2e6, TargetReturn: 1500, UsedTime: 2ks, AntBulletEnv-v0, PPO if_use_cn" "TotalStep: 10e6, TargetReturn: 2500, UsedTime: 7ks, AntBulletEnv-v0, PPO if_use_cn" "TotalStep: 53e6, TargetReturn: 2834, UsedTime: 35ks, AntBulletEnv-v0, PPO if_use_cn" args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0')) from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True args.agent.lambda_entropy = 0.05 args.agent.lambda_gae_adv = 0.97 args.if_allow_break = False args.break_step = int(8e6 * 8) # (5e5) 1e6, UsedTime: (15,000s) 30,000s args.reward_scale = 2 ** -2 # (-50) 0 ~ 2500 (3340) args.max_memo = args.env.max_step * 4 args.batch_size = 2 ** 11 # 10 args.repeat_times = 2 ** 3 args.eval_gap = 2 ** 8 # for Recorder args.eva_size1 = 2 ** 1 # for Recorder args.eva_size2 = 2 ** 3 # for Recorder # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo2_continuous_action_space_on_policy(): import elegantrl.agent as agent """DEMO 2.1: Continuous action env (on-policy)""" args = Arguments(if_on_policy=True) # hyper-parameters of on-policy is different from off-policy args.agent = agent.AgentGaePPO() # AgentPPO() '''choose environment''' # env = gym.make('Pendulum-v0') # env.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' # args.env = PreprocessEnv(env=env) # args.reward_scale = 2 ** -3 # RewardRange: -1800 < -200 < -50 < 0 # # TotalStep: 4e5, TargetReward: -200, UsedTime: 400s args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 "TotalStep: 8e5, TargetReward: 200, UsedTime: 1500s" # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) # args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 # args.gamma = 0.96 # "TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s" '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate__multiprocessing(args)
def demo2_continuous_action_space_off_policy(): import elegantrl.agent as agent """DEMO 2.1: Continuous action env (off-policy)""" args = Arguments(if_on_policy=False) args.agent = agent.AgentModSAC() # AgentSAC(), AgentTD3(), AgentDDPG() '''choose environment''' # env = gym.make('Pendulum-v0') # env.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' # args.env = PreprocessEnv(env=env) # args.reward_scale = 2 ** -3 # RewardRange: -1800 < -200 < -50 < 0 # "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s" # # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) # args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 # "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s" args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) args.reward_scale = 2 ** -1 # RewardRange: -200 < -150 < 300 < 334 args.gamma = 0.95 "TotalStep: 2e5, TargetReward: 300, UsedTime: 3500s" '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 2 train_and_evaluate__multiprocessing(args)
def demo2_continuous_action_space_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy '''choose an DRL algorithm''' from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True '''choose environment''' "PPO TotalStep: 4e5, TargetReward: -200, UsedTime: 400s, Pendulum-v0" env = gym.make('Pendulum-v0') env.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 "PPO TotalStep: 8e5, TargetReward: 200, UsedTime: 1500s, LunarLanderContinuous-v2" # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) # args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 "PPO TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s, BipedalWalker-v3" # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) # args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 # args.gamma = 0.96 '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo2_continuous_action_space_off_policy(): args = Arguments(if_on_policy=False) '''choose an DRL algorithm''' from elegantrl.agent import AgentModSAC # AgentSAC, AgentTD3, AgentDDPG args.agent = AgentModSAC() '''choose environment''' "TD3 TotalStep: 3e4, TargetReward: -200, UsedTime: 300s, Pendulum-v0" "ModSAC TotalStep: 4e4, TargetReward: -200, UsedTime: 400s, Pendulum-v0" env = gym.make('Pendulum-v0') env.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 "TD3 TotalStep: 9e4, TargetReward: 100, UsedTime: 3ks, LunarLanderContinuous-v2" "TD3 TotalStep: 20e4, TargetReward: 200, UsedTime: 5ks, LunarLanderContinuous-v2" "SAC TotalStep: 9e4, TargetReward: 200, UsedTime: 3ks, LunarLanderContinuous-v2" "ModSAC TotalStep: 5e4, TargetReward: 200, UsedTime: 1ks, LunarLanderContinuous-v2" # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) # args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 # args.eval_times2 = 2 ** 4 # set a large eval_times to get a precise learning curve "ModSAC TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s, BipedalWalker-v3" # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) # args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 # args.net_dim = 2 ** 8 # args.break_step = int(2e5) # args.if_allow_break = True # allow break training when reach goal (early termination) # args.break_step = int(2e5 * 4) # break training after 'total_step > break_step' '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo1_discrete_action_space(): args = Arguments(agent=None, env=None, gpu_id=None) # see Arguments() to see hyper-parameters '''choose an DRL algorithm''' # from elegantrl.agent import AgentD3QN # AgentDQN,AgentDuelDQN, AgentDoubleDQN, # args.agent = AgentD3QN() from elegantrl.agent import AgentDuelingDQN # AgentDQN,AgentDuelDQN, AgentDoubleDQN, args.agent = AgentDuelingDQN() '''choose environment''' "TotalStep: 2e3, TargetReturn: 200, UsedTime: 20s, CartPole-v0" "TotalStep: 2e3, TargetReturn: 200, UsedTime: 30s, CartPole-v0 rollout_num = 2" # args.env = PreprocessEnv(env=gym.make('CartPole-v0')) # args.net_dim = 2 ** 7 # change a default hyper-parameters # args.batch_size = 2 ** 7 # args.target_step = 2 ** 8 # args.eval_gap = 2 ** 0 "TotalStep: 6e4, TargetReturn: 200, UsedTime: 600s, LunarLander-v2, D3DQN" "TotalStep: 4e4, TargetReturn: 200, UsedTime: 600s, LunarLander-v2, DuelDQN" args.env = PreprocessEnv(env=gym.make('LunarLander-v2')) args.net_dim = 2 ** 8 args.batch_size = 2 ** 8 '''train and evaluate''' train_and_evaluate(args)
def demo4_bullet_mujoco_off_policy(): args = Arguments(if_on_policy=False) args.random_seed = 10086 from elegantrl.agent import AgentModSAC # AgentSAC, AgentTD3, AgentDDPG args.agent = AgentModSAC() # AgentSAC(), AgentTD3(), AgentDDPG() args.agent.if_use_dn = True import pybullet_envs # for python-bullet-gym dir(pybullet_envs) "TotalStep: 5e4, TargetReturn: 18, UsedTime: 1100s, ReacherBulletEnv-v0" "TotalStep: 30e4, TargetReturn: 25, UsedTime: s, ReacherBulletEnv-v0" args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0')) args.env.max_step = 2 ** 10 # important, default env.max_step=150 args.reward_scale = 2 ** 0 # -80 < -30 < 18 < 28 args.gamma = 0.96 args.break_step = int(6e4 * 8) # (4e4) 8e5, UsedTime: (300s) 700s args.eval_times1 = 2 ** 2 args.eval_times1 = 2 ** 5 args.if_per = True train_and_evaluate(args) "TotalStep: 3e5, TargetReward: 1500, UsedTime: 4ks, AntBulletEnv-v0 ModSAC if_use_dn" "TotalStep: 4e5, TargetReward: 2500, UsedTime: 6ks, AntBulletEnv-v0 ModSAC if_use_dn" "TotalStep: 10e5, TargetReward: 2879, UsedTime: ks, AntBulletEnv-v0 ModSAC if_use_dn" "TotalStep: 3e5, TargetReward: 1500, UsedTime: 8ks, AntBulletEnv-v0 ModSAC if_use_cn" "TotalStep: 7e5, TargetReward: 2500, UsedTime: 18ks, AntBulletEnv-v0 ModSAC if_use_cn" "TotalStep: 16e5, TargetReward: 2923, UsedTime: ks, AntBulletEnv-v0 ModSAC if_use_cn" args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0')) args.break_step = int(6e5 * 8) # (5e5) 1e6, UsedTime: (15,000s) 30,000s args.if_allow_break = False args.reward_scale = 2 ** -2 # RewardRange: -50 < 0 < 2500 < 3340 args.max_memo = 2 ** 21 args.batch_size = 2 ** 8 args.repeat_times = 2 ** 1 args.eval_gap = 2 ** 9 # for Recorder args.eva_size1 = 2 ** 1 # for Recorder args.eva_size2 = 2 ** 3 # for Recorder # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo4_bullet_mujoco_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True import pybullet_envs # for python-bullet-gym dir(pybullet_envs) "TotalStep: 1e5, TargetReward: 18, UsedTime: 3ks, ReacherBulletEnv-v0" "TotalStep: 1e6, TargetReward: 18, UsedTime: 30ks, ReacherBulletEnv-v0" args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0')) args.break_step = int(2e5 * 8) args.reward_scale = 2**0 # RewardRange: -15 < 0 < 18 < 25 args.gamma = 0.96 args.eval_times1 = 2**2 args.eval_times1 = 2**5 # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args) "TotalStep: 2e6, TargetReward: 1500, UsedTime: 3ks, AntBulletEnv-v0" "TotalStep: 13e6, TargetReward: 2400, UsedTime: 21ks, AntBulletEnv-v0" args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0')) args.env.max_step = 2**10 args.break_step = int(2e6 * 8) # (5e5) 1e6, UsedTime: (15,000s) 30,000s args.reward_scale = 2**-2 # (-50) 0 ~ 2500 (3340) args.max_memo = 2**11 args.repeat_times = 2**3 args.batch_size = 2**10 args.net_dim = 2**9 args.show_gap = 2**8 # for Recorder args.eva_size1 = 2**1 # for Recorder args.eva_size2 = 2**3 # for Recorder # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo1_discrete_action_space(): args = Arguments(agent=None, env=None, gpu_id=None) # see Arguments() to see hyper-parameters '''choose an DRL algorithm''' from elegantrl.agent import AgentD3QN # AgentDQN,AgentDuelDQN, AgentDoubleDQN, args.agent = AgentD3QN() '''choose environment''' # args.env = PreprocessEnv(env=gym.make('CartPole-v0')) # args.net_dim = 2 ** 7 # change a default hyper-parameters # args.batch_size = 2 ** 7 "TotalStep: 2e3, TargetReward: , UsedTime: 10s" args.env = PreprocessEnv(env=gym.make('LunarLander-v2')) args.net_dim = 2**8 args.batch_size = 2**8 "TotalStep: 6e4, TargetReward: 200, UsedTime: 600s" '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate__multiprocessing(args)
def demo1_discrete_action_space(): import elegantrl.agent as agent """DEMO 1: Discrete action env of gym""" args = Arguments(agent=None, env=None, gpu_id=None) # see Arguments() to see hyper-parameters '''choose an DRL algorithm''' # args.agent_rl = agent.AgentDuelingDQN() # AgentDQN() args.agent = agent.AgentD3QN() # AgentDoubleDQN() '''choose environment''' args.env = PreprocessEnv(env=gym.make('CartPole-v0')) args.net_dim = 2 ** 7 # change a default hyper-parameters # args.env = PreprocessEnv(env=gym.make('LunarLander-v2')) # args.net_dim = 2 ** 8 '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate__multiprocessing(args)
def demo2_continuous_action_space_off_policy(): args = Arguments(if_on_policy=False) '''choose an DRL algorithm''' from elegantrl.agent import AgentModSAC # AgentSAC, AgentTD3, AgentDDPG args.agent = AgentModSAC() # AgentSAC(), AgentTD3(), AgentDDPG() '''choose environment''' env = gym.make('Pendulum-v0') env.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s" # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) # args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s" # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) # args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 # args.net_dim = 2 ** 8 # args.break_step = int(2e5) # args.if_allow_break = False "TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s" '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate__multiprocessing(args)
def train_and_evaluate(args): args.init_before_training() '''basic arguments''' cwd = args.cwd env = args.env agent = args.agent gpu_id = args.gpu_id # necessary for Evaluator? '''training arguments''' net_dim = args.net_dim max_memo = args.max_memo break_step = args.break_step batch_size = args.batch_size target_step = args.target_step repeat_times = args.repeat_times if_break_early = args.if_allow_break if_per = args.if_per gamma = args.gamma reward_scale = args.reward_scale '''evaluating arguments''' eval_gap = args.eval_gap eval_times1 = args.eval_times1 eval_times2 = args.eval_times2 if args.env_eval is not None: env_eval = args.env_eval elif args.env_eval in set(gym.envs.registry.env_specs.keys()): env_eval = PreprocessEnv(gym.make(env.env_name)) else: env_eval = deepcopy(env) del args # In order to show these hyper-parameters clearly, I put them above. '''init: environment''' max_step = env.max_step state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete '''init: Agent, ReplayBuffer, Evaluator''' agent.init(net_dim, state_dim, action_dim, if_per) if_on_policy = getattr(agent, 'if_on_policy', False) buffer = ReplayBuffer(max_len=max_memo + max_step, state_dim=state_dim, action_dim=1 if if_discrete else action_dim, if_on_policy=if_on_policy, if_per=if_per, if_gpu=True) evaluator = Evaluator( cwd=cwd, agent_id=gpu_id, device=agent.device, env=env_eval, eval_gap=eval_gap, eval_times1=eval_times1, eval_times2=eval_times2, ) '''prepare for training''' agent.state = env.reset() if if_on_policy: steps = 0 else: # explore_before_training for off-policy with torch.no_grad(): # update replay buffer steps = explore_before_training(env, buffer, target_step, reward_scale, gamma) agent.update_net(buffer, target_step, batch_size, repeat_times) # pre-training and hard update agent.act_target.load_state_dict(agent.act.state_dict()) if getattr( agent, 'act_target', None) else None agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr( agent, 'cri_target', None) else None total_step = steps '''start training''' if_reach_goal = False while not ((if_break_early and if_reach_goal) or total_step > break_step or os.path.exists(f'{cwd}/stop')): steps = agent.explore_env(env, buffer, target_step, reward_scale, gamma) total_step += steps obj_a, obj_c = agent.update_net(buffer, target_step, batch_size, repeat_times) if_reach_goal = evaluator.evaluate_save(agent.act, steps, obj_a, obj_c) evaluator.draw_plot() print( f'| SavedDir: {cwd}\n| UsedTime: {time.time() - evaluator.start_time:.0f}' )