def demo2_continuous_action_space_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy '''choose an DRL algorithm''' from agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True '''choose environment''' "PPO TotalStep: 4e5, TargetReward: -200, UsedTime: 400s, Pendulum-v0" env = gym.make('Pendulum-v0') env.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 "PPO TotalStep: 8e5, TargetReward: 200, UsedTime: 1500s, LunarLanderContinuous-v2" # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) # args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 "PPO TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s, BipedalWalker-v3" # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) # args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 # args.gamma = 0.96 '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo3_custom_env_fin_rl(): from agent import AgentPPO '''choose an DRL algorithm''' args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.if_use_gae = False "TotalStep: 5e4, TargetReward: 1.25, UsedTime: 20s, FinanceStock-v2" "TotalStep: 20e4, TargetReward: 1.50, UsedTime: 80s, FinanceStock-v2" from env import FinanceStockEnv # a standard env for ElegantRL, not need PreprocessEnv() args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024) args.env_eval = FinanceStockEnv( if_train=False, train_beg=0, train_len=1024) # eva_len = 1699 - train_len args.reward_scale = 2**0 # RewardRange: 0 < 1.0 < 1.25 < 1.5 < 1.6 args.break_step = int(5e6) args.net_dim = 2**8 args.max_step = args.env.max_step args.max_memo = (args.max_step - 1) * 8 args.batch_size = 2**11 args.repeat_times = 2**4 args.eval_times1 = 2**2 args.eval_times2 = 2**4 args.if_allow_break = True '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 8 train_and_evaluate_mp(args)
def demo2_continuous_action_space_off_policy(): args = Arguments(if_on_policy=False) '''choose an DRL algorithm''' from agent import AgentModSAC # AgentSAC, AgentTD3, AgentDDPG args.agent = AgentModSAC() '''choose environment''' "TD3 TotalStep: 3e4, TargetReward: -200, UsedTime: 300s, Pendulum-v0" "ModSAC TotalStep: 4e4, TargetReward: -200, UsedTime: 400s, Pendulum-v0" env = gym.make('Pendulum-v0') env.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 "TD3 TotalStep: 9e4, TargetReward: 100, UsedTime: 3ks, LunarLanderContinuous-v2" "TD3 TotalStep: 20e4, TargetReward: 200, UsedTime: 5ks, LunarLanderContinuous-v2" "SAC TotalStep: 9e4, TargetReward: 200, UsedTime: 3ks, LunarLanderContinuous-v2" "ModSAC TotalStep: 5e4, TargetReward: 200, UsedTime: 1ks, LunarLanderContinuous-v2" # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) # args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 # args.eval_times2 = 2 ** 4 # set a large eval_times to get a precise learning curve "ModSAC TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s, BipedalWalker-v3" # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) # args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 # args.net_dim = 2 ** 8 # args.break_step = int(2e5) # args.if_allow_break = True # allow break training when reach goal (early termination) # args.break_step = int(2e5 * 4) # break training after 'total_step > break_step' '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo4_bullet_mujoco_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy import pybullet_envs # for python-bullet-gym dir(pybullet_envs) "TotalStep: 1e5, TargetReward: 18, UsedTime: 3ks, ReacherBulletEnv-v0" "TotalStep: 1e6, TargetReward: 18, UsedTime: 30ks, ReacherBulletEnv-v0" args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0')) from agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True args.break_step = int(2e5 * 8) args.reward_scale = 2**0 # RewardRange: -15 < 0 < 18 < 25 args.gamma = 0.96 args.eval_times1 = 2**2 args.eval_times1 = 2**5 # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args) "TotalStep: 30e5, TargetReward: 1500, UsedTime: 6ks, AntBulletEnv-v0" "TotalStep: 75e5, TargetReward: 2500, UsedTime: 14ks, AntBulletEnv-v0" args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0')) from agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True args.agent.lambda_entropy = 0.05 # 0.02 args.agent.lambda_gae_adv = 0.97 # 0.98 args.agent.if_use_dn = True args.net_dim = 2**8 args.break_step = int(8e6 * 8) # (5e5) 1e6, UsedTime: (15,000s) 30,000s args.reward_scale = 2**-2 # (-50) 0 ~ 2500 (3340) args.max_memo = args.env.max_step * 4 args.batch_size = 2**11 # 10 args.repeat_times = 2**3 args.net_dim = 2**8 args.show_gap = 2**8 # for Recorder args.eva_size1 = 2**1 # for Recorder args.eva_size2 = 2**3 # for Recorder # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo4_bullet_mujoco_off_policy(): args = Arguments(if_on_policy=False) args.random_seed = 10086 from agent import AgentModSAC # AgentSAC, AgentTD3, AgentDDPG args.agent = AgentModSAC() # AgentSAC(), AgentTD3(), AgentDDPG() args.agent.if_use_dn = True import pybullet_envs # for python-bullet-gym dir(pybullet_envs) "TotalStep: 5e4, TargetReward: 18, UsedTime: 1100s, ReacherBulletEnv-v0" "TotalStep: 30e4, TargetReward: 25, UsedTime: s, ReacherBulletEnv-v0" # args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0')) # args.env.max_step = 2 ** 10 # important, default env.max_step=150 # args.reward_scale = 2 ** 0 # -80 < -30 < 18 < 28 # args.gamma = 0.96 # args.break_step = int(6e4 * 8) # (4e4) 8e5, UsedTime: (300s) 700s # args.eval_times1 = 2 ** 2 # args.eval_times1 = 2 ** 5 # args.if_per = True # # train_and_evaluate(args) "TotalStep: 3e5, TargetReward: 1500, UsedTime: 8ks, AntBulletEnv-v0" "TotalStep: 6e5, TargetReward: 2500, UsedTime: 18ks, AntBulletEnv-v0" "TotalStep: 20e5, TargetReward: 3000, UsedTime: 80ks, AntBulletEnv-v0" "TotalStep: 48e5, TargetReward: 3186, UsedTime:175ks, AntBulletEnv-v0" args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0')) args.break_step = int(6e5 * 8) # (5e5) 1e6, UsedTime: (15,000s) 30,000s args.if_allow_break = False args.reward_scale = 2**-2 # RewardRange: -50 < 0 < 2500 < 3340 args.max_memo = 2**20 args.batch_size = 2**9 args.show_gap = 2**8 # for Recorder args.eva_size1 = 2**1 # for Recorder args.eva_size2 = 2**3 # for Recorder # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo4_bullet_mujoco_mpo(): args = Arguments(if_on_policy=False) args.random_seed = 10086 from agent import AgentMPO # AgentSAC, AgentTD3, AgentDDPG args.agent = AgentMPO() # AgentSAC(), AgentTD3(), AgentDDPG() import pybullet_envs # for python-bullet-gym dir(pybullet_envs) args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0')) args.break_step = int(6e5 * 8) # (5e5) 1e6, UsedTime: (15,000s) 30,000s args.if_allow_break = False args.reward_scale = 2**0 # RewardRange: -50 < 0 < 2500 < 3340 args.max_memo = 2**20 args.batch_size = 2**9 args.show_gap = 2**8 # for Recorder args.eva_size1 = 2**1 # for Recorder args.eva_size2 = 2**3 # for Recorder # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo4_bullet_mujoco_off_policy_per(): args = Arguments(if_on_policy=False) args.random_seed = 10086 from agent import AgentModSAC # AgentSAC, AgentTD3, AgentDDPG args.agent = AgentModSAC() # AgentSAC(), AgentTD3(), AgentDDPG() args.agent.if_use_dn = True import pybullet_envs # for python-bullet-gym dir(pybullet_envs) "TotalStep: 5e4, TargetReward: 18, UsedTime: 1100s, ReacherBulletEnv-v0" "TotalStep: 30e4, TargetReward: 25, UsedTime: s, ReacherBulletEnv-v0" args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0')) args.env.max_step = 2**10 # important, default env.max_step=150 args.reward_scale = 2**0 # -80 < -30 < 18 < 28 args.gamma = 0.96 args.break_step = int(6e4 * 8) # (4e4) 8e5, UsedTime: (300s) 700s args.eval_times1 = 2**2 args.eval_times1 = 2**5 args.if_per = True args.rollout_num = 2 train_and_evaluate_mp(args)