def demo_continuous_action_off_policy(): args = Arguments() args.gpu_id = sys.argv[-1][-4] from elegantrl2.tutorial.agent import AgentSAC # AgentDDPG AgentTD3 args.agent = AgentSAC() '''choose environment''' if_train_pendulum = 1 if if_train_pendulum: "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s" env = gym.make('Pendulum-v0') env.target_return = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 args.net_dim = 2**7 args.batch_size = args.net_dim args.target_step = args.env.max_step * 4 if_train_lunar_lander = 0 if if_train_lunar_lander: "TotalStep: 4e5, TargetReward: 200, UsedTime: 900s" args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) args.reward_scale = 2**0 # RewardRange: -800 < -200 < 200 < 302 if_train_bipedal_walker = 0 if if_train_bipedal_walker: "TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s" args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) args.reward_scale = 2**0 # RewardRange: -200 < -150 < 300 < 334 args.gamma = 0.97 args.if_per_or_gae = True '''train and evaluate''' train_and_evaluate(args)
def demo_discrete_action_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy from elegantrl2.tutorial.agent import AgentDiscretePPO args.agent = AgentDiscretePPO() '''choose environment''' if_train_cart_pole = 0 if if_train_cart_pole: "TotalStep: 5e4, TargetReward: 200, UsedTime: 60s" args.env = PreprocessEnv(env='CartPole-v0') args.net_dim = 2**7 args.batch_size = args.net_dim * 2 args.repeat_times = 2**4 args.target_step = args.env.max_step * 8 args.if_per_or_gae = True if_train_lunar_lander = 1 if if_train_lunar_lander: "TotalStep: 2e5, TargetReturn: 200, UsedTime: 400s, LunarLander-v2, PPO" args.env = PreprocessEnv(env=gym.make('LunarLander-v2')) args.agent.cri_target = False args.reward_scale = 2**-1 args.net_dim = 2**8 args.batch_size = args.net_dim * 4 args.target_step = args.env.max_step * 4 args.repeat_times = 2**5 args.if_per_or_gae = True '''train and evaluate''' train_and_evaluate(args)
def demo_continuous_action_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy from elegantrl2.tutorial.agent import AgentPPO args.agent = AgentPPO() '''choose environment''' if_train_pendulum = 0 if if_train_pendulum: "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s" env = gym.make('Pendulum-v0') env.target_return = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 args.net_dim = 2**7 args.batch_size = args.net_dim * 2 args.target_step = args.env.max_step * 16 if_train_lunar_lander = 0 if if_train_lunar_lander: "TotalStep: 4e5, TargetReward: 200, UsedTime: 900s" args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) args.reward_scale = 2**0 # RewardRange: -800 < -200 < 200 < 302 if_train_bipedal_walker = 1 if if_train_bipedal_walker: "TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s" args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) args.reward_scale = 2**0 # RewardRange: -200 < -150 < 300 < 334 args.gamma = 0.97 args.if_per_or_gae = True # args.agent.lambda_entropy = 0.05 '''train and evaluate''' train_and_evaluate(args)
def demo_discrete_action_off_policy(): args = Arguments() from elegantrl2.tutorial.agent import AgentDoubleDQN # AgentDQN args.agent = AgentDoubleDQN() '''choose environment''' if_train_cart_pole = 0 if if_train_cart_pole: "TotalStep: 5e4, TargetReward: 200, UsedTime: 60s" args.env = PreprocessEnv(env='CartPole-v0') args.net_dim = 2**7 args.target_step = args.env.max_step * 2 if_train_lunar_lander = 1 if if_train_lunar_lander: "TotalStep: 2e5, TargetReturn: 200, UsedTime: 400s, LunarLander-v2, PPO" args.env = PreprocessEnv(env=gym.make('LunarLander-v2')) args.net_dim = 2**8 args.batch_size = args.net_dim '''train and evaluate''' train_and_evaluate(args)
def demo_discrete_action_off_policy(): args = Arguments() from elegantrl2.agent import AgentD3QN as Agent # AgentDQN args.agent = Agent() '''choose environment''' if_train_cart_pole = 0 if if_train_cart_pole: "TotalStep: 5e4, TargetReward: 200, UsedTime: 60s" args.env = PreprocessEnv(env='CartPole-v0') args.net_dim = 2**7 args.target_step = args.env.max_step * 2 train_and_evaluate(args) if_train_lunar_lander = 1 if if_train_lunar_lander: "TotalStep: 2e5, TargetReturn: 200, UsedTime: 400s, LunarLander-v2, PPO" args.env = PreprocessEnv(env=gym.make('LunarLander-v2')) args.net_dim = 2**8 args.batch_size = args.net_dim train_and_evaluate(args) if_train_lunar_lander_mp = 0 if if_train_lunar_lander_mp: "TotalStep: 36e4, EpisodeReturn: 200, UsedTime: 1181s, LunarLander-v2, D3QN" "TotalStep: 52e4, EpisodeReturn: 260, UsedTime: 2148s, LunarLander-v2, D3QN" args.env = PreprocessEnv(env=gym.make('LunarLander-v2')) args.net_dim = 2**8 args.batch_size = args.net_dim args.target_step = args.env.max_step * 1 args.worker_num = 4 args.reward_scale = 2**-4 args.max_memo = 2**19 args.break_step = 2**19 args.if_allow_break = False args.gpu_id = sys.argv[-1] train_and_evaluate_mp(args)
def demo_custom_env_finance_rl_nas89(): # 1.7+ 2.0+ args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy args.random_seed = 19430 from elegantrl2.agent import AgentPPO args.agent = AgentPPO() args.agent.lambda_entropy = 0.02 from envs.FinRL.StockTrading import StockEnvNAS89 args.gamma = 0.999 args.env = StockEnvNAS89(if_eval=False, gamma=args.gamma, turbulence_thresh=30) args.eval_env = StockEnvNAS89(if_eval=True, gamma=args.gamma, turbulence_thresh=15) args.net_dim = 2**9 args.repeat_times = 2**4 args.learning_rate = 2**-14 args.batch_size = args.net_dim * 4 args.eval_gap = 2**8 args.eval_times1 = 2**0 args.eval_times2 = 2**1 args.break_step = int(8e6) args.if_allow_break = False if_single_proc = 0 if if_single_proc: args.gpu_id = int(sys.argv[-1][-4]) args.random_seed += int(args.gpu_id) args.target_step = args.env.max_step * 4 train_and_evaluate(args) if_single_env = 1 if if_single_env: args.gpu_id = int(sys.argv[-1][-4]) args.random_seed += int(args.gpu_id) args.target_step = args.env.max_step * 1 args.worker_num = 4 train_and_evaluate_mp(args) if_multi_learner = 0 if if_multi_learner: args.gpu_id = (2, 3) if len(sys.argv) == 1 else eval( sys.argv[-1]) # python main.py -GPU 0,1 args.repeat_times = 2**4 args.target_step = args.env.max_step args.worker_num = 4 train_and_evaluate_mg(args) if_batch_env = 0 if if_batch_env: from envs.FinRL.StockTrading import StockVecEnvNAS89 args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2) args.gpu_id = int(sys.argv[-1][-4]) args.random_seed += args.gpu_id args.target_step = args.env.max_step args.worker_num = 4 train_and_evaluate_mp(args)