def demo4_bullet_mujoco_on_policy(): args = Arguments(if_on_policy=True) # hyper-parameters of on-policy is different from off-policy import pybullet_envs # for python-bullet-gym dir(pybullet_envs) "TotalStep: 1e5, TargetReturn: 18, UsedTime: 3ks, ReacherBulletEnv-v0, PPO" "TotalStep: 1e6, TargetReturn: 18, UsedTime: 30ks, ReacherBulletEnv-v0, PPO" args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0')) from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True args.break_step = int(2e5 * 8) args.reward_scale = 2 ** 0 # RewardRange: -15 < 0 < 18 < 25 args.gamma = 0.96 args.eval_times1 = 2 ** 2 args.eval_times1 = 2 ** 5 # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args) "TotalStep: 3e6, TargetReturn: 1500, UsedTime: 2ks, AntBulletEnv-v0, PPO" "TotalStep: 10e6, TargetReturn: 2500, UsedTime: 6ks, AntBulletEnv-v0, PPO" "TotalStep: 46e6, TargetReturn: 3017, UsedTime: 25ks, AntBulletEnv-v0, PPO" "TotalStep: 5e6, TargetReturn: 1500, UsedTime: 3ks, AntBulletEnv-v0, PPO if_use_dn" "TotalStep: 15e6, TargetReturn: 2500, UsedTime: 10ks, AntBulletEnv-v0, PPO if_use_dn" "TotalStep: 60e6, TargetReturn: 2949, UsedTime: 34ks, AntBulletEnv-v0, PPO if_use_dn" "TotalStep: 2e6, TargetReturn: 1500, UsedTime: 2ks, AntBulletEnv-v0, PPO if_use_cn" "TotalStep: 10e6, TargetReturn: 2500, UsedTime: 7ks, AntBulletEnv-v0, PPO if_use_cn" "TotalStep: 53e6, TargetReturn: 2834, UsedTime: 35ks, AntBulletEnv-v0, PPO if_use_cn" args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0')) from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True args.agent.lambda_entropy = 0.05 args.agent.lambda_gae_adv = 0.97 args.if_allow_break = False args.break_step = int(8e6 * 8) # (5e5) 1e6, UsedTime: (15,000s) 30,000s args.reward_scale = 2 ** -2 # (-50) 0 ~ 2500 (3340) args.max_memo = args.env.max_step * 4 args.batch_size = 2 ** 11 # 10 args.repeat_times = 2 ** 3 args.eval_gap = 2 ** 8 # for Recorder args.eva_size1 = 2 ** 1 # for Recorder args.eva_size2 = 2 ** 3 # for Recorder # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo4_bullet_mujoco_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy args.random_seed = 104367 import pybullet_envs # for python-bullet-gym dir(pybullet_envs) "TotalStep: 6e6, TargetReturn: 1500, UsedTime: 5ks, HumanoidBulletEnv-v0, PPO" "TotalStep: 12e6, TargetReturn: 2500, UsedTime: 10ks, HumanoidBulletEnv-v0, PPO" "TotalStep: 51e6, TargetReturn: 3077, UsedTime: 40ks, HumanoidBulletEnv-v0, PPO" args.env = PreprocessEnv(env=gym.make('HumanoidBulletEnv-v0')) args.env.target_return = 2500 from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True args.agent.lambda_entropy = 0.05 args.agent.lambda_gae_adv = 0.97 args.if_allow_break = False args.break_step = int(8e6 * 8) # (5e5) 1e6, UsedTime: (15,000s) 30,000s args.reward_scale = 2**-3 # (-50) 0 ~ 2500 (3340) args.max_memo = args.env.max_step * 4 args.batch_size = 2**11 # 10 args.repeat_times = 2**3 args.eval_gap = 2**9 # for Recorder args.eva_size1 = 2**1 # for Recorder args.eva_size2 = 2**3 # for Recorder # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo3_custom_env_fin_rl(): from elegantrl.agent import AgentPPO '''choose an DRL algorithm''' args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.if_use_gae = False "TotalStep: 5e4, TargetReward: 1.25, UsedTime: 20s, FinanceStock-v2" "TotalStep: 20e4, TargetReward: 1.50, UsedTime: 80s, FinanceStock-v2" from elegantrl.env import FinanceStockEnv # a standard env for ElegantRL, not need PreprocessEnv() args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024) args.env_eval = FinanceStockEnv( if_train=False, train_beg=0, train_len=1024) # eva_len = 1699 - train_len args.reward_scale = 2**0 # RewardRange: 0 < 1.0 < 1.25 < 1.5 < 1.6 args.break_step = int(5e6) args.net_dim = 2**8 args.max_step = args.env.max_step args.max_memo = (args.max_step - 1) * 8 args.batch_size = 2**11 args.repeat_times = 2**4 args.eval_times1 = 2**2 args.eval_times2 = 2**4 args.if_allow_break = True '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 8 train_and_evaluate_mp(args)
def demo2_continuous_action_space_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy '''choose an DRL algorithm''' from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True '''choose environment''' "PPO TotalStep: 4e5, TargetReward: -200, UsedTime: 400s, Pendulum-v0" env = gym.make('Pendulum-v0') env.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 "PPO TotalStep: 8e5, TargetReward: 200, UsedTime: 1500s, LunarLanderContinuous-v2" # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) # args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 "PPO TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s, BipedalWalker-v3" # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) # args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 # args.gamma = 0.96 '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def run__demo(): import gym import neo_finrl gym.logger.set_level( 40 ) # Block warning: 'WARN: Box bound precision lowered by casting to float32' """DEMO 3: Custom Continuous action env: FinanceStock-v1""" args = Arguments(if_on_policy=True) '''choose an DRL algorithm''' from elegantrl.agent import AgentPPO args.agent = AgentPPO() # from Env import FinanceMultiStockEnv # args.env = FinanceMultiStockEnv(if_train=True, train_beg=0, train_len=1024) # args.env_eval = FinanceMultiStockEnv(if_train=False, train_beg=0, train_len=1024) # eva_len = 1699 - train_len args.env = gym.make('tradingEnv-v0') args.env_eval = gym.make('tradingEnv-v0') args.reward_scale = 2**0 # RewardRange: 0 < 1.0 < 1.25 < args.break_step = int(5e6) args.max_step = args.env.max_step args.max_memo = (args.max_step - 1) * 8 args.batch_size = 2**11 args.if_allow_break = False "TotalStep: 2e5, TargetReward: 1.25, UsedTime: 200s" "TotalStep: 4e5, TargetReward: 1.50, UsedTime: 400s" "TotalStep: 10e5, TargetReward: 1.62, UsedTime: 1000s" '''train and evaluate''' train_and_evaluate(args) # args.rollout_num = 8 # train_and_evaluate__multiprocessing(args) # try multiprocessing in complete version exit()
def check_stock_trading_env(): if_eval = True # False env = StockTradingEnv(if_eval=if_eval) action_dim = env.action_dim state = env.reset() print('state_dim', len(state)) done = False step = 1 reward = None from time import time timer = time() while not done: action = rd.rand(action_dim) * 2 - 1 next_state, reward, done, _ = env.step(action) # print(';', len(next_state), env.day, reward) step += 1 print(f"step: {step}, UsedTime: {time() - timer:.3f}") print(f"terminal reward {reward:.3f}") print(f"episode return {env.episode_return:.3f}") '''draw_cumulative_return''' from elegantrl.agent import AgentPPO from elegantrl.run import Arguments args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.init_before_training() args.agent.save_load_model(cwd='./AgentPPO/StockTradingEnv-v1_0', if_save=False) env.draw_cumulative_return(args, torch)
def demo3_custom_env_fin_rl(): from elegantrl.agent import AgentPPO '''choose an DRL algorithm''' args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.if_use_gae = True args.agent.lambda_entropy = 0.04 "TotalStep: 10e4, TargetReturn: 3.0, UsedTime: 200s, FinanceStock-v1" "TotalStep: 20e4, TargetReturn: 4.0, UsedTime: 400s, FinanceStock-v1" "TotalStep: 30e4, TargetReturn: 4.2, UsedTime: 600s, FinanceStock-v1" from envs.FinRL.StockTrading import StockTradingEnv gamma = 0.995 args.env = StockTradingEnv(if_eval=False, gamma=gamma) args.env_eval = StockTradingEnv(if_eval=True, gamma=gamma) args.gamma = gamma args.break_step = int(3e5) args.net_dim = 2**9 args.max_step = args.env.max_step args.max_memo = args.max_step * 4 args.batch_size = 2**10 args.repeat_times = 2**3 args.eval_gap = 2**4 args.eval_times1 = 2**3 args.eval_times2 = 2**5 args.if_allow_break = False '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def get_video_to_watch_gym_render(): import cv2 # pip3 install opencv-python import gym # pip3 install gym==0.17 pyglet==1.5.0 # env.render() bug in gym==0.18, pyglet==1.6 import torch '''choose env''' # from elegantrl.env import PreprocessEnv env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) '''choose algorithm''' from elegantrl.agent import AgentPPO agent = AgentPPO() net_dim = 2 ** 8 cwd = 'AgentPPO/BipedalWalker-v3_2/' # from elegantrl.agent import AgentModSAC # agent = AgentModSAC() # net_dim = 2 ** 7 # cwd = 'AgentModSAC/BipedalWalker-v3_2/' '''initialize agent''' state_dim = env.state_dim action_dim = env.action_dim agent.init(net_dim, state_dim, action_dim) agent.save_load_model(cwd=cwd, if_save=False) '''initialize evaluete and env.render()''' device = agent.device save_frame_dir = 'frames' save_video = 'gym_render.mp4' os.makedirs(save_frame_dir, exist_ok=True) state = env.reset() for i in range(1024): frame = env.render('rgb_array') cv2.imwrite(f'{save_frame_dir}/{i:06}.png', frame) # cv2.imshow('', frame) # cv2.waitKey(1) s_tensor = torch.as_tensor((state,), dtype=torch.float32, device=device) a_tensor = agent.act(s_tensor) action = a_tensor.detach().cpu().numpy()[0] # if use 'with torch.no_grad()', then '.detach()' not need. # action = gym_env.action_space.sample() next_state, reward, done, _ = env.step(action) if done: state = env.reset() else: state = next_state env.close() '''convert frames png/jpg to video mp4/avi using ffmpeg''' os.system(f"| Convert frames to video using ffmpeg. Save in {save_video}") os.system(f'ffmpeg -r 60 -f image2 -s 600x400 -i {save_frame_dir}/%06d.png ' f'-crf 25 -vb 20M -pix_fmt yuv420p {save_video}')
def demo3_custom_env_fin_rl(): from elegantrl.agent import AgentPPO '''choose an DRL algorithm''' args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.if_use_gae = False "TotalStep: 5e4, TargetReturn: 1.25, UsedTime: 20s, FinanceStock-v2" "TotalStep: 20e4, TargetReturn: 1.50, UsedTime: 80s, FinanceStock-v2" # from elegantrl.env import FinanceStockEnv # a standard env for ElegantRL, not need PreprocessEnv() # args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024) # args.env_eval = FinanceStockEnv(if_train=False, train_beg=0, train_len=1024) # eva_len = 1699 - train_len from finrl.config import config from beta3 import StockTradingEnv, load_stock_trading_data train_df, eval_df = load_stock_trading_data() # train = data_split(processed_df, config.START_DATE, config.START_TRADE_DATE) # trade = data_split(processed_df, config.START_TRADE_DATE, config.END_DATE) # calculate state action space stock_dimension = len(train_df.tic.unique()) state_space = 1 + (2 + len(config.TECHNICAL_INDICATORS_LIST)) * stock_dimension env_kwargs = { "max_stock": 100, "initial_amount": 1000000, "buy_cost_pct": 0.001, "sell_cost_pct": 0.001, "state_space": state_space, "stock_dim": stock_dimension, "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, "action_space": stock_dimension, "reward_scaling": 2**-14 } args.env = StockTradingEnv(df=train_df, **env_kwargs) args.env_eval = StockTradingEnv(df=eval_df, **env_kwargs) args.reward_scale = 2**0 # RewardRange: 0 < 1.0 < 1.25 < 1.5 < 1.6 args.break_step = int(5e6) args.net_dim = 2**8 args.max_step = args.env.max_step args.max_memo = (args.max_step - 1) * 8 args.batch_size = 2**11 args.repeat_times = 2**4 args.eval_times1 = 2**1 args.eval_times2 = 2**3 args.if_allow_break = True '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo4_bullet_mujoco_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True import pybullet_envs # for python-bullet-gym dir(pybullet_envs) "TotalStep: 1e5, TargetReward: 18, UsedTime: 3ks, ReacherBulletEnv-v0" "TotalStep: 1e6, TargetReward: 18, UsedTime: 30ks, ReacherBulletEnv-v0" args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0')) args.break_step = int(2e5 * 8) args.reward_scale = 2**0 # RewardRange: -15 < 0 < 18 < 25 args.gamma = 0.96 args.eval_times1 = 2**2 args.eval_times1 = 2**5 # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args) "TotalStep: 2e6, TargetReward: 1500, UsedTime: 3ks, AntBulletEnv-v0" "TotalStep: 13e6, TargetReward: 2400, UsedTime: 21ks, AntBulletEnv-v0" args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0')) args.env.max_step = 2**10 args.break_step = int(2e6 * 8) # (5e5) 1e6, UsedTime: (15,000s) 30,000s args.reward_scale = 2**-2 # (-50) 0 ~ 2500 (3340) args.max_memo = 2**11 args.repeat_times = 2**3 args.batch_size = 2**10 args.net_dim = 2**9 args.show_gap = 2**8 # for Recorder args.eva_size1 = 2**1 # for Recorder args.eva_size2 = 2**3 # for Recorder # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def get_video_to_watch_gym_render(): import cv2 # pip3 install opencv-python import gym # pip3 install gym==0.17 pyglet==1.5.0 # env.render() bug in gym==0.18, pyglet==1.6 import torch '''choose env''' import pybullet_envs # for python-bullet-gym dir(pybullet_envs) # from elegantrl.env import PreprocessEnv env_name = [ 'BipedalWalker-v3', 'AntBulletEnv-v0', 'KukaBulletEnv-v0', 'ReacherBulletEnv-v0', 'PusherBulletEnv-v0', "ThrowerBulletEnv-v0", "StrikerBulletEnv-v0" ][1] env = PreprocessEnv(env=gym.make(env_name)) '''initialize agent''' agent = None from elegantrl.agent import AgentPPO agent = AgentPPO() agent.if_use_dn = True net_dim = 2**8 cwd = f'./{env_name}_4/' # from elegantrl.agent import AgentModSAC # agent = AgentModSAC() # agent.if_use_dn = True # net_dim = 2 ** 8 # cwd = f'./{env_name}_2/' device = None if agent is not None: state_dim = env.state_dim action_dim = env.action_dim agent.init(net_dim, state_dim, action_dim) agent.save_load_model(cwd=cwd, if_save=False) device = agent.device rd.seed(194686) torch.manual_seed(1942876) '''initialize evaluete and env.render()''' save_frame_dir = 'frames' if save_frame_dir: os.makedirs(save_frame_dir, exist_ok=True) state = env.reset() episode_return = 0 step = 0 for i in range(2**9): print(i) if i % 128 == 0 else None for j in range(1): if agent is not None: s_tensor = torch.as_tensor((state, ), dtype=torch.float32, device=device) a_tensor = agent.act(s_tensor) action = a_tensor.detach().cpu().numpy( )[0] # if use 'with torch.no_grad()', then '.detach()' not need. else: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) episode_return += reward step += 1 if done: print( f'{i:>6}, {step:6.0f}, {episode_return:8.3f}, {reward:8.3f}' ) state = env.reset() episode_return = 0 step = 0 else: state = next_state frame = env.render('rgb_array') frame = frame[50:210, 50:270] # (240, 320) AntPyBulletEnv-v0 # frame = cv2.resize(frame[:, :500], (500//2, 720//2)) cv2.imwrite(f'{save_frame_dir}/{i:06}.png', frame) cv2.imshow('', frame) cv2.waitKey(1) env.close() # exit() '''convert frames png/jpg to video mp4/avi using ffmpeg''' if save_frame_dir: frame_shape = cv2.imread(f'{save_frame_dir}/{3:06}.png').shape print(f"frame_shape: {frame_shape}") save_video = 'gym_render.mp4' os.system( f"| Convert frames to video using ffmpeg. Save in {save_video}") os.system( f'ffmpeg -r 60 -f image2 -s {frame_shape[0]}x{frame_shape[1]} ' f'-i ./{save_frame_dir}/%06d.png ' f'-crf 25 -vb 20M -pix_fmt yuv420p {save_video}')
from elegantrl.agent import AgentPPO import gym from elegantrl.env import PreprocessEnv from elegantrl.run import Arguments, train_and_evaluate, train_and_evaluate_mp if __name__ == '__main__': args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.if_use_gae = True args.agent.lambda_entropy = 0.04 from kuka_cam_reach_env import KukaCamReachEnv, CustomSkipFrame env_config = { "is_render": False, "is_good_view": False, "max_steps_one_episode": 1000, } args.env = CustomSkipFrame(KukaCamReachEnv(config=env_config)) args.gamma = 0.995 args.break_step = int(3e5) args.net_dim = 2**9 args.max_step = args.env.max_step args.max_memo = args.max_step * 4 args.batch_size = 2**10 args.repeat_times = 2**3 args.eval_gap = 2**4 args.eval_times1 = 2**3 args.eval_times2 = 2**5 args.if_allow_break = False