Esempio n. 1
0
def demo4_bullet_mujoco_on_policy():
    args = Arguments(if_on_policy=True)  # hyper-parameters of on-policy is different from off-policy

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep: 1e5, TargetReturn: 18, UsedTime:  3ks, ReacherBulletEnv-v0, PPO"
    "TotalStep: 1e6, TargetReturn: 18, UsedTime: 30ks, ReacherBulletEnv-v0, PPO"
    args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0'))

    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True

    args.break_step = int(2e5 * 8)
    args.reward_scale = 2 ** 0  # RewardRange: -15 < 0 < 18 < 25
    args.gamma = 0.96
    args.eval_times1 = 2 ** 2
    args.eval_times1 = 2 ** 5

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)

    "TotalStep:  3e6, TargetReturn: 1500, UsedTime:  2ks, AntBulletEnv-v0, PPO"
    "TotalStep: 10e6, TargetReturn: 2500, UsedTime:  6ks, AntBulletEnv-v0, PPO"
    "TotalStep: 46e6, TargetReturn: 3017, UsedTime: 25ks, AntBulletEnv-v0, PPO"
    "TotalStep:  5e6, TargetReturn: 1500, UsedTime:  3ks, AntBulletEnv-v0, PPO if_use_dn"
    "TotalStep: 15e6, TargetReturn: 2500, UsedTime: 10ks, AntBulletEnv-v0, PPO if_use_dn"
    "TotalStep: 60e6, TargetReturn: 2949, UsedTime: 34ks, AntBulletEnv-v0, PPO if_use_dn"
    "TotalStep:  2e6, TargetReturn: 1500, UsedTime:  2ks, AntBulletEnv-v0, PPO if_use_cn"
    "TotalStep: 10e6, TargetReturn: 2500, UsedTime:  7ks, AntBulletEnv-v0, PPO if_use_cn"
    "TotalStep: 53e6, TargetReturn: 2834, UsedTime: 35ks, AntBulletEnv-v0, PPO if_use_cn"
    args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0'))

    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    args.agent.lambda_entropy = 0.05
    args.agent.lambda_gae_adv = 0.97

    args.if_allow_break = False
    args.break_step = int(8e6 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.reward_scale = 2 ** -2  # (-50) 0 ~ 2500 (3340)
    args.max_memo = args.env.max_step * 4
    args.batch_size = 2 ** 11  # 10
    args.repeat_times = 2 ** 3
    args.eval_gap = 2 ** 8  # for Recorder
    args.eva_size1 = 2 ** 1  # for Recorder
    args.eva_size2 = 2 ** 3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Esempio n. 2
0
def demo4_bullet_mujoco_on_policy():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    args.random_seed = 104367

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep:  6e6, TargetReturn: 1500, UsedTime:  5ks, HumanoidBulletEnv-v0, PPO"
    "TotalStep: 12e6, TargetReturn: 2500, UsedTime: 10ks, HumanoidBulletEnv-v0, PPO"
    "TotalStep: 51e6, TargetReturn: 3077, UsedTime: 40ks, HumanoidBulletEnv-v0, PPO"
    args.env = PreprocessEnv(env=gym.make('HumanoidBulletEnv-v0'))
    args.env.target_return = 2500

    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    args.agent.lambda_entropy = 0.05
    args.agent.lambda_gae_adv = 0.97

    args.if_allow_break = False
    args.break_step = int(8e6 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.reward_scale = 2**-3  # (-50) 0 ~ 2500 (3340)
    args.max_memo = args.env.max_step * 4
    args.batch_size = 2**11  # 10
    args.repeat_times = 2**3
    args.eval_gap = 2**9  # for Recorder
    args.eva_size1 = 2**1  # for Recorder
    args.eva_size2 = 2**3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Esempio n. 3
0
def demo3_custom_env_fin_rl():
    from elegantrl.agent import AgentPPO
    '''choose an DRL algorithm'''
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.if_use_gae = False

    "TotalStep:  5e4, TargetReward: 1.25, UsedTime:  20s, FinanceStock-v2"
    "TotalStep: 20e4, TargetReward: 1.50, UsedTime:  80s, FinanceStock-v2"
    from elegantrl.env import FinanceStockEnv  # a standard env for ElegantRL, not need PreprocessEnv()
    args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024)
    args.env_eval = FinanceStockEnv(
        if_train=False, train_beg=0,
        train_len=1024)  # eva_len = 1699 - train_len
    args.reward_scale = 2**0  # RewardRange: 0 < 1.0 < 1.25 < 1.5 < 1.6
    args.break_step = int(5e6)
    args.net_dim = 2**8
    args.max_step = args.env.max_step
    args.max_memo = (args.max_step - 1) * 8
    args.batch_size = 2**11
    args.repeat_times = 2**4
    args.eval_times1 = 2**2
    args.eval_times2 = 2**4
    args.if_allow_break = True
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 8
    train_and_evaluate_mp(args)
Esempio n. 4
0
def demo2_continuous_action_space_on_policy():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    '''choose an DRL algorithm'''
    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    '''choose environment'''
    "PPO    TotalStep: 4e5, TargetReward: -200, UsedTime: 400s, Pendulum-v0"
    env = gym.make('Pendulum-v0')
    env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    args.env = PreprocessEnv(env=env)
    args.reward_scale = 2**-3  # RewardRange: -1800 < -200 < -50 < 0

    "PPO    TotalStep: 8e5, TargetReward: 200, UsedTime: 1500s, LunarLanderContinuous-v2"
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302

    "PPO    TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s, BipedalWalker-v3"
    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.gamma = 0.96
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Esempio n. 5
0
def run__demo():
    import gym
    import neo_finrl
    gym.logger.set_level(
        40
    )  # Block warning: 'WARN: Box bound precision lowered by casting to float32'
    """DEMO 3: Custom Continuous action env: FinanceStock-v1"""
    args = Arguments(if_on_policy=True)
    '''choose an DRL algorithm'''
    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()

    # from Env import FinanceMultiStockEnv

    # args.env = FinanceMultiStockEnv(if_train=True, train_beg=0, train_len=1024)
    # args.env_eval = FinanceMultiStockEnv(if_train=False, train_beg=0, train_len=1024)  # eva_len = 1699 - train_len
    args.env = gym.make('tradingEnv-v0')
    args.env_eval = gym.make('tradingEnv-v0')
    args.reward_scale = 2**0  # RewardRange: 0 < 1.0 < 1.25 <
    args.break_step = int(5e6)
    args.max_step = args.env.max_step
    args.max_memo = (args.max_step - 1) * 8
    args.batch_size = 2**11
    args.if_allow_break = False
    "TotalStep:  2e5, TargetReward: 1.25, UsedTime:  200s"
    "TotalStep:  4e5, TargetReward: 1.50, UsedTime:  400s"
    "TotalStep: 10e5, TargetReward: 1.62, UsedTime: 1000s"
    '''train and evaluate'''
    train_and_evaluate(args)
    # args.rollout_num = 8
    # train_and_evaluate__multiprocessing(args)  # try multiprocessing in complete version
    exit()
Esempio n. 6
0
def check_stock_trading_env():
    if_eval = True  # False

    env = StockTradingEnv(if_eval=if_eval)
    action_dim = env.action_dim

    state = env.reset()
    print('state_dim', len(state))

    done = False
    step = 1
    reward = None
    from time import time
    timer = time()
    while not done:
        action = rd.rand(action_dim) * 2 - 1
        next_state, reward, done, _ = env.step(action)
        # print(';', len(next_state), env.day, reward)
        step += 1

    print(f"step: {step}, UsedTime: {time() - timer:.3f}")
    print(f"terminal reward {reward:.3f}")
    print(f"episode return {env.episode_return:.3f}")
    '''draw_cumulative_return'''
    from elegantrl.agent import AgentPPO
    from elegantrl.run import Arguments
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()

    args.init_before_training()
    args.agent.save_load_model(cwd='./AgentPPO/StockTradingEnv-v1_0',
                               if_save=False)

    env.draw_cumulative_return(args, torch)
Esempio n. 7
0
def demo3_custom_env_fin_rl():
    from elegantrl.agent import AgentPPO
    '''choose an DRL algorithm'''
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    args.agent.lambda_entropy = 0.04

    "TotalStep: 10e4, TargetReturn: 3.0, UsedTime:  200s, FinanceStock-v1"
    "TotalStep: 20e4, TargetReturn: 4.0, UsedTime:  400s, FinanceStock-v1"
    "TotalStep: 30e4, TargetReturn: 4.2, UsedTime:  600s, FinanceStock-v1"
    from envs.FinRL.StockTrading import StockTradingEnv
    gamma = 0.995
    args.env = StockTradingEnv(if_eval=False, gamma=gamma)
    args.env_eval = StockTradingEnv(if_eval=True, gamma=gamma)

    args.gamma = gamma
    args.break_step = int(3e5)
    args.net_dim = 2**9
    args.max_step = args.env.max_step
    args.max_memo = args.max_step * 4
    args.batch_size = 2**10
    args.repeat_times = 2**3
    args.eval_gap = 2**4
    args.eval_times1 = 2**3
    args.eval_times2 = 2**5
    args.if_allow_break = False
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Esempio n. 8
0
def get_video_to_watch_gym_render():
    import cv2  # pip3 install opencv-python
    import gym  # pip3 install gym==0.17 pyglet==1.5.0  # env.render() bug in gym==0.18, pyglet==1.6
    import torch

    '''choose env'''
    # from elegantrl.env import PreprocessEnv
    env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))

    '''choose algorithm'''
    from elegantrl.agent import AgentPPO
    agent = AgentPPO()
    net_dim = 2 ** 8
    cwd = 'AgentPPO/BipedalWalker-v3_2/'
    # from elegantrl.agent import AgentModSAC
    # agent = AgentModSAC()
    # net_dim = 2 ** 7
    # cwd = 'AgentModSAC/BipedalWalker-v3_2/'

    '''initialize agent'''
    state_dim = env.state_dim
    action_dim = env.action_dim
    agent.init(net_dim, state_dim, action_dim)
    agent.save_load_model(cwd=cwd, if_save=False)

    '''initialize evaluete and env.render()'''
    device = agent.device
    save_frame_dir = 'frames'
    save_video = 'gym_render.mp4'

    os.makedirs(save_frame_dir, exist_ok=True)

    state = env.reset()
    for i in range(1024):
        frame = env.render('rgb_array')
        cv2.imwrite(f'{save_frame_dir}/{i:06}.png', frame)
        # cv2.imshow('', frame)
        # cv2.waitKey(1)

        s_tensor = torch.as_tensor((state,), dtype=torch.float32, device=device)
        a_tensor = agent.act(s_tensor)
        action = a_tensor.detach().cpu().numpy()[0]  # if use 'with torch.no_grad()', then '.detach()' not need.
        # action = gym_env.action_space.sample()

        next_state, reward, done, _ = env.step(action)

        if done:
            state = env.reset()
        else:
            state = next_state
    env.close()

    '''convert frames png/jpg to video mp4/avi using ffmpeg'''
    os.system(f"| Convert frames to video using ffmpeg. Save in {save_video}")
    os.system(f'ffmpeg -r 60 -f image2 -s 600x400 -i {save_frame_dir}/%06d.png '
              f'-crf 25 -vb 20M -pix_fmt yuv420p {save_video}')
Esempio n. 9
0
def demo3_custom_env_fin_rl():
    from elegantrl.agent import AgentPPO
    '''choose an DRL algorithm'''
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.if_use_gae = False

    "TotalStep:  5e4, TargetReturn: 1.25, UsedTime:  20s, FinanceStock-v2"
    "TotalStep: 20e4, TargetReturn: 1.50, UsedTime:  80s, FinanceStock-v2"
    # from elegantrl.env import FinanceStockEnv  # a standard env for ElegantRL, not need PreprocessEnv()
    # args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024)
    # args.env_eval = FinanceStockEnv(if_train=False, train_beg=0, train_len=1024)  # eva_len = 1699 - train_len
    from finrl.config import config
    from beta3 import StockTradingEnv, load_stock_trading_data
    train_df, eval_df = load_stock_trading_data()
    # train = data_split(processed_df, config.START_DATE, config.START_TRADE_DATE)
    # trade = data_split(processed_df, config.START_TRADE_DATE, config.END_DATE)

    # calculate state action space
    stock_dimension = len(train_df.tic.unique())
    state_space = 1 + (2 +
                       len(config.TECHNICAL_INDICATORS_LIST)) * stock_dimension

    env_kwargs = {
        "max_stock": 100,
        "initial_amount": 1000000,
        "buy_cost_pct": 0.001,
        "sell_cost_pct": 0.001,
        "state_space": state_space,
        "stock_dim": stock_dimension,
        "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST,
        "action_space": stock_dimension,
        "reward_scaling": 2**-14
    }
    args.env = StockTradingEnv(df=train_df, **env_kwargs)
    args.env_eval = StockTradingEnv(df=eval_df, **env_kwargs)

    args.reward_scale = 2**0  # RewardRange: 0 < 1.0 < 1.25 < 1.5 < 1.6
    args.break_step = int(5e6)
    args.net_dim = 2**8
    args.max_step = args.env.max_step
    args.max_memo = (args.max_step - 1) * 8
    args.batch_size = 2**11
    args.repeat_times = 2**4
    args.eval_times1 = 2**1
    args.eval_times2 = 2**3
    args.if_allow_break = True
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Esempio n. 10
0
def demo4_bullet_mujoco_on_policy():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy

    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep: 1e5, TargetReward: 18, UsedTime:  3ks, ReacherBulletEnv-v0"
    "TotalStep: 1e6, TargetReward: 18, UsedTime: 30ks, ReacherBulletEnv-v0"
    args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0'))
    args.break_step = int(2e5 * 8)
    args.reward_scale = 2**0  # RewardRange: -15 < 0 < 18 < 25
    args.gamma = 0.96
    args.eval_times1 = 2**2
    args.eval_times1 = 2**5

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)

    "TotalStep:  2e6, TargetReward: 1500, UsedTime:  3ks, AntBulletEnv-v0"
    "TotalStep: 13e6, TargetReward: 2400, UsedTime: 21ks, AntBulletEnv-v0"
    args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0'))
    args.env.max_step = 2**10

    args.break_step = int(2e6 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.reward_scale = 2**-2  # (-50) 0 ~ 2500 (3340)
    args.max_memo = 2**11
    args.repeat_times = 2**3
    args.batch_size = 2**10
    args.net_dim = 2**9
    args.show_gap = 2**8  # for Recorder
    args.eva_size1 = 2**1  # for Recorder
    args.eva_size2 = 2**3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Esempio n. 11
0
def get_video_to_watch_gym_render():
    import cv2  # pip3 install opencv-python
    import gym  # pip3 install gym==0.17 pyglet==1.5.0  # env.render() bug in gym==0.18, pyglet==1.6
    import torch
    '''choose env'''
    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    # from elegantrl.env import PreprocessEnv
    env_name = [
        'BipedalWalker-v3', 'AntBulletEnv-v0', 'KukaBulletEnv-v0',
        'ReacherBulletEnv-v0', 'PusherBulletEnv-v0', "ThrowerBulletEnv-v0",
        "StrikerBulletEnv-v0"
    ][1]
    env = PreprocessEnv(env=gym.make(env_name))
    '''initialize agent'''
    agent = None

    from elegantrl.agent import AgentPPO
    agent = AgentPPO()
    agent.if_use_dn = True
    net_dim = 2**8
    cwd = f'./{env_name}_4/'

    # from elegantrl.agent import AgentModSAC
    # agent = AgentModSAC()
    # agent.if_use_dn = True
    # net_dim = 2 ** 8
    # cwd = f'./{env_name}_2/'

    device = None
    if agent is not None:
        state_dim = env.state_dim
        action_dim = env.action_dim
        agent.init(net_dim, state_dim, action_dim)
        agent.save_load_model(cwd=cwd, if_save=False)
        device = agent.device
        rd.seed(194686)
        torch.manual_seed(1942876)
    '''initialize evaluete and env.render()'''
    save_frame_dir = 'frames'

    if save_frame_dir:
        os.makedirs(save_frame_dir, exist_ok=True)

    state = env.reset()
    episode_return = 0
    step = 0
    for i in range(2**9):
        print(i) if i % 128 == 0 else None
        for j in range(1):
            if agent is not None:
                s_tensor = torch.as_tensor((state, ),
                                           dtype=torch.float32,
                                           device=device)
                a_tensor = agent.act(s_tensor)
                action = a_tensor.detach().cpu().numpy(
                )[0]  # if use 'with torch.no_grad()', then '.detach()' not need.
            else:
                action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)

            episode_return += reward
            step += 1

            if done:
                print(
                    f'{i:>6}, {step:6.0f}, {episode_return:8.3f}, {reward:8.3f}'
                )
                state = env.reset()
                episode_return = 0
                step = 0
            else:
                state = next_state

        frame = env.render('rgb_array')
        frame = frame[50:210, 50:270]  # (240, 320) AntPyBulletEnv-v0
        # frame = cv2.resize(frame[:, :500], (500//2, 720//2))
        cv2.imwrite(f'{save_frame_dir}/{i:06}.png', frame)
        cv2.imshow('', frame)
        cv2.waitKey(1)
    env.close()
    # exit()
    '''convert frames png/jpg to video mp4/avi using ffmpeg'''
    if save_frame_dir:
        frame_shape = cv2.imread(f'{save_frame_dir}/{3:06}.png').shape
        print(f"frame_shape: {frame_shape}")

        save_video = 'gym_render.mp4'
        os.system(
            f"| Convert frames to video using ffmpeg. Save in {save_video}")
        os.system(
            f'ffmpeg -r 60 -f image2 -s {frame_shape[0]}x{frame_shape[1]} '
            f'-i ./{save_frame_dir}/%06d.png '
            f'-crf 25 -vb 20M -pix_fmt yuv420p {save_video}')
from elegantrl.agent import AgentPPO
import gym
from elegantrl.env import PreprocessEnv
from elegantrl.run import Arguments, train_and_evaluate, train_and_evaluate_mp

if __name__ == '__main__':
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    args.agent.lambda_entropy = 0.04

    from kuka_cam_reach_env import KukaCamReachEnv, CustomSkipFrame

    env_config = {
        "is_render": False,
        "is_good_view": False,
        "max_steps_one_episode": 1000,
    }

    args.env = CustomSkipFrame(KukaCamReachEnv(config=env_config))
    args.gamma = 0.995
    args.break_step = int(3e5)
    args.net_dim = 2**9
    args.max_step = args.env.max_step
    args.max_memo = args.max_step * 4
    args.batch_size = 2**10
    args.repeat_times = 2**3
    args.eval_gap = 2**4
    args.eval_times1 = 2**3
    args.eval_times2 = 2**5
    args.if_allow_break = False