Beispiel #1
0
def demo2_continuous_action_space_off_policy():
    args = Arguments(if_on_policy=False)

    '''choose an DRL algorithm'''
    from elegantrl.agent import AgentModSAC  # AgentSAC, AgentTD3, AgentDDPG
    args.agent = AgentModSAC()

    '''choose environment'''
    "TotalStep: 3e4, TargetReturn: -200, UsedTime: 300s, Pendulum-v0, TD3"
    "TotalStep: 2e4, TargetReturn: -200, UsedTime: 200s, Pendulum-v0, ModSAC "
    env = gym.make('Pendulum-v0')
    env.target_return = -200  # set target_return manually for env 'Pendulum-v0'
    args.env = PreprocessEnv(env=env)
    args.reward_scale = 2 ** -3  # RewardRange: -1800 < -200 < -50 < 0

    "TD3    TotalStep:  9e4, TargetReturn: 100, UsedTime: 3ks, LunarLanderContinuous-v2"
    "TD3    TotalStep: 20e4, TargetReturn: 200, UsedTime: 5ks, LunarLanderContinuous-v2"
    "SAC    TotalStep:  9e4, TargetReturn: 200, UsedTime: 3ks, LunarLanderContinuous-v2"
    "ModSAC TotalStep:  5e4, TargetReturn: 200, UsedTime: 1ks, LunarLanderContinuous-v2"
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302
    # args.eval_times2 = 2 ** 4  # set a large eval_times to get a precise learning curve

    "ModSAC TotalStep: 2e5, TargetReturn: 300, UsedTime: 5000s, BipedalWalker-v3"
    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.net_dim = 2 ** 8
    # args.break_step = int(2e5)
    # args.if_allow_break = True  # allow break training when reach goal (early termination)
    # args.break_step = int(2e5 * 4)  # break training after 'total_step > break_step'

    '''train and evaluate'''
    train_and_evaluate(args)
Beispiel #2
0
def check_stock_trading_env():
    if_eval = True  # False

    env = StockTradingEnv(if_eval=if_eval)
    action_dim = env.action_dim

    state = env.reset()
    print('state_dim', len(state))

    done = False
    step = 1
    reward = None
    from time import time
    timer = time()
    while not done:
        action = rd.rand(action_dim) * 2 - 1
        next_state, reward, done, _ = env.step(action)
        # print(';', len(next_state), env.day, reward)
        step += 1

    print(f"step: {step}, UsedTime: {time() - timer:.3f}")
    print(f"terminal reward {reward:.3f}")
    print(f"episode return {env.episode_return:.3f}")
    '''draw_cumulative_return'''
    from elegantrl.agent import AgentPPO
    from elegantrl.run import Arguments
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()

    args.init_before_training()
    args.agent.save_load_model(cwd='./AgentPPO/StockTradingEnv-v1_0',
                               if_save=False)

    env.draw_cumulative_return(args, torch)
Beispiel #3
0
def demo2_continuous_action_space_on_policy():
    args = Arguments(if_on_policy=True)  # hyper-parameters of on-policy is different from off-policy
    args.random_seed = 1943

    '''choose an DRL algorithm'''
    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = False

    '''choose environment'''
    "TotalStep: 2e5, TargetReturn: -200, UsedTime: 300s, Pendulum-v0, PPO"
    env = gym.make('Pendulum-v0')
    env.target_return = -200  # set target_return manually for env 'Pendulum-v0'
    args.env = PreprocessEnv(env=env)
    args.reward_scale = 2 ** -3  # RewardRange: -1800 < -200 < -50 < 0
    args.repeat_times = 2 ** 3
    args.target_step = 200 * 8
    args.eval_gap = 2 ** 6

    "PPO    TotalStep: 8e5, TargetReturn: 200, UsedTime: 1500s, LunarLanderContinuous-v2"
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302

    "PPO    TotalStep: 8e5, TargetReturn: 300, UsedTime: 1800s, BipedalWalker-v3"
    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.gamma = 0.96

    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 2
    train_and_evaluate_mp(args)
Beispiel #4
0
def demo1_discrete_action_space():
    args = Arguments(agent=None, env=None, gpu_id=None)  # see Arguments() to see hyper-parameters

    '''choose an DRL algorithm'''
    # from elegantrl.agent import AgentD3QN  # AgentDQN,AgentDuelDQN, AgentDoubleDQN,
    # args.agent = AgentD3QN()
    from elegantrl.agent import AgentDuelingDQN  # AgentDQN,AgentDuelDQN, AgentDoubleDQN,
    args.agent = AgentDuelingDQN()

    '''choose environment'''
    "TotalStep: 2e3, TargetReturn: 200, UsedTime: 20s, CartPole-v0"
    "TotalStep: 2e3, TargetReturn: 200, UsedTime: 30s, CartPole-v0 rollout_num = 2"
    # args.env = PreprocessEnv(env=gym.make('CartPole-v0'))
    # args.net_dim = 2 ** 7  # change a default hyper-parameters
    # args.batch_size = 2 ** 7
    # args.target_step = 2 ** 8
    # args.eval_gap = 2 ** 0

    "TotalStep: 6e4, TargetReturn: 200, UsedTime: 600s, LunarLander-v2, D3DQN"
    "TotalStep: 4e4, TargetReturn: 200, UsedTime: 600s, LunarLander-v2, DuelDQN"
    args.env = PreprocessEnv(env=gym.make('LunarLander-v2'))
    args.net_dim = 2 ** 8
    args.batch_size = 2 ** 8

    '''train and evaluate'''
    train_and_evaluate(args)
from elegantrl.agent import AgentPPO
import gym
from elegantrl.env import PreprocessEnv
from elegantrl.run import Arguments, train_and_evaluate, train_and_evaluate_mp

if __name__ == '__main__':
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    args.agent.lambda_entropy = 0.04

    from kuka_cam_reach_env import KukaCamReachEnv, CustomSkipFrame

    env_config = {
        "is_render": False,
        "is_good_view": False,
        "max_steps_one_episode": 1000,
    }

    args.env = CustomSkipFrame(KukaCamReachEnv(config=env_config))
    args.gamma = 0.995
    args.break_step = int(3e5)
    args.net_dim = 2**9
    args.max_step = args.env.max_step
    args.max_memo = args.max_step * 4
    args.batch_size = 2**10
    args.repeat_times = 2**3
    args.eval_gap = 2**4
    args.eval_times1 = 2**3
    args.eval_times2 = 2**5
    args.if_allow_break = False
Beispiel #6
0
    def __init__(self,
                 ticker_list,
                 time_interval,
                 drl_lib,
                 agent,
                 cwd,
                 net_dim,
                 state_dim,
                 action_dim,
                 API_KEY,
                 API_SECRET,
                 APCA_API_BASE_URL,
                 tech_indicator_list,
                 turbulence_thresh=30,
                 max_stock=1e2,
                 latency=None):
        #load agent
        self.drl_lib = drl_lib
        if agent == 'ppo':
            if drl_lib == 'elegantrl':
                from elegantrl.agent import AgentPPO
                from elegantrl.run import Arguments, init_agent
                #load agent
                config = {
                    'state_dim': state_dim,
                    'action_dim': action_dim,
                }
                args = Arguments(agent=AgentPPO, env=StockEnvEmpty(config))
                args.cwd = cwd
                args.net_dim = net_dim
                # load agent
                try:
                    agent = init_agent(args, gpu_id=0)
                    self.act = agent.act
                    self.device = agent.device
                except BaseException:
                    raise ValueError("Fail to load agent!")

            elif drl_lib == 'rllib':
                from ray.rllib.agents import ppo
                from ray.rllib.agents.ppo.ppo import PPOTrainer

                config = ppo.DEFAULT_CONFIG.copy()
                config['env'] = StockEnvEmpty
                config["log_level"] = "WARN"
                config['env_config'] = {
                    'state_dim': state_dim,
                    'action_dim': action_dim,
                }
                trainer = PPOTrainer(env=StockEnvEmpty, config=config)
                trainer.restore(cwd)
                try:
                    trainer.restore(cwd)
                    self.agent = trainer
                    print("Restoring from checkpoint path", cwd)
                except:
                    raise ValueError('Fail to load agent!')

            elif drl_lib == 'stable_baselines3':
                from stable_baselines3 import PPO

                try:
                    #load agent
                    self.model = PPO.load(cwd)
                    print("Successfully load model", cwd)
                except:
                    raise ValueError('Fail to load agent!')

            else:
                raise ValueError(
                    'The DRL library input is NOT supported yet. Please check your input.'
                )

        else:
            raise ValueError('Agent input is NOT supported yet.')

        #connect to Alpaca trading API
        try:
            self.alpaca = tradeapi.REST(API_KEY, API_SECRET, APCA_API_BASE_URL,
                                        'v2')
        except:
            raise ValueError(
                'Fail to connect Alpaca. Please check account info and internet connection.'
            )

        #read trading time interval
        if time_interval == '1s':
            self.time_interval = 1
        elif time_interval == '5s':
            self.time_interval = 5
        elif time_interval == '1Min':
            self.time_interval = 60
        elif time_interval == '5Min':
            self.time_interval = 60 * 5
        elif time_interval == '15Min':
            self.time_interval = 60 * 15
        else:
            raise ValueError('Time interval input is NOT supported yet.')

        #read trading settings
        self.tech_indicator_list = tech_indicator_list
        self.turbulence_thresh = turbulence_thresh
        self.max_stock = max_stock

        #initialize account
        self.stocks = np.asarray([0] * len(ticker_list))  #stocks holding
        self.stocks_cd = np.zeros_like(self.stocks)
        self.cash = None  #cash record
        self.stocks_df = pd.DataFrame(self.stocks,
                                      columns=['stocks'],
                                      index=ticker_list)
        self.asset_list = []
        self.price = np.asarray([0] * len(ticker_list))
        self.stockUniverse = ticker_list
        self.turbulence_bool = 0
        self.equities = []
Beispiel #7
0
def demo4_bullet_mujoco_on_policy():
    args = Arguments(if_on_policy=True)  # hyper-parameters of on-policy is different from off-policy

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep: 1e5, TargetReturn: 18, UsedTime:  3ks, ReacherBulletEnv-v0, PPO"
    "TotalStep: 1e6, TargetReturn: 18, UsedTime: 30ks, ReacherBulletEnv-v0, PPO"
    args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0'))

    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True

    args.break_step = int(2e5 * 8)
    args.reward_scale = 2 ** 0  # RewardRange: -15 < 0 < 18 < 25
    args.gamma = 0.96
    args.eval_times1 = 2 ** 2
    args.eval_times1 = 2 ** 5

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)

    "TotalStep:  3e6, TargetReturn: 1500, UsedTime:  2ks, AntBulletEnv-v0, PPO"
    "TotalStep: 10e6, TargetReturn: 2500, UsedTime:  6ks, AntBulletEnv-v0, PPO"
    "TotalStep: 46e6, TargetReturn: 3017, UsedTime: 25ks, AntBulletEnv-v0, PPO"
    "TotalStep:  5e6, TargetReturn: 1500, UsedTime:  3ks, AntBulletEnv-v0, PPO if_use_dn"
    "TotalStep: 15e6, TargetReturn: 2500, UsedTime: 10ks, AntBulletEnv-v0, PPO if_use_dn"
    "TotalStep: 60e6, TargetReturn: 2949, UsedTime: 34ks, AntBulletEnv-v0, PPO if_use_dn"
    "TotalStep:  2e6, TargetReturn: 1500, UsedTime:  2ks, AntBulletEnv-v0, PPO if_use_cn"
    "TotalStep: 10e6, TargetReturn: 2500, UsedTime:  7ks, AntBulletEnv-v0, PPO if_use_cn"
    "TotalStep: 53e6, TargetReturn: 2834, UsedTime: 35ks, AntBulletEnv-v0, PPO if_use_cn"
    args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0'))

    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    args.agent.lambda_entropy = 0.05
    args.agent.lambda_gae_adv = 0.97

    args.if_allow_break = False
    args.break_step = int(8e6 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.reward_scale = 2 ** -2  # (-50) 0 ~ 2500 (3340)
    args.max_memo = args.env.max_step * 4
    args.batch_size = 2 ** 11  # 10
    args.repeat_times = 2 ** 3
    args.eval_gap = 2 ** 8  # for Recorder
    args.eva_size1 = 2 ** 1  # for Recorder
    args.eva_size2 = 2 ** 3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Beispiel #8
0
def demo4_bullet_mujoco_off_policy():
    args = Arguments(if_on_policy=False)
    args.random_seed = 10086

    from elegantrl.agent import AgentModSAC  # AgentSAC, AgentTD3, AgentDDPG
    args.agent = AgentModSAC()  # AgentSAC(), AgentTD3(), AgentDDPG()
    args.agent.if_use_dn = True

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep:  5e4, TargetReturn: 18, UsedTime: 1100s, ReacherBulletEnv-v0"
    "TotalStep: 30e4, TargetReturn: 25, UsedTime:     s, ReacherBulletEnv-v0"
    args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0'))
    args.env.max_step = 2 ** 10  # important, default env.max_step=150
    args.reward_scale = 2 ** 0  # -80 < -30 < 18 < 28
    args.gamma = 0.96
    args.break_step = int(6e4 * 8)  # (4e4) 8e5, UsedTime: (300s) 700s
    args.eval_times1 = 2 ** 2
    args.eval_times1 = 2 ** 5
    args.if_per = True

    train_and_evaluate(args)

    "TotalStep:  3e5, TargetReward: 1500, UsedTime:  4ks, AntBulletEnv-v0 ModSAC if_use_dn"
    "TotalStep:  4e5, TargetReward: 2500, UsedTime:  6ks, AntBulletEnv-v0 ModSAC if_use_dn"
    "TotalStep: 10e5, TargetReward: 2879, UsedTime:   ks, AntBulletEnv-v0 ModSAC if_use_dn"
    "TotalStep:  3e5, TargetReward: 1500, UsedTime:  8ks, AntBulletEnv-v0 ModSAC if_use_cn"
    "TotalStep:  7e5, TargetReward: 2500, UsedTime: 18ks, AntBulletEnv-v0 ModSAC if_use_cn"
    "TotalStep: 16e5, TargetReward: 2923, UsedTime:   ks, AntBulletEnv-v0 ModSAC if_use_cn"
    args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0'))
    args.break_step = int(6e5 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.if_allow_break = False
    args.reward_scale = 2 ** -2  # RewardRange: -50 < 0 < 2500 < 3340
    args.max_memo = 2 ** 21
    args.batch_size = 2 ** 8
    args.repeat_times = 2 ** 1
    args.eval_gap = 2 ** 9  # for Recorder
    args.eva_size1 = 2 ** 1  # for Recorder
    args.eva_size2 = 2 ** 3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Beispiel #9
0
def demo3_custom_env_fin_rl():
    from elegantrl.agent import AgentPPO

    '''choose an DRL algorithm'''
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.if_use_gae = False

    "TotalStep:  5e4, TargetReturn: 1.25, UsedTime:  20s, FinanceStock-v2"
    "TotalStep: 20e4, TargetReturn: 1.50, UsedTime:  80s, FinanceStock-v2"
    from elegantrl.env import FinanceStockEnv  # a standard env for ElegantRL, not need PreprocessEnv()
    args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024)
    args.env_eval = FinanceStockEnv(if_train=False, train_beg=0, train_len=1024)  # eva_len = 1699 - train_len
    args.reward_scale = 2 ** 0  # RewardRange: 0 < 1.0 < 1.25 < 1.5 < 1.6
    args.break_step = int(5e6)
    args.net_dim = 2 ** 8
    args.max_step = args.env.max_step
    args.max_memo = (args.max_step - 1) * 8
    args.batch_size = 2 ** 11
    args.repeat_times = 2 ** 4
    args.eval_times1 = 2 ** 2
    args.eval_times2 = 2 ** 4
    args.if_allow_break = True

    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 8
    train_and_evaluate_mp(args)
Beispiel #10
0
def demo3_custom_env_fin_rl():
    from elegantrl.agent import AgentPPO
    '''choose an DRL algorithm'''
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    args.agent.lambda_entropy = 0.04

    "TotalStep: 10e4, TargetReturn: 3.0, UsedTime:  200s, FinanceStock-v1"
    "TotalStep: 20e4, TargetReturn: 4.0, UsedTime:  400s, FinanceStock-v1"
    "TotalStep: 30e4, TargetReturn: 4.2, UsedTime:  600s, FinanceStock-v1"
    from envs.FinRL.StockTrading import StockTradingEnv
    gamma = 0.995
    args.env = StockTradingEnv(if_eval=False, gamma=gamma)
    args.env_eval = StockTradingEnv(if_eval=True, gamma=gamma)

    args.gamma = gamma
    args.break_step = int(3e5)
    args.net_dim = 2**9
    args.max_step = args.env.max_step
    args.max_memo = args.max_step * 4
    args.batch_size = 2**10
    args.repeat_times = 2**3
    args.eval_gap = 2**4
    args.eval_times1 = 2**3
    args.eval_times2 = 2**5
    args.if_allow_break = False
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Beispiel #11
0
def demo3_custom_env_fin_rl():
    from elegantrl.agent import AgentPPO
    '''choose an DRL algorithm'''
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.if_use_gae = False

    "TotalStep:  5e4, TargetReturn: 1.25, UsedTime:  20s, FinanceStock-v2"
    "TotalStep: 20e4, TargetReturn: 1.50, UsedTime:  80s, FinanceStock-v2"
    # from elegantrl.env import FinanceStockEnv  # a standard env for ElegantRL, not need PreprocessEnv()
    # args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024)
    # args.env_eval = FinanceStockEnv(if_train=False, train_beg=0, train_len=1024)  # eva_len = 1699 - train_len
    from finrl.config import config
    from beta3 import StockTradingEnv, load_stock_trading_data
    train_df, eval_df = load_stock_trading_data()
    # train = data_split(processed_df, config.START_DATE, config.START_TRADE_DATE)
    # trade = data_split(processed_df, config.START_TRADE_DATE, config.END_DATE)

    # calculate state action space
    stock_dimension = len(train_df.tic.unique())
    state_space = 1 + (2 +
                       len(config.TECHNICAL_INDICATORS_LIST)) * stock_dimension

    env_kwargs = {
        "max_stock": 100,
        "initial_amount": 1000000,
        "buy_cost_pct": 0.001,
        "sell_cost_pct": 0.001,
        "state_space": state_space,
        "stock_dim": stock_dimension,
        "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST,
        "action_space": stock_dimension,
        "reward_scaling": 2**-14
    }
    args.env = StockTradingEnv(df=train_df, **env_kwargs)
    args.env_eval = StockTradingEnv(df=eval_df, **env_kwargs)

    args.reward_scale = 2**0  # RewardRange: 0 < 1.0 < 1.25 < 1.5 < 1.6
    args.break_step = int(5e6)
    args.net_dim = 2**8
    args.max_step = args.env.max_step
    args.max_memo = (args.max_step - 1) * 8
    args.batch_size = 2**11
    args.repeat_times = 2**4
    args.eval_times1 = 2**1
    args.eval_times2 = 2**3
    args.if_allow_break = True
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)