def demo4_bullet_mujoco_on_policy(): args = Arguments(if_on_policy=True) # hyper-parameters of on-policy is different from off-policy import pybullet_envs # for python-bullet-gym dir(pybullet_envs) "TotalStep: 1e5, TargetReward: 18, UsedTime: 3ks, ReacherBulletEnv-v0" "TotalStep: 1e6, TargetReward: 18, UsedTime: 30ks, ReacherBulletEnv-v0" args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0')) from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True args.break_step = int(2e5 * 8) args.reward_scale = 2 ** 0 # RewardRange: -15 < 0 < 18 < 25 args.gamma = 0.96 args.eval_times1 = 2 ** 2 args.eval_times1 = 2 ** 5 # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args) "TotalStep: 30e5, TargetReward: 1500, UsedTime: 6ks, AntBulletEnv-v0" "TotalStep: 75e5, TargetReward: 2500, UsedTime: 14ks, AntBulletEnv-v0" args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0')) from elegantrl.agent import AgentPPO args.agent = AgentPPO() args.agent.if_use_gae = True args.agent.lambda_entropy = 0.05 # 0.02 args.agent.lambda_gae_adv = 0.97 # 0.98 args.agent.if_use_dn = True args.net_dim = 2 ** 8 args.break_step = int(8e6 * 8) # (5e5) 1e6, UsedTime: (15,000s) 30,000s args.reward_scale = 2 ** -2 # (-50) 0 ~ 2500 (3340) args.max_memo = args.env.max_step * 4 args.batch_size = 2 ** 11 # 10 args.repeat_times = 2 ** 3 args.net_dim = 2 ** 8 args.show_gap = 2 ** 8 # for Recorder args.eva_size1 = 2 ** 1 # for Recorder args.eva_size2 = 2 ** 3 # for Recorder # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo1_discrete_action_space(): args = Arguments(agent=None, env=None, gpu_id=None) # see Arguments() to see hyper-parameters '''choose an DRL algorithm''' # from elegantrl.agent import AgentD3QN # AgentDQN,AgentDuelDQN, AgentDoubleDQN, # args.agent = AgentD3QN() from elegantrl.agent import AgentDuelingDQN # AgentDQN,AgentDuelDQN, AgentDoubleDQN, args.agent = AgentDuelingDQN() '''choose environment''' "TotalStep: 2e3, TargetReturn: 200, UsedTime: 20s, CartPole-v0" "TotalStep: 2e3, TargetReturn: 200, UsedTime: 30s, CartPole-v0 rollout_num = 2" # args.env = PreprocessEnv(env=gym.make('CartPole-v0')) # args.net_dim = 2 ** 7 # change a default hyper-parameters # args.batch_size = 2 ** 7 # args.target_step = 2 ** 8 # args.eval_gap = 2 ** 0 "TotalStep: 6e4, TargetReturn: 200, UsedTime: 600s, LunarLander-v2, D3DQN" "TotalStep: 4e4, TargetReturn: 200, UsedTime: 600s, LunarLander-v2, DuelDQN" args.env = PreprocessEnv(env=gym.make('LunarLander-v2')) args.net_dim = 2 ** 8 args.batch_size = 2 ** 8 '''train and evaluate''' train_and_evaluate(args)
def demo3_custom_env_fin_rl(): from elegantrl.agent import AgentPPO '''choose an DRL algorithm''' args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.if_use_gae = False "TotalStep: 5e4, TargetReturn: 1.25, UsedTime: 20s, FinanceStock-v2" "TotalStep: 20e4, TargetReturn: 1.50, UsedTime: 80s, FinanceStock-v2" from elegantrl.env import FinanceStockEnv # a standard env for ElegantRL, not need PreprocessEnv() args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024) args.env_eval = FinanceStockEnv(if_train=False, train_beg=0, train_len=1024) # eva_len = 1699 - train_len args.reward_scale = 2 ** 0 # RewardRange: 0 < 1.0 < 1.25 < 1.5 < 1.6 args.break_step = int(5e6) args.net_dim = 2 ** 8 args.max_step = args.env.max_step args.max_memo = (args.max_step - 1) * 8 args.batch_size = 2 ** 11 args.repeat_times = 2 ** 4 args.eval_times1 = 2 ** 2 args.eval_times2 = 2 ** 4 args.if_allow_break = True '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 8 train_and_evaluate_mp(args)
def demo3_custom_env_fin_rl(): from elegantrl.agent import AgentPPO '''choose an DRL algorithm''' args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.if_use_gae = True args.agent.lambda_entropy = 0.04 "TotalStep: 10e4, TargetReturn: 3.0, UsedTime: 200s, FinanceStock-v1" "TotalStep: 20e4, TargetReturn: 4.0, UsedTime: 400s, FinanceStock-v1" "TotalStep: 30e4, TargetReturn: 4.2, UsedTime: 600s, FinanceStock-v1" from envs.FinRL.StockTrading import StockTradingEnv gamma = 0.995 args.env = StockTradingEnv(if_eval=False, gamma=gamma) args.env_eval = StockTradingEnv(if_eval=True, gamma=gamma) args.gamma = gamma args.break_step = int(3e5) args.net_dim = 2**9 args.max_step = args.env.max_step args.max_memo = args.max_step * 4 args.batch_size = 2**10 args.repeat_times = 2**3 args.eval_gap = 2**4 args.eval_times1 = 2**3 args.eval_times2 = 2**5 args.if_allow_break = False '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
def demo3_custom_env_fin_rl(): from elegantrl.agent import AgentPPO '''choose an DRL algorithm''' args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.if_use_gae = False "TotalStep: 5e4, TargetReturn: 1.25, UsedTime: 20s, FinanceStock-v2" "TotalStep: 20e4, TargetReturn: 1.50, UsedTime: 80s, FinanceStock-v2" # from elegantrl.env import FinanceStockEnv # a standard env for ElegantRL, not need PreprocessEnv() # args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024) # args.env_eval = FinanceStockEnv(if_train=False, train_beg=0, train_len=1024) # eva_len = 1699 - train_len from finrl.config import config from beta3 import StockTradingEnv, load_stock_trading_data train_df, eval_df = load_stock_trading_data() # train = data_split(processed_df, config.START_DATE, config.START_TRADE_DATE) # trade = data_split(processed_df, config.START_TRADE_DATE, config.END_DATE) # calculate state action space stock_dimension = len(train_df.tic.unique()) state_space = 1 + (2 + len(config.TECHNICAL_INDICATORS_LIST)) * stock_dimension env_kwargs = { "max_stock": 100, "initial_amount": 1000000, "buy_cost_pct": 0.001, "sell_cost_pct": 0.001, "state_space": state_space, "stock_dim": stock_dimension, "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, "action_space": stock_dimension, "reward_scaling": 2**-14 } args.env = StockTradingEnv(df=train_df, **env_kwargs) args.env_eval = StockTradingEnv(df=eval_df, **env_kwargs) args.reward_scale = 2**0 # RewardRange: 0 < 1.0 < 1.25 < 1.5 < 1.6 args.break_step = int(5e6) args.net_dim = 2**8 args.max_step = args.env.max_step args.max_memo = (args.max_step - 1) * 8 args.batch_size = 2**11 args.repeat_times = 2**4 args.eval_times1 = 2**1 args.eval_times2 = 2**3 args.if_allow_break = True '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 4 train_and_evaluate_mp(args)
if __name__ == '__main__': args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.if_use_gae = True args.agent.lambda_entropy = 0.04 from kuka_cam_reach_env import KukaCamReachEnv, CustomSkipFrame env_config = { "is_render": False, "is_good_view": False, "max_steps_one_episode": 1000, } args.env = CustomSkipFrame(KukaCamReachEnv(config=env_config)) args.gamma = 0.995 args.break_step = int(3e5) args.net_dim = 2**9 args.max_step = args.env.max_step args.max_memo = args.max_step * 4 args.batch_size = 2**10 args.repeat_times = 2**3 args.eval_gap = 2**4 args.eval_times1 = 2**3 args.eval_times2 = 2**5 args.if_allow_break = False '''train and evaluate''' # train_and_evaluate(args) args.rollout_num = 1 train_and_evaluate(args)
def __init__(self, ticker_list, time_interval, drl_lib, agent, cwd, net_dim, state_dim, action_dim, API_KEY, API_SECRET, APCA_API_BASE_URL, tech_indicator_list, turbulence_thresh=30, max_stock=1e2, latency=None): #load agent self.drl_lib = drl_lib if agent == 'ppo': if drl_lib == 'elegantrl': from elegantrl.agent import AgentPPO from elegantrl.run import Arguments, init_agent #load agent config = { 'state_dim': state_dim, 'action_dim': action_dim, } args = Arguments(agent=AgentPPO, env=StockEnvEmpty(config)) args.cwd = cwd args.net_dim = net_dim # load agent try: agent = init_agent(args, gpu_id=0) self.act = agent.act self.device = agent.device except BaseException: raise ValueError("Fail to load agent!") elif drl_lib == 'rllib': from ray.rllib.agents import ppo from ray.rllib.agents.ppo.ppo import PPOTrainer config = ppo.DEFAULT_CONFIG.copy() config['env'] = StockEnvEmpty config["log_level"] = "WARN" config['env_config'] = { 'state_dim': state_dim, 'action_dim': action_dim, } trainer = PPOTrainer(env=StockEnvEmpty, config=config) trainer.restore(cwd) try: trainer.restore(cwd) self.agent = trainer print("Restoring from checkpoint path", cwd) except: raise ValueError('Fail to load agent!') elif drl_lib == 'stable_baselines3': from stable_baselines3 import PPO try: #load agent self.model = PPO.load(cwd) print("Successfully load model", cwd) except: raise ValueError('Fail to load agent!') else: raise ValueError( 'The DRL library input is NOT supported yet. Please check your input.' ) else: raise ValueError('Agent input is NOT supported yet.') #connect to Alpaca trading API try: self.alpaca = tradeapi.REST(API_KEY, API_SECRET, APCA_API_BASE_URL, 'v2') except: raise ValueError( 'Fail to connect Alpaca. Please check account info and internet connection.' ) #read trading time interval if time_interval == '1s': self.time_interval = 1 elif time_interval == '5s': self.time_interval = 5 elif time_interval == '1Min': self.time_interval = 60 elif time_interval == '5Min': self.time_interval = 60 * 5 elif time_interval == '15Min': self.time_interval = 60 * 15 else: raise ValueError('Time interval input is NOT supported yet.') #read trading settings self.tech_indicator_list = tech_indicator_list self.turbulence_thresh = turbulence_thresh self.max_stock = max_stock #initialize account self.stocks = np.asarray([0] * len(ticker_list)) #stocks holding self.stocks_cd = np.zeros_like(self.stocks) self.cash = None #cash record self.stocks_df = pd.DataFrame(self.stocks, columns=['stocks'], index=ticker_list) self.asset_list = [] self.price = np.asarray([0] * len(ticker_list)) self.stockUniverse = ticker_list self.turbulence_bool = 0 self.equities = []