def basic_usage_example(): # Basic Usage: Training, Saving, Loading. # Create environment. env = gym.make("LunarLander-v2") # Instantiate the agent. model = DQN("MlpPolicy", env, verbose=1) # Train the agent. model.learn(total_timesteps=int(2e5)) # Save the agent. model.save("dqn_lunar") del model # Delete trained model to demonstrate loading. # Load the trained agent. # NOTE: if you have loading issue, you can pass 'print_system_info=True' # to compare the system on which the model was trained vs the current one. #model = DQN.load("dqn_lunar", env=env, print_system_info=True) model = DQN.load("dqn_lunar", env=env) # Evaluate the agent. # NOTE: If you use wrappers with your environment that modify rewards, # this will be reflected here. To evaluate with original rewards, # wrap environment in a "Monitor" wrapper before other wrappers. mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) # Enjoy trained agent. obs = env.reset() for i in range(1000): action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render()
def test_dqn_epsilon_greedy(): env = IdentityEnv(2) model = DQN("MlpPolicy", env) model.exploration_rate = 1.0 obs = env.reset() # is vectorized should not crash with discrete obs action, _ = model.predict(obs, deterministic=False) assert env.action_space.contains(action)
def run_dqn_baseline(): env = make_atari_env('BreakoutNoFrameskip-v4', n_envs=1, seed=0) env = VecFrameStack(env, n_stack=4) tensorboard_log = os.path.join(os.path.dirname(__file__), 'runs_baseline') buffer_size = 100000 num_training_steps = 1000000 model = DQN('CnnPolicy', env, verbose=0, buffer_size=buffer_size, learning_starts=50000, optimize_memory_usage=False, tensorboard_log=tensorboard_log) model.learn(total_timesteps=num_training_steps) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
model.learn(total_timesteps=5_000_000, callback=eval_callback) # Typically not enough model.save("DQN") #model = DQN.load("DQN", env=env) model = DQN.load("logs/best_model", env=env) #model = PPO.load("PPO_discrete", env=env) logger = Logger(logging_freq_hz=int(env.SIM_FREQ / env.AGGR_PHY_STEPS), num_drones=ARGS.num_drones) obs = env.reset() start = time.time() n_trial = 0 for i in range(ARGS.duration_sec * env.SIM_FREQ): if ARGS.duration_sec * env.SIM_FREQ % AGGR_PHY_STEPS == 0: action, _states = model.predict( obs, deterministic=True, ) #else: # action = np.array([1,0,0]) #No Turn #print(f"action {action}") #print(f"obs : {obs}") obs, reward, done, info = env.step(action) for j in range(ARGS.num_drones): logger.log(drone=j, timestamp=i / env.SIM_FREQ, state=env._getDroneStateVector(int(j)), control=env._getTargetVelocityControls(int(j))) #if i%env.SIM_FREQ == 0: #env.render()
def key_handler(event): """ Accepts a key event and makes an appropriate decision. :param event: Key event :return: void """ global _root global _routing_canvas global _rl_model global _is_first_step global _rl_env global _rl_target_cell global _step_count global LEARN_RATE global EXPLORE_INIT global EXPLORE_FINAL global GAMMA global TRAIN_TIME_STEPS global LOAD_MODEL_NAME e_char = event.char if e_char == 'l': # RL Agent Learning pass # AI Gym Environment check - only do this when testing a new environment (resets RNG seed) # check_env(_rl_env) _step_count = 0 # Reset because check_env increments via step() # RL Agent _rl_model = DQN('MlpPolicy', _rl_env, verbose=1, learning_rate=LEARN_RATE, exploration_initial_eps=EXPLORE_INIT, exploration_final_eps=EXPLORE_FINAL, gamma=GAMMA) print("Beginning RL training") _rl_model.learn(total_timesteps=TRAIN_TIME_STEPS) print("Finished RL training") print("Saving trained model") _rl_model.save("agent_" + time.strftime("%d-%m-%YT%H-%M-%S")) elif e_char == 't': # RL Agent Testing pass # AI Gym Environment check - only do this when testing a new environment (resets RNG seed) # check_env(_rl_env) _step_count = 0 # Reset because check_env increments via step() print("Loading trained model") if _rl_model is None: _rl_model = DQN.load(LOAD_MODEL_NAME) obs = _rl_env.reset() done = False while not done: rl_action, states = _rl_model.predict(obs, deterministic=True) print("Action " + str(rl_action)) obs, rewards, done, info = _rl_env.step(rl_action) elif e_char == 'r': # RL flow debugging (no agent involved, emulate actions randomly) if _is_first_step: _rl_env.reset() _is_first_step = False else: rand_action = random.randrange(1) rl_action_step(rand_action) else: pass
eval_eps = 100 pbar = tqdm(total = eval_eps) env = gym.make('Trading-v0') rewards = [] baseline_diff = [] for ep in range(eval_eps): done = False ep_reward = 0 s = env.reset() while not done: if args.rand: action = env.get_random_action() else: action = model.predict(s, deterministic = args.d)[0] ns, r, done, info = env.step(action) s = ns if args.r: env.render() ep_reward += r baseline_diff.append(env.get_baseline_diff()) rewards.append(ep_reward) pbar.update(1) pbar.close() agent_name = 'random' if args.rand else 'agent_{}'.format('deterministic' if args.d else 'stochastic') rewards = pd.DataFrame(rewards, columns = [agent_name + '_' + args.name]) rewards.to_csv('reward_{}_{}.csv'.format(agent_name, args.name), index = False)
class TradingAgent: def __init__(self, model='a2c', use_gp=False, gp_params=None, **kwargs): # wrapper around stable_baselines RL implemenetations assert model in ACCEPTED_MODELS, 'Unknown RL model, must be in {}'.format(ACCEPTED_MODELS) if model == 'a2c': self.rl = A2C(**kwargs) elif model == 'ppo': self.rl = PPO(**kwargs) elif model == 'dqn': self.rl = DQN(**kwargs) elif model == 'td3': self.rl = TD3(**kwargs) self.use_gp = use_gp if self.use_gp: assert gp_params is not None, 'Must provide parameters such as training data, number of iterations, etc. for GPR' self.n_train = gp_params['n_train'] self.retraining_iter = gp_params['training_iter'] self.cvar_limit = gp_params['cvar_limit'] self.gp_limit = gp_params['gp_limit'] self.likelihood = gpytorch.likelihoods.GaussianLikelihood() if 'data' in gp_params.keys(): self.X_train = gp_params['data']['X_train'] self.y_train = gp_params['data']['y_train'] else: self.X_train = torch.zeros(self.n_train, kwargs['env'].num_features) # hard coded to match dimensions of features self.y_train = torch.zeros(self.n_train) self.gp = ExactGPModel(self.X_train, self.y_train, self.likelihood) self.mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.gp) self.opt = torch.optim.Adam(self.gp.parameters(), lr=0.1) self.shares = 0 self.cash = 0 self.obs = [] # holds up to 2 past observations, helps in keeping X, y aligned # for plotting self.pred_return = 0 self.pred_lower = 0 self.pred_upper = 0 # for debugging self.goal_num_shares = 0 def learn(self, n_steps): # when using gp, load pretrained rl agent - no need to train if self.use_gp: # train GP using fixed number of steps self.__train_gp(100) else: # train RL agent self.rl.learn(n_steps) def predict(self, obs, deterministic): action, state = self.rl.predict(obs, deterministic=deterministic) if self.use_gp: # slightly retrain self.__train_gp(self.retraining_iter, retrain=True) # predict next step returns and CI using GP with torch.no_grad(), gpytorch.settings.fast_pred_var(): output = self.gp(torch.Tensor(obs[2:])[None]) obs_pred = self.likelihood(output) f_mean = output.mean.detach().numpy()[0] self.pred_return = f_mean.item() f_samples = output.sample(sample_shape=torch.Size((10000,))).detach().numpy() lower, upper = obs_pred.confidence_region() self.pred_lower = lower.item() self.pred_upper = upper.item() rl_action = action action -= ACTION_OFFSET # adjust from action for env to see actual trade # adjust trade size given prediction # if self.shares > 0: # long position if f_mean > self.gp_limit: # predict positive return over certain threshold tail_samples = f_samples[f_samples < lower.item()] ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else lower.item() # cvar per share if ps_cvar < 0: goal_num_shares = self.cvar_limit // ps_cvar else: goal_num_shares = self.shares + action # positive return for long - no adjustment needed action = min(10, max(0, goal_num_shares - self.shares)) elif f_mean < -self.gp_limit: tail_samples = f_samples[f_samples > upper.item()] ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else upper.item() # cvar per share if ps_cvar < 0: goal_num_shares = self.shares + action # negative return for short - no adjustment needed else: goal_num_shares = self.cvar_limit // ps_cvar action = max(-10, min(0, goal_num_shares - self.shares)) else: goal_num_shares = self.shares + action # print(ps_cvar, lower.item(), upper.item()) # if not np.isnan(goal_num_shares): self.goal_num_shares = goal_num_shares # if action > 0: # buy order # action = min(10, max(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction # # print(goal_num_shares - self.shares, action) # elif action < 0: # sell order # action = max(-10, min(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction action += ACTION_OFFSET # adjust for env actions being 1 to N rather than -N/2 to N/2 # print(f_mean, ps_cvar, self.shares, goal_num_shares, rl_action-ACTION_OFFSET, action-ACTION_OFFSET) return action, state def update(self, obs, reward=None): self.obs.append(obs) self.shares, self.cash = obs[:2] if reward is not None: self.X_train = torch.cat((self.X_train, torch.Tensor(self.obs.pop(0)[2:])[None]))[1:] # self.X_train[1:] self.y_train = torch.cat((self.y_train, torch.Tensor([reward])))[1:] # print(self.X_train, self.y_train) self.gp.set_train_data(self.X_train, self.y_train) def save(self, rl_path, gp_path=None): self.rl.save(rl_path) if gp_path is not None: torch.save(self.gp.state_dict(), gp_path) def load(self, rl_path, gp_path=None): self.rl = A2C.load(rl_path) if gp_path is not None: state_dict = torch.load(gp_path) self.gp.load_state_dict(state_dict) def __train_gp(self, n_iter, retrain=False): # train GP using fixed number of steps self.gp.train() self.likelihood.train() for i in range(n_iter): output = self.gp(self.X_train) loss = -self.mll(output, self.y_train) self.opt.zero_grad() loss.backward() self.opt.step() self.gp.eval() self.likelihood.eval()
}, "policy_frequency": 2, "duration": 40, }) env.reset() model = DQN('CnnPolicy', env, gamma=0.8, learning_rate=5e-4, buffer_size=40*1000, learning_starts=200, exploration_fraction=0.6, target_update_interval=256, batch_size=32, verbose=1, tensorboard_log="logs/") model.learn(total_timesteps=int(2e5)) model.save("dqn_highway") # Record video model = DQN.load("dqn_highway") env.configure({"policy_frequency": 15, "duration": 20 * 15}) video_length = 2 * env.config["duration"] env = VecVideoRecorder(env, "videos/", record_video_trigger=lambda x: x == 0, video_length=video_length, name_prefix="dqn-agent") obs = env.reset() for _ in range(video_length + 1): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) env.close()
env = make_vec_env(lambda: env, n_envs=1) model = DQN(MlpPolicy, env, verbose=1, learning_rate=1e-3) model.learn(total_timesteps=20000, log_interval=200, n_eval_episodes=1000) # model.save("deepq_breakout") # # del model # remove to demonstrate saving and loading # # model = DQN.load("deepq_breakout") print("Teste") obs = env.reset() #while True: #Test1 for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() #Test2 print("Evaluating...") start = time.time() mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1000) end = time.time() print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") print("evaluation duration in s: ", end - start) #Test3 print("Evaluating v2...") start = time.time() mean_reward = evaluate(model, env)
# model = DQN("CnnPolicy", env_vec, policy_kwargs=policy_kwargs, verbose=1) model = DQN("CnnPolicy", env, target_update_interval=1000, batch_size=512, exploration_final_eps=0.2, policy_kwargs=policy_kwargs, verbose=1) model.save(agentPath) # Record gif of trained agent imagesGrid = [] obs = env.reset() imagesGrid.append(env.render("human")) for step in range(200): action, _ = model.predict(obs, deterministic=False) obs, reward, done, info = env.step(action) print("reward : ", reward) env.render(mode='console') imagesGrid.append(env.render("human")) if done: print("Goal reached!", "reward=", reward) break imagesGrid[0].save(f'_data/visu.gif', save_all=True, append_images=imagesGrid[1:], optimize=True, duration=100, loop=0) for _ in range(50):
from gym_anytrading.datasets import FOREX_EURUSD_1H_ASK, STOCKS_GOOGL # env = gym.make('forex-v0', frame_bound=(50, 100), window_size=10) env = gym.make('stocks-v0', frame_bound=(50, 100), window_size=10) #############################AGENT############################################ from stable_baselines3 import DQN # model = DQN.load("MyTradingAgent") # use an existing model, if available model = DQN("MlpPolicy", env, verbose=1) model.learn(total_timesteps=50000, log_interval=500) # model.save("MyTradingAgent") # del model # remove the model ##############################OBSERVATION###################################### import matplotlib.pyplot as plt observation = env.reset() done = False while not done: action, _states = model.predict(observation) observation, reward, done, info = env.step(action) print("Info:", info) plt.cla() env.render_all() plt.show()
if TEST: model = DDQN.load(log_dir + '/best_model/' + MODEL_NAME) env = gym.make('rl_stocks-v0') env._reset(actions=N_DISCRETE_ACTIONS, observation_space=OBSERVATION_SPACE_TEST, data=test_df, trade_amount=TRADE_AMOUNT, key=KEY, wallet=WALLET, window=WINDOW, interest_rate=INTEREST_RATE, log_dir=test_log_dir) for step in range(testing_timesteps): obs = env.reset() action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) # print('action=', action, 'reward=', reward, 'done=', done) npz = np.load(test_log_dir + '/log.npz') df = pd.DataFrame.from_dict({item: npz[item] for item in npz.files}) df['State_prime'] = df['State'].shift() print( 'Test Profit Factor:', df['Index'].loc[(df['State_prime'] != 'Flat') & (df['Profit'] > 0)].count() / df['Index'].loc[(df['State_prime'] != 'Flat') & (df['Profit'] < 0)].count()) print('Test Profit', df['Profit'].sum()) df.to_csv(test_log_dir + '/Test_log.csv')