Esempio n. 1
0
def basic_usage_example():
    # Basic Usage: Training, Saving, Loading.

    # Create environment.
    env = gym.make("LunarLander-v2")

    # Instantiate the agent.
    model = DQN("MlpPolicy", env, verbose=1)
    # Train the agent.
    model.learn(total_timesteps=int(2e5))
    # Save the agent.
    model.save("dqn_lunar")
    del model  # Delete trained model to demonstrate loading.

    # Load the trained agent.
    # NOTE: if you have loading issue, you can pass 'print_system_info=True'
    # to compare the system on which the model was trained vs the current one.
    #model = DQN.load("dqn_lunar", env=env, print_system_info=True)
    model = DQN.load("dqn_lunar", env=env)

    # Evaluate the agent.
    # NOTE: If you use wrappers with your environment that modify rewards,
    #	this will be reflected here. To evaluate with original rewards,
    #	wrap environment in a "Monitor" wrapper before other wrappers.
    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)

    # Enjoy trained agent.
    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = env.step(action)
        env.render()
Esempio n. 2
0
def test_dqn_epsilon_greedy():
    env = IdentityEnv(2)
    model = DQN("MlpPolicy", env)
    model.exploration_rate = 1.0
    obs = env.reset()
    # is vectorized should not crash with discrete obs
    action, _ = model.predict(obs, deterministic=False)
    assert env.action_space.contains(action)
def run_dqn_baseline():
    env = make_atari_env('BreakoutNoFrameskip-v4', n_envs=1, seed=0)
    env = VecFrameStack(env, n_stack=4)
    tensorboard_log = os.path.join(os.path.dirname(__file__), 'runs_baseline')
    buffer_size = 100000
    num_training_steps = 1000000

    model = DQN('CnnPolicy',
                env,
                verbose=0,
                buffer_size=buffer_size,
                learning_starts=50000,
                optimize_memory_usage=False,
                tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=num_training_steps)

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Esempio n. 4
0
    model.learn(total_timesteps=5_000_000,
                callback=eval_callback)  # Typically not enough
    model.save("DQN")
    #model = DQN.load("DQN", env=env)
    model = DQN.load("logs/best_model", env=env)
    #model = PPO.load("PPO_discrete", env=env)

    logger = Logger(logging_freq_hz=int(env.SIM_FREQ / env.AGGR_PHY_STEPS),
                    num_drones=ARGS.num_drones)
    obs = env.reset()
    start = time.time()
    n_trial = 0
    for i in range(ARGS.duration_sec * env.SIM_FREQ):
        if ARGS.duration_sec * env.SIM_FREQ % AGGR_PHY_STEPS == 0:
            action, _states = model.predict(
                obs,
                deterministic=True,
            )
            #else:
            #    action = np.array([1,0,0]) #No Turn

            #print(f"action {action}")
            #print(f"obs : {obs}")
            obs, reward, done, info = env.step(action)

        for j in range(ARGS.num_drones):
            logger.log(drone=j,
                       timestamp=i / env.SIM_FREQ,
                       state=env._getDroneStateVector(int(j)),
                       control=env._getTargetVelocityControls(int(j)))
        #if i%env.SIM_FREQ == 0:
        #env.render()
Esempio n. 5
0
def key_handler(event):
    """
    Accepts a key event and makes an appropriate decision.
    :param event: Key event
    :return: void
    """
    global _root
    global _routing_canvas
    global _rl_model
    global _is_first_step
    global _rl_env
    global _rl_target_cell
    global _step_count
    global LEARN_RATE
    global EXPLORE_INIT
    global EXPLORE_FINAL
    global GAMMA
    global TRAIN_TIME_STEPS
    global LOAD_MODEL_NAME

    e_char = event.char

    if e_char == 'l':
        # RL Agent Learning pass

        # AI Gym Environment check - only do this when testing a new environment (resets RNG seed)
        # check_env(_rl_env)
        _step_count = 0  # Reset because check_env increments via step()

        # RL Agent
        _rl_model = DQN('MlpPolicy', _rl_env, verbose=1, learning_rate=LEARN_RATE, exploration_initial_eps=EXPLORE_INIT,
                        exploration_final_eps=EXPLORE_FINAL, gamma=GAMMA)
        print("Beginning RL training")
        _rl_model.learn(total_timesteps=TRAIN_TIME_STEPS)
        print("Finished RL training")
        print("Saving trained model")
        _rl_model.save("agent_" + time.strftime("%d-%m-%YT%H-%M-%S"))
    elif e_char == 't':
        # RL Agent Testing pass

        # AI Gym Environment check - only do this when testing a new environment (resets RNG seed)
        # check_env(_rl_env)
        _step_count = 0  # Reset because check_env increments via step()

        print("Loading trained model")
        if _rl_model is None:
            _rl_model = DQN.load(LOAD_MODEL_NAME)

        obs = _rl_env.reset()
        done = False
        while not done:
            rl_action, states = _rl_model.predict(obs, deterministic=True)
            print("Action " + str(rl_action))
            obs, rewards, done, info = _rl_env.step(rl_action)
    elif e_char == 'r':
        # RL flow debugging (no agent involved, emulate actions randomly)
        if _is_first_step:
            _rl_env.reset()
            _is_first_step = False
        else:
            rand_action = random.randrange(1)
            rl_action_step(rand_action)
    else:
        pass
Esempio n. 6
0

    eval_eps = 100
    pbar = tqdm(total = eval_eps)
    env = gym.make('Trading-v0')
    rewards = []
    baseline_diff = []
    for ep in range(eval_eps): 
        done = False 
        ep_reward = 0
        s = env.reset()
        while not done: 
            if args.rand: 
                action = env.get_random_action()
            else: 
                action = model.predict(s, deterministic = args.d)[0]
            ns, r, done, info = env.step(action)
            s = ns 
            if args.r: 
                env.render()
            ep_reward += r
        baseline_diff.append(env.get_baseline_diff())
        rewards.append(ep_reward)
        pbar.update(1)
    pbar.close()


    agent_name = 'random' if args.rand else 'agent_{}'.format('deterministic' if args.d else 'stochastic')
    rewards = pd.DataFrame(rewards, columns = [agent_name + '_' + args.name]) 
    rewards.to_csv('reward_{}_{}.csv'.format(agent_name, args.name), index = False)
Esempio n. 7
0
class TradingAgent:
    def __init__(self, model='a2c', use_gp=False, gp_params=None, **kwargs):
        # wrapper around stable_baselines RL implemenetations
        assert model in ACCEPTED_MODELS, 'Unknown RL model, must be in {}'.format(ACCEPTED_MODELS)
        if model == 'a2c':
            self.rl = A2C(**kwargs)
        elif model == 'ppo':
            self.rl = PPO(**kwargs)
        elif model == 'dqn':
            self.rl = DQN(**kwargs)
        elif model == 'td3':
            self.rl = TD3(**kwargs)

        self.use_gp = use_gp
        if self.use_gp:
            assert gp_params is not None, 'Must provide parameters such as training data, number of iterations, etc. for GPR'
            self.n_train = gp_params['n_train']
            self.retraining_iter = gp_params['training_iter']
            self.cvar_limit = gp_params['cvar_limit']
            self.gp_limit = gp_params['gp_limit']

            self.likelihood = gpytorch.likelihoods.GaussianLikelihood()
            if 'data' in gp_params.keys():
                self.X_train = gp_params['data']['X_train']
                self.y_train = gp_params['data']['y_train']
            else:
                self.X_train = torch.zeros(self.n_train, kwargs['env'].num_features) # hard coded to match dimensions of features
                self.y_train = torch.zeros(self.n_train)
            self.gp = ExactGPModel(self.X_train, self.y_train, self.likelihood)
            self.mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.gp)
            self.opt = torch.optim.Adam(self.gp.parameters(), lr=0.1)

            self.shares = 0
            self.cash = 0
            self.obs = [] # holds up to 2 past observations, helps in keeping X, y aligned

            # for plotting
            self.pred_return = 0
            self.pred_lower = 0
            self.pred_upper = 0

            # for debugging
            self.goal_num_shares = 0

    def learn(self, n_steps):
        # when using gp, load pretrained rl agent - no need to train
        if self.use_gp:
            # train GP using fixed number of steps
            self.__train_gp(100)
        else:
            # train RL agent
            self.rl.learn(n_steps)

    def predict(self, obs, deterministic):
        action, state = self.rl.predict(obs, deterministic=deterministic)

        if self.use_gp:
            # slightly retrain
            self.__train_gp(self.retraining_iter, retrain=True)

            # predict next step returns and CI using GP
            with torch.no_grad(), gpytorch.settings.fast_pred_var():
                output = self.gp(torch.Tensor(obs[2:])[None])
                obs_pred = self.likelihood(output)
                f_mean = output.mean.detach().numpy()[0]
                self.pred_return = f_mean.item()
                f_samples = output.sample(sample_shape=torch.Size((10000,))).detach().numpy()
                lower, upper = obs_pred.confidence_region()
                self.pred_lower = lower.item()
                self.pred_upper = upper.item()

            rl_action = action
            action -= ACTION_OFFSET # adjust from action for env to see actual trade

            # adjust trade size given prediction
            # if self.shares > 0: # long position
            if f_mean > self.gp_limit: # predict positive return over certain threshold
                tail_samples = f_samples[f_samples < lower.item()]
                ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else lower.item() # cvar per share
                if ps_cvar < 0:
                    goal_num_shares = self.cvar_limit // ps_cvar
                else:
                    goal_num_shares = self.shares + action # positive return for long - no adjustment needed
                action = min(10, max(0, goal_num_shares - self.shares))
            elif f_mean < -self.gp_limit:
                tail_samples = f_samples[f_samples > upper.item()]
                ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else upper.item() # cvar per share
                if ps_cvar < 0:
                    goal_num_shares = self.shares + action # negative return for short - no adjustment needed
                else:
                    goal_num_shares = self.cvar_limit // ps_cvar
                action = max(-10, min(0, goal_num_shares - self.shares))
            else:
                goal_num_shares = self.shares + action
            # print(ps_cvar, lower.item(), upper.item())

            # if not np.isnan(goal_num_shares):
            self.goal_num_shares = goal_num_shares
            # if action > 0: # buy order
            #     action = min(10, max(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction
            #     # print(goal_num_shares - self.shares, action)
            # elif action < 0: # sell order
            #     action = max(-10, min(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction

            action += ACTION_OFFSET # adjust for env actions being 1 to N rather than -N/2 to N/2

            # print(f_mean, ps_cvar, self.shares, goal_num_shares, rl_action-ACTION_OFFSET, action-ACTION_OFFSET)

        return action, state

    def update(self, obs, reward=None):
        self.obs.append(obs)
        self.shares, self.cash = obs[:2]
        if reward is not None:
            self.X_train = torch.cat((self.X_train, torch.Tensor(self.obs.pop(0)[2:])[None]))[1:] # self.X_train[1:]
            self.y_train = torch.cat((self.y_train, torch.Tensor([reward])))[1:]

        # print(self.X_train, self.y_train)

        self.gp.set_train_data(self.X_train, self.y_train)

    def save(self, rl_path, gp_path=None):
        self.rl.save(rl_path)
        if gp_path is not None:
            torch.save(self.gp.state_dict(), gp_path)

    def load(self, rl_path, gp_path=None):
        self.rl = A2C.load(rl_path)
        if gp_path is not None:
            state_dict = torch.load(gp_path)
            self.gp.load_state_dict(state_dict)

    def __train_gp(self, n_iter, retrain=False):
        # train GP using fixed number of steps
        self.gp.train()
        self.likelihood.train()

        for i in range(n_iter):
            output = self.gp(self.X_train)
            loss = -self.mll(output, self.y_train)
            self.opt.zero_grad()
            loss.backward()
            self.opt.step()

        self.gp.eval()
        self.likelihood.eval()
        },
        "policy_frequency": 2,
        "duration": 40,
    })
    env.reset()
    model = DQN('CnnPolicy', env,
                gamma=0.8,
                learning_rate=5e-4,
                buffer_size=40*1000,
                learning_starts=200,
                exploration_fraction=0.6,
                target_update_interval=256,
                batch_size=32,
                verbose=1,
                tensorboard_log="logs/")
    model.learn(total_timesteps=int(2e5))
    model.save("dqn_highway")

    # Record video
    model = DQN.load("dqn_highway")
    env.configure({"policy_frequency": 15, "duration": 20 * 15})
    video_length = 2 * env.config["duration"]
    env = VecVideoRecorder(env, "videos/",
                           record_video_trigger=lambda x: x == 0, video_length=video_length,
                           name_prefix="dqn-agent")
    obs = env.reset()
    for _ in range(video_length + 1):
        action, _ = model.predict(obs)
        obs, _, _, _ = env.step(action)
    env.close()
Esempio n. 9
0
env = make_vec_env(lambda: env, n_envs=1)

model = DQN(MlpPolicy, env, verbose=1, learning_rate=1e-3)
model.learn(total_timesteps=20000, log_interval=200, n_eval_episodes=1000)
# model.save("deepq_breakout")
#
# del model # remove to demonstrate saving and loading
#
# model = DQN.load("deepq_breakout")
print("Teste")
obs = env.reset()
#while True:

#Test1
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

#Test2
print("Evaluating...")
start = time.time()
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1000)
end = time.time()
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
print("evaluation duration in s: ", end - start)

#Test3
print("Evaluating v2...")
start = time.time()
mean_reward = evaluate(model, env)
    # model = DQN("CnnPolicy", env_vec, policy_kwargs=policy_kwargs, verbose=1)
    model = DQN("CnnPolicy",
                env,
                target_update_interval=1000,
                batch_size=512,
                exploration_final_eps=0.2,
                policy_kwargs=policy_kwargs,
                verbose=1)
    model.save(agentPath)

# Record gif of trained agent
imagesGrid = []
obs = env.reset()
imagesGrid.append(env.render("human"))
for step in range(200):
    action, _ = model.predict(obs, deterministic=False)
    obs, reward, done, info = env.step(action)
    print("reward : ", reward)
    env.render(mode='console')
    imagesGrid.append(env.render("human"))
    if done:
        print("Goal reached!", "reward=", reward)
        break
imagesGrid[0].save(f'_data/visu.gif',
                   save_all=True,
                   append_images=imagesGrid[1:],
                   optimize=True,
                   duration=100,
                   loop=0)

for _ in range(50):
from gym_anytrading.datasets import FOREX_EURUSD_1H_ASK, STOCKS_GOOGL

# env = gym.make('forex-v0', frame_bound=(50, 100), window_size=10)
env = gym.make('stocks-v0', frame_bound=(50, 100), window_size=10)

#############################AGENT############################################

from stable_baselines3 import DQN

# model = DQN.load("MyTradingAgent") # use an existing model, if available

model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=50000, log_interval=500)

# model.save("MyTradingAgent")
# del model # remove the model

##############################OBSERVATION######################################

import matplotlib.pyplot as plt

observation = env.reset()
done = False
while not done:
    action, _states = model.predict(observation)
    observation, reward, done, info = env.step(action)

print("Info:", info)
plt.cla()
env.render_all()
plt.show()
Esempio n. 12
0
    if TEST:
        model = DDQN.load(log_dir + '/best_model/' + MODEL_NAME)
        env = gym.make('rl_stocks-v0')
        env._reset(actions=N_DISCRETE_ACTIONS,
                   observation_space=OBSERVATION_SPACE_TEST,
                   data=test_df,
                   trade_amount=TRADE_AMOUNT,
                   key=KEY,
                   wallet=WALLET,
                   window=WINDOW,
                   interest_rate=INTEREST_RATE,
                   log_dir=test_log_dir)

        for step in range(testing_timesteps):
            obs = env.reset()
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            # print('action=', action, 'reward=', reward, 'done=', done)

        npz = np.load(test_log_dir + '/log.npz')
        df = pd.DataFrame.from_dict({item: npz[item] for item in npz.files})
        df['State_prime'] = df['State'].shift()
        print(
            'Test Profit Factor:',
            df['Index'].loc[(df['State_prime'] != 'Flat')
                            & (df['Profit'] > 0)].count() /
            df['Index'].loc[(df['State_prime'] != 'Flat')
                            & (df['Profit'] < 0)].count())
        print('Test Profit', df['Profit'].sum())
        df.to_csv(test_log_dir + '/Test_log.csv')