Ejemplo n.º 1
0
def DQNTrain(scenario_args, observation_space_args,
             action_space_args, reward_args, data_args, almgren_chriss_args):

    EPISODES = 10000

    env = gym.make('hwenv-v0',
                   scenario_args=scenario_args,
                   observation_space_args=observation_space_args,
                   action_space_args=action_space_args,
                   reward_args=reward_args,
                   data_args=data_args,
                   almgren_chriss_args=almgren_chriss_args)

    # get size of state and action from trading_environment
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n

    agent = DQNAgent(ob_dim, ac_dim, batch_size=64, initial_exploration_steps=10000)

    scores = []
    avg_step = 10
    for eps in range(EPISODES):
        eps_rew = agent.sample_trajectory(env)
        scores.append(eps_rew)
        if eps % avg_step == 0:
            avg = sum(scores[-avg_step-1:-1]) / avg_step
            print('{} episode: {}/{}, average reward: {}'.
                  format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), eps, EPISODES, avg))
        agent.train_model()
        if eps % 10 == 0:
            agent.update_target_model()

    plot_with_avg_std(scores, 1, xlabel=f'Number of Episodes in {1}')
Ejemplo n.º 2
0
def DRQNTrain(scenario_args, observation_space_args, action_space_args,
              reward_args, data_args, almgren_chriss_args, double):

    EPISODES = 30000

    env = gym.make('hwenv-v0',
                   scenario_args=scenario_args,
                   observation_space_args=observation_space_args,
                   action_space_args=action_space_args,
                   reward_args=reward_args,
                   data_args=data_args,
                   almgren_chriss_args=almgren_chriss_args)

    # get size of state and action from trading_environment
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n

    agent = DRQNAgent(ob_dim,
                      ac_dim,
                      lookback=30,
                      batch_size=256,
                      initial_exploration_steps=1000,
                      double=double)

    scores = []
    avgs = []
    avg_step = 100
    for eps in range(EPISODES):
        eps_rew = agent.sample_trajectory(env)
        scores.append(eps_rew)
        if (eps % avg_step) == 0 & (eps != 0):
            avg = sum(scores[-avg_step - 1:-1]) / avg_step
            avgs.append(avg)
            print('{} episode: {}/{}, average reward: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), eps,
                EPISODES, avg))
            # env.render()
        agent.train_model()
        if eps % 5 == 0:
            agent.update_target_model()

    agent.target_model.save('model.h5')
    print('Saved model to disk.')

    plot_with_avg_std(avgs, 10)
Ejemplo n.º 3
0
def A2CTrain(scenario_args, observation_space_args, action_space_args,
             reward_args, data_args, almgren_chriss_args):
    """
    Train the A2CAgent by sampling trajectories from the trading_environment.
    """
    N_ITERATION = 200

    # Initialize the gym trading_environment.
    env = gym.make('hwenv-v0',
                   scenario_args=scenario_args,
                   observation_space_args=observation_space_args,
                   action_space_args=action_space_args,
                   reward_args=reward_args,
                   data_args=data_args,
                   almgren_chriss_args=almgren_chriss_args)

    # Initialize the A2CAgent.
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n
    agent = A2CAgent(ob_dim, ac_dim)

    # Run the iterations through recursively sampling trajectories and update neural network parameters.
    avg_rews = []
    for itr in range(N_ITERATION):
        # Sample trajectories
        print("********** Iteration %i ************" % itr)
        paths, timesteps_this_batch, avg_rew, avg_info = agent.sample_trajectories(
            itr, env, info_name='shortfall')
        avg_rews.append(avg_rew)
        print("Total rewards per trajectory in this iteration: ", avg_rew)
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = np.concatenate([path["reward"] for path in paths])
        next_ob_no = np.concatenate(
            [path["next_observation"] for path in paths])
        terminal_n = np.concatenate([path["terminal"] for path in paths])

        # Update the critic model and the actor model
        agent.update_critic(ob_no, next_ob_no, re_n, terminal_n)
        adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)
        agent.update_actor(ob_no, ac_na, adv_n)

    # Visualize the training results.
    plot_with_avg_std(avg_rews, 1, xlabel=f'Number of Episodes in {1}')
Ejemplo n.º 4
0
    env = gym.make('CartPole-v1')
    # get size of state and action from trading_environment
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    model = keras.models.load_model(
        '/Users/mmw/Documents/GitHub/rl_for_optimal_exec/drqn_cartpole/drqn_cartpole_v0_10000_eps.h5'
    )
    agent = DRQN_Cartpole_Agent(state_size,
                                action_size,
                                lookback=5,
                                initial_exploration_eps=0,
                                exploration=LinearSchedule(1, 0, initial_p=0),
                                model=model)

    scores, episodes = [], []
    avg_step = 1
    for eps in range(EPISODES):
        eps_rew = agent.sample_transition_pairs(env,
                                                render=(eps % avg_step == 0),
                                                max_step=500)
        scores.append(eps_rew)
        if eps % avg_step == 0:
            avg = sum(scores[-avg_step - 1:-1]) / avg_step
            print('{} episode: {}/{}, average reward: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), eps,
                EPISODES, avg))
        env.reset()

    plot_with_avg_std(scores, 1, xlabel=f'Number of Episodes in {1}')
Ejemplo n.º 5
0
import gym
import keras
from cartpole_agents.a2c_rnn_cartpole import ACRnnAgent
from tools.plot_tool import plot_with_avg_std

env = gym.make('CartPole-v1')

discrete = isinstance(env.action_space, gym.spaces.Discrete)
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

agent = ACRnnAgent(ob_dim, ac_dim, batch_size=100)
agent.actor_model = keras.models.load_model('dra2c_cartpole_actor_70_itr.h5')
agent.critic_model = keras.models.load_model('dra2c_cartpole_critic_70_itr.h5')

avg_rews = []
n_iter = 100
total_timesteps = 0
for itr in range(n_iter):
    print("********** Iteration %i ************" % itr)
    ob_seq, next_ob_seq, ac_na, re_n, terminal_n, avg_rew = agent.sample_trajectories(
        env, render=True, animate_eps_frequency=1)
    avg_rews.append(avg_rew)
    print(avg_rew)

plot_with_avg_std(avg_rews, 1, xlabel=f'Number of Iterations')
Ejemplo n.º 6
0
from cartpole_agents.a2c_cartpole.ac_agent import ACAgent
from tools.plot_tool import plot_with_avg_std
env = gym.make('CartPole-v1')

discrete = isinstance(env.action_space, gym.spaces.Discrete)
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

agent = ACAgent(ob_dim, ac_dim)
agent.actor_model = keras.models.load_model('a2c_cartpole_actor_50_eps.h5')
agent.critic_model = keras.models.load_model('a2c_cartpole_critic_50_eps.h5')

# # build computation graph
# agent.build_computation_graph()
#
# # tensorflow: config, session, variable initialization
# agent.init_tf_sess()

avg_rews = []
n_iter = 100
total_timesteps = 0
for itr in range(n_iter):
    print("********** Iteration %i ************" % itr)
    paths, timesteps_this_batch, avg_rew = agent.sample_trajectories(
        env, render=True, animate_eps_frequency=1)
    avg_rews.append(avg_rew)
    print(avg_rew)
    total_timesteps += timesteps_this_batch

plot_with_avg_std(avg_rews, 1, xlabel=f'Number of Episodes in {1}')
    agent = DRQN_Cartpole_Agent(state_size,
                                action_size,
                                lookback=5,
                                batch_size=64,
                                initial_exploration_eps=1000,
                                buffer_size=int(2e5))

    scores, episodes = [], []
    avg_step = 100
    for eps in range(EPISODES):
        eps_rew = agent.sample_transition_pairs(env,
                                                render=(eps % avg_step == 0),
                                                max_step=500)
        scores.append(eps_rew)
        if eps % avg_step == 0:
            avg = sum(scores[-avg_step:-1]) / avg_step
            print('{} episode: {}/{}, average reward: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), eps,
                EPISODES, avg))
        agent.train_model()
        if eps % 1 == 0:
            agent.update_target_model()
        if eps % 100 == 0:
            env.render()
        env.reset()
        if eps % 10000 == 0:
            agent.model.save(f'drqn_cartpole_tanh_v1_{int(eps)}_eps.h5')

    plot_with_avg_std(scores, 500, xlabel=f'Number of Episodes in {500}')