Ejemplo n.º 1
0
    def __call__(self, epoch_log):
        agent = epoch_log['agent']

        if agent.step % self.loss_freq == 0:
            loss = epoch_log['loss']
            self.td_loss_history.append(loss)

        if agent.step % self.eval_freq == 0:
            buffer_length = epoch_log['buffer_length']
            n_lives = epoch_log['n_lives']
            self.mean_rw_history.append(
                evaluate(make_env(seed=agent.step, clip_rewards=False),
                         agent,
                         n_games=n_lives,
                         greedy=True) * 5)

            clear_output(True)
            print("buffer size = %i, epsilon = %.5f" %
                  (buffer_length, agent.epsilon))

            plt.figure(figsize=[16, 5])

            plt.subplot(1, 2, 1)
            plt.title("Mean reward")
            plt.plot(self.mean_rw_history)
            plt.grid()

            plt.subplot(1, 2, 2)
            plt.title("TD loss history (smoothened)")
            plt.plot(smoothen(self.td_loss_history))
            plt.grid()
Ejemplo n.º 2
0
def makeAgent():
    env = atari_wrappers.make_env(ENV_NAME)

    net = DCQN(env.observation_space.shape, env.action_space.n)
    net.buildLearningTensors(LEARNING_RATE)

    replayBuffer = ExperienceBuffer(REPLAY_BUFFER_SIZE)

    return Agent(env, net, replayBuffer)
Ejemplo n.º 3
0
 def __call__(self, epoch_log):
     agent = epoch_log['agent']
     if agent.step % self.freq == 0:
         n_lives = epoch_log['n_lives']
         rewards, frames = evaluate(make_env(seed=agent.step,
                                             clip_rewards=False),
                                    agent,
                                    n_games=n_lives,
                                    greedy=True,
                                    render=True)
         total_reward = rewards * n_lives
         frames[0].save(
             self.dir +
             '/step={},reward={:.0f}.gif'.format(agent.step, total_reward),
             save_all=True,
             append_images=frames[1:],
             duration=30)
Ejemplo n.º 4
0
def main(cfg: omegaconf.DictConfig):

	# create the environment
	env = atari_wrappers.make_env(cfg.exp.env)
	env = gym.wrappers.Monitor(env, "recording/", force=True)
	obs = env.reset()

	# TensorBoard
	writer = SummaryWriter()
	writer.add_hparams(flatten_dict(cfg), {})
	logger.info('Hyperparams:', cfg)

	# create the agent
	agent = DQNAgent(env, device=cfg.train.device, summary_writer=writer, cfg=cfg)

	n_games = 0
	max_mean_40_reward = -sys.maxsize

	# Play MAX_N_GAMES games
	while n_games < cfg.train.max_episodes:
		# act greedly
		action = agent.act_eps_greedy(obs)

		# one step on the environment
		new_obs, reward, done, _ = env.step(action)

		# add the environment feedback to the agent
		agent.add_env_feedback(obs, action, new_obs, reward, done)

		# sample and optimize NB: the agent could wait to have enough memories
		agent.sample_and_optimize(cfg.train.batch_size)

		obs = new_obs
		if done:
			n_games += 1
			agent.print_info()
			agent.reset_stats()
			obs = env.reset()
			if agent.rewards:
				current_mean_40_reward = np.mean(agent.rewards[-40:])
				if current_mean_40_reward > max_mean_40_reward:
					agent.save_model(cfg.train.best_checkpoint)
	writer.close()
Ejemplo n.º 5
0
def train(env, agent, target_network, exp_replay, loss_func, device, lr=1e-4, 
          total_steps=3 * 10**6, verbose_steps=3 * 10 ** 5, batch_size=32,
          decay_steps=1 * 10**6, init_epsilon=1.0, final_epsilon=0.1, timesteps_per_epoch=1,
          max_grad_norm=50, loss_freq=50, refresh_target_network_freq=5000, eval_freq=5000):
    stop_evaluation = False
    
    mean_rw_history = []
    td_loss_history = []
    grad_norm_history = []
    initial_state_v_history = []

    opt = torch.optim.Adam(agent.parameters(), lr=lr)
    
    state = env.reset()
    for step in trange(total_steps + 1):
        agent.epsilon = utils.linear_decay(init_epsilon, final_epsilon, step, decay_steps)

        # play
        _, state = utils.play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)

        # train
        states, actions, rewards, next_states, is_done = exp_replay.sample(batch_size)
        loss = loss_func(states, actions, rewards, next_states, is_done,
                         agent, target_network, device)


        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
        opt.step()
        opt.zero_grad()

        if step % loss_freq == 0:
            td_loss_history.append(loss.data.cpu().item())
            grad_norm_history.append(grad_norm)

        if step % refresh_target_network_freq == 0:
            # Load agent weights into target_network
            #target_network.parameters() = agent.parameters()
            target_network.load_state_dict(agent.state_dict())

        if step == verbose_steps:
            print("Stopping plotting to reduce training time.")
            stop_evaluation = True
        if (step % eval_freq == 0):
            # eval the agent
            mean_rw_history.append(utils.evaluate(
                make_env(seed=step), agent, n_games=3, greedy=True, t_max=1000)
            )
            initial_state_q_values = agent.get_qvalues(
                [make_env(seed=step).reset()]
            )
            initial_state_v_history.append(np.max(initial_state_q_values))
            if not stop_evaluation:
                clear_output(True)
                print("buffer size = %i, epsilon = %.5f" %
                    (len(exp_replay), agent.epsilon))

                plt.figure(figsize=[16, 9])
                plt.subplot(2, 2, 1)
                plt.title("Mean reward per episode")
                plt.plot(mean_rw_history)
                plt.grid()

                assert not np.isnan(td_loss_history[-1])
                plt.subplot(2, 2, 2)
                plt.title("TD loss history (smoothened)")
                plt.plot(utils.smoothen(td_loss_history))
                plt.grid()

                plt.subplot(2, 2, 3)
                plt.title("Initial state V")
                plt.plot(initial_state_v_history)
                plt.grid()

                plt.subplot(2, 2, 4)
                plt.title("Grad norm history (smoothened)")
                plt.plot(utils.smoothen(grad_norm_history))
                plt.grid()

                plt.show()

    return {'reward_history': mean_rw_history, 
            'td_loss_history': td_loss_history, 
            'grad_norm_history': grad_norm_history,
            'initial_state_v_history': initial_state_v_history}
Ejemplo n.º 6
0
def DQN_with_variations(env_name,
                        extensions_hyp,
                        hidden_sizes=[32],
                        lr=1e-2,
                        num_epochs=2000,
                        buffer_size=100000,
                        discount=0.99,
                        render_cycle=100,
                        update_target_net=1000,
                        batch_size=64,
                        update_freq=4,
                        frames_num=2,
                        min_buffer_size=5000,
                        tost_frequency=20,
                        start_explor=1,
                        end_explor=0.1,
                        explor_steps=100000):

    # Create the environment both for train and tost
    env = make_env(env_name,
                   frames_num=frames_num,
                   skip_frames=True,
                   noop_num=20)
    env_tost = make_env(env_name,
                        frames_num=frames_num,
                        skip_frames=True,
                        noop_num=20)
    # Add a monitor to the tost env to store the videos
    env_tost = gym.wrappers.Monitor(env_tost,
                                    "VIDEOS/tost_VIDEOS" + env_name +
                                    str(current_milli_time()),
                                    force=True,
                                    video_callable=lambda x: x % 20 == 0)

    tf.compat.v1.reset_default_graph()

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n

    # Create all the placeholders
    obs_ph = tf.compat.v1.placeholder(shape=(None, obs_dim[0], obs_dim[1],
                                             obs_dim[2]),
                                      dtype=tf.float32,
                                      name='obs')
    act_ph = tf.compat.v1.placeholder(shape=(None, ),
                                      dtype=tf.int32,
                                      name='act')
    y_ph = tf.compat.v1.placeholder(shape=(None, ), dtype=tf.float32, name='y')

    # Create the target network
    with tf.compat.v1.variable_scope('target_network'):
        if extensions_hyp['dueling']:
            target_qv = dueling_qnet(obs_ph, hidden_sizes, act_dim)
        else:
            target_qv = qnet(obs_ph, hidden_sizes, act_dim)
    target_vars = tf.compat.v1.trainable_variables()

    # Create the online network (i.e. the behavior policy)
    with tf.compat.v1.variable_scope('online_network'):
        if extensions_hyp['dueling']:
            online_qv = dueling_qnet(obs_ph, hidden_sizes, act_dim)
        else:
            online_qv = qnet(obs_ph, hidden_sizes, act_dim)
    train_vars = tf.compat.v1.trainable_variables()

    # Update the target network by assigning to it the variables of the online network
    # Note that the target network and the online network have the same exact architecture
    update_target = [
        train_vars[i].assign(train_vars[i + len(target_vars)])
        for i in range(len(train_vars) - len(target_vars))
    ]
    update_target_op = tf.group(*update_target)

    # One hot encoding of the action
    act_onehot = tf.one_hot(act_ph, depth=act_dim)
    # We are interested only in the Q-values of those actions
    q_values = tf.reduce_sum(input_tensor=act_onehot * online_qv, axis=1)

    # MSE loss function
    v_loss = tf.reduce_mean(input_tensor=(y_ph - q_values)**2)
    # Adam optimize that minimize the loss v_loss
    v_opt = tf.compat.v1.train.AdamOptimizer(lr).minimize(v_loss)

    def agent_op(o):
        '''
        Forward pass to obtain the Q-values from the online network of a single observation
        '''
        # Scale the frames
        o = scale_frames(o)
        return sess.run(online_qv, feed_dict={obs_ph: [o]})

    # Time
    now = datetime.now()
    clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute,
                                      int(now.second))
    print('Time:', clock_time)

    mr_v = tf.Variable(0.0)
    ml_v = tf.Variable(0.0)

    # TensorBoard summaries
    tf.compat.v1.summary.scalar('v_loss', v_loss)
    tf.compat.v1.summary.scalar('Q-value',
                                tf.reduce_mean(input_tensor=q_values))
    tf.compat.v1.summary.histogram('Q-values', q_values)

    scalar_summary = tf.compat.v1.summary.merge_all()
    reward_summary = tf.compat.v1.summary.scalar('tost_rew', mr_v)
    mean_loss_summary = tf.compat.v1.summary.scalar('mean_loss', ml_v)

    LOG_DIR = 'log_dir/' + env_name
    hyp_str = "-lr_{}-upTN_{}-upF_{}-frms_{}-ddqn_{}-duel_{}-nstep_{}" \
                .format(lr, update_target_net, update_freq, frames_num, extensions_hyp['DDQN'], extensions_hyp['dueling'], extensions_hyp['multi_step'])

    # initialize the File Writer for writing TensorBoard summaries
    file_writer = tf.compat.v1.summary.FileWriter(
        LOG_DIR + '/DQN_' + clock_time + '_' + hyp_str,
        tf.compat.v1.get_default_graph())

    # open a session
    sess = tf.compat.v1.Session()
    # and initialize all the variables
    sess.run(tf.compat.v1.global_variables_initializer())

    render_the_game = False
    step_count = 0
    last_update_loss = []
    ep_time = current_milli_time()
    batch_rew = []
    old_step_count = 0

    obs = env.reset()

    # Initialize the experience buffer
    #buffer = ExperienceBuffer(buffer_size)
    buffer = MultiStepExperienceBuffer(buffer_size,
                                       extensions_hyp['multi_step'], discount)

    # Copy the online network in the target network
    sess.run(update_target_op)

    ########## EXPLORATION INITIALIZATION ######
    eps = start_explor
    eps_decay = (start_explor - end_explor) / explor_steps

    for ep in range(num_epochs):
        g_rew = 0
        done = False

        # Until the environment does not end..
        while not done:

            # Epsilon decay
            if eps > end_explor:
                eps -= eps_decay

            # Choose an eps-greedy action
            act = eps_greedy(np.squeeze(agent_op(obs)), eps=eps)

            # execute the action in the environment
            obs2, rew, done, _ = env.step(act)

            # Render the game if you want to
            if render_the_game:
                env.render()

            # Add the transition to the replay buffer
            buffer.add(obs, rew, act, obs2, done)

            obs = obs2
            g_rew += rew
            step_count += 1

            ################ TRAINING ###############
            # If it's time to train the network:
            if len(buffer) > min_buffer_size and (step_count % update_freq
                                                  == 0):

                # sample a minibatch from the buffer
                mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(
                    batch_size)

                if extensions_hyp['DDQN']:
                    mb_onl_qv, mb_trg_qv = sess.run(
                        [online_qv, target_qv], feed_dict={obs_ph: mb_obs2})
                    y_r = double_q_target_values(mb_rew, mb_done, mb_trg_qv,
                                                 mb_onl_qv, discount)
                else:
                    mb_trg_qv = sess.run(target_qv,
                                         feed_dict={obs_ph: mb_obs2})
                    y_r = q_target_values(mb_rew, mb_done, mb_trg_qv, discount)

                # optimize, compute the loss and return the TB summary
                train_summary, train_loss, _ = sess.run(
                    [scalar_summary, v_loss, v_opt],
                    feed_dict={
                        obs_ph: mb_obs,
                        y_ph: y_r,
                        act_ph: mb_act
                    })

                # Add the train summary to the file_writer
                file_writer.add_summary(train_summary, step_count)
                last_update_loss.append(train_loss)

            # Every update_target_net steps, update the target network
            if (len(buffer) > min_buffer_size) and (step_count %
                                                    update_target_net == 0):

                # run the session to update the target network and get the mean loss sumamry
                _, train_summary = sess.run(
                    [update_target_op, mean_loss_summary],
                    feed_dict={ml_v: np.mean(last_update_loss)})
                file_writer.add_summary(train_summary, step_count)
                last_update_loss = []

            # If the environment is ended, reset it and initialize the variables
            if done:
                obs = env.reset()
                batch_rew.append(g_rew)
                g_rew, render_the_game = 0, False

        # every tost_frequency episodes, tost the agent and write some stats in TensorBoard
        if ep % tost_frequency == 0:
            # tost the agent to 10 games
            tost_rw = tost_agent(env_tost, agent_op, num_games=10)

            # Run the tost stats and add them to the file_writer
            tost_summary = sess.run(reward_summary,
                                    feed_dict={mr_v: np.mean(tost_rw)})
            file_writer.add_summary(tost_summary, step_count)

            # Print some useful stats
            ep_sec_time = int((current_milli_time() - ep_time) / 1000)
            print(
                'Ep:%4d Rew:%4.2f, Eps:%2.2f -- Step:%5d -- tost:%4.2f %4.2f -- Time:%d -- Ep_Steps:%d'
                % (ep, np.mean(batch_rew), eps, step_count, np.mean(tost_rw),
                   np.std(tost_rw), ep_sec_time,
                   (step_count - old_step_count) / tost_frequency))

            ep_time = current_milli_time()
            batch_rew = []
            old_step_count = step_count

        if ep % render_cycle == 0:
            render_the_game = True

    file_writer.close()
    env.close()
Ejemplo n.º 7
0
ENV_NAME = "PongNoFrameskip-v4"
RECORD = True
MAX_GAMES = 500
DEVICE = 'cuda'
BATCH_SIZE = 32

# For TensorBoard
SUMMARY_WRITER = True
LOG_DIR = 'content/runs'
name = 'DQN Multi-step=%d,Double=%r,Dueling=%r' % (DQN_HYPERPARAMS['multi_step'], DQN_HYPERPARAMS['double_dqn'], DQN_HYPERPARAMS['dueling'])
# For Telegram
TG_BOT = True

# ------------------------Create enviroment and agent--------------------------
env = atari_wrappers.make_env("PongNoFrameskip-v4")  # gym.make("PongNoFrameskip-v4")
# For recording few seelcted episodes. 'force' means overwriting earlier recordings
if RECORD:
    env = gym.wrappers.Monitor(env, "main-" + ENV_NAME, force=True)
obs = env.reset()
# Create TensorBoard writer that will create graphs
writer = SummaryWriter(log_dir=LOG_DIR + '/' + name + str(time.time())) if SUMMARY_WRITER else None
# Create agent that will learn
agent = Agent(env, hyperparameters=DQN_HYPERPARAMS, device=DEVICE, writer=writer, max_games=MAX_GAMES, tg_bot=TG_BOT)
# --------------------------------Learning-------------------------------------
num_games = 0
while num_games < MAX_GAMES:
    # Select one action with e-greedy policy and observe s,a,s',r and done
    action = agent.select_eps_greedy_action(obs)
    # Take that action and observe s, a, s', r and done
    new_obs, reward, done, _ = env.step(action)
Ejemplo n.º 8
0
MAX_N_GAMES = int(hyperparameters['MAX_N_GAMES'])
TEST_FREQUENCY = int(hyperparameters['TEST_FREQUENCY'])

ENV_NAME = "PongNoFrameskip-v4"
SAVE_VIDEO = True
DEVICE = hyperparameters['DEVICE']  # 'cuda' or 'cpu'
SUMMARY_WRITER = True

LOG_DIR = hyperparameters['path']  # 'content/runs' '/opt/ml/model/'
name = '_'.join([str(k) + '.' + str(v) for k, v in DQN_HYPERPARAMS.items()])
name = 'prv'

if __name__ == '__main__':

    # create the environment
    env = atari_wrappers.make_env(ENV_NAME)
    if SAVE_VIDEO:
        # save the video of the games
        env = gym.wrappers.Monitor(env,
                                   LOG_DIR + "/main-" + ENV_NAME,
                                   force=True)

    # starting environment state
    obs = env.reset()

    # TensorBoard
    writer = SummaryWriter(log_dir=LOG_DIR + '/' + name +
                           str(time.time())) if SUMMARY_WRITER else None

    # create the agent
    agent = DQNAgent(env,
Ejemplo n.º 9
0
def main():

    args = parse_args()

    # Overwrite default values
    DQN_HYPERPARAMS['epsilon_final'] = args.eps
    DQN_HYPERPARAMS['double_DQN'] = args.ddqn

    # create the environment
    # env = atari_wrappers.make_env(ENV_NAME)
    env = atari_wrappers.make_env(args.env_name)

    # Create run name with environment name and timestamp of launch
    # (and optional tag)
    run_name = args.env_name
    if args.tag != "":
        run_name += f"_{args.tag}"
    run_name += "_run_" + datetime.now().strftime("%Y%m%d_%H%M")

    if SAVE_VIDEO:
        # save the video of the games
        # env = gym.wrappers.Monitor(env, "main-"+args.env_name, force=True)
        # Save every 50th episode
        env = gym.wrappers.Monitor(
            env,
            "videos/" + args.env_name + "/run_" +
            datetime.now().strftime("%Y%m%d_%H%M"),  # noqa
            video_callable=lambda episode_id: episode_id % 50 == 0)

    # TensorBoard
    writer = SummaryWriter(log_dir=LOG_DIR+'/'+run_name) \
        if SUMMARY_WRITER else None

    print('Hyperparams:', DQN_HYPERPARAMS)

    # create the agent
    agent = DQNAgent(env, DQN_HYPERPARAMS, DEVICE, summary_writer=writer)

    n_games = 0
    # n_iter = 0

    # Play MAX_N_GAMES games
    while n_games < MAX_N_GAMES:

        obs = env.reset()
        done = False

        while not done:

            # act greedly
            action = agent.act_eps_greedy(obs)

            # one step on the environment
            new_obs, reward, done, _ = env.step(action)

            # add the environment feedback to the agent
            agent.add_env_feedback(obs, action, new_obs, reward, done)

            # sample and optimize NB: the agent could wait to have enough
            # memories
            agent.sample_and_optimize(BATCH_SIZE)

            obs = new_obs

        n_games += 1

        # print info about the agent and reset the stats
        agent.print_info()
        agent.reset_stats()

        # if n_games % TEST_FREQUENCY == 0:
        # print('Test mean:', utils.test_game(env, agent, 1))

    writer.close()