def __call__(self, epoch_log): agent = epoch_log['agent'] if agent.step % self.loss_freq == 0: loss = epoch_log['loss'] self.td_loss_history.append(loss) if agent.step % self.eval_freq == 0: buffer_length = epoch_log['buffer_length'] n_lives = epoch_log['n_lives'] self.mean_rw_history.append( evaluate(make_env(seed=agent.step, clip_rewards=False), agent, n_games=n_lives, greedy=True) * 5) clear_output(True) print("buffer size = %i, epsilon = %.5f" % (buffer_length, agent.epsilon)) plt.figure(figsize=[16, 5]) plt.subplot(1, 2, 1) plt.title("Mean reward") plt.plot(self.mean_rw_history) plt.grid() plt.subplot(1, 2, 2) plt.title("TD loss history (smoothened)") plt.plot(smoothen(self.td_loss_history)) plt.grid()
def makeAgent(): env = atari_wrappers.make_env(ENV_NAME) net = DCQN(env.observation_space.shape, env.action_space.n) net.buildLearningTensors(LEARNING_RATE) replayBuffer = ExperienceBuffer(REPLAY_BUFFER_SIZE) return Agent(env, net, replayBuffer)
def __call__(self, epoch_log): agent = epoch_log['agent'] if agent.step % self.freq == 0: n_lives = epoch_log['n_lives'] rewards, frames = evaluate(make_env(seed=agent.step, clip_rewards=False), agent, n_games=n_lives, greedy=True, render=True) total_reward = rewards * n_lives frames[0].save( self.dir + '/step={},reward={:.0f}.gif'.format(agent.step, total_reward), save_all=True, append_images=frames[1:], duration=30)
def main(cfg: omegaconf.DictConfig): # create the environment env = atari_wrappers.make_env(cfg.exp.env) env = gym.wrappers.Monitor(env, "recording/", force=True) obs = env.reset() # TensorBoard writer = SummaryWriter() writer.add_hparams(flatten_dict(cfg), {}) logger.info('Hyperparams:', cfg) # create the agent agent = DQNAgent(env, device=cfg.train.device, summary_writer=writer, cfg=cfg) n_games = 0 max_mean_40_reward = -sys.maxsize # Play MAX_N_GAMES games while n_games < cfg.train.max_episodes: # act greedly action = agent.act_eps_greedy(obs) # one step on the environment new_obs, reward, done, _ = env.step(action) # add the environment feedback to the agent agent.add_env_feedback(obs, action, new_obs, reward, done) # sample and optimize NB: the agent could wait to have enough memories agent.sample_and_optimize(cfg.train.batch_size) obs = new_obs if done: n_games += 1 agent.print_info() agent.reset_stats() obs = env.reset() if agent.rewards: current_mean_40_reward = np.mean(agent.rewards[-40:]) if current_mean_40_reward > max_mean_40_reward: agent.save_model(cfg.train.best_checkpoint) writer.close()
def train(env, agent, target_network, exp_replay, loss_func, device, lr=1e-4, total_steps=3 * 10**6, verbose_steps=3 * 10 ** 5, batch_size=32, decay_steps=1 * 10**6, init_epsilon=1.0, final_epsilon=0.1, timesteps_per_epoch=1, max_grad_norm=50, loss_freq=50, refresh_target_network_freq=5000, eval_freq=5000): stop_evaluation = False mean_rw_history = [] td_loss_history = [] grad_norm_history = [] initial_state_v_history = [] opt = torch.optim.Adam(agent.parameters(), lr=lr) state = env.reset() for step in trange(total_steps + 1): agent.epsilon = utils.linear_decay(init_epsilon, final_epsilon, step, decay_steps) # play _, state = utils.play_and_record(state, agent, env, exp_replay, timesteps_per_epoch) # train states, actions, rewards, next_states, is_done = exp_replay.sample(batch_size) loss = loss_func(states, actions, rewards, next_states, is_done, agent, target_network, device) loss.backward() grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm) opt.step() opt.zero_grad() if step % loss_freq == 0: td_loss_history.append(loss.data.cpu().item()) grad_norm_history.append(grad_norm) if step % refresh_target_network_freq == 0: # Load agent weights into target_network #target_network.parameters() = agent.parameters() target_network.load_state_dict(agent.state_dict()) if step == verbose_steps: print("Stopping plotting to reduce training time.") stop_evaluation = True if (step % eval_freq == 0): # eval the agent mean_rw_history.append(utils.evaluate( make_env(seed=step), agent, n_games=3, greedy=True, t_max=1000) ) initial_state_q_values = agent.get_qvalues( [make_env(seed=step).reset()] ) initial_state_v_history.append(np.max(initial_state_q_values)) if not stop_evaluation: clear_output(True) print("buffer size = %i, epsilon = %.5f" % (len(exp_replay), agent.epsilon)) plt.figure(figsize=[16, 9]) plt.subplot(2, 2, 1) plt.title("Mean reward per episode") plt.plot(mean_rw_history) plt.grid() assert not np.isnan(td_loss_history[-1]) plt.subplot(2, 2, 2) plt.title("TD loss history (smoothened)") plt.plot(utils.smoothen(td_loss_history)) plt.grid() plt.subplot(2, 2, 3) plt.title("Initial state V") plt.plot(initial_state_v_history) plt.grid() plt.subplot(2, 2, 4) plt.title("Grad norm history (smoothened)") plt.plot(utils.smoothen(grad_norm_history)) plt.grid() plt.show() return {'reward_history': mean_rw_history, 'td_loss_history': td_loss_history, 'grad_norm_history': grad_norm_history, 'initial_state_v_history': initial_state_v_history}
def DQN_with_variations(env_name, extensions_hyp, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_size=100000, discount=0.99, render_cycle=100, update_target_net=1000, batch_size=64, update_freq=4, frames_num=2, min_buffer_size=5000, tost_frequency=20, start_explor=1, end_explor=0.1, explor_steps=100000): # Create the environment both for train and tost env = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20) env_tost = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20) # Add a monitor to the tost env to store the videos env_tost = gym.wrappers.Monitor(env_tost, "VIDEOS/tost_VIDEOS" + env_name + str(current_milli_time()), force=True, video_callable=lambda x: x % 20 == 0) tf.compat.v1.reset_default_graph() obs_dim = env.observation_space.shape act_dim = env.action_space.n # Create all the placeholders obs_ph = tf.compat.v1.placeholder(shape=(None, obs_dim[0], obs_dim[1], obs_dim[2]), dtype=tf.float32, name='obs') act_ph = tf.compat.v1.placeholder(shape=(None, ), dtype=tf.int32, name='act') y_ph = tf.compat.v1.placeholder(shape=(None, ), dtype=tf.float32, name='y') # Create the target network with tf.compat.v1.variable_scope('target_network'): if extensions_hyp['dueling']: target_qv = dueling_qnet(obs_ph, hidden_sizes, act_dim) else: target_qv = qnet(obs_ph, hidden_sizes, act_dim) target_vars = tf.compat.v1.trainable_variables() # Create the online network (i.e. the behavior policy) with tf.compat.v1.variable_scope('online_network'): if extensions_hyp['dueling']: online_qv = dueling_qnet(obs_ph, hidden_sizes, act_dim) else: online_qv = qnet(obs_ph, hidden_sizes, act_dim) train_vars = tf.compat.v1.trainable_variables() # Update the target network by assigning to it the variables of the online network # Note that the target network and the online network have the same exact architecture update_target = [ train_vars[i].assign(train_vars[i + len(target_vars)]) for i in range(len(train_vars) - len(target_vars)) ] update_target_op = tf.group(*update_target) # One hot encoding of the action act_onehot = tf.one_hot(act_ph, depth=act_dim) # We are interested only in the Q-values of those actions q_values = tf.reduce_sum(input_tensor=act_onehot * online_qv, axis=1) # MSE loss function v_loss = tf.reduce_mean(input_tensor=(y_ph - q_values)**2) # Adam optimize that minimize the loss v_loss v_opt = tf.compat.v1.train.AdamOptimizer(lr).minimize(v_loss) def agent_op(o): ''' Forward pass to obtain the Q-values from the online network of a single observation ''' # Scale the frames o = scale_frames(o) return sess.run(online_qv, feed_dict={obs_ph: [o]}) # Time now = datetime.now() clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second)) print('Time:', clock_time) mr_v = tf.Variable(0.0) ml_v = tf.Variable(0.0) # TensorBoard summaries tf.compat.v1.summary.scalar('v_loss', v_loss) tf.compat.v1.summary.scalar('Q-value', tf.reduce_mean(input_tensor=q_values)) tf.compat.v1.summary.histogram('Q-values', q_values) scalar_summary = tf.compat.v1.summary.merge_all() reward_summary = tf.compat.v1.summary.scalar('tost_rew', mr_v) mean_loss_summary = tf.compat.v1.summary.scalar('mean_loss', ml_v) LOG_DIR = 'log_dir/' + env_name hyp_str = "-lr_{}-upTN_{}-upF_{}-frms_{}-ddqn_{}-duel_{}-nstep_{}" \ .format(lr, update_target_net, update_freq, frames_num, extensions_hyp['DDQN'], extensions_hyp['dueling'], extensions_hyp['multi_step']) # initialize the File Writer for writing TensorBoard summaries file_writer = tf.compat.v1.summary.FileWriter( LOG_DIR + '/DQN_' + clock_time + '_' + hyp_str, tf.compat.v1.get_default_graph()) # open a session sess = tf.compat.v1.Session() # and initialize all the variables sess.run(tf.compat.v1.global_variables_initializer()) render_the_game = False step_count = 0 last_update_loss = [] ep_time = current_milli_time() batch_rew = [] old_step_count = 0 obs = env.reset() # Initialize the experience buffer #buffer = ExperienceBuffer(buffer_size) buffer = MultiStepExperienceBuffer(buffer_size, extensions_hyp['multi_step'], discount) # Copy the online network in the target network sess.run(update_target_op) ########## EXPLORATION INITIALIZATION ###### eps = start_explor eps_decay = (start_explor - end_explor) / explor_steps for ep in range(num_epochs): g_rew = 0 done = False # Until the environment does not end.. while not done: # Epsilon decay if eps > end_explor: eps -= eps_decay # Choose an eps-greedy action act = eps_greedy(np.squeeze(agent_op(obs)), eps=eps) # execute the action in the environment obs2, rew, done, _ = env.step(act) # Render the game if you want to if render_the_game: env.render() # Add the transition to the replay buffer buffer.add(obs, rew, act, obs2, done) obs = obs2 g_rew += rew step_count += 1 ################ TRAINING ############### # If it's time to train the network: if len(buffer) > min_buffer_size and (step_count % update_freq == 0): # sample a minibatch from the buffer mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch( batch_size) if extensions_hyp['DDQN']: mb_onl_qv, mb_trg_qv = sess.run( [online_qv, target_qv], feed_dict={obs_ph: mb_obs2}) y_r = double_q_target_values(mb_rew, mb_done, mb_trg_qv, mb_onl_qv, discount) else: mb_trg_qv = sess.run(target_qv, feed_dict={obs_ph: mb_obs2}) y_r = q_target_values(mb_rew, mb_done, mb_trg_qv, discount) # optimize, compute the loss and return the TB summary train_summary, train_loss, _ = sess.run( [scalar_summary, v_loss, v_opt], feed_dict={ obs_ph: mb_obs, y_ph: y_r, act_ph: mb_act }) # Add the train summary to the file_writer file_writer.add_summary(train_summary, step_count) last_update_loss.append(train_loss) # Every update_target_net steps, update the target network if (len(buffer) > min_buffer_size) and (step_count % update_target_net == 0): # run the session to update the target network and get the mean loss sumamry _, train_summary = sess.run( [update_target_op, mean_loss_summary], feed_dict={ml_v: np.mean(last_update_loss)}) file_writer.add_summary(train_summary, step_count) last_update_loss = [] # If the environment is ended, reset it and initialize the variables if done: obs = env.reset() batch_rew.append(g_rew) g_rew, render_the_game = 0, False # every tost_frequency episodes, tost the agent and write some stats in TensorBoard if ep % tost_frequency == 0: # tost the agent to 10 games tost_rw = tost_agent(env_tost, agent_op, num_games=10) # Run the tost stats and add them to the file_writer tost_summary = sess.run(reward_summary, feed_dict={mr_v: np.mean(tost_rw)}) file_writer.add_summary(tost_summary, step_count) # Print some useful stats ep_sec_time = int((current_milli_time() - ep_time) / 1000) print( 'Ep:%4d Rew:%4.2f, Eps:%2.2f -- Step:%5d -- tost:%4.2f %4.2f -- Time:%d -- Ep_Steps:%d' % (ep, np.mean(batch_rew), eps, step_count, np.mean(tost_rw), np.std(tost_rw), ep_sec_time, (step_count - old_step_count) / tost_frequency)) ep_time = current_milli_time() batch_rew = [] old_step_count = step_count if ep % render_cycle == 0: render_the_game = True file_writer.close() env.close()
ENV_NAME = "PongNoFrameskip-v4" RECORD = True MAX_GAMES = 500 DEVICE = 'cuda' BATCH_SIZE = 32 # For TensorBoard SUMMARY_WRITER = True LOG_DIR = 'content/runs' name = 'DQN Multi-step=%d,Double=%r,Dueling=%r' % (DQN_HYPERPARAMS['multi_step'], DQN_HYPERPARAMS['double_dqn'], DQN_HYPERPARAMS['dueling']) # For Telegram TG_BOT = True # ------------------------Create enviroment and agent-------------------------- env = atari_wrappers.make_env("PongNoFrameskip-v4") # gym.make("PongNoFrameskip-v4") # For recording few seelcted episodes. 'force' means overwriting earlier recordings if RECORD: env = gym.wrappers.Monitor(env, "main-" + ENV_NAME, force=True) obs = env.reset() # Create TensorBoard writer that will create graphs writer = SummaryWriter(log_dir=LOG_DIR + '/' + name + str(time.time())) if SUMMARY_WRITER else None # Create agent that will learn agent = Agent(env, hyperparameters=DQN_HYPERPARAMS, device=DEVICE, writer=writer, max_games=MAX_GAMES, tg_bot=TG_BOT) # --------------------------------Learning------------------------------------- num_games = 0 while num_games < MAX_GAMES: # Select one action with e-greedy policy and observe s,a,s',r and done action = agent.select_eps_greedy_action(obs) # Take that action and observe s, a, s', r and done new_obs, reward, done, _ = env.step(action)
MAX_N_GAMES = int(hyperparameters['MAX_N_GAMES']) TEST_FREQUENCY = int(hyperparameters['TEST_FREQUENCY']) ENV_NAME = "PongNoFrameskip-v4" SAVE_VIDEO = True DEVICE = hyperparameters['DEVICE'] # 'cuda' or 'cpu' SUMMARY_WRITER = True LOG_DIR = hyperparameters['path'] # 'content/runs' '/opt/ml/model/' name = '_'.join([str(k) + '.' + str(v) for k, v in DQN_HYPERPARAMS.items()]) name = 'prv' if __name__ == '__main__': # create the environment env = atari_wrappers.make_env(ENV_NAME) if SAVE_VIDEO: # save the video of the games env = gym.wrappers.Monitor(env, LOG_DIR + "/main-" + ENV_NAME, force=True) # starting environment state obs = env.reset() # TensorBoard writer = SummaryWriter(log_dir=LOG_DIR + '/' + name + str(time.time())) if SUMMARY_WRITER else None # create the agent agent = DQNAgent(env,
def main(): args = parse_args() # Overwrite default values DQN_HYPERPARAMS['epsilon_final'] = args.eps DQN_HYPERPARAMS['double_DQN'] = args.ddqn # create the environment # env = atari_wrappers.make_env(ENV_NAME) env = atari_wrappers.make_env(args.env_name) # Create run name with environment name and timestamp of launch # (and optional tag) run_name = args.env_name if args.tag != "": run_name += f"_{args.tag}" run_name += "_run_" + datetime.now().strftime("%Y%m%d_%H%M") if SAVE_VIDEO: # save the video of the games # env = gym.wrappers.Monitor(env, "main-"+args.env_name, force=True) # Save every 50th episode env = gym.wrappers.Monitor( env, "videos/" + args.env_name + "/run_" + datetime.now().strftime("%Y%m%d_%H%M"), # noqa video_callable=lambda episode_id: episode_id % 50 == 0) # TensorBoard writer = SummaryWriter(log_dir=LOG_DIR+'/'+run_name) \ if SUMMARY_WRITER else None print('Hyperparams:', DQN_HYPERPARAMS) # create the agent agent = DQNAgent(env, DQN_HYPERPARAMS, DEVICE, summary_writer=writer) n_games = 0 # n_iter = 0 # Play MAX_N_GAMES games while n_games < MAX_N_GAMES: obs = env.reset() done = False while not done: # act greedly action = agent.act_eps_greedy(obs) # one step on the environment new_obs, reward, done, _ = env.step(action) # add the environment feedback to the agent agent.add_env_feedback(obs, action, new_obs, reward, done) # sample and optimize NB: the agent could wait to have enough # memories agent.sample_and_optimize(BATCH_SIZE) obs = new_obs n_games += 1 # print info about the agent and reset the stats agent.print_info() agent.reset_stats() # if n_games % TEST_FREQUENCY == 0: # print('Test mean:', utils.test_game(env, agent, 1)) writer.close()