def train(self, config: TrainConfig): reward_history = [] reward_averaged = [] step = 0 alpha = config.alpha eps = config.epsilon warmup_episodes = config.warmup_episodes or config.n_episodes eps_drop = (config.epsilon - config.epsilon_final) / warmup_episodes for n_episode in range(config.n_episodes): ob = self.env.reset() done = False reward = 0. while not done: a = self.act(ob, eps) new_ob, r, done, info = self.env.step(a) if done and config.done_reward is not None: r += config.done_reward self._update_q_value(Transition(ob, a, r, new_ob, done), alpha) step += 1 reward += r ob = new_ob reward_history.append(reward) reward_averaged.append(np.average(reward_history[-50:])) alpha *= config.alpha_decay if eps > config.epsilon_final: eps = max(config.epsilon_final, eps - eps_drop) if config.log_every_episode is not None and n_episode % config.log_every_episode == 0: # Report the performance every 100 steps print( "[episode:{}|step:{}] best:{} avg:{:.4f} alpha:{:.4f} eps:{:.4f} Qsize:{}" .format(n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), alpha, eps, len(self.Q))) print("[FINAL] Num. episodes: {}, Max reward: {}, Average reward: {}". format(len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = {'reward': reward_history, 'reward_avg50': reward_averaged} plot_learning_curve(self.name, data_dict, xlabel='episode') self.env.render()
def train(self, config: TrainConfig): # Construct the replay memory buffer. buffer = ReplayMemory(tuple_class=Transition) step = 0 n_episode = 0 episode_reward = 0. episode_step = 0 reward_history = [] reward_averaged = [] eps = config.epsilon eps_drop_per_step = (eps - config.epsilon_final) / config.warmup_steps print("decrease `epsilon` per step:", eps_drop_per_step) env = self.env ob = env.reset() done = False while step < config.n_steps: while not done: a = self.act(ob, eps) ob_next, r, done, _ = env.step(a) step += 1 episode_step += 1 episode_reward += r buffer.add(Transition(ob, a, r, ob_next, float(done))) ob = ob_next if eps > config.epsilon_final: eps = max(config.epsilon_final, eps - eps_drop_per_step) if reward_history and config.log_every_step and step % config.log_every_step == 0: # Report the performance every `log_every_step` steps print( "[episodes:{}/step:{}], best(reward):{:.2f}, avg(reward):{:.2f}, eps:{:.4f}" .format(n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), eps)) # self.save_checkpoint(step=step) if buffer.size >= config.batch_size: batch = buffer.pop(config.batch_size) _, q_loss, mu_loss, summ_str = self.sess.run( [ self.train_ops, self.Q_loss, self.mu_loss, self.merged_summary ], feed_dict={ self.lr_a: config.lr_a, self.lr_c: config.lr_c, self.done: batch['done'], self.s: batch['s'], self.a: batch['a'], self.r: batch['r'], self.s_next: batch['s_next'], self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, }) self.update_target_net(tau=config.tau) self.writer.add_summary(summ_str, step) # one trajectory is complete. n_episode += 1 ob = env.reset() done = False reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) episode_step = 0 episode_reward = 0. self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def train(self, config: TrainConfig): buffer = ReplayMemory(tuple_class=Transition) step = 0 episode_reward = 0. reward_history = [] reward_averaged = [] lr_c = config.lr_c lr_a = config.lr_a eps = config.epsilon warmup_episodes = config.warmup_episodes or config.n_episodes eps_drop = (eps - config.epsilon_final) / warmup_episodes print("Decrease epsilon per step:", eps_drop) for n_episode in range(config.n_episodes): ob = self.env.reset() self.act(ob, eps) done = False while not done: a = self.act(ob, eps) ob_next, r, done, info = self.env.step(a) step += 1 episode_reward += r record = Transition(self.obs_to_inputs(ob), a, r, self.obs_to_inputs(ob_next), done) buffer.add(record) ob = ob_next while buffer.size >= config.batch_size: batch = buffer.pop(config.batch_size) _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.lr_c: lr_c, self.lr_a: lr_a, self.s: batch['s'], self.a: batch['a'], self.r: batch['r'], self.s_next: batch['s_next'], self.done: batch['done'], self.episode_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, }) self.writer.add_summary(summ_str, step) # One trajectory is complete! reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) episode_reward = 0. lr_c *= config.lr_c_decay lr_a *= config.lr_a_decay if eps > config.epsilon_final: eps -= eps_drop if (reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0): # Report the performance every `every_step` steps print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}|{:.4f} eps:{:.4f}" .format( n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr_c, lr_a, eps, )) # self.save_checkpoint(step=step) self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def train(self, config: TrainConfig): if self.model_type == 'lstm': buffer = ReplayTrajMemory(capacity=config.memory_capacity, step_size=self.step_size) else: buffer = ReplayMemory(capacity=config.memory_capacity) reward = 0. reward_history = [0.0] reward_averaged = [] lr = config.lr eps = config.epsilon annealing_episodes = config.warmup_episodes or config.n_episodes eps_drop = (config.epsilon - config.epsilon_final) / annealing_episodes print("eps_drop:", eps_drop) step = 0 for n_episode in range(config.n_episodes): ob = self.env.reset() done = False traj = [] while not done: a = self.act(self.obs_to_inputs(ob), eps) new_ob, r, done, info = self.env.step(a) step += 1 reward += r traj.append( Transition(self.obs_to_inputs(ob), a, r, self.obs_to_inputs(new_ob), done)) ob = new_ob # No enough samples in the buffer yet. if buffer.size < self.batch_size: continue # Training with a mini batch of samples! batch_data = buffer.sample(self.batch_size) feed_dict = { self.learning_rate: lr, self.states: batch_data['s'], self.actions: batch_data['a'], self.rewards: batch_data['r'], self.states_next: batch_data['s_next'], self.done_flags: batch_data['done'], self.ep_reward: reward_history[-1], } if self.double_q: actions_next = self.sess.run( self.actions_selected_by_q, {self.states: batch_data['s_next']}) feed_dict.update({self.actions_next: actions_next}) _, q_val, q_target_val, loss, summ_str = self.sess.run([ self.optimizer, self.q, self.q_target, self.loss, self.merged_summary ], feed_dict) self.writer.add_summary(summ_str, step) if step % config.target_update_every_step: self.update_target_q_net() # Add all the transitions of one trajectory into the replay memory. buffer.add(traj) # One episode is complete. reward_history.append(reward) reward_averaged.append(np.mean(reward_history[-10:])) reward = 0. # Annealing the learning and exploration rate after every episode. lr *= config.lr_decay if eps > config.epsilon_final: eps = max(eps - eps_drop, config.epsilon_final) if reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0: # Report the performance every `every_step` steps print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}, eps:{:.4f}" .format(n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr, eps, buffer.size)) # self.save_checkpoint(step=step) self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')