def train(self, config: TrainConfig): reward_history = [] reward_averaged = [] step = 0 alpha = config.alpha eps = config.epsilon warmup_episodes = config.warmup_episodes or config.n_episodes eps_drop = (config.epsilon - config.epsilon_final) / warmup_episodes for n_episode in range(config.n_episodes): ob = self.env.reset() done = False reward = 0. while not done: a = self.act(ob, eps) new_ob, r, done, info = self.env.step(a) if done and config.done_reward is not None: r += config.done_reward self._update_q_value(Transition(ob, a, r, new_ob, done), alpha) step += 1 reward += r ob = new_ob reward_history.append(reward) reward_averaged.append(np.average(reward_history[-50:])) alpha *= config.alpha_decay if eps > config.epsilon_final: eps = max(config.epsilon_final, eps - eps_drop) if config.log_every_episode is not None and n_episode % config.log_every_episode == 0: # Report the performance every 100 steps print( "[episode:{}|step:{}] best:{} avg:{:.4f} alpha:{:.4f} eps:{:.4f} Qsize:{}" .format(n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), alpha, eps, len(self.Q))) print("[FINAL] Num. episodes: {}, Max reward: {}, Average reward: {}". format(len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = {'reward': reward_history, 'reward_avg50': reward_averaged} plot_learning_curve(self.name, data_dict, xlabel='episode')
def train(self, config: TrainConfig): step = 0 episode_reward = 0. reward_history = [] reward_averaged = [] lr = config.lr for n_episode in range(config.n_episodes): ob = self.env.reset() done = False obs = [] actions = [] rewards = [] returns = [] while not done: a = self.act(ob) new_ob, r, done, info = self.env.step(a) step += 1 episode_reward += r obs.append(self.obs_to_inputs(ob)) actions.append(a) rewards.append(r) ob = new_ob # One trajectory is complete! reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) episode_reward = 0. lr *= config.lr_decay # Estimate returns backwards. return_so_far = 0.0 for r in rewards[::-1]: return_so_far = self.gamma * return_so_far + r returns.append(return_so_far) returns = returns[::-1] _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.lr: lr, self.s: np.array(obs), self.a: np.array(actions), self.returns: np.array(returns), self.ep_reward: reward_history[-1], }) self.writer.add_summary(summ_str, step) if reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0: # Report the performance every `every_step` steps print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}". format( n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr, )) # self.save_checkpoint(step=step) self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def train(self, config: TrainConfig): if self.model_type == 'lstm': buffer = ReplayTrajMemory(capacity=config.memory_capacity, step_size=self.step_size) else: buffer = ReplayMemory(capacity=config.memory_capacity) reward = 0. reward_history = [0.0] reward_averaged = [] lr = config.lr eps = config.epsilon annealing_episodes = config.warmup_episodes or config.n_episodes eps_drop = (config.epsilon - config.epsilon_final) / annealing_episodes print("eps_drop:", eps_drop) step = 0 #obtain state transition table from somewhere #decomposeStates(self, st_table) for n_episode in range(config.n_episodes): ob = self.env.reset() done = False traj = [] while not done: a = self.act(self.obs_to_inputs(ob), eps) new_ob, r, done, info = self.env.step(a) step += 1 reward += r traj.append( Transition(self.obs_to_inputs(ob), a, r, self.obs_to_inputs(new_ob), done)) ob = new_ob # No enough samples in the buffer yet. if buffer.size < self.batch_size: continue # Training with a mini batch of samples! batch_data = buffer.sample(self.batch_size) feed_dict = { self.learning_rate: lr, self.states: batch_data['s'], self.actions: batch_data['a'], self.rewards: batch_data['r'], self.states_next: batch_data['s_next'], self.done_flags: batch_data['done'], self.ep_reward: reward_history[-1], } if self.double_q: actions_next = self.sess.run(self.actions_selected_by_q, { self.states: batch_data['s_next'] }) feed_dict.update({self.actions_next: actions_next}) _, q_val, q_target_val, loss, summ_str = self.sess.run( [self.optimizer, self.q, self.q_target, self.loss, self.merged_summary], feed_dict ) self.writer.add_summary(summ_str, step) if step % config.target_update_every_step: self.update_target_q_net() # Add all the transitions of one trajectory into the replay memory. buffer.add(traj) # One episode is complete. reward_history.append(reward) reward_averaged.append(np.mean(reward_history[-10:])) reward = 0. # Annealing the learning and exploration rate after every episode. lr *= config.lr_decay if eps > config.epsilon_final: eps = max(eps - eps_drop, config.epsilon_final) if reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0: # Report the performance every `every_step` steps print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}, eps:{:.4f}".format( n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr, eps, buffer.size )) # self.save_checkpoint(step=step) self.save_checkpoint(step=step) print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def train(self, config: TrainConfig): buffer = ReplayMemory(tuple_class=Transition) step = 0 episode_reward = 0. reward_history = [] reward_averaged = [] lr_c = config.lr_c lr_a = config.lr_a eps = config.epsilon warmup_episodes = config.warmup_episodes or config.n_episodes eps_drop = (eps - config.epsilon_final) / warmup_episodes print("Decrease epsilon per step:", eps_drop) for n_episode in range(config.n_episodes): ob = self.env.reset() self.act(ob, eps) done = False while not done: a = self.act(ob, eps) ob_next, r, done, info = self.env.step(a) step += 1 episode_reward += r record = Transition(self.obs_to_inputs(ob), a, r, self.obs_to_inputs(ob_next), done) buffer.add(record) ob = ob_next while buffer.size >= config.batch_size: batch = buffer.pop(config.batch_size) _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.lr_c: lr_c, self.lr_a: lr_a, self.s: batch['s'], self.a: batch['a'], self.r: batch['r'], self.s_next: batch['s_next'], self.done: batch['done'], self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, }) self.writer.add_summary(summ_str, step) # One trajectory is complete! reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) episode_reward = 0. lr_c *= config.lr_c_decay lr_a *= config.lr_a_decay if eps > config.epsilon_final: eps -= eps_drop if (reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0): # Report the performance every `every_step` steps print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}|{:.4f} eps:{:.4f}" .format( n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr_c, lr_a, eps, )) # self.save_checkpoint(step=step) self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def train(self, config: TrainConfig): # Construct the replay memory buffer. buffer = ReplayMemory(tuple_class=Transition) step = 0 n_episode = 0 episode_reward = 0. episode_step = 0 reward_history = [] reward_averaged = [] eps = config.epsilon eps_drop_per_step = (eps - config.epsilon_final) / config.warmup_steps print("decrease `epsilon` per step:", eps_drop_per_step) env = self.env ob = env.reset() done = False while step < config.n_steps: while not done: a = self.act(ob, eps) ob_next, r, done, _ = env.step(a) step += 1 episode_step += 1 episode_reward += r buffer.add(Transition(ob, a, r, ob_next, float(done))) ob = ob_next if eps > config.epsilon_final: eps = max(config.epsilon_final, eps - eps_drop_per_step) if reward_history and config.log_every_step and step % config.log_every_step == 0: # Report the performance every `log_every_step` steps print( "[episodes:{}/step:{}], best(reward):{:.2f}, avg(reward):{:.2f}, eps:{:.4f}" .format(n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), eps)) # self.save_checkpoint(step=step) if buffer.size >= config.batch_size: batch = buffer.pop(config.batch_size) _, q_loss, mu_loss, summ_str = self.sess.run( [ self.train_ops, self.Q_loss, self.mu_loss, self.merged_summary ], feed_dict={ self.lr_a: config.lr_a, self.lr_c: config.lr_c, self.done: batch['done'], self.s: batch['s'], self.a: batch['a'], self.r: batch['r'], self.s_next: batch['s_next'], self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, }) self.update_target_net(tau=config.tau) self.writer.add_summary(summ_str, step) # one trajectory is complete. n_episode += 1 ob = env.reset() done = False reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) episode_step = 0 episode_reward = 0. self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def run(self): n_episodes = self.config.n_episodes loss = None total_rewards = np.empty(n_episodes) solved_consecutively = 0 for i in range(n_episodes): state = self.env.reset() done = False score = 0 while not done: action, q_value = self.get_action( state, self.config.epsilon_schedule.current_p) next_state, reward, done, info = self.env.step(action) self.memory.add( Transition(state, action, reward, next_state, done)) score += reward state = next_state if self.memory.size > self.batch_size: loss = self.train() if i % self.config.target_update_every_step == 0: self.update_target() total_rewards[i] = score avg_rewards = total_rewards[max(0, i - 100):(i + 1)].mean() self.config.epsilon_schedule.anneal() self.config.beta_schedule.anneal() self.global_lr.assign(self.config.learning_rate_schedule.anneal()) with self.writer.as_default(): with tf.name_scope('Performance'): tf.summary.scalar('episode reward', score, step=i) tf.summary.scalar('running avg reward(100)', avg_rewards, step=i) if self.config.prioritized_memory_replay: with tf.name_scope('Schedules'): tf.summary.scalar('Beta', self.config.beta_schedule.current_p, step=i) tf.summary.scalar( 'Epsilon', self.config.epsilon_schedule.current_p, step=i) tf.summary.scalar('Learning rate', self.optimizer._decayed_lr( tf.float32).numpy(), step=i) # Specific four mountain car if done and score == 500: solved_consecutively += 1 else: solved_consecutively = 0 if solved_consecutively >= 50: print(f'Successfully SOLVED {solved_consecutively} times!') break if i % self.config.log_every_episode == 0: print("episode:", i, "/", self.config.n_episodes, "episode reward:", score, "avg reward (last 100):", avg_rewards, "eps:", self.config.epsilon_schedule.current_p, "Learning rate (10e-3):", (self.optimizer._decayed_lr(tf.float32).numpy() * 1000), "Consecutively solved:", solved_consecutively) plot_learning_curve(self.name + '.png', {'rewards': total_rewards}) self.save()
def train(self, config: TrainConfig): BufferRecord = namedtuple('BufferRecord', ['s', 'a', 's_next', 'r', 'done', 'old_logp_actor', 'v_target', 'adv']) buffer = ReplayMemory(tuple_class=BufferRecord) reward_history = [] reward_averaged = [] step = 0 total_rec = 0 clip = config.ratio_clip_range if config.ratio_clip_decay: clip_delta = clip / config.n_iterations else: clip_delta = 0.0 for n_iteration in range(config.n_iterations): # we should have multiple rollout_workers running in parallel. for _ in range(config.n_rollout_workers): episode_reward, n_rec = self._generate_rollout(buffer) # One trajectory is complete. reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) total_rec += n_rec # now let's train the model for some steps. for batch in buffer.loop(config.batch_size, epoch=config.train_epoches): _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.lr_a: config.lr_a, self.lr_c: config.lr_c, self.clip_range: clip, self.s: batch['s'], self.a: batch['a'], self.s_next: batch['s_next'], self.r: batch['r'], self.done: batch['done'], self.old_logp_a: batch['old_logp_actor'], self.v_target: batch['v_target'], self.adv: batch['adv'], self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, }) self.writer.add_summary(summ_str, step) step += 1 clip = max(0.0, clip - clip_delta) if (reward_history and config.log_every_iteration and n_iteration % config.log_every_iteration == 0): # Report the performance every `log_every_iteration` steps print("[iteration:{}/step:{}], best:{}, avg:{:.2f}, hist:{}, clip:{:.2f}; {} transitions.".format( n_iteration, step, np.max(reward_history), np.mean(reward_history[-10:]), list(map(lambda x: round(x, 2), reward_history[-5:])), clip, total_rec )) # self.save_checkpoint(step=step) self.save_checkpoint(step=step) print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')