def run_train_episode(self): state = self.env.reset() total_reward = 0.0 frame_index = self.steps start_time = time.time() while True: self.steps += 1 epsilon = max( self.epsilon_end, self.epsilon_start - self.steps * (self.epsilon_start - self.epsilon_end) / self.epsilon_period) self.beta = min( 1.0, self.beta_start + self.steps * (1 - self.beta_start) / self.beta_period) action = self.get_action(state, epsilon) next_state, reward, done, _ = self.env.step(action) total_reward += reward experience = replay_buffer.Experience(state, action, next_state, reward, done) self.replay_buffer.append(experience) if self.steps % self.update_target_interval == 0: self.update_target_network() if len(self.replay_buffer) >= self.replay_start_size: self.optimize() if done: self.total_rewards.append(total_reward) speed = (self.steps - frame_index) / (time.time() - start_time) mean_reward = np.mean([self.total_rewards[-100:]]) print( "%d: Done %d games, mean reward %.3f, epsilon %.2f, beta %.2f, speed %.2f f/s" % (self.steps, len(self.total_rewards), mean_reward, epsilon, self.beta, speed)) self.writer.add_scalar("epsilon", epsilon, self.steps) self.writer.add_scalar("speed", speed, self.steps) self.writer.add_scalar("mean_reward", mean_reward, self.steps) self.writer.add_scalar("reward", total_reward, self.steps) self.writer.add_scalar("beta", self.beta, self.steps) if self.best_mean_reward is None or self.best_mean_reward < mean_reward: torch.save(self.network.state_dict(), self.env_name + "-mydqnprioritybest.pth") if self.best_mean_reward is not None: print( "Best mean reward updated %.3f -> %.3f, model saved" % (self.best_mean_reward, mean_reward)) self.best_mean_reward = mean_reward break state = next_state
def run_train_episode(self): state = self.env.reset() total_reward = 0.0 frame_index = self.steps start_time = time.time() while True: self.steps += 1 action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) total_reward += reward experience = replay_buffer.Experience(state, action, next_state, reward, done) self.replay_buffer.append(experience) if self.steps % self.update_interval == 0: self.update_network() if len(self.replay_buffer) >= self.replay_start_size: self.optimize() if done: self.total_rewards.append(total_reward) end_time = time.time() speed = (self.steps - frame_index) / (end_time - start_time) mean_rewards = np.mean([self.total_rewards[-100:]]) self.writer.add_scalar("speed", speed, self.steps) self.writer.add_scalar("mean_reward", mean_rewards, self.steps) self.writer.add_scalar("reward", total_reward, self.steps) print( "%d: Done %d games, mean reward %.3f, speed %.2f f/s" % (self.steps, len(self.total_rewards), mean_rewards, speed)) if self.best_mean_reward is None or self.best_mean_reward < mean_rewards: if self.best_mean_reward is not None: print( "Best mean reward updated %.3f -> %.3f, model saved" % (self.best_mean_reward, mean_rewards)) self.best_mean_reward = mean_rewards self.save_network() break state = next_state
def play_step(self, epsilon, state, total_reward): if np.random.random() < epsilon: action = self.env.action_space.sample() else: state_t = np.array([state], copy=False) state_t = torch.tensor(state_t).to(self.device) q_vals = self.network(state_t) _, action = torch.max(q_vals, dim=1) action = int(action.item()) next_state, reward, done, _ = self.env.step(action) total_reward += reward experience = replay_buffer.Experience(state, action, next_state, reward, done) self.replay_buffer.append(experience) state = next_state return state, total_reward, done