Exemple #1
0
    def run_train_episode(self):
        state = self.env.reset()
        total_reward = 0.0
        frame_index = self.steps
        start_time = time.time()

        while True:
            self.steps += 1
            epsilon = max(
                self.epsilon_end, self.epsilon_start - self.steps *
                (self.epsilon_start - self.epsilon_end) / self.epsilon_period)
            self.beta = min(
                1.0, self.beta_start + self.steps *
                (1 - self.beta_start) / self.beta_period)

            action = self.get_action(state, epsilon)
            next_state, reward, done, _ = self.env.step(action)
            total_reward += reward

            experience = replay_buffer.Experience(state, action, next_state,
                                                  reward, done)
            self.replay_buffer.append(experience)

            if self.steps % self.update_target_interval == 0:
                self.update_target_network()

            if len(self.replay_buffer) >= self.replay_start_size:
                self.optimize()

            if done:
                self.total_rewards.append(total_reward)
                speed = (self.steps - frame_index) / (time.time() - start_time)
                mean_reward = np.mean([self.total_rewards[-100:]])

                print(
                    "%d: Done %d games, mean reward %.3f, epsilon %.2f, beta %.2f, speed %.2f f/s"
                    % (self.steps, len(self.total_rewards), mean_reward,
                       epsilon, self.beta, speed))

                self.writer.add_scalar("epsilon", epsilon, self.steps)
                self.writer.add_scalar("speed", speed, self.steps)
                self.writer.add_scalar("mean_reward", mean_reward, self.steps)
                self.writer.add_scalar("reward", total_reward, self.steps)
                self.writer.add_scalar("beta", self.beta, self.steps)

                if self.best_mean_reward is None or self.best_mean_reward < mean_reward:
                    torch.save(self.network.state_dict(),
                               self.env_name + "-mydqnprioritybest.pth")
                    if self.best_mean_reward is not None:
                        print(
                            "Best mean reward updated %.3f -> %.3f, model saved"
                            % (self.best_mean_reward, mean_reward))
                    self.best_mean_reward = mean_reward

                break
            state = next_state
    def run_train_episode(self):
        state = self.env.reset()
        total_reward = 0.0
        frame_index = self.steps
        start_time = time.time()

        while True:
            self.steps += 1

            action = self.get_action(state)
            next_state, reward, done, _ = self.env.step(action)
            total_reward += reward

            experience = replay_buffer.Experience(state, action, next_state,
                                                  reward, done)
            self.replay_buffer.append(experience)

            if self.steps % self.update_interval == 0:
                self.update_network()

            if len(self.replay_buffer) >= self.replay_start_size:
                self.optimize()

            if done:
                self.total_rewards.append(total_reward)
                end_time = time.time()
                speed = (self.steps - frame_index) / (end_time - start_time)
                mean_rewards = np.mean([self.total_rewards[-100:]])

                self.writer.add_scalar("speed", speed, self.steps)
                self.writer.add_scalar("mean_reward", mean_rewards, self.steps)
                self.writer.add_scalar("reward", total_reward, self.steps)

                print(
                    "%d: Done %d games, mean reward %.3f, speed %.2f f/s" %
                    (self.steps, len(self.total_rewards), mean_rewards, speed))

                if self.best_mean_reward is None or self.best_mean_reward < mean_rewards:
                    if self.best_mean_reward is not None:
                        print(
                            "Best mean reward updated %.3f -> %.3f, model saved"
                            % (self.best_mean_reward, mean_rewards))
                    self.best_mean_reward = mean_rewards
                    self.save_network()
                break
            state = next_state
Exemple #3
0
    def play_step(self, epsilon, state, total_reward):

        if np.random.random() < epsilon:
            action = self.env.action_space.sample()
        else:
            state_t = np.array([state], copy=False)
            state_t = torch.tensor(state_t).to(self.device)
            q_vals = self.network(state_t)
            _, action = torch.max(q_vals, dim=1)
            action = int(action.item())

        next_state, reward, done, _ = self.env.step(action)
        total_reward += reward

        experience = replay_buffer.Experience(state, action, next_state,
                                              reward, done)
        self.replay_buffer.append(experience)
        state = next_state

        return state, total_reward, done