Esempio n. 1
0
    def evaluate(self, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        save_paths = False
        if num_episodes is None:
            self.logger.info("Evaluating...")
        else:
            save_paths = True

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env
            bfs_len = self.bfs_len
        else:
            bfs_len = env.get_bfs_length()

        # replay memory to play
        if self.config.use_memory:
            replay_buffer = ReplayBuffer(
                self.config.buffer_size,
                self.config.state_history,
                memory_size=self.config.memory_unit_size)
        else:
            replay_buffer = ReplayBuffer(self.config.buffer_size,
                                         self.config.state_history)
        rewards = []
        steps = []

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            count = 0
            while True:
                if self.config.render_test: env.render()

                # store last state in buffer
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                if self.config.use_memory:
                    prev_memory = replay_buffer.encode_recent_memory()
                    action, bottom_q, top_q, next_memory = self.get_action_with_memory(
                        q_input, prev_memory)
                    next_memory = np.squeeze(next_memory)
                else:
                    action = self.get_action(q_input)

                if i == 0 and self.config.use_memory:
                    with open(self.config.output_path + 'eval_example_log.txt',
                              'a') as of:
                        of.write('State = {}\n'.format(env.cur_state))
                        of.write('Taking action = {}\n'.format(action))
                        of.write('prev_memory = {}\n'.format(
                            prev_memory[0, :6]))
                        of.write('next_memory = {}\n'.format(next_memory[:6]))
                        of.write('bottom_q_values = {}\n'.format(bottom_q))
                        of.write('top_q_values = {}\n'.format(top_q))
                        of.write('\n')

                if save_paths:
                    with open(self.config.output_path + 'path_log.txt',
                              'a') as of:
                        of.write("(s, a) = ({}, {})\n".format(
                            env.cur_state, action))
                        of.write('\n')

                # perform action in env
                new_state, reward, done, info = env.step(action)

                # store in replay memory
                replay_buffer.store_effect(idx, action, reward, done)

                if self.config.use_memory:
                    replay_buffer.store_memory(idx, next_memory)

                state = new_state

                count += 1

                # count reward
                total_reward += reward
                if done:
                    if save_paths:
                        with open(self.config.output_path + 'path_log.txt',
                                  'a') as of:
                            of.write('\n')
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)
            if total_reward <= 0:
                steps.append(np.nan)
            else:
                steps.append(count)

        steps = np.array(steps) - bfs_len  # adjust for shortest possible path
        avg_reward = np.mean(rewards)

        avg_length = np.nanmean(steps)
        sigma_length = np.sqrt(np.nanvar(steps) / len(steps))
        percent_completed = np.count_nonzero(~np.isnan(steps)) / float(
            len(steps))
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}, Percent completed: {:04.2f}, Average length: {:04.2f} +/- {:04.2f}, n = {}".format(
                avg_reward, sigma_reward, percent_completed, avg_length,
                sigma_length, len(rewards))
            self.logger.info(msg)

        return avg_reward, percent_completed, avg_length
Esempio n. 2
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables

        if self.config.use_memory:
            replay_buffer = ReplayBuffer(
                self.config.buffer_size,
                self.config.state_history,
                memory_size=self.config.memory_unit_size)
        else:
            replay_buffer = ReplayBuffer(self.config.buffer_size,
                                         self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()[0]]

        prog = Progbar(target=self.config.nsteps_train)

        evaluation_result_list = []
        oos_evalution_result_list = []

        # interact with environment
        prev_time = time.time()
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                if self.config.use_memory:
                    prev_memory = replay_buffer.encode_recent_memory()
                    best_action, q_values, _, next_memory = self.get_best_action_with_memory(
                        q_input, prev_memory)
                    next_memory = np.squeeze(next_memory)
                else:
                    best_action, q_values = self.get_best_action(q_input)
                # chose action according to current Q and exploration
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                if self.config.use_memory:
                    replay_buffer.store_memory(idx, next_memory)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                time_log_freq = 1000
                if t % time_log_freq == 0:
                    with open(self.config.output_path + 'time_log.txt',
                              'a') as of:
                        of.write('{}\n'.format(time.time() - prev_time))
                        of.write('\n')
                    prev_time = time.time()

                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg_R", self.avg_reward),
                                           ("Max_R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max_Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                score, complete, length = self.evaluate()
                if complete > 0:
                    evaluation_result_list += [(score, complete, length)]
                if score > self.config.extended_eval_threshold:
                    self.logger.info('Extended in-sample evaluation...')
                    self.evaluate(num_episodes=1000)
                    for _ in range(10):
                        self.logger.info(
                            'Extended out-of-sample evaluation...')
                        oos_result = self.evaluate(
                            EnvMaze(n=self.config.maze_size), num_episodes=100)
                        oos_evalution_result_list += [oos_result]
                scores_eval += [score]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()[0]]
        export_plot(scores_eval, "Scores", self.config.plot_output)

        return evaluation_result_list, oos_evalution_result_list