Exemple #1
0
    def evaluate(self, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env
        env.state.is_render_image = self.config.render_test

        # replay memory to play
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = []

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            goal_state = env.teacher.goal_obs_onehot_state  # h x w x c
            h_state = (np.zeros([1, self.config.h_size]),
                       np.zeros([1, self.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0
            for j in range(50):
                if self.config.render_test: env.render()

                #### for replay_buffer
                # store last state in buffer
                idx = replay_buffer.store_frame(state, goal_state)
                q_input = replay_buffer.encode_recent_observation()

                action, action_q, h_state = self.get_action(
                    [q_input], goal_state[None][None], h_state, slen, [action])
                #print(action, action_q)

                # perform action in env
                new_state, reward, done = env.step(action)

                # store in replay memory
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # count reward
                total_reward += reward
                if done:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

        return avg_reward
Exemple #2
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()  # h x w x c
            goal_state = self.env.teacher.goal_obs_onehot_state  # h x w x c
            h_state = (np.zeros([1, self.config.h_size]),
                       np.zeros([1, self.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0
            for i in range(200):
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()

                #### for replay_buffer
                # replay memory stuff
                idx = replay_buffer.store_frame(state, goal_state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values, h_state = self.get_best_action(
                    [q_input], goal_state[None][None], h_state, slen, [action])
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)
                # perform action in env
                new_state, reward, done = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate()]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)