Ejemplo n.º 1
0
    def train(self, config: TrainConfig):
        reward_history = []
        reward_averaged = []
        step = 0
        alpha = config.alpha
        eps = config.epsilon

        warmup_episodes = config.warmup_episodes or config.n_episodes
        eps_drop = (config.epsilon - config.epsilon_final) / warmup_episodes

        for n_episode in range(config.n_episodes):
            ob = self.env.reset()
            done = False
            reward = 0.

            while not done:
                a = self.act(ob, eps)
                new_ob, r, done, info = self.env.step(a)
                if done and config.done_reward is not None:
                    r += config.done_reward

                self._update_q_value(Transition(ob, a, r, new_ob, done), alpha)

                step += 1
                reward += r
                ob = new_ob

            reward_history.append(reward)
            reward_averaged.append(np.average(reward_history[-50:]))

            alpha *= config.alpha_decay
            if eps > config.epsilon_final:
                eps = max(config.epsilon_final, eps - eps_drop)

            if config.log_every_episode is not None and n_episode % config.log_every_episode == 0:
                # Report the performance every 100 steps
                print(
                    "[episode:{}|step:{}] best:{} avg:{:.4f} alpha:{:.4f} eps:{:.4f} Qsize:{}"
                    .format(n_episode, step, np.max(reward_history),
                            np.mean(reward_history[-10:]), alpha, eps,
                            len(self.Q)))

        print("[FINAL] Num. episodes: {}, Max reward: {}, Average reward: {}".
              format(len(reward_history), np.max(reward_history),
                     np.mean(reward_history)))

        data_dict = {'reward': reward_history, 'reward_avg50': reward_averaged}
        plot_learning_curve(self.name, data_dict, xlabel='episode')
        self.env.render()
Ejemplo n.º 2
0
    def train(self, config: TrainConfig):
        # Construct the replay memory buffer.
        buffer = ReplayMemory(tuple_class=Transition)

        step = 0
        n_episode = 0

        episode_reward = 0.
        episode_step = 0
        reward_history = []
        reward_averaged = []

        eps = config.epsilon
        eps_drop_per_step = (eps - config.epsilon_final) / config.warmup_steps
        print("decrease `epsilon` per step:", eps_drop_per_step)

        env = self.env
        ob = env.reset()
        done = False

        while step < config.n_steps:
            while not done:
                a = self.act(ob, eps)
                ob_next, r, done, _ = env.step(a)
                step += 1
                episode_step += 1
                episode_reward += r

                buffer.add(Transition(ob, a, r, ob_next, float(done)))
                ob = ob_next

                if eps > config.epsilon_final:
                    eps = max(config.epsilon_final, eps - eps_drop_per_step)

                if reward_history and config.log_every_step and step % config.log_every_step == 0:
                    # Report the performance every `log_every_step` steps
                    print(
                        "[episodes:{}/step:{}], best(reward):{:.2f}, avg(reward):{:.2f}, eps:{:.4f}"
                        .format(n_episode, step, np.max(reward_history),
                                np.mean(reward_history[-10:]), eps))
                    # self.save_checkpoint(step=step)

                if buffer.size >= config.batch_size:
                    batch = buffer.pop(config.batch_size)
                    _, q_loss, mu_loss, summ_str = self.sess.run(
                        [
                            self.train_ops, self.Q_loss, self.mu_loss,
                            self.merged_summary
                        ],
                        feed_dict={
                            self.lr_a:
                            config.lr_a,
                            self.lr_c:
                            config.lr_c,
                            self.done:
                            batch['done'],
                            self.s:
                            batch['s'],
                            self.a:
                            batch['a'],
                            self.r:
                            batch['r'],
                            self.s_next:
                            batch['s_next'],
                            self.ep_reward:
                            np.mean(reward_history[-10:])
                            if reward_history else 0.0,
                        })
                    self.update_target_net(tau=config.tau)
                    self.writer.add_summary(summ_str, step)

            # one trajectory is complete.
            n_episode += 1
            ob = env.reset()
            done = False
            reward_history.append(episode_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            episode_step = 0
            episode_reward = 0.

        self.save_checkpoint(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
Ejemplo n.º 3
0
    def train(self, config: TrainConfig):
        buffer = ReplayMemory(tuple_class=Transition)

        step = 0
        episode_reward = 0.
        reward_history = []
        reward_averaged = []

        lr_c = config.lr_c
        lr_a = config.lr_a

        eps = config.epsilon
        warmup_episodes = config.warmup_episodes or config.n_episodes
        eps_drop = (eps - config.epsilon_final) / warmup_episodes
        print("Decrease epsilon per step:", eps_drop)

        for n_episode in range(config.n_episodes):
            ob = self.env.reset()
            self.act(ob, eps)
            done = False

            while not done:
                a = self.act(ob, eps)
                ob_next, r, done, info = self.env.step(a)
                step += 1
                episode_reward += r

                record = Transition(self.obs_to_inputs(ob), a, r,
                                    self.obs_to_inputs(ob_next), done)
                buffer.add(record)

                ob = ob_next

                while buffer.size >= config.batch_size:
                    batch = buffer.pop(config.batch_size)
                    _, summ_str = self.sess.run(
                        [self.train_ops, self.merged_summary],
                        feed_dict={
                            self.lr_c:
                            lr_c,
                            self.lr_a:
                            lr_a,
                            self.s:
                            batch['s'],
                            self.a:
                            batch['a'],
                            self.r:
                            batch['r'],
                            self.s_next:
                            batch['s_next'],
                            self.done:
                            batch['done'],
                            self.episode_reward:
                            np.mean(reward_history[-10:])
                            if reward_history else 0.0,
                        })
                    self.writer.add_summary(summ_str, step)

            # One trajectory is complete!
            reward_history.append(episode_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            episode_reward = 0.

            lr_c *= config.lr_c_decay
            lr_a *= config.lr_a_decay
            if eps > config.epsilon_final:
                eps -= eps_drop

            if (reward_history and config.log_every_episode
                    and n_episode % config.log_every_episode == 0):
                # Report the performance every `every_step` steps
                print(
                    "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}|{:.4f} eps:{:.4f}"
                    .format(
                        n_episode,
                        step,
                        np.max(reward_history),
                        np.mean(reward_history[-10:]),
                        reward_history[-5:],
                        lr_c,
                        lr_a,
                        eps,
                    ))
                # self.save_checkpoint(step=step)

        self.save_checkpoint(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
Ejemplo n.º 4
0
    def train(self, config: TrainConfig):

        if self.model_type == 'lstm':
            buffer = ReplayTrajMemory(capacity=config.memory_capacity,
                                      step_size=self.step_size)
        else:
            buffer = ReplayMemory(capacity=config.memory_capacity)

        reward = 0.
        reward_history = [0.0]
        reward_averaged = []

        lr = config.lr
        eps = config.epsilon
        annealing_episodes = config.warmup_episodes or config.n_episodes
        eps_drop = (config.epsilon - config.epsilon_final) / annealing_episodes
        print("eps_drop:", eps_drop)
        step = 0

        for n_episode in range(config.n_episodes):
            ob = self.env.reset()
            done = False
            traj = []

            while not done:
                a = self.act(self.obs_to_inputs(ob), eps)
                new_ob, r, done, info = self.env.step(a)
                step += 1
                reward += r

                traj.append(
                    Transition(self.obs_to_inputs(ob), a, r,
                               self.obs_to_inputs(new_ob), done))
                ob = new_ob

                # No enough samples in the buffer yet.
                if buffer.size < self.batch_size:
                    continue

                # Training with a mini batch of samples!
                batch_data = buffer.sample(self.batch_size)
                feed_dict = {
                    self.learning_rate: lr,
                    self.states: batch_data['s'],
                    self.actions: batch_data['a'],
                    self.rewards: batch_data['r'],
                    self.states_next: batch_data['s_next'],
                    self.done_flags: batch_data['done'],
                    self.ep_reward: reward_history[-1],
                }

                if self.double_q:
                    actions_next = self.sess.run(
                        self.actions_selected_by_q,
                        {self.states: batch_data['s_next']})
                    feed_dict.update({self.actions_next: actions_next})

                _, q_val, q_target_val, loss, summ_str = self.sess.run([
                    self.optimizer, self.q, self.q_target, self.loss,
                    self.merged_summary
                ], feed_dict)
                self.writer.add_summary(summ_str, step)
                if step % config.target_update_every_step:
                    self.update_target_q_net()

            # Add all the transitions of one trajectory into the replay memory.
            buffer.add(traj)

            # One episode is complete.
            reward_history.append(reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            reward = 0.

            # Annealing the learning and exploration rate after every episode.
            lr *= config.lr_decay
            if eps > config.epsilon_final:
                eps = max(eps - eps_drop, config.epsilon_final)

            if reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0:
                # Report the performance every `every_step` steps
                print(
                    "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}, eps:{:.4f}"
                    .format(n_episode, step, np.max(reward_history),
                            np.mean(reward_history[-10:]), reward_history[-5:],
                            lr, eps, buffer.size))
                # self.save_checkpoint(step=step)

        self.save_checkpoint(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')