Beispiel #1
0
    def train(self,
              n_episodes=100,
              annealing_episodes=None,
              done_reward=None,
              every_episode=None):
        reward_history = []
        reward_averaged = []
        step = 0
        alpha = self.alpha
        eps = self.epsilon

        annealing_episodes = annealing_episodes or n_episodes
        eps_drop = (self.epsilon - self.epsilon_final) / annealing_episodes

        for n_episode in range(n_episodes):
            ob = self.env.reset()
            done = False
            reward = 0.

            while not done:
                a = self.act(ob, eps)
                new_ob, r, done, info = self.env.step(a)
                if done and done_reward is not None:
                    r = done_reward

                self._update_q_value(Transition(ob, a, r, new_ob, done))

                step += 1
                reward += r
                ob = new_ob

            reward_history.append(reward)
            reward_averaged.append(np.average(reward_history[-50:]))

            alpha *= self.alpha_decay
            if eps > self.epsilon_final:
                eps -= eps_drop

            if every_episode is not None and n_episode % every_episode == 0:
                # Report the performance every 100 steps
                print(
                    "[episode:{}|step:{}] best:{} avg:{:.4f}|{} alpha:{:.4f} eps:{:.4f} Qsize:{}"
                    .format(n_episode, step, np.max(reward_history),
                            np.mean(reward_history[-10:]), reward_history[-5:],
                            alpha, eps, len(self.Q)))

        print("[FINAL] Num. episodes: {}, Max reward: {}, Average reward: {}".
              format(len(reward_history), np.max(reward_history),
                     np.mean(reward_history)))

        plot_learning_curve(self.name, {
            'reward': reward_history,
            'reward_avg50': reward_averaged
        },
                            xlabel='episode')
Beispiel #2
0
    def train(self, config: TrainConfig):
        reward_history = []
        reward_averaged = []
        step = 0
        alpha = config.alpha
        eps = config.epsilon

        warmup_episodes = config.warmup_episodes or config.n_episodes
        eps_drop = (config.epsilon - config.epsilon_final) / warmup_episodes

        for n_episode in range(config.n_episodes):
            ob = self.env.reset()
            done = False
            reward = 0.

            while not done:
                a = self.act(ob, eps)
                new_ob, r, done, info = self.env.step(a)
                if done and config.done_reward is not None:
                    r += config.done_reward

                self._update_q_value(Transition(ob, a, r, new_ob, done), alpha)

                step += 1
                reward += r
                ob = new_ob

            reward_history.append(reward)
            reward_averaged.append(np.average(reward_history[-50:]))

            alpha *= config.alpha_decay
            if eps > config.epsilon_final:
                eps = max(config.epsilon_final, eps - eps_drop)

            if config.log_every_episode is not None and n_episode % config.log_every_episode == 0:
                # Report the performance every 100 steps
                print(
                    "[episode:{}|step:{}] best:{} avg:{:.4f} alpha:{:.4f} eps:{:.4f} Qsize:{}"
                    .format(n_episode, step, np.max(reward_history),
                            np.mean(reward_history[-10:]), alpha, eps,
                            len(self.Q)))

        print("[FINAL] Num. episodes: {}, Max reward: {}, Average reward: {}".
              format(len(reward_history), np.max(reward_history),
                     np.mean(reward_history)))

        data_dict = {'reward': reward_history, 'reward_avg50': reward_averaged}
        plot_learning_curve(self.name, data_dict, xlabel='episode')
        self.env.render()
Beispiel #3
0
    def train(self, config: TrainConfig):
        # Construct the replay memory buffer.
        buffer = ReplayMemory(tuple_class=Transition)

        step = 0
        n_episode = 0

        episode_reward = 0.
        episode_step = 0
        reward_history = []
        reward_averaged = []

        eps = config.epsilon
        eps_drop_per_step = (eps - config.epsilon_final) / config.warmup_steps
        print("decrease `epsilon` per step:", eps_drop_per_step)

        env = self.env
        ob = env.reset()
        done = False

        while step < config.n_steps:
            while not done:
                a = self.act(ob, eps)
                ob_next, r, done, _ = env.step(a)
                step += 1
                episode_step += 1
                episode_reward += r

                buffer.add(Transition(ob, a, r, ob_next, float(done)))
                ob = ob_next

                if eps > config.epsilon_final:
                    eps = max(config.epsilon_final, eps - eps_drop_per_step)

                if reward_history and config.log_every_step and step % config.log_every_step == 0:
                    # Report the performance every `log_every_step` steps
                    print(
                        "[episodes:{}/step:{}], best(reward):{:.2f}, avg(reward):{:.2f}, eps:{:.4f}"
                        .format(n_episode, step, np.max(reward_history),
                                np.mean(reward_history[-10:]), eps))
                    # self.save_checkpoint(step=step)

                if buffer.size >= config.batch_size:
                    batch = buffer.pop(config.batch_size)
                    _, q_loss, mu_loss, summ_str = self.sess.run(
                        [
                            self.train_ops, self.Q_loss, self.mu_loss,
                            self.merged_summary
                        ],
                        feed_dict={
                            self.lr_a:
                            config.lr_a,
                            self.lr_c:
                            config.lr_c,
                            self.done:
                            batch['done'],
                            self.s:
                            batch['s'],
                            self.a:
                            batch['a'],
                            self.r:
                            batch['r'],
                            self.s_next:
                            batch['s_next'],
                            self.ep_reward:
                            np.mean(reward_history[-10:])
                            if reward_history else 0.0,
                        })
                    self.update_target_net(tau=config.tau)
                    self.writer.add_summary(summ_str, step)

            # one trajectory is complete.
            n_episode += 1
            ob = env.reset()
            done = False
            reward_history.append(episode_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            episode_step = 0
            episode_reward = 0.

        self.save_checkpoint(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
Beispiel #4
0
    def train(self, config: TrainConfig):
        buffer = ReplayMemory(tuple_class=Transition)

        step = 0
        episode_reward = 0.
        reward_history = []
        reward_averaged = []

        lr_c = config.lr_c
        lr_a = config.lr_a

        eps = config.epsilon
        warmup_episodes = config.warmup_episodes or config.n_episodes
        eps_drop = (eps - config.epsilon_final) / warmup_episodes
        print("Decrease epsilon per step:", eps_drop)

        for n_episode in range(config.n_episodes):
            ob = self.env.reset()
            self.act(ob, eps)
            done = False

            while not done:
                a = self.act(ob, eps)
                ob_next, r, done, info = self.env.step(a)
                step += 1
                episode_reward += r

                record = Transition(self.obs_to_inputs(ob), a, r,
                                    self.obs_to_inputs(ob_next), done)
                buffer.add(record)

                ob = ob_next

                while buffer.size >= config.batch_size:
                    batch = buffer.pop(config.batch_size)
                    _, summ_str = self.sess.run(
                        [self.train_ops, self.merged_summary],
                        feed_dict={
                            self.lr_c:
                            lr_c,
                            self.lr_a:
                            lr_a,
                            self.s:
                            batch['s'],
                            self.a:
                            batch['a'],
                            self.r:
                            batch['r'],
                            self.s_next:
                            batch['s_next'],
                            self.done:
                            batch['done'],
                            self.episode_reward:
                            np.mean(reward_history[-10:])
                            if reward_history else 0.0,
                        })
                    self.writer.add_summary(summ_str, step)

            # One trajectory is complete!
            reward_history.append(episode_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            episode_reward = 0.

            lr_c *= config.lr_c_decay
            lr_a *= config.lr_a_decay
            if eps > config.epsilon_final:
                eps -= eps_drop

            if (reward_history and config.log_every_episode
                    and n_episode % config.log_every_episode == 0):
                # Report the performance every `every_step` steps
                print(
                    "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}|{:.4f} eps:{:.4f}"
                    .format(
                        n_episode,
                        step,
                        np.max(reward_history),
                        np.mean(reward_history[-10:]),
                        reward_history[-5:],
                        lr_c,
                        lr_a,
                        eps,
                    ))
                # self.save_checkpoint(step=step)

        self.save_checkpoint(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
Beispiel #5
0
    def train(self, config: TrainConfig):
        BufferRecord = namedtuple('BufferRecord', [
            's', 'a', 's_next', 'r', 'done', 'old_logp_actor', 'v_target',
            'adv'
        ])
        buffer = ReplayMemory(tuple_class=BufferRecord)

        reward_history = []
        reward_averaged = []
        step = 0
        total_rec = 0

        clip = config.ratio_clip_range
        if config.ratio_clip_decay:
            clip_delta = clip / config.n_iterations
        else:
            clip_delta = 0.0

        for n_iteration in range(config.n_iterations):

            # we should have multiple rollout_workers running in parallel.
            for _ in range(config.n_rollout_workers):
                episode_reward, n_rec = self._generate_rollout(buffer)
                # One trajectory is complete.
                reward_history.append(episode_reward)
                reward_averaged.append(np.mean(reward_history[-10:]))
                total_rec += n_rec

            # now let's train the model for some steps.
            for batch in buffer.loop(config.batch_size,
                                     epoch=config.train_epoches):
                _, summ_str = self.sess.run(
                    [self.train_ops, self.merged_summary],
                    feed_dict={
                        self.lr_a:
                        config.lr_a,
                        self.lr_c:
                        config.lr_c,
                        self.clip_range:
                        clip,
                        self.s:
                        batch['s'],
                        self.a:
                        batch['a'],
                        self.s_next:
                        batch['s_next'],
                        self.r:
                        batch['r'],
                        self.done:
                        batch['done'],
                        self.old_logp_a:
                        batch['old_logp_actor'],
                        self.v_target:
                        batch['v_target'],
                        self.adv:
                        batch['adv'],
                        self.ep_reward:
                        np.mean(reward_history[-10:])
                        if reward_history else 0.0,
                    })

                self.writer.add_summary(summ_str, step)
                step += 1

            clip = max(0.0, clip - clip_delta)

            if (reward_history and config.log_every_iteration
                    and n_iteration % config.log_every_iteration == 0):
                # Report the performance every `log_every_iteration` steps
                print(
                    "[iteration:{}/step:{}], best:{}, avg:{:.2f}, hist:{}, clip:{:.2f}; {} transitions."
                    .format(
                        n_iteration, step, np.max(reward_history),
                        np.mean(reward_history[-10:]),
                        list(map(lambda x: round(x, 2), reward_history[-5:])),
                        clip, total_rec))
                # self.save_checkpoint(step=step)

        self.save_checkpoint(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
Beispiel #6
0
    def train(self, config: TrainConfig):

        if self.model_type == 'lstm':
            buffer = ReplayTrajMemory(capacity=config.memory_capacity,
                                      step_size=self.step_size)
        else:
            buffer = ReplayMemory(capacity=config.memory_capacity)

        reward = 0.
        reward_history = [0.0]
        reward_averaged = []

        lr = config.lr
        eps = config.epsilon
        annealing_episodes = config.warmup_episodes or config.n_episodes
        eps_drop = (config.epsilon - config.epsilon_final) / annealing_episodes
        print("eps_drop:", eps_drop)
        step = 0

        for n_episode in range(config.n_episodes):
            ob = self.env.reset()
            done = False
            traj = []

            while not done:
                a = self.act(self.obs_to_inputs(ob), eps)
                new_ob, r, done, info = self.env.step(a)
                step += 1
                reward += r

                traj.append(
                    Transition(self.obs_to_inputs(ob), a, r,
                               self.obs_to_inputs(new_ob), done))
                ob = new_ob

                # No enough samples in the buffer yet.
                if buffer.size < self.batch_size:
                    continue

                # Training with a mini batch of samples!
                batch_data = buffer.sample(self.batch_size)
                feed_dict = {
                    self.learning_rate: lr,
                    self.states: batch_data['s'],
                    self.actions: batch_data['a'],
                    self.rewards: batch_data['r'],
                    self.states_next: batch_data['s_next'],
                    self.done_flags: batch_data['done'],
                    self.ep_reward: reward_history[-1],
                }

                if self.double_q:
                    actions_next = self.sess.run(
                        self.actions_selected_by_q,
                        {self.states: batch_data['s_next']})
                    feed_dict.update({self.actions_next: actions_next})

                _, q_val, q_target_val, loss, summ_str = self.sess.run([
                    self.optimizer, self.q, self.q_target, self.loss,
                    self.merged_summary
                ], feed_dict)
                self.writer.add_summary(summ_str, step)
                if step % config.target_update_every_step:
                    self.update_target_q_net()

            # Add all the transitions of one trajectory into the replay memory.
            buffer.add(traj)

            # One episode is complete.
            reward_history.append(reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            reward = 0.

            # Annealing the learning and exploration rate after every episode.
            lr *= config.lr_decay
            if eps > config.epsilon_final:
                eps = max(eps - eps_drop, config.epsilon_final)

            if reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0:
                # Report the performance every `every_step` steps
                print(
                    "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}, eps:{:.4f}"
                    .format(n_episode, step, np.max(reward_history),
                            np.mean(reward_history[-10:]), reward_history[-5:],
                            lr, eps, buffer.size))
                # self.save_checkpoint(step=step)

        self.save_checkpoint(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
Beispiel #7
0
    def train(self,
              n_episodes,
              annealing_episodes=None,
              every_episode=None,
              done_rewards=None):
        step = 0
        episode_reward = 0.
        reward_history = []
        reward_averaged = []

        lr_c = self.lr_c
        lr_a = self.lr_a

        eps = self.epsilon
        annealing_episodes = annealing_episodes or n_episodes
        eps_drop = (eps - self.epsilon_final) / annealing_episodes
        print("eps_drop:", eps_drop)

        for n_episode in range(n_episodes):
            ob = self.env.reset()
            self.act(ob, eps)
            done = False

            while not done:
                a = self.act(ob, eps)
                ob_next, r, done, info = self.env.step(a)
                step += 1
                episode_reward += r

                if done:
                    next_state_value = done_rewards or 0.0
                else:
                    with self.sess.as_default():
                        next_state_value = self.critic.eval(
                            {self.states: [self.obs_to_inputs(ob_next)]})[0][0]

                td_target = r + self.gamma * next_state_value
                self.memory.add(Record(self.obs_to_inputs(ob), a, r,
                                       td_target))
                ob = ob_next

                while self.memory.size >= self.batch_size:
                    batch = self.memory.pop(self.batch_size)
                    _, summ_str = self.sess.run(
                        [self.train_ops, self.merged_summary],
                        feed_dict={
                            self.learning_rate_c:
                            lr_c,
                            self.learning_rate_a:
                            lr_a,
                            self.states:
                            batch['s'],
                            self.actions:
                            batch['a'],
                            self.rewards:
                            batch['r'],
                            self.td_targets:
                            batch['td_target'],
                            self.ep_reward:
                            reward_history[-1] if reward_history else 0.0,
                        })
                    self.writer.add_summary(summ_str, step)

            # One trajectory is complete!
            reward_history.append(episode_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            episode_reward = 0.

            lr_c *= self.lr_c_decay
            lr_a *= self.lr_a_decay
            if eps > self.epsilon_final:
                eps -= eps_drop

            if reward_history and every_episode and n_episode % every_episode == 0:
                # Report the performance every `every_step` steps
                print(
                    "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}|{:.4f} eps:{:.4f}"
                    .format(
                        n_episode,
                        step,
                        np.max(reward_history),
                        np.mean(reward_history[-10:]),
                        reward_history[-5:],
                        lr_c,
                        lr_a,
                        eps,
                    ))
                # self.save_model(step=step)

        self.save_model(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
Beispiel #8
0
    def train(self, n_episodes, every_episode):
        step = 0
        episode_reward = 0.
        reward_history = []
        reward_averaged = []

        lr = self.lr

        for n_episode in range(n_episodes):
            ob = self.env.reset()
            done = False

            obs = []
            actions = []
            rewards = []
            returns = []

            while not done:
                a = self.act(ob)
                new_ob, r, done, info = self.env.step(a)
                step += 1
                episode_reward += r

                obs.append(self.obs_to_inputs(ob))
                actions.append(a)
                rewards.append(r)
                ob = new_ob

            # One trajectory is complete!
            reward_history.append(episode_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            episode_reward = 0.
            lr *= self.lr_decay

            # Estimate returns backwards.
            return_so_far = 0.0
            for r in rewards[::-1]:
                return_so_far = self.gamma * return_so_far + r
                returns.append(return_so_far)

            returns = returns[::-1]

            _, summ_str = self.sess.run(
                [self.train_ops, self.merged_summary],
                feed_dict={
                    self.learning_rate: lr,
                    self.states: np.array(obs),
                    self.actions: np.array(actions),
                    self.returns: np.array(returns),
                    self.ep_reward: reward_history[-1],
                })
            self.writer.add_summary(summ_str, step)

            if reward_history and every_episode and n_episode % every_episode == 0:
                # Report the performance every `every_step` steps
                print(
                    "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}".
                    format(
                        n_episode,
                        step,
                        np.max(reward_history),
                        np.mean(reward_history[-10:]),
                        reward_history[-5:],
                        lr,
                    ))
                # self.save_model(step=step)

        self.save_model(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')