Esempio n. 1
0
    def train(self, config: TrainConfig):
        reward_history = []
        reward_averaged = []
        step = 0
        alpha = config.alpha
        eps = config.epsilon

        warmup_episodes = config.warmup_episodes or config.n_episodes
        eps_drop = (config.epsilon - config.epsilon_final) / warmup_episodes

        for n_episode in range(config.n_episodes):
            ob = self.env.reset()
            done = False
            reward = 0.

            while not done:
                a = self.act(ob, eps)
                new_ob, r, done, info = self.env.step(a)
                if done and config.done_reward is not None:
                    r += config.done_reward

                self._update_q_value(Transition(ob, a, r, new_ob, done), alpha)

                step += 1
                reward += r
                ob = new_ob

            reward_history.append(reward)
            reward_averaged.append(np.average(reward_history[-50:]))

            alpha *= config.alpha_decay
            if eps > config.epsilon_final:
                eps = max(config.epsilon_final, eps - eps_drop)

            if config.log_every_episode is not None and n_episode % config.log_every_episode == 0:
                # Report the performance every 100 steps
                print(
                    "[episode:{}|step:{}] best:{} avg:{:.4f} alpha:{:.4f} eps:{:.4f} Qsize:{}"
                    .format(n_episode, step, np.max(reward_history),
                            np.mean(reward_history[-10:]), alpha, eps,
                            len(self.Q)))

        print("[FINAL] Num. episodes: {}, Max reward: {}, Average reward: {}".
              format(len(reward_history), np.max(reward_history),
                     np.mean(reward_history)))

        data_dict = {'reward': reward_history, 'reward_avg50': reward_averaged}
        plot_learning_curve(self.name, data_dict, xlabel='episode')
Esempio n. 2
0
    def train(self, config: TrainConfig):
        step = 0
        episode_reward = 0.
        reward_history = []
        reward_averaged = []

        lr = config.lr

        for n_episode in range(config.n_episodes):
            ob = self.env.reset()
            done = False

            obs = []
            actions = []
            rewards = []
            returns = []

            while not done:
                a = self.act(ob)
                new_ob, r, done, info = self.env.step(a)
                step += 1
                episode_reward += r

                obs.append(self.obs_to_inputs(ob))
                actions.append(a)
                rewards.append(r)
                ob = new_ob

            # One trajectory is complete!
            reward_history.append(episode_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            episode_reward = 0.
            lr *= config.lr_decay

            # Estimate returns backwards.
            return_so_far = 0.0
            for r in rewards[::-1]:
                return_so_far = self.gamma * return_so_far + r
                returns.append(return_so_far)

            returns = returns[::-1]

            _, summ_str = self.sess.run(
                [self.train_ops, self.merged_summary],
                feed_dict={
                    self.lr: lr,
                    self.s: np.array(obs),
                    self.a: np.array(actions),
                    self.returns: np.array(returns),
                    self.ep_reward: reward_history[-1],
                })
            self.writer.add_summary(summ_str, step)

            if reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0:
                # Report the performance every `every_step` steps
                print(
                    "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}".
                    format(
                        n_episode,
                        step,
                        np.max(reward_history),
                        np.mean(reward_history[-10:]),
                        reward_history[-5:],
                        lr,
                    ))
                # self.save_checkpoint(step=step)

        self.save_checkpoint(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
Esempio n. 3
0
    def train(self, config: TrainConfig):

        if self.model_type == 'lstm':
            buffer = ReplayTrajMemory(capacity=config.memory_capacity, step_size=self.step_size)
        else:
            buffer = ReplayMemory(capacity=config.memory_capacity)

        reward = 0.
        reward_history = [0.0]
        reward_averaged = []

        lr = config.lr
        eps = config.epsilon
        annealing_episodes = config.warmup_episodes or config.n_episodes
        eps_drop = (config.epsilon - config.epsilon_final) / annealing_episodes
        print("eps_drop:", eps_drop)
        step = 0

        #obtain state transition table from somewhere
        #decomposeStates(self, st_table)

        for n_episode in range(config.n_episodes):
            ob = self.env.reset()
            done = False
            traj = []

            while not done:
                a = self.act(self.obs_to_inputs(ob), eps)
                new_ob, r, done, info = self.env.step(a)
                step += 1
                reward += r

                traj.append(
                    Transition(self.obs_to_inputs(ob), a, r, self.obs_to_inputs(new_ob), done))
                ob = new_ob

                # No enough samples in the buffer yet.
                if buffer.size < self.batch_size:
                    continue

                # Training with a mini batch of samples!
                batch_data = buffer.sample(self.batch_size)
                feed_dict = {
                    self.learning_rate: lr,
                    self.states: batch_data['s'],
                    self.actions: batch_data['a'],
                    self.rewards: batch_data['r'],
                    self.states_next: batch_data['s_next'],
                    self.done_flags: batch_data['done'],
                    self.ep_reward: reward_history[-1],
                }

                if self.double_q:
                    actions_next = self.sess.run(self.actions_selected_by_q, {
                        self.states: batch_data['s_next']
                    })
                    feed_dict.update({self.actions_next: actions_next})

                _, q_val, q_target_val, loss, summ_str = self.sess.run(
                    [self.optimizer, self.q, self.q_target, self.loss, self.merged_summary],
                    feed_dict
                )
                self.writer.add_summary(summ_str, step)
                if step % config.target_update_every_step:
                    self.update_target_q_net()

            # Add all the transitions of one trajectory into the replay memory.
            buffer.add(traj)

            # One episode is complete.
            reward_history.append(reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            reward = 0.

            # Annealing the learning and exploration rate after every episode.
            lr *= config.lr_decay
            if eps > config.epsilon_final:
                eps = max(eps - eps_drop, config.epsilon_final)

            if reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0:
                # Report the performance every `every_step` steps
                print(
                    "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}, eps:{:.4f}".format(
                        n_episode, step, np.max(reward_history),
                        np.mean(reward_history[-10:]), reward_history[-5:],
                        lr, eps, buffer.size
                    ))
                # self.save_checkpoint(step=step)

        self.save_checkpoint(step=step)

        print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
            len(reward_history), np.max(reward_history), np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
    def train(self, config: TrainConfig):
        buffer = ReplayMemory(tuple_class=Transition)

        step = 0
        episode_reward = 0.
        reward_history = []
        reward_averaged = []

        lr_c = config.lr_c
        lr_a = config.lr_a

        eps = config.epsilon
        warmup_episodes = config.warmup_episodes or config.n_episodes
        eps_drop = (eps - config.epsilon_final) / warmup_episodes
        print("Decrease epsilon per step:", eps_drop)

        for n_episode in range(config.n_episodes):
            ob = self.env.reset()
            self.act(ob, eps)
            done = False

            while not done:
                a = self.act(ob, eps)
                ob_next, r, done, info = self.env.step(a)
                step += 1
                episode_reward += r

                record = Transition(self.obs_to_inputs(ob), a, r,
                                    self.obs_to_inputs(ob_next), done)
                buffer.add(record)

                ob = ob_next

                while buffer.size >= config.batch_size:
                    batch = buffer.pop(config.batch_size)
                    _, summ_str = self.sess.run(
                        [self.train_ops, self.merged_summary],
                        feed_dict={
                            self.lr_c:
                            lr_c,
                            self.lr_a:
                            lr_a,
                            self.s:
                            batch['s'],
                            self.a:
                            batch['a'],
                            self.r:
                            batch['r'],
                            self.s_next:
                            batch['s_next'],
                            self.done:
                            batch['done'],
                            self.ep_reward:
                            np.mean(reward_history[-10:])
                            if reward_history else 0.0,
                        })
                    self.writer.add_summary(summ_str, step)

            # One trajectory is complete!
            reward_history.append(episode_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            episode_reward = 0.

            lr_c *= config.lr_c_decay
            lr_a *= config.lr_a_decay
            if eps > config.epsilon_final:
                eps -= eps_drop

            if (reward_history and config.log_every_episode
                    and n_episode % config.log_every_episode == 0):
                # Report the performance every `every_step` steps
                print(
                    "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}|{:.4f} eps:{:.4f}"
                    .format(
                        n_episode,
                        step,
                        np.max(reward_history),
                        np.mean(reward_history[-10:]),
                        reward_history[-5:],
                        lr_c,
                        lr_a,
                        eps,
                    ))
                # self.save_checkpoint(step=step)

        self.save_checkpoint(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
Esempio n. 5
0
    def train(self, config: TrainConfig):
        # Construct the replay memory buffer.
        buffer = ReplayMemory(tuple_class=Transition)

        step = 0
        n_episode = 0

        episode_reward = 0.
        episode_step = 0
        reward_history = []
        reward_averaged = []

        eps = config.epsilon
        eps_drop_per_step = (eps - config.epsilon_final) / config.warmup_steps
        print("decrease `epsilon` per step:", eps_drop_per_step)

        env = self.env
        ob = env.reset()
        done = False

        while step < config.n_steps:
            while not done:
                a = self.act(ob, eps)
                ob_next, r, done, _ = env.step(a)
                step += 1
                episode_step += 1
                episode_reward += r

                buffer.add(Transition(ob, a, r, ob_next, float(done)))
                ob = ob_next

                if eps > config.epsilon_final:
                    eps = max(config.epsilon_final, eps - eps_drop_per_step)

                if reward_history and config.log_every_step and step % config.log_every_step == 0:
                    # Report the performance every `log_every_step` steps
                    print(
                        "[episodes:{}/step:{}], best(reward):{:.2f}, avg(reward):{:.2f}, eps:{:.4f}"
                        .format(n_episode, step, np.max(reward_history),
                                np.mean(reward_history[-10:]), eps))
                    # self.save_checkpoint(step=step)

                if buffer.size >= config.batch_size:
                    batch = buffer.pop(config.batch_size)
                    _, q_loss, mu_loss, summ_str = self.sess.run(
                        [
                            self.train_ops, self.Q_loss, self.mu_loss,
                            self.merged_summary
                        ],
                        feed_dict={
                            self.lr_a:
                            config.lr_a,
                            self.lr_c:
                            config.lr_c,
                            self.done:
                            batch['done'],
                            self.s:
                            batch['s'],
                            self.a:
                            batch['a'],
                            self.r:
                            batch['r'],
                            self.s_next:
                            batch['s_next'],
                            self.ep_reward:
                            np.mean(reward_history[-10:])
                            if reward_history else 0.0,
                        })
                    self.update_target_net(tau=config.tau)
                    self.writer.add_summary(summ_str, step)

            # one trajectory is complete.
            n_episode += 1
            ob = env.reset()
            done = False
            reward_history.append(episode_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))
            episode_step = 0
            episode_reward = 0.

        self.save_checkpoint(step=step)

        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')
Esempio n. 6
0
    def run(self):

        n_episodes = self.config.n_episodes
        loss = None

        total_rewards = np.empty(n_episodes)

        solved_consecutively = 0

        for i in range(n_episodes):
            state = self.env.reset()
            done = False

            score = 0
            while not done:
                action, q_value = self.get_action(
                    state, self.config.epsilon_schedule.current_p)
                next_state, reward, done, info = self.env.step(action)
                self.memory.add(
                    Transition(state, action, reward, next_state, done))
                score += reward

                state = next_state

                if self.memory.size > self.batch_size:
                    loss = self.train()
                    if i % self.config.target_update_every_step == 0:
                        self.update_target()

            total_rewards[i] = score
            avg_rewards = total_rewards[max(0, i - 100):(i + 1)].mean()

            self.config.epsilon_schedule.anneal()
            self.config.beta_schedule.anneal()
            self.global_lr.assign(self.config.learning_rate_schedule.anneal())

            with self.writer.as_default():
                with tf.name_scope('Performance'):
                    tf.summary.scalar('episode reward', score, step=i)
                    tf.summary.scalar('running avg reward(100)',
                                      avg_rewards,
                                      step=i)

                if self.config.prioritized_memory_replay:
                    with tf.name_scope('Schedules'):
                        tf.summary.scalar('Beta',
                                          self.config.beta_schedule.current_p,
                                          step=i)
                        tf.summary.scalar(
                            'Epsilon',
                            self.config.epsilon_schedule.current_p,
                            step=i)
                        tf.summary.scalar('Learning rate',
                                          self.optimizer._decayed_lr(
                                              tf.float32).numpy(),
                                          step=i)

            # Specific four mountain car
            if done and score == 500:
                solved_consecutively += 1
            else:
                solved_consecutively = 0

            if solved_consecutively >= 50:
                print(f'Successfully SOLVED {solved_consecutively} times!')
                break

            if i % self.config.log_every_episode == 0:
                print("episode:", i, "/", self.config.n_episodes,
                      "episode reward:", score, "avg reward (last 100):",
                      avg_rewards, "eps:",
                      self.config.epsilon_schedule.current_p,
                      "Learning rate (10e-3):",
                      (self.optimizer._decayed_lr(tf.float32).numpy() * 1000),
                      "Consecutively solved:", solved_consecutively)

        plot_learning_curve(self.name + '.png', {'rewards': total_rewards})
        self.save()
Esempio n. 7
0
    def train(self, config: TrainConfig):
        BufferRecord = namedtuple('BufferRecord', ['s', 'a', 's_next', 'r', 'done',
                                                   'old_logp_actor', 'v_target', 'adv'])
        buffer = ReplayMemory(tuple_class=BufferRecord)

        reward_history = []
        reward_averaged = []
        step = 0
        total_rec = 0

        clip = config.ratio_clip_range
        if config.ratio_clip_decay:
            clip_delta = clip / config.n_iterations
        else:
            clip_delta = 0.0

        for n_iteration in range(config.n_iterations):

            # we should have multiple rollout_workers running in parallel.
            for _ in range(config.n_rollout_workers):
                episode_reward, n_rec = self._generate_rollout(buffer)
                # One trajectory is complete.
                reward_history.append(episode_reward)
                reward_averaged.append(np.mean(reward_history[-10:]))
                total_rec += n_rec

            # now let's train the model for some steps.
            for batch in buffer.loop(config.batch_size, epoch=config.train_epoches):
                _, summ_str = self.sess.run(
                    [self.train_ops, self.merged_summary], feed_dict={
                        self.lr_a: config.lr_a,
                        self.lr_c: config.lr_c,
                        self.clip_range: clip,
                        self.s: batch['s'],
                        self.a: batch['a'],
                        self.s_next: batch['s_next'],
                        self.r: batch['r'],
                        self.done: batch['done'],
                        self.old_logp_a: batch['old_logp_actor'],
                        self.v_target: batch['v_target'],
                        self.adv: batch['adv'],
                        self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0,
                    })

                self.writer.add_summary(summ_str, step)
                step += 1

            clip = max(0.0, clip - clip_delta)

            if (reward_history and config.log_every_iteration and
                    n_iteration % config.log_every_iteration == 0):
                # Report the performance every `log_every_iteration` steps
                print("[iteration:{}/step:{}], best:{}, avg:{:.2f}, hist:{}, clip:{:.2f}; {} transitions.".format(
                    n_iteration, step, np.max(reward_history), np.mean(reward_history[-10:]),
                    list(map(lambda x: round(x, 2), reward_history[-5:])), clip, total_rec
                ))
                # self.save_checkpoint(step=step)

        self.save_checkpoint(step=step)

        print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
            len(reward_history), np.max(reward_history), np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')