Exemple #1
0
class Agent:
    """
    This class builds an agent with its own QNetwork, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment and QNetwork.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.gui_thread = threading.Thread(target=lambda: self.gui.run(self))
        self.displayer = displayer
        self.saver = saver
        signal.signal(signal.SIGINT, self.interrupt)

        self.env = Environment()
        self.QNetwork = QNetwork(sess)
        self.buffer = ExperienceBuffer(prioritized=Settings.PRIORITIZED_ER)
        self.epsilon = Settings.EPSILON_START
        self.beta = Settings.BETA_START

        self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS -
                                                            1)
        self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS)

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !\n")

    def create_summaries(self):

        self.ep_reward_ph = tf.placeholder(tf.float32)
        ep_reward_summary = tf.summary.scalar("Episode/Episode reward",
                                              self.ep_reward_ph)

        self.steps_ph = tf.placeholder(tf.float32)
        steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph)

        self.epsilon_ph = tf.placeholder(tf.float32)
        epsilon_summary = tf.summary.scalar("Settings/Epsilon",
                                            self.epsilon_ph)

        self.ep_summary = tf.summary.merge(
            [ep_reward_summary, epsilon_summary, steps_summary])

        self.lr_ph = tf.placeholder(tf.float32)
        self.lr_summary = tf.summary.scalar("Settings/Learning rate",
                                            self.lr_ph)

        self.writer = tf.summary.FileWriter("./logs", self.sess.graph)

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.QNetwork.init_target()
        self.gui_thread.start()

        self.nb_ep = 1
        learning_steps = 0

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False
            memory = deque()

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))
            plot_distrib = self.gui.plot_distrib.get(self.nb_ep)

            while episode_step <= max_step and not done:

                # Exploration by NoisyNets or epsilon-greedy policy
                if not Settings.NOISY and random.random() < self.epsilon:
                    a = self.env.act_random()
                else:
                    if Settings.DISTRIBUTIONAL:
                        Qdistrib = self.QNetwork.act(s)
                        Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    else:
                        Qvalue = self.QNetwork.act(s)

                    a = np.argmax(Qvalue, axis=0)

                    if plot_distrib:
                        self.displayer.disp_distrib(self.z, self.delta_z,
                                                    Qdistrib, Qvalue)

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                memory.append((s, a, r, s_, done))

                # Keep the experience in memory until 'N_STEP_RETURN' steps has
                # passed to get the delayed return r_1 + ... + gamma^n r_n
                while len(memory) >= Settings.N_STEP_RETURN or (memory and
                                                                memory[-1][4]):
                    s_mem, a_mem, discount_R, si_, done_ = memory.popleft()
                    if not done_ and memory:
                        for i in range(Settings.N_STEP_RETURN - 1):
                            si, ai, ri, si_, done_ = memory[i]
                            discount_R += ri * Settings.DISCOUNT**(i + 1)
                            if done_:
                                break
                    self.buffer.add(
                        (s_mem, a_mem, discount_R, si_, 1 if not done_ else 0))

                if episode_step % Settings.TRAINING_FREQ == 0:
                    if Settings.PRIORITIZED_ER:
                        batch, idx, weights = self.buffer.sample(self.beta)
                    else:
                        batch = self.buffer.sample(self.beta)
                        idx = weights = None
                    loss = self.QNetwork.train(np.asarray(batch), weights)
                    self.buffer.update(idx, loss)
                    self.QNetwork.update_target()

                    feed_dict = {self.lr_ph: self.QNetwork.learning_rate}
                    summary = self.sess.run(self.lr_summary,
                                            feed_dict=feed_dict)
                    self.writer.add_summary(summary, learning_steps)
                    learning_steps += 1

                s = s_
                episode_step += 1

            # Decay epsilon
            if self.epsilon > Settings.EPSILON_STOP:
                self.epsilon -= Settings.EPSILON_DECAY

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %f'
                      ', Max steps: %i, Learning rate: %fe-4' %
                      (self.nb_ep, episode_reward, episode_step, self.epsilon,
                       max_step, self.QNetwork.learning_rate * 1e4))

            # Write the summary
            feed_dict = {
                self.ep_reward_ph: episode_reward,
                self.epsilon_ph: self.epsilon,
                self.steps_ph: episode_step
            }
            summary = self.sess.run(self.ep_summary, feed_dict=feed_dict)
            self.writer.add_summary(summary, self.nb_ep)

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        print("Training completed !")
        self.env.close()
        self.display()
        self.gui.end_training()
        self.gui_thread.join()

    def play(self, number_run=1, gif=False, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            gif       : whether to save a gif or not
            name      : the name of the gif that will be saved
        """
        self.env.set_render(Settings.DISPLAY)

        for i in range(number_run):

            s = self.env.reset()
            episode_reward = 0
            done = False
            self.env.set_gif(gif, name)

            while not done:
                if Settings.DISTRIBUTIONAL:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                else:
                    Qvalue = self.QNetwork.act(s)
                a = np.argmax(Qvalue, axis=0)
                s, r, done, info = self.env.act(a)
                episode_reward += r

            if gif: self.env.save_gif()
            print("Episode reward :", episode_reward)

    def display(self):
        self.displayer.disp()

    def stop(self):
        self.env.close()

    def interrupt(self, sig, frame):
        self.gui.stop_run()
Exemple #2
0
class Agent:
    """
    This class builds an agent with its own Network, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.displayer = displayer
        self.saver = saver

        self.env = Environment()
        self.network = Network(sess)
        self.buffer = ExperienceBuffer()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !")

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.network.init_target()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Initialize exploration noise process
            noise_process = np.zeros(Settings.ACTION_SIZE)
            noise_scale = (Settings.NOISE_SCALE_INIT *
                           Settings.NOISE_DECAY**self.nb_ep) * \
                (Settings.HIGH_BOUND - Settings.LOW_BOUND)

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))

            while episode_step <= max_step and not done:

                # Choose action based on deterministic policy
                a = self.network.act(s)

                # Add temporally-correlated exploration noise to action
                noise_process = Settings.EXPLO_THETA * \
                    (Settings.EXPLO_MU - noise_process) + \
                    Settings.EXPLO_SIGMA * np.random.randn(Settings.ACTION_SIZE)

                a += noise_scale * noise_process
                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                if self.total_steps % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample()
                    self.network.train(np.asarray(batch))
                    self.network.update_target()

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'
                    % (self.nb_ep, episode_reward, episode_step, noise_scale))

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        self.env.close()

    def play(self, number_run, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            name      : the name of the gif that will be saved
        """
        print("Playing for", number_run, "runs")

        self.env.set_render(Settings.DISPLAY)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False
                self.env.set_gif(True, name)

                while not done:
                    a = self.network.act(s)
                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")

    def stop(self):
        self.env.close()
Exemple #3
0
class Agent:
    """
    This class builds an agent with its own QNetwork, memory buffer and
    environment to learn a policy.
    """

    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.displayer = displayer
        self.saver = saver

        self.env = Environment()
        self.QNetwork = QNetwork(self.sess)
        self.buffer = ExperienceBuffer()
        self.epsilon = Settings.EPSILON_START

        self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS - 1)
        self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS)

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !\n")

    def create_summaries(self):

        self.ep_reward_ph = tf.placeholder(tf.float32)
        ep_reward_summary = tf.summary.scalar("Episode/Episode reward", self.ep_reward_ph)

        self.steps_ph = tf.placeholder(tf.float32)
        steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph)

        self.epsilon_ph = tf.placeholder(tf.float32)
        epsilon_summary = tf.summary.scalar("Settings/Epsilon", self.epsilon_ph)

        self.ep_summary = tf.summary.merge([ep_reward_summary,
                                            epsilon_summary,
                                            steps_summary])

        self.lr_ph = tf.placeholder(tf.float32)
        self.lr_summary = tf.summary.scalar("Settings/Learning rate", self.lr_ph)

        self.writer = tf.summary.FileWriter("./logs", self.sess.graph)

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS // 5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.QNetwork.init_target()

        self.nb_ep = 1
        learning_steps = 0

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))
            plot_distrib = self.gui.plot_distrib.get(self.nb_ep)

            while episode_step <= max_step and not done:

                # Exploration by epsilon-greedy policy
                if random.random() < self.epsilon:
                    a = self.env.act_random()
                else:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    a = np.argmax(Qvalue, axis=0)

                    if plot_distrib:
                        self.displayer.disp_distrib(self.z, self.delta_z,
                                                    Qdistrib, Qvalue)

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                if episode_step % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample()
                    self.QNetwork.train(np.asarray(batch))
                    self.QNetwork.update_target()

                    feed_dict = {self.lr_ph: self.QNetwork.learning_rate}
                    summary = self.sess.run(self.lr_summary, feed_dict=feed_dict)
                    self.writer.add_summary(summary, learning_steps)
                    learning_steps += 1

                s = s_
                episode_step += 1

            # Decay epsilon
            if self.epsilon > Settings.EPSILON_STOP:
                self.epsilon -= Settings.EPSILON_DECAY

            self.displayer.add_reward(episode_reward, plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)
            
            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %f, Max steps: %i, LR: %fe-4' % (
                    self.nb_ep, episode_reward, episode_step, self.epsilon, max_step, self.QNetwork.learning_rate))

            # Write the summary
            feed_dict = {self.ep_reward_ph: episode_reward,
                         self.epsilon_ph: self.epsilon,
                         self.steps_ph: episode_step}
            summary = self.sess.run(self.ep_summary, feed_dict=feed_dict)
            self.writer.add_summary(summary, self.nb_ep)

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        self.env.close()

    def play(self, number_run, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            name      : the name of the gif that will be saved
        """
        print("Playing for", number_run, "runs")

        self.env.set_render(Settings.DISPLAY)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False
                self.env.set_gif(True, name)

                while not done:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    a = np.argmax(Qvalue, axis=0)
                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")

    def stop(self):
        self.env.close()
Exemple #4
0
class Agent:
    """
    This class builds an agent with its own Network, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.gui_thread = threading.Thread(target=lambda: self.gui.run(self))
        self.displayer = displayer
        self.saver = saver
        signal.signal(signal.SIGINT, self.interrupt)

        self.env = Environment()
        self.network = Network(sess)
        self.buffer = ExperienceBuffer()

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !")

    def create_summaries(self):

        self.ep_reward_ph = tf.placeholder(tf.float32)
        ep_reward_summary = tf.summary.scalar("Episode/Episode reward",
                                              self.ep_reward_ph)

        self.steps_ph = tf.placeholder(tf.float32)
        steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph)

        self.noise_ph = tf.placeholder(tf.float32)
        noise_summary = tf.summary.scalar("Settings/Noise", self.noise_ph)

        self.ep_summary = tf.summary.merge(
            [ep_reward_summary, noise_summary, steps_summary])

        self.writer = tf.summary.FileWriter("./logs", self.sess.graph)

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.network.init_target()
        self.gui_thread.start()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Initialize exploration noise process
            noise_process = np.zeros(Settings.ACTION_SIZE)
            noise_scale = (Settings.NOISE_SCALE_INIT *
                           Settings.NOISE_DECAY**self.nb_ep) * \
                (Settings.HIGH_BOUND - Settings.LOW_BOUND)

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))

            while episode_step <= max_step and not done:

                # Choose action based on deterministic policy
                a = self.network.act(s)

                # Add temporally-correlated exploration noise to action
                noise_process = Settings.EXPLO_THETA * \
                    (Settings.EXPLO_MU - noise_process) + \
                    Settings.EXPLO_SIGMA * np.random.randn(Settings.ACTION_SIZE)

                a += noise_scale * noise_process
                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                if self.total_steps % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample()
                    self.network.train(np.asarray(batch))
                    self.network.update_target()

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'
                    % (self.nb_ep, episode_reward, episode_step, noise_scale))

            # Write the summary
            feed_dict = {
                self.ep_reward_ph: episode_reward,
                self.noise_ph: noise_scale[0],
                self.steps_ph: episode_step
            }
            summary = self.sess.run(self.ep_summary, feed_dict=feed_dict)
            self.writer.add_summary(summary, self.nb_ep)

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        print("Training completed !")
        self.env.close()
        self.display()
        self.gui.end_training()
        self.gui_thread.join()

    def play(self, number_run=1, gif=False, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            gif       : whether to save a gif or not
            name      : the name of the gif that will be saved
        """
        self.env.set_render(Settings.DISPLAY)

        for i in range(number_run):

            s = self.env.reset()
            episode_reward = 0
            done = False
            self.env.set_gif(gif, name)

            while not done:
                a = self.network.act(s)
                s, r, done, info = self.env.act(a)
                episode_reward += r

            if gif: self.env.save_gif()
            print("Episode reward :", episode_reward)

    def display(self):
        self.displayer.disp()

    def stop(self):
        self.env.close()

    def interrupt(self, sig, frame):
        self.gui.stop_run()
Exemple #5
0
class Agent:
    """
    This class builds an agent with its own QNetwork, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment and QNetwork.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.displayer = displayer
        self.saver = saver

        self.env = Environment()
        self.QNetwork = QNetwork(sess)
        self.buffer = PrioritizedReplayBuffer(Settings.BUFFER_SIZE,
                                              Settings.ALPHA)
        self.epsilon = Settings.EPSILON_START
        self.beta = Settings.BETA_START

        self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS -
                                                            1)
        self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS)

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !\n")

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add(s, a, r, s_, 1 if not done else 0)

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.QNetwork.init_target()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False
            memory = deque()

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))
            plot_distrib = self.gui.plot_distrib.get(self.nb_ep)

            while episode_step <= max_step and not done:

                # Exploration by epsilon-greedy policy
                if random.random() < self.epsilon:
                    a = self.env.act_random()
                else:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    a = np.argmax(Qvalue, axis=0)

                    if plot_distrib:
                        self.displayer.disp_distrib(self.z, self.delta_z,
                                                    Qdistrib, Qvalue)

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                memory.append((s, a, r))

                # Keep the experience in memory until 'N_STEP_RETURN' steps has
                # passed to get the delayed return r_1 + ... + gamma^n r_n
                if len(memory) > Settings.N_STEP_RETURN:
                    s_mem, a_mem, discount_R = memory.popleft()
                    for i, (si, ai, ri) in enumerate(memory):
                        discount_R += ri * Settings.DISCOUNT**(i + 1)
                    self.buffer.add(s_mem, a_mem, discount_R, s_,
                                    1 if not done else 0)

                if episode_step % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample(Settings.BATCH_SIZE, self.beta)
                    loss = self.QNetwork.train(batch)
                    self.buffer.update_priorities(batch[6], loss)
                    self.QNetwork.update_target()

                s = s_
                episode_step += 1
                self.total_steps += 1

            # Decay epsilon
            if self.epsilon > Settings.EPSILON_STOP:
                self.epsilon -= Settings.EPSILON_DECAY

            self.QNetwork.decrease_lr()

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %i'
                      ', Max steps: %i' %
                      (self.nb_ep, episode_reward, episode_step, self.epsilon,
                       max_step))

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        self.env.close()

    def play(self, number_run, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            name      : the name of the gif that will be saved
        """
        print("Playing for", number_run, "runs")

        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False
                self.env.set_gif(True, name)

                while not done:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    a = np.argmax(Qvalue, axis=0)
                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")
            self.env.close()

    def stop(self):
        self.env.close()