Ejemplo n.º 1
0
class Agent:
    """
    This class builds an agent with its own Network, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.displayer = displayer
        self.saver = saver

        self.env = Environment()
        self.network = Network(sess)
        self.buffer = ExperienceBuffer()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !")

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.network.init_target()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Initialize exploration noise process
            noise_process = np.zeros(Settings.ACTION_SIZE)
            noise_scale = (Settings.NOISE_SCALE_INIT *
                           Settings.NOISE_DECAY**self.nb_ep) * \
                (Settings.HIGH_BOUND - Settings.LOW_BOUND)

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))

            while episode_step <= max_step and not done:

                # Choose action based on deterministic policy
                a = self.network.act(s)

                # Add temporally-correlated exploration noise to action
                noise_process = Settings.EXPLO_THETA * \
                    (Settings.EXPLO_MU - noise_process) + \
                    Settings.EXPLO_SIGMA * np.random.randn(Settings.ACTION_SIZE)

                a += noise_scale * noise_process
                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                if self.total_steps % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample()
                    self.network.train(np.asarray(batch))
                    self.network.update_target()

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'
                    % (self.nb_ep, episode_reward, episode_step, noise_scale))

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        self.env.close()

    def play(self, number_run, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            name      : the name of the gif that will be saved
        """
        print("Playing for", number_run, "runs")

        self.env.set_render(Settings.DISPLAY)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False
                self.env.set_gif(True, name)

                while not done:
                    a = self.network.act(s)
                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")

    def stop(self):
        self.env.close()
Ejemplo n.º 2
0
class Agent:
    """
    This class builds an agent with its own Network, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.gui_thread = threading.Thread(target=lambda: self.gui.run(self))
        self.displayer = displayer
        self.saver = saver
        signal.signal(signal.SIGINT, self.interrupt)

        self.env = Environment()
        self.network = Network(sess)
        self.buffer = ExperienceBuffer()

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !")

    def create_summaries(self):

        self.ep_reward_ph = tf.placeholder(tf.float32)
        ep_reward_summary = tf.summary.scalar("Episode/Episode reward",
                                              self.ep_reward_ph)

        self.steps_ph = tf.placeholder(tf.float32)
        steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph)

        self.noise_ph = tf.placeholder(tf.float32)
        noise_summary = tf.summary.scalar("Settings/Noise", self.noise_ph)

        self.ep_summary = tf.summary.merge(
            [ep_reward_summary, noise_summary, steps_summary])

        self.writer = tf.summary.FileWriter("./logs", self.sess.graph)

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.network.init_target()
        self.gui_thread.start()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Initialize exploration noise process
            noise_process = np.zeros(Settings.ACTION_SIZE)
            noise_scale = (Settings.NOISE_SCALE_INIT *
                           Settings.NOISE_DECAY**self.nb_ep) * \
                (Settings.HIGH_BOUND - Settings.LOW_BOUND)

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))

            while episode_step <= max_step and not done:

                # Choose action based on deterministic policy
                a = self.network.act(s)

                # Add temporally-correlated exploration noise to action
                noise_process = Settings.EXPLO_THETA * \
                    (Settings.EXPLO_MU - noise_process) + \
                    Settings.EXPLO_SIGMA * np.random.randn(Settings.ACTION_SIZE)

                a += noise_scale * noise_process
                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                if self.total_steps % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample()
                    self.network.train(np.asarray(batch))
                    self.network.update_target()

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'
                    % (self.nb_ep, episode_reward, episode_step, noise_scale))

            # Write the summary
            feed_dict = {
                self.ep_reward_ph: episode_reward,
                self.noise_ph: noise_scale[0],
                self.steps_ph: episode_step
            }
            summary = self.sess.run(self.ep_summary, feed_dict=feed_dict)
            self.writer.add_summary(summary, self.nb_ep)

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        print("Training completed !")
        self.env.close()
        self.display()
        self.gui.end_training()
        self.gui_thread.join()

    def play(self, number_run=1, gif=False, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            gif       : whether to save a gif or not
            name      : the name of the gif that will be saved
        """
        self.env.set_render(Settings.DISPLAY)

        for i in range(number_run):

            s = self.env.reset()
            episode_reward = 0
            done = False
            self.env.set_gif(gif, name)

            while not done:
                a = self.network.act(s)
                s, r, done, info = self.env.act(a)
                episode_reward += r

            if gif: self.env.save_gif()
            print("Episode reward :", episode_reward)

    def display(self):
        self.displayer.disp()

    def stop(self):
        self.env.close()

    def interrupt(self, sig, frame):
        self.gui.stop_run()