Esempio n. 1
0
    def save(self, episode_step):
        # Save model
        SAVER.save(episode_step)

        # Save summary statistics
        summary = tf.Summary()
        summary.value.add(tag='Perf/Reward',
                          simple_value=np.mean(self.rewards_plus))
        summary.value.add(tag='Perf/Value',
                          simple_value=np.mean(self.next_values))
        summary.value.add(tag='Losses/Value', simple_value=self.value_loss)
        summary.value.add(tag='Losses/Policy', simple_value=self.policy_loss)
        summary.value.add(tag='Losses/Entropy', simple_value=self.entropy)
        summary.value.add(tag='Losses/Grad Norm', simple_value=self.grad_norm)
        self.summary_writer.add_summary(summary, self.nb_ep)
        self.summary_writer.flush()
Esempio n. 2
0
    def run(self):

        self.nb_ep = 1
        self.total_steps = 0

        for self.nb_ep in range(1, parameters.TRAINING_STEPS + 1):

            episode_reward = 0
            episode_step = 0
            done = False
            memory = deque()

            # Initial state
            s = self.env.reset()
            max_steps = parameters.MAX_EPISODE_STEPS + self.nb_ep // parameters.EP_ELONGATION

            while episode_step < max_steps and not done:

                if random.random() < self.epsilon:
                    a = self.env.random()
                else:
                    # choose action based on deterministic policy
                    a, = self.sess.run(self.network.actions,
                                       feed_dict={self.network.state_ph: [s]})

                # Decay epsilon
                if self.epsilon > parameters.EPSILON_STOP:
                    self.epsilon -= parameters.EPSILON_DECAY

                s_, r, done, info = self.env.act(a)
                memory.append((s, a, r, s_, 0.0 if done else 1.0))

                if len(memory) > parameters.N_STEP_RETURN:
                    s_mem, a_mem, r_mem, ss_mem, done_mem = memory.popleft()
                    discount_R = 0
                    for i, (si, ai, ri, s_i, di) in enumerate(memory):
                        discount_R += ri * parameters.DISCOUNT**(i + 1)
                    self.buffer.add(s_mem, a_mem, discount_R, s_, done)

                # update network weights to fit a minibatch of experience
                if self.total_steps % parameters.TRAINING_FREQ == 0 and \
                        len(self.buffer) >= parameters.BATCH_SIZE:

                    minibatch = self.buffer.sample(parameters.BATCH_SIZE,
                                                   self.beta)

                    if self.beta <= parameters.BETA_STOP:
                        self.beta += parameters.BETA_INCR

                    td_errors, _, _ = self.sess.run(
                        [
                            self.network.td_errors,
                            self.network.critic_train_op,
                            self.network.actor_train_op
                        ],
                        feed_dict={
                            self.network.state_ph: minibatch[0],
                            self.network.action_ph: minibatch[1],
                            self.network.reward_ph: minibatch[2],
                            self.network.next_state_ph: minibatch[3],
                            self.network.is_not_terminal_ph: minibatch[4]
                        })

                    self.buffer.update_priorities(minibatch[6],
                                                  td_errors + 1e-6)
                    # update target networks
                    _ = self.sess.run(self.network.update_slow_targets_op)

                episode_reward += r
                s = s_
                episode_step += 1
                self.total_steps += 1

            self.nb_ep += 1

            if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0:
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Epsilon : %7.3f, Max steps : %i'
                    % (self.nb_ep, episode_reward, episode_step, self.epsilon,
                       max_steps))

            DISPLAYER.add_reward(episode_reward)

            if episode_reward > self.best_run and self.nb_ep > 100:
                self.best_run = episode_reward
                print("Best agent ! ", episode_reward)
                SAVER.save('best')

            if self.nb_ep % parameters.SAVE_FREQ == 0:
                SAVER.save(self.nb_ep)
Esempio n. 3
0
    def run(self):

        self.total_steps = 1
        self.sess.run(self.network.target_init)
        self.z = self.sess.run(self.network.z)
        self.delta_z = self.network.delta_z

        ep = 1
        while ep < settings.TRAINING_EPS + 1 and not GUI.STOP:

            s = self.env.reset()
            episode_reward = 0
            episode_step = 0
            done = False
            memory = deque()

            # Initialize exploration noise process
            noise_scale = settings.NOISE_SCALE * settings.NOISE_DECAY**ep

            # Initial state
            self.env.set_render(GUI.render.get(ep))
            self.env.set_gif(GUI.gif.get(ep))
            plot_distrib = GUI.plot_distrib.get(ep)

            max_eps = settings.MAX_EPISODE_STEPS + (ep // 50)

            while episode_step < max_eps and not done:

                noise = np.random.normal(size=self.action_size)
                scaled_noise = noise_scale * noise

                a = np.clip(
                    self.predict_action(s, plot_distrib) + scaled_noise,
                    *self.bounds)

                s_, r, done, info = self.env.act(a)

                episode_reward += r

                memory.append((s, a, r, s_, 0 if done else 1))

                if len(memory) >= settings.N_STEP_RETURN:
                    s_mem, a_mem, discount_r, ss_mem, done_mem = memory.popleft(
                    )
                    for i, (si, ai, ri, s_i, di) in enumerate(memory):
                        discount_r += ri * settings.DISCOUNT**(i + 1)
                    BUFFER.add(s_mem, a_mem, discount_r, s_, 0 if done else 1)

                if len(
                        BUFFER
                ) > 0 and self.total_steps % settings.TRAINING_FREQ == 0:
                    self.network.train(BUFFER.sample(), self.critic_lr,
                                       self.actor_lr)

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.critic_lr -= self.delta_critic_lr
            self.actor_lr -= self.delta_actor_lr

            # Plot reward
            plot = GUI.plot.get(ep)
            DISPLAYER.add_reward(episode_reward, plot)

            # Print episode reward
            if GUI.ep_reward.get(ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f, Critic LR: %f, Actor LR: %f'
                    % (ep, episode_reward, episode_step, noise_scale,
                       self.critic_lr, self.actor_lr))

            # Save the model
            if GUI.save.get(ep):
                SAVER.save(ep)

            ep += 1
Esempio n. 4
0
    def run(self):
        print("Beginning of the run...")

        self.pre_train()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < parameters.TRAINING_STEPS:

            self.learning_rate = self.initial_learning_rate * \
                (parameters.TRAINING_STEPS - self.nb_ep) / \
                parameters.TRAINING_STEPS

            s = self.env.reset()
            episode_reward = 0
            done = False

            memory = deque()
            discount_R = 0

            episode_step = 0
            max_step = parameters.MAX_EPISODE_STEPS + \
                self.nb_ep // parameters.EP_ELONGATION

            # Render parameters
            self.env.set_render(self.nb_ep % parameters.RENDER_FREQ == 0)

            while episode_step < max_step and not done:

                if random.random() < self.epsilon:
                    a = random.randint(0, self.action_size - 1)
                else:
                    a = self.sess.run(self.mainQNetwork.predict,
                                      feed_dict={self.mainQNetwork.inputs: [s]})
                    a = a[0]

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                memory.append((s, a, r, s_, done))

                if len(memory) > parameters.N_STEP_RETURN:
                    s_mem, a_mem, r_mem, ss_mem, done_mem = memory.popleft()
                    discount_R = r_mem
                    for i, (si, ai, ri, s_i, di) in enumerate(memory):
                        discount_R += ri * parameters.DISCOUNT ** (i + 1)
                    self.buffer.add(s_mem, a_mem, discount_R, s_, done)

                if episode_step % parameters.TRAINING_FREQ == 0:

                    train_batch = self.buffer.sample(parameters.BATCH_SIZE,
                                                     self.beta)
                    # Incr beta
                    if self.beta <= parameters.BETA_STOP:
                        self.beta += parameters.BETA_INCR

                    feed_dict = {self.mainQNetwork.inputs: train_batch[0]}
                    oldQvalues = self.sess.run(self.mainQNetwork.Qvalues,
                                               feed_dict=feed_dict)
                    tmp = [0] * len(oldQvalues)
                    for i, oldQvalue in enumerate(oldQvalues):
                        tmp[i] = oldQvalue[train_batch[1][i]]
                    oldQvalues = tmp

                    feed_dict = {self.mainQNetwork.inputs: train_batch[3]}
                    mainQaction = self.sess.run(self.mainQNetwork.predict,
                                                feed_dict=feed_dict)

                    feed_dict = {self.targetQNetwork.inputs: train_batch[3]}
                    targetQvalues = self.sess.run(self.targetQNetwork.Qvalues,
                                                  feed_dict=feed_dict)

                    # Done multiplier :
                    # equals 0 if the episode was done
                    # equals 1 else
                    done_multiplier = (1 - train_batch[4])
                    doubleQ = targetQvalues[range(parameters.BATCH_SIZE),
                                            mainQaction]
                    targetQvalues = train_batch[2] + \
                        parameters.DISCOUNT * doubleQ * done_multiplier

                    errors = np.square(targetQvalues - oldQvalues) + 1e-6
                    self.buffer.update_priorities(train_batch[6], errors)

                    feed_dict = {self.mainQNetwork.inputs: train_batch[0],
                                 self.mainQNetwork.Qtarget: targetQvalues,
                                 self.mainQNetwork.actions: train_batch[1],
                                 self.mainQNetwork.learning_rate: self.learning_rate}
                    _ = self.sess.run(self.mainQNetwork.train,
                                      feed_dict=feed_dict)

                    update_target(self.update_target_ops, self.sess)

                s = s_
                episode_step += 1
                self.total_steps += 1

            # Decay epsilon
            if self.epsilon > parameters.EPSILON_STOP:
                self.epsilon -= parameters.EPSILON_DECAY

            DISPLAYER.add_reward(episode_reward)
            # if episode_reward > self.best_run and \
            #         self.nb_ep > 50:
            #     self.best_run = episode_reward
            #     print("Save best", episode_reward)
            #     SAVER.save('best')
            #     self.play(1)

            self.total_steps += 1

            if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0:
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %.3f'
                      ', Max steps: %i, Learning rate: %g' % (
                          self.nb_ep, episode_reward, episode_step,
                          self.epsilon, max_step, self.learning_rate))

            # Save the model
            if self.nb_ep % parameters.SAVE_FREQ == 0:
                SAVER.save(self.nb_ep)

            self.nb_ep += 1
Esempio n. 5
0
from Displayer import DISPLAYER
from Saver import SAVER

import parameters

if __name__ == '__main__':

    tf.reset_default_graph()

    with tf.Session() as sess:

        agent = Agent(sess)
        SAVER.set_sess(sess)

        SAVER.load(agent)

        print("Beginning of the run")
        try:
            agent.run()
        except KeyboardInterrupt:
            pass
        print("End of the run")
        SAVER.save(agent.total_steps)
        DISPLAYER.disp()

        # agent.play(10)

        # agent.play(3, "results/gif/".format(parameters.ENV))

    agent.close()
Esempio n. 6
0
    def run(self):

        self.total_steps = 0

        for ep in range(1, parameters.TRAINING_STEPS + 1):

            episode_reward = 0
            episode_step = 0
            done = False

            # Initial state
            s = self.env.reset()
            self.env.set_render(ep % 1000 == 0)
            gif = (ep % 1500 == 0)
            step_allonge = ep // 1000

            while episode_step < parameters.MAX_EPISODE_STEPS + step_allonge \
                    and not done:

                if random.random() < self.epsilon:
                    a = self.env.random()
                else:
                    # choose action based on deterministic policy
                    a, = self.sess.run(self.network.actions,
                                       feed_dict={self.network.state_ph: [s]})

                s_, r, done, info = self.env.act(a, gif)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 0.0 if done else 1.0))

                # update network weights to fit a minibatch of experience
                if self.total_steps % parameters.TRAINING_FREQ == 0 and \
                        len(self.buffer) >= parameters.BATCH_SIZE:

                    minibatch = self.buffer.sample()

                    _, _ = self.sess.run([self.network.critic_train_op, self.network.actor_train_op],
                                         feed_dict={
                        self.network.state_ph: np.asarray([elem[0] for elem in minibatch]),
                        self.network.action_ph: np.asarray([elem[1] for elem in minibatch]),
                        self.network.reward_ph: np.asarray([elem[2] for elem in minibatch]),
                        self.network.next_state_ph: np.asarray([elem[3] for elem in minibatch]),
                        self.network.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch])})

                    # update target networks
                    _ = self.sess.run(self.network.update_slow_targets_op)

                s = s_
                episode_step += 1
                self.total_steps += 1

            # Decay epsilon
            if self.epsilon > parameters.EPSILON_STOP:
                self.epsilon -= self.epsilon_decay

            if gif:
                self.env.save_gif('results/gif/', self.n_gif)
                self.n_gif = (self.n_gif + 1) % 5

            if episode_reward > self.best_run:
                self.best_run = episode_reward
                print("Save best", episode_reward)
                SAVER.save('best')

            DISPLAYER.add_reward(episode_reward)
            if ep % 50 == 0:
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %7.3f'
                      ' (max step: %i)' % (ep, episode_reward, episode_step,
                                           self.epsilon,
                                           parameters.MAX_EPISODE_STEPS +
                                           step_allonge))
            if ep % 500 == 0:
                DISPLAYER.disp()
Esempio n. 7
0
if __name__ == '__main__':

    tf.reset_default_graph()

    with tf.Session() as sess:

        agent = Agent(sess)
        SAVER.set_sess(sess)

        SAVER.load(agent)

        if settings.GUI:
            gui = threading.Thread(target=GUI.main)
            gui.start()

        try:
            agent.run()
        except KeyboardInterrupt:
            pass
        print("End of the run")
        SAVER.save('last')
        DISPLAYER.disp()

        if settings.GUI:
            gui.join()
        else:
            agent.play(5)

    agent.close()
Esempio n. 8
0
    for i in range(settings.NB_THREADS):
        train_threads.append(threading.Thread(target=work, args=(i, )))

    # Intercept CTRL+C signal
    signal.signal(signal.SIGINT, signal_handler)

    # Set start time
    global_start_time = time.time()

    # Start the workers' thread
    for t in train_threads:
        t.start()

    print('Press Ctrl+C to stop')
    signal.pause()

    for t in train_threads:
        t.join()

    wall_time += time.time() - global_start_time

    print("Wall time of simulation : %f\nGlobal CPU time : %f\n"
          "Total number of episodes : %i\nTotal number of steps : %i" %
          (wall_time, global_total_time, total_eps, total_steps))

    print('Now saving data. Please wait')
    SAVER.save(global_total_time, wall_time, total_eps, total_steps)
    DISPLAYER.disp()

    summary_writer.close()
Esempio n. 9
0
from Displayer import DISPLAYER
from Saver import SAVER

import parameters

if __name__ == '__main__':

    tf.reset_default_graph()

    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())

        agent = Agent(sess)
        SAVER.set_sess(sess)

        SAVER.load(agent)

        try:
            agent.run()
        except KeyboardInterrupt:
            pass
        print("End of the run")
        SAVER.save(agent.nb_ep)
        DISPLAYER.disp()

        agent.play(10)

    agent.stop()