Exemple #1
0
 def play(self):
     environment = Environment(True, 4)
     while not environment.isTerminal():
         state = environment.get_state()
         qval = self.net.predict(state.reshape(1, 4, 84, 84), batch_size=1)
         action = (np.argmax(qval))
         reward = environment.act(action)
Exemple #2
0
 def play(self):
     environment = Environment(True, 4)
     while not environment.isTerminal():
         state = environment.get_state()
         qval = self.net.predict(state.reshape(1, 4, 84, 84), batch_size=1)
         action = (np.argmax(qval))
         reward = environment.act(action)
Exemple #3
0
class Agent:

    def __init__(self, sess):
        print("Initializing the agent...")

        self.sess = sess
        self.env = Environment()
        self.state_size = self.env.get_state_size()[0]
        self.action_size = self.env.get_action_size()
        self.low_bound, self.high_bound = self.env.get_bounds()

        self.buffer = ExperienceBuffer()

        print("Creation of the actor-critic network")
        self.network = Network(self.state_size, self.action_size,
                               self.low_bound, self.high_bound)

        self.epsilon = parameters.EPSILON_START
        self.epsilon_decay = (parameters.EPSILON_START -
                              parameters.EPSILON_STOP) \
            / parameters.EPSILON_STEPS

        self.best_run = -1e10
        self.n_gif = 0

        self.sess.run(tf.global_variables_initializer())

    def run(self):

        self.total_steps = 0

        for ep in range(1, parameters.TRAINING_STEPS + 1):

            episode_reward = 0
            episode_step = 0
            done = False

            # Initial state
            s = self.env.reset()
            self.env.set_render(ep % 1000 == 0)
            gif = (ep % 1500 == 0)
            step_allonge = ep // 1000

            while episode_step < parameters.MAX_EPISODE_STEPS + step_allonge \
                    and not done:

                if random.random() < self.epsilon:
                    a = self.env.random()
                else:
                    # choose action based on deterministic policy
                    a, = self.sess.run(self.network.actions,
                                       feed_dict={self.network.state_ph: [s]})

                s_, r, done, info = self.env.act(a, gif)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 0.0 if done else 1.0))

                # update network weights to fit a minibatch of experience
                if self.total_steps % parameters.TRAINING_FREQ == 0 and \
                        len(self.buffer) >= parameters.BATCH_SIZE:

                    minibatch = self.buffer.sample()

                    _, _ = self.sess.run([self.network.critic_train_op, self.network.actor_train_op],
                                         feed_dict={
                        self.network.state_ph: np.asarray([elem[0] for elem in minibatch]),
                        self.network.action_ph: np.asarray([elem[1] for elem in minibatch]),
                        self.network.reward_ph: np.asarray([elem[2] for elem in minibatch]),
                        self.network.next_state_ph: np.asarray([elem[3] for elem in minibatch]),
                        self.network.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch])})

                    # update target networks
                    _ = self.sess.run(self.network.update_slow_targets_op)

                s = s_
                episode_step += 1
                self.total_steps += 1

            # Decay epsilon
            if self.epsilon > parameters.EPSILON_STOP:
                self.epsilon -= self.epsilon_decay

            if gif:
                self.env.save_gif('results/gif/', self.n_gif)
                self.n_gif = (self.n_gif + 1) % 5

            if episode_reward > self.best_run:
                self.best_run = episode_reward
                print("Save best", episode_reward)
                SAVER.save('best')

            DISPLAYER.add_reward(episode_reward)
            if ep % 50 == 0:
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %7.3f'
                      ' (max step: %i)' % (ep, episode_reward, episode_step,
                                           self.epsilon,
                                           parameters.MAX_EPISODE_STEPS +
                                           step_allonge))
            if ep % 500 == 0:
                DISPLAYER.disp()

    def play(self, number_run, path=''):
        print("Playing for", number_run, "runs")

        self.env.set_render(True)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False

                while not done:

                    a, = self.sess.run(self.network.actions,
                                       feed_dict={self.network.state_ph: s[None]})

                    s, r, done, info = self.env.act(a, path != '')
                    episode_reward += r

                print("Episode reward :", episode_reward)

                if path != '':
                    self.env.save_gif(path, i)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            self.env.set_render(False)
            print("End of the demo")
            self.env.close()

    def close(self):
        self.env.close()
Exemple #4
0
    global_network = Network(0, device)
    env = Environment(True)

    # prepare session
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                            allow_soft_placement=True))

    sess.run(tf.global_variables_initializer())

    SAVER.set_sess(sess)
    settings.LOAD = True
    global_total_time, wall_time, total_eps, total_steps = SAVER.load()

    for i in range(NB_PLAY):
        done = False
        reward, step = 0, 0

        state = env.reset()
        global_network.reset_state()

        while not done:

            pi, value = global_network.run_policy_and_value(sess, state)
            a = np.random.choice(settings.ACTION_SIZE, p=pi)
            state, r, done, _ = env.act(a)

            reward += r
            step += 1

        print("Episode reward {} (in {} steps)".format(reward, step))
Exemple #5
0
class Q:
    def __init__(self):
        self.net = None
        self.env = Environment(False, 4)
        self.mem = Memory(32, 1000000)
        self.epsilon = 0.5
        self.gamma = 0.7
        self.number_of_actions = 4
        try:
            self.load_network()
        except IOError:
            print 'No network found'
            self.create_model()

    def create_model(self):
        print 'Creating model...'
        model = Sequential()
        model.add(
            Convolution2D(32, 8, 8, subsample=(4, 4), activation='relu', input_shape=(4, 84, 84)))
        model.add(Convolution2D(64, 4, 4, activation='relu', subsample=(2, 2)))
        model.add(Convolution2D(64, 3, 3, activation='relu', subsample=(1, 1)))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.number_of_actions, activation='linear'))
        model.compile(loss='mse', optimizer='rmsprop')
        self.net = model
        print 'Done!'

    def save_network(self):
        json_string = self.net.to_json()
        open('deep_q_network.json', 'w').write(json_string)
        self.net.save_weights('network_weights.h5', overwrite=True)

    def load_network(self):
        print 'Loading network...'
        model = model_from_json(open('deep_q_network.json').read())
        model.load_weights('network_weights.h5')
        model.compile(loss='mse', optimizer='rmsprop')
        print 'Network loaded!'
        self.net = model

    def train(self, epochs):
        for i in xrange(epochs):
            state = self.env.get_state()
            while not self.env.isTerminal():
                qval = self.net.predict(state.reshape(1, 4, 84, 84), batch_size=1)
                if random.random() < self.epsilon:  # choose random action
                    action = np.random.randint(0, self.number_of_actions)
                else:  # choose best action from Q(s,a) values
                    action = np.argmax(qval)
                # Take action, observe new state S'
                reward = self.env.act(action)
                new_state = self.env.get_state()
                # Experience replay storage
                is_terminal = self.env.isTerminal()

                self.mem.store(state, action, reward, new_state, is_terminal)

                print 'Game : {}'.format(i)
                if self.mem.isFull():
                    minibatch = self.mem.sample()
                    self.train_on_minibatch(minibatch)
                state = new_state

            if self.epsilon > 0.1:  # decrement epsilon over time
                self.epsilon -= (1 / 100000)
            self.env.restart()
            if i % 10 == 0:
                self.save_network()

    def train_on_minibatch(self, minibatch):
        x_train, y_train = [], []
        for sample in minibatch:
            # Get max_Q(S',a)
            old_state, action, reward, new_state, terminal = sample
            old_qval = self.net.predict(old_state.reshape(1, 4, 84, 84), batch_size=1)
            newQ = self.net.predict(new_state.reshape(1, 4, 84, 84), batch_size=1)
            maxQ = np.max(newQ)
            y = np.zeros((1, self.number_of_actions))
            y[:] = old_qval[:]
            if not terminal:  # non-terminal state
                update = (reward + (self.gamma * maxQ))
            else:  # terminal state
                update = reward
            y[0][action] = update
            x_train.append(old_state.reshape(4, 84, 84))
            y_train.append(y.reshape(self.number_of_actions, ))

        x_train = np.array(x_train)
        y_train = np.array(y_train)
        self.net.fit(x_train, y_train, batch_size=self.mem.batch_size, nb_epoch=1)

    def play(self):
        environment = Environment(True, 4)
        while not environment.isTerminal():
            state = environment.get_state()
            qval = self.net.predict(state.reshape(1, 4, 84, 84), batch_size=1)
            action = (np.argmax(qval))
            reward = environment.act(action)
Exemple #6
0
class Agent:

    def __init__(self, sess):
        print("Initializing the agent...")

        self.sess = sess
        self.env = Environment()
        self.state_size = self.env.get_state_size()*2 #60 velocities and 60 incidences
        self.action_size = self.env.get_action_size()
        self.low_bound, self.high_bound = self.env.get_bounds()

        self.buffer = ExperienceBuffer()

        print("Creation of the actor-critic network")
        self.network = Network(self.state_size, self.action_size,
                               self.low_bound, self.high_bound)

        self.sess.run(tf.global_variables_initializer())
        DISPLAYER.reset()

    def run(self):
        #self.load("NetworkParam_best_ThirdSemester/FinalParam") #get the best parameters to start the training
        self.total_steps = 0

        '''
        WIND CONDITIONS
        '''
        mean = 45 * TORAD
        std = 0.1 * TORAD
        wind_samples = 10
        w = wind(mean=mean, std=std, samples = wind_samples)
        WH = w.generateWind()

        for ep in range(1, parameters.TRAINING_STEPS+1):

            episode_reward = 0
            episode_step = 0
            nearlyDone=0
            done=False

            # Initialize exploration noise process
            noise_process = np.zeros(self.action_size)
            noise_scale = (parameters.NOISE_SCALE_INIT *
                           parameters.NOISE_DECAY**ep) * \
                (self.high_bound - self.low_bound)

            # Initial state
            w = wind(mean=mean, std=std, samples = wind_samples)
            WH = w.generateWind()
            hdg0_rand = random.uniform(6,13) 
            hdg0 = hdg0_rand * TORAD * np.ones(10)
            s = self.env.reset(hdg0,WH)
            
            while episode_step < parameters.MAX_EPISODE_STEPS: #and not done:

                WH = np.random.uniform(mean - std, mean + std, size=wind_samples)

                # choose action based on deterministic policy
                s = np.reshape([s[0,:], s[1,:]], [self.state_size,1])
                a, = self.sess.run(self.network.actions,
                                   feed_dict={self.network.state_ph: s[None]})

                # add temporally-correlated exploration noise to action
                # (using an Ornstein-Uhlenbeck process)
                noise_process = parameters.EXPLO_THETA * \
                    (parameters.EXPLO_MU - noise_process) + \
                    parameters.EXPLO_SIGMA * np.random.randn(self.action_size)
                a += noise_scale * noise_process
                #to respect the bounds:
                a = np.clip(a, self.low_bound, self.high_bound)
                
                s_, v  = self.env.act(a,WH)
                
                #reward  assignation algorithm
                if episode_step==1:
                    r=0
                #elif s[int(self.state_size/2-2)]>(13*TORAD) and s[int(self.state_size/2-2)]<(15*TORAD) and v>0.63 and v<0.67 and a<0:
                #    r=0.1
                else:
                    if v<=0.69:
                        r=0
                        nearlyDone = 0
                    elif v>0.69 and v<=0.75:
                        r=0.00001
                        nearlyDone = 0
                    elif v>0.75 and v<=0.8:
                        r=0.01
                        nearlyDone = 0
                    elif v>0.80:
                        r=0.1
                        if nearlyDone>=3:
                            r=1
                            done = True
                        elif nearlyDone==2:
                            r=0.8
                        elif nearlyDone==1:
                            r=0.25
                        nearlyDone=nearlyDone+1
                    else:
                        r=0
                        nearlyDone = False

                episode_reward += r

                self.buffer.add((s, np.reshape(a, [1,1] ), r, np.reshape(s_, [self.state_size,1]), 0.0 if episode_step<parameters.MAX_EPISODE_STEPS-1 else 1.0)) #, 0.0 if done else 1.0

                # update network weights to fit a minibatch of experience
                if self.total_steps % parameters.TRAINING_FREQ == 0 and \
                        len(self.buffer) >= parameters.BATCH_SIZE:

                    minibatch = self.buffer.sample()

                    _, _,critic_loss = self.sess.run([self.network.critic_train_op, self.network.actor_train_op,self.network.critic_loss],
                                         feed_dict={
                        self.network.state_ph: np.asarray([elem[0] for elem in minibatch]),
                        self.network.action_ph: np.asarray([elem[1] for elem in minibatch]),
                        self.network.reward_ph: np.asarray([elem[2] for elem in minibatch]),
                        self.network.next_state_ph: np.asarray([elem[3] for elem in minibatch]),
                        self.network.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch])})

                    # update target networks
                    _ = self.sess.run(self.network.update_slow_targets_op)

                s = s_
                episode_step += 1
                self.total_steps += 1
            if ep % parameters.DISP_EP_REWARD_FREQ == 0:
                print('Episode %2i, initial heading: %7.3f, Reward: %7.3f, Final noise scale: %7.3f, critic loss: %7.3f' %
                      (ep, hdg0[0]*(1/TORAD), episode_reward, noise_scale,critic_loss))
            DISPLAYER.add_reward(episode_reward)
            # We save CNN weights every 500 epochs
            if ep % 500 == 0 and ep != 0:
                self.save("NetworkParam/"+ str(ep) +"_epochs")
        self.save("NetworkParam/"+"FinalParam")


    def playActor(self):
        self.load("NetworkParam/FinalParam")

        hdg0_rand_vec=[0,7,12]
        '''
        WIND CONDITIONS
        '''
        mean = 45 * TORAD
        std = 0.1 * TORAD
        wind_samples = 10
        w = wind(mean=mean, std=std, samples = wind_samples)

        try:
            for i in range(len(hdg0_rand_vec)):
                # Initial state
                WH = w.generateWind()
                hdg0_rand = hdg0_rand_vec[i]
                hdg0 = hdg0_rand * TORAD * np.ones(10)
                s = self.env.reset(hdg0,WH)
                episode_reward = 0
                episode_step=0
                v_episode=[]
                i_episode=[]
                while episode_step < 40: #not done:
                    if episode_step==0:
                        i_episode.append(hdg0_rand+WH[0]/TORAD-40)
                    else:
                        i_episode.append(s[0][-1]/TORAD)
                    s = np.reshape([s[0,:], s[1,:]], [self.state_size,1])

                    a, = self.sess.run(self.network.actions,
                                       feed_dict={self.network.state_ph: s[None]})
                    a = np.clip(a, self.low_bound, self.high_bound)
                    s_, r   = self.env.act(a,WH)
                    episode_reward += r
                    v_episode.append(r)
                    episode_step += 1
                    s = s_
                DISPLAYER.displayVI(v_episode,i_episode,i)
                print("Episode reward :", episode_reward," for incidence: ",hdg0_rand)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")

    def playCritic(self):
        self.load("NetworkParam/FinalParam")

        hdg0_rand_vec=[0,7,12]
        '''
        WIND CONDITIONS
        '''
        mean = 45 * TORAD
        std = 0.1 * TORAD
        wind_samples = 10
        w = wind(mean=mean, std=std, samples = wind_samples)

        try:
            for i in range(len(hdg0_rand_vec)):
                # Initial state
                WH = w.generateWind()
                hdg0_rand = hdg0_rand_vec[i]
                hdg0 = hdg0_rand * TORAD * np.ones(10)
                s = self.env.reset(hdg0,WH)
                
                episode_reward = 0
                episode_step=0
                v_episode=[]
                i_episode=[]
                while episode_step < 30: #not done:
                    if episode_step==0:
                        i_episode.append(hdg0_rand+WH[0]/TORAD-40)
                    else:
                        i_episode.append(s[0][-1]/TORAD)
                    
                    # Critic policy
                    critic = [self.evaluate(s, -1.5),self.evaluate(s, -1.25),self.evaluate(s, -1),
                        self.evaluate(s, -0.75),self.evaluate(s, -0.5),self.evaluate(s, -0.25),self.evaluate(s, 0),self.evaluate(s, 0.25),
                            self.evaluate(s, 0.5),self.evaluate(s, 0.75),self.evaluate(s, 1),self.evaluate(s, 1.25),
                            self.evaluate(s, 1.5)]
                    a = np.argmax(critic)
                    if a == 0:
                        a = -1.5
                    if a == 1:
                        a = -1.25
                    if a == 2:
                        a = -1
                    if a == 3:
                        a = -0.75
                    if a == 4:
                        a = -0.5
                    if a == 5:
                        a = -0.25
                    if a == 6:
                        a = 0
                    if a == 7:
                        a = 0.25
                    if a == 8:
                        a = 0.5
                    if a == 9:
                        a = 0.75
                    if a == 10:
                        a = 1
                    if a == 11:
                        a = 1.25
                    if a == 12:
                        a = 1.5

                    s_, r   = self.env.act(a,WH)
                    episode_reward += r
                    v_episode.append(r)
                    episode_step += 1
                    s = s_
                DISPLAYER.displayVI(v_episode,i_episode,i+3)
                print("Episode reward :", episode_reward," for incidence: ",hdg0_rand)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")

    def save(self, name):
        """
        Save the weights of both of the networks into a .ckpt tensorflow session file
        :param name: Name of the file where the weights are saved
        """
        saver = tf.train.Saver()
        save_path = saver.save(self.sess, name+".ckpt")
        print("Model saved in path: %s" % save_path)

    def load(self, name):
        """
        Load the weights of the 2 networks saved in the file into :ivar network
        :param name: name of the file containing the weights to load
        """
        saver = tf.train.Saver()
        saver.restore(self.sess, name+".ckpt")

    def evaluate(self, state, action):
        """
        Evaluate the Q-value of a state-action pair  using the critic neural network.

        :param np.array state: state that we want to evaluate.
        :param float action: action that we want to evaluate (has to be between permitted bounds)
        :return: The continuous action value.
        """
        s = np.reshape([state[0, :], state[1, :]], (1,self.state_size, 1))
        a = np.reshape(action, (1,self.action_size, 1))
        q = self.sess.run(
            self.network.q_values_of_given_actions,
            feed_dict={
                self.network.state_ph: s,
                self.network.action_ph: a})
        return q
Exemple #7
0
class Agent:
    """
    This class builds an agent with its own QNetwork, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment and QNetwork.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.gui_thread = threading.Thread(target=lambda: self.gui.run(self))
        self.displayer = displayer
        self.saver = saver
        signal.signal(signal.SIGINT, self.interrupt)

        self.env = Environment()
        self.QNetwork = QNetwork(sess)
        self.buffer = ExperienceBuffer(prioritized=Settings.PRIORITIZED_ER)
        self.epsilon = Settings.EPSILON_START
        self.beta = Settings.BETA_START

        self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS -
                                                            1)
        self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS)

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !\n")

    def create_summaries(self):

        self.ep_reward_ph = tf.placeholder(tf.float32)
        ep_reward_summary = tf.summary.scalar("Episode/Episode reward",
                                              self.ep_reward_ph)

        self.steps_ph = tf.placeholder(tf.float32)
        steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph)

        self.epsilon_ph = tf.placeholder(tf.float32)
        epsilon_summary = tf.summary.scalar("Settings/Epsilon",
                                            self.epsilon_ph)

        self.ep_summary = tf.summary.merge(
            [ep_reward_summary, epsilon_summary, steps_summary])

        self.lr_ph = tf.placeholder(tf.float32)
        self.lr_summary = tf.summary.scalar("Settings/Learning rate",
                                            self.lr_ph)

        self.writer = tf.summary.FileWriter("./logs", self.sess.graph)

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.QNetwork.init_target()
        self.gui_thread.start()

        self.nb_ep = 1
        learning_steps = 0

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False
            memory = deque()

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))
            plot_distrib = self.gui.plot_distrib.get(self.nb_ep)

            while episode_step <= max_step and not done:

                # Exploration by NoisyNets or epsilon-greedy policy
                if not Settings.NOISY and random.random() < self.epsilon:
                    a = self.env.act_random()
                else:
                    if Settings.DISTRIBUTIONAL:
                        Qdistrib = self.QNetwork.act(s)
                        Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    else:
                        Qvalue = self.QNetwork.act(s)

                    a = np.argmax(Qvalue, axis=0)

                    if plot_distrib:
                        self.displayer.disp_distrib(self.z, self.delta_z,
                                                    Qdistrib, Qvalue)

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                memory.append((s, a, r, s_, done))

                # Keep the experience in memory until 'N_STEP_RETURN' steps has
                # passed to get the delayed return r_1 + ... + gamma^n r_n
                while len(memory) >= Settings.N_STEP_RETURN or (memory and
                                                                memory[-1][4]):
                    s_mem, a_mem, discount_R, si_, done_ = memory.popleft()
                    if not done_ and memory:
                        for i in range(Settings.N_STEP_RETURN - 1):
                            si, ai, ri, si_, done_ = memory[i]
                            discount_R += ri * Settings.DISCOUNT**(i + 1)
                            if done_:
                                break
                    self.buffer.add(
                        (s_mem, a_mem, discount_R, si_, 1 if not done_ else 0))

                if episode_step % Settings.TRAINING_FREQ == 0:
                    if Settings.PRIORITIZED_ER:
                        batch, idx, weights = self.buffer.sample(self.beta)
                    else:
                        batch = self.buffer.sample(self.beta)
                        idx = weights = None
                    loss = self.QNetwork.train(np.asarray(batch), weights)
                    self.buffer.update(idx, loss)
                    self.QNetwork.update_target()

                    feed_dict = {self.lr_ph: self.QNetwork.learning_rate}
                    summary = self.sess.run(self.lr_summary,
                                            feed_dict=feed_dict)
                    self.writer.add_summary(summary, learning_steps)
                    learning_steps += 1

                s = s_
                episode_step += 1

            # Decay epsilon
            if self.epsilon > Settings.EPSILON_STOP:
                self.epsilon -= Settings.EPSILON_DECAY

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %f'
                      ', Max steps: %i, Learning rate: %fe-4' %
                      (self.nb_ep, episode_reward, episode_step, self.epsilon,
                       max_step, self.QNetwork.learning_rate * 1e4))

            # Write the summary
            feed_dict = {
                self.ep_reward_ph: episode_reward,
                self.epsilon_ph: self.epsilon,
                self.steps_ph: episode_step
            }
            summary = self.sess.run(self.ep_summary, feed_dict=feed_dict)
            self.writer.add_summary(summary, self.nb_ep)

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        print("Training completed !")
        self.env.close()
        self.display()
        self.gui.end_training()
        self.gui_thread.join()

    def play(self, number_run=1, gif=False, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            gif       : whether to save a gif or not
            name      : the name of the gif that will be saved
        """
        self.env.set_render(Settings.DISPLAY)

        for i in range(number_run):

            s = self.env.reset()
            episode_reward = 0
            done = False
            self.env.set_gif(gif, name)

            while not done:
                if Settings.DISTRIBUTIONAL:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                else:
                    Qvalue = self.QNetwork.act(s)
                a = np.argmax(Qvalue, axis=0)
                s, r, done, info = self.env.act(a)
                episode_reward += r

            if gif: self.env.save_gif()
            print("Episode reward :", episode_reward)

    def display(self):
        self.displayer.disp()

    def stop(self):
        self.env.close()

    def interrupt(self, sig, frame):
        self.gui.stop_run()
Exemple #8
0
class Agent:
    def __init__(self, sess):
        print("Initializing the agent...")

        self.sess = sess
        self.env = Environment()
        self.state_size = self.env.get_state_size()[0]
        self.action_size = self.env.get_action_size()
        self.bounds = self.env.get_bounds()

        print("Creation of the actor-critic network")
        self.network = Network(self.sess, self.state_size, self.action_size,
                               self.bounds)

        self.critic_lr = settings.CRITIC_LEARNING_RATE
        self.actor_lr = settings.ACTOR_LEARNING_RATE

        self.delta_critic_lr = self.critic_lr / settings.TRAINING_EPS
        self.delta_actor_lr = self.actor_lr / settings.TRAINING_EPS

        self.sess.run(tf.global_variables_initializer())

    def predict_action(self, s, plot_distrib):
        if plot_distrib:
            action, distrib, value = self.sess.run([
                self.network.actions, self.network.Q_distrib_suggested_actions,
                self.network.Q_values_suggested_actions
            ],
                                                   feed_dict={
                                                       self.network.state_ph:
                                                       s[None]
                                                   })
            action, distrib, value = action[0], distrib[0], value[0]
            fig = plt.figure(2)
            fig.clf()
            plt.bar(self.z, distrib, self.delta_z)
            plt.axvline(value, color='red', linewidth=0.7)
            plt.show(block=False)
            plt.pause(0.001)
            return action

        return self.sess.run(self.network.actions,
                             feed_dict={self.network.state_ph: s[None]})[0]

    def run(self):

        self.total_steps = 1
        self.sess.run(self.network.target_init)
        self.z = self.sess.run(self.network.z)
        self.delta_z = self.network.delta_z

        ep = 1
        while ep < settings.TRAINING_EPS + 1 and not GUI.STOP:

            s = self.env.reset()
            episode_reward = 0
            episode_step = 0
            done = False
            memory = deque()

            # Initialize exploration noise process
            noise_scale = settings.NOISE_SCALE * settings.NOISE_DECAY**ep

            # Initial state
            self.env.set_render(GUI.render.get(ep))
            self.env.set_gif(GUI.gif.get(ep))
            plot_distrib = GUI.plot_distrib.get(ep)

            max_eps = settings.MAX_EPISODE_STEPS + (ep // 50)

            while episode_step < max_eps and not done:

                noise = np.random.normal(size=self.action_size)
                scaled_noise = noise_scale * noise

                a = np.clip(
                    self.predict_action(s, plot_distrib) + scaled_noise,
                    *self.bounds)

                s_, r, done, info = self.env.act(a)

                episode_reward += r

                memory.append((s, a, r, s_, 0 if done else 1))

                if len(memory) >= settings.N_STEP_RETURN:
                    s_mem, a_mem, discount_r, ss_mem, done_mem = memory.popleft(
                    )
                    for i, (si, ai, ri, s_i, di) in enumerate(memory):
                        discount_r += ri * settings.DISCOUNT**(i + 1)
                    BUFFER.add(s_mem, a_mem, discount_r, s_, 0 if done else 1)

                if len(
                        BUFFER
                ) > 0 and self.total_steps % settings.TRAINING_FREQ == 0:
                    self.network.train(BUFFER.sample(), self.critic_lr,
                                       self.actor_lr)

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.critic_lr -= self.delta_critic_lr
            self.actor_lr -= self.delta_actor_lr

            # Plot reward
            plot = GUI.plot.get(ep)
            DISPLAYER.add_reward(episode_reward, plot)

            # Print episode reward
            if GUI.ep_reward.get(ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f, Critic LR: %f, Actor LR: %f'
                    % (ep, episode_reward, episode_step, noise_scale,
                       self.critic_lr, self.actor_lr))

            # Save the model
            if GUI.save.get(ep):
                SAVER.save(ep)

            ep += 1

    def play(self, number_run):
        print("Playing for", number_run, "runs")

        self.env.set_render(settings.DISPLAY)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False

                while not done:

                    a = self.predict_action(s)

                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            self.env.set_render(False)
            print("End of the demo")
            self.env.close()

    def close(self):
        self.env.close()
Exemple #9
0
class Agent:
    def __init__(self, sess):
        print("Initializing the agent...")

        self.sess = sess
        self.env = Environment()
        self.state_size = self.env.get_state_size()
        self.action_size = self.env.get_action_size()

        print("Creation of the main QNetwork...")
        self.mainQNetwork = QNetwork(self.state_size, self.action_size, 'main')
        print("Main QNetwork created !\n")

        print("Creation of the target QNetwork...")
        self.targetQNetwork = QNetwork(self.state_size, self.action_size,
                                       'target')
        print("Target QNetwork created !\n")

        self.buffer = PrioritizedReplayBuffer(parameters.BUFFER_SIZE,
                                              parameters.ALPHA)

        self.epsilon = parameters.EPSILON_START
        self.beta = parameters.BETA_START

        trainables = tf.trainable_variables()
        self.update_target_ops = updateTargetGraph(trainables)

        self.nb_ep = 1

    def pre_train(self):
        print("Beginning of the pre-training...")

        for i in range(parameters.PRE_TRAIN_STEPS):

            s = self.env.reset()
            done = False
            episode_step = 0
            episode_reward = 0

            while episode_step < parameters.MAX_EPISODE_STEPS and not done:

                a = random.randint(0, self.action_size - 1)
                s_, r, done, info = self.env.act(a)
                self.buffer.add(s, a, r, s_, done)

                s = s_
                episode_reward += r
                episode_step += 1

            if i % 100 == 0:
                print("Pre-train step n", i)

        print("End of the pre training !")

    def run(self):
        print("Beginning of the run...")

        self.pre_train()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < parameters.TRAINING_STEPS:

            s = self.env.reset()
            episode_reward = 0
            done = False

            memory = deque()
            discount_R = 0

            episode_step = 0

            # Render parameters
            self.env.set_render(self.nb_ep % parameters.RENDER_FREQ == 0)

            while episode_step < parameters.MAX_EPISODE_STEPS and not done:

                if random.random() < self.epsilon:
                    a = random.randint(0, self.action_size - 1)
                else:
                    a = self.sess.run(
                        self.mainQNetwork.predict,
                        feed_dict={self.mainQNetwork.inputs: [s]})
                    a = a[0]

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                memory.append((s, a, r, s_, done))

                if len(memory) > parameters.N_STEP_RETURN:
                    s_mem, a_mem, r_mem, ss_mem, done_mem = memory.popleft()
                    discount_R = r_mem
                    for i, (si, ai, ri, s_i, di) in enumerate(memory):
                        discount_R += ri * parameters.DISCOUNT**(i + 1)
                    self.buffer.add(s_mem, a_mem, discount_R, s_, done)

                if episode_step % parameters.TRAINING_FREQ == 0:

                    train_batch = self.buffer.sample(parameters.BATCH_SIZE,
                                                     self.beta)
                    # Incr beta
                    if self.beta <= parameters.BETA_STOP:
                        self.beta += parameters.BETA_INCR

                    feed_dict = {self.mainQNetwork.inputs: train_batch[3]}
                    mainQaction = self.sess.run(self.mainQNetwork.predict,
                                                feed_dict=feed_dict)

                    feed_dict = {self.targetQNetwork.inputs: train_batch[3]}
                    targetQvalues = self.sess.run(self.targetQNetwork.Qvalues,
                                                  feed_dict=feed_dict)

                    # Done multiplier :
                    # equals 0 if the episode was done
                    # equals 1 else
                    done_multiplier = (1 - train_batch[4])
                    doubleQ = targetQvalues[range(parameters.BATCH_SIZE),
                                            mainQaction]
                    targetQvalues = train_batch[2] + \
                        parameters.DISCOUNT * doubleQ * done_multiplier

                    feed_dict = {
                        self.mainQNetwork.inputs: train_batch[0],
                        self.mainQNetwork.Qtarget: targetQvalues,
                        self.mainQNetwork.actions: train_batch[1]
                    }
                    td_error, _ = self.sess.run(
                        [self.mainQNetwork.td_error, self.mainQNetwork.train],
                        feed_dict=feed_dict)

                    self.buffer.update_priorities(train_batch[6],
                                                  td_error + 1e-6)

                    update_target(self.update_target_ops, self.sess)

                s = s_
                episode_step += 1
                self.total_steps += 1

            # Decay epsilon
            if self.epsilon > parameters.EPSILON_STOP:
                self.epsilon -= parameters.EPSILON_DECAY

            DISPLAYER.add_reward(episode_reward)

            self.total_steps += 1

            if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0:
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %f' %
                      (self.nb_ep, episode_reward, episode_step, self.epsilon))
            self.nb_ep += 1

    def play(self, number_run):
        print("Playing for", number_run, "runs")

        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False

                while not done:
                    a = self.sess.run(
                        self.mainQNetwork.predict,
                        feed_dict={self.mainQNetwork.inputs: [s]})
                    a = a[0]
                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            self.env.set_render(False)
            print("End of the demo")
            self.env.close()

    def stop(self):
        self.env.close()
Exemple #10
0
class Agent:
    """
    This class builds an agent with its own Network, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.gui_thread = threading.Thread(target=lambda: self.gui.run(self))
        self.displayer = displayer
        self.saver = saver
        signal.signal(signal.SIGINT, self.interrupt)

        self.env = Environment()
        self.network = Network(sess)
        self.buffer = ExperienceBuffer()

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !")

    def create_summaries(self):

        self.ep_reward_ph = tf.placeholder(tf.float32)
        ep_reward_summary = tf.summary.scalar("Episode/Episode reward",
                                              self.ep_reward_ph)

        self.steps_ph = tf.placeholder(tf.float32)
        steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph)

        self.noise_ph = tf.placeholder(tf.float32)
        noise_summary = tf.summary.scalar("Settings/Noise", self.noise_ph)

        self.ep_summary = tf.summary.merge(
            [ep_reward_summary, noise_summary, steps_summary])

        self.writer = tf.summary.FileWriter("./logs", self.sess.graph)

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.network.init_target()
        self.gui_thread.start()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Initialize exploration noise process
            noise_process = np.zeros(Settings.ACTION_SIZE)
            noise_scale = (Settings.NOISE_SCALE_INIT *
                           Settings.NOISE_DECAY**self.nb_ep) * \
                (Settings.HIGH_BOUND - Settings.LOW_BOUND)

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))

            while episode_step <= max_step and not done:

                # Choose action based on deterministic policy
                a = self.network.act(s)

                # Add temporally-correlated exploration noise to action
                noise_process = Settings.EXPLO_THETA * \
                    (Settings.EXPLO_MU - noise_process) + \
                    Settings.EXPLO_SIGMA * np.random.randn(Settings.ACTION_SIZE)

                a += noise_scale * noise_process
                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                if self.total_steps % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample()
                    self.network.train(np.asarray(batch))
                    self.network.update_target()

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'
                    % (self.nb_ep, episode_reward, episode_step, noise_scale))

            # Write the summary
            feed_dict = {
                self.ep_reward_ph: episode_reward,
                self.noise_ph: noise_scale[0],
                self.steps_ph: episode_step
            }
            summary = self.sess.run(self.ep_summary, feed_dict=feed_dict)
            self.writer.add_summary(summary, self.nb_ep)

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        print("Training completed !")
        self.env.close()
        self.display()
        self.gui.end_training()
        self.gui_thread.join()

    def play(self, number_run=1, gif=False, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            gif       : whether to save a gif or not
            name      : the name of the gif that will be saved
        """
        self.env.set_render(Settings.DISPLAY)

        for i in range(number_run):

            s = self.env.reset()
            episode_reward = 0
            done = False
            self.env.set_gif(gif, name)

            while not done:
                a = self.network.act(s)
                s, r, done, info = self.env.act(a)
                episode_reward += r

            if gif: self.env.save_gif()
            print("Episode reward :", episode_reward)

    def display(self):
        self.displayer.disp()

    def stop(self):
        self.env.close()

    def interrupt(self, sig, frame):
        self.gui.stop_run()
Exemple #11
0
class Agent:
    def __init__(self, worker_index, sess, render=False, master=False):
        print("Initialization of the agent", str(worker_index))

        self.worker_index = worker_index
        if master:
            self.name = 'global'
        else:
            self.name = 'Worker_' + str(worker_index)

        self.env = Environment()
        self.env.set_render(render)
        self.state_size = self.env.get_state_size()
        self.action_size = self.env.get_action_size()

        self.network = Network(self.state_size, self.action_size, self.name)
        self.update_local_vars = update_target_graph('global', self.name)

        self.starting_time = 0
        self.epsilon = parameters.EPSILON_START

        if self.name != 'global':
            self.summary_writer = tf.summary.FileWriter(
                "results/" + self.name, sess.graph)

    def save(self, episode_step):
        # Save model
        SAVER.save(episode_step)

        # Save summary statistics
        summary = tf.Summary()
        summary.value.add(tag='Perf/Reward',
                          simple_value=np.mean(self.rewards_plus))
        summary.value.add(tag='Perf/Value',
                          simple_value=np.mean(self.next_values))
        summary.value.add(tag='Losses/Value', simple_value=self.value_loss)
        summary.value.add(tag='Losses/Policy', simple_value=self.policy_loss)
        summary.value.add(tag='Losses/Entropy', simple_value=self.entropy)
        summary.value.add(tag='Losses/Grad Norm', simple_value=self.grad_norm)
        self.summary_writer.add_summary(summary, self.nb_ep)
        self.summary_writer.flush()

    def train(self, sess, bootstrap_value):

        # Add the bootstrap value to our experience
        self.rewards_plus = np.asarray(self.rewards_buffer + [bootstrap_value])
        discounted_reward = discount(self.rewards_plus,
                                     parameters.DISCOUNT)[:-1]

        self.next_values = np.asarray(self.values_buffer[1:] +
                                      [bootstrap_value])
        advantages = self.rewards_buffer + \
            parameters.DISCOUNT * self.next_values - \
            self.values_buffer
        advantages = discount(
            advantages, parameters.GENERALIZED_LAMBDA * parameters.DISCOUNT)

        # Update the global network
        feed_dict = {
            self.network.discounted_reward: discounted_reward,
            self.network.inputs: self.states_buffer,
            self.network.actions: self.actions_buffer,
            self.network.advantages: advantages,
            self.network.state_in: self.initial_lstm_state
        }
        losses = sess.run([
            self.network.value_loss, self.network.policy_loss,
            self.network.entropy, self.network.grad_norm,
            self.network.state_out, self.network.apply_grads
        ],
                          feed_dict=feed_dict)

        # Get the losses for tensorboard
        self.value_loss, self.policy_loss, self.entropy = losses[:3]
        self.grad_norm, self.lstm_state, _ = losses[3:]

        # Reinitialize buffers and variables
        self.states_buffer = []
        self.actions_buffer = []
        self.rewards_buffer = []
        self.values_buffer = []
        self.lstm_buffer = []

    def work(self, sess, coord):
        print("Running", self.name, end='\n\n')
        self.starting_time = time()
        self.nb_ep = 1

        with sess.as_default(), sess.graph.as_default():

            with coord.stop_on_exception():
                while not coord.should_stop():

                    self.states_buffer = []
                    self.actions_buffer = []
                    self.rewards_buffer = []
                    self.values_buffer = []
                    self.mean_values_buffer = []
                    self.lstm_buffer = []

                    self.total_steps = 0
                    episode_reward = 0
                    episode_step = 0

                    # Reset the local network to the global
                    sess.run(self.update_local_vars)

                    s = self.env.reset()
                    done = False
                    render = (self.nb_ep % parameters.RENDER_FREQ == 0)
                    if render and parameters.DISPLAY:
                        self.env.set_render(True)

                    self.lstm_state = self.network.lstm_state_init
                    self.initial_lstm_state = self.lstm_state

                    while not coord.should_stop() and not done and \
                            episode_step < parameters.MAX_EPISODE_STEP:

                        self.lstm_buffer.append(self.lstm_state)

                        # Prediction of the policy and the value
                        feed_dict = {
                            self.network.inputs: [s],
                            self.network.state_in: self.lstm_state
                        }
                        policy, value, self.lstm_state = sess.run(
                            [
                                self.network.policy, self.network.value,
                                self.network.state_out
                            ],
                            feed_dict=feed_dict)

                        policy, value = policy[0], value[0][0]

                        if random.random() < self.epsilon:
                            action = random.randint(0, self.action_size - 1)

                        else:
                            # Choose an action according to the policy
                            action = np.random.choice(self.action_size,
                                                      p=policy)

                        s_, r, done, _ = self.env.act(action)

                        # Store the experience
                        self.states_buffer.append(s)
                        self.actions_buffer.append(action)
                        self.rewards_buffer.append(r)
                        self.values_buffer.append(value)
                        self.mean_values_buffer.append(value)
                        episode_reward += r
                        s = s_

                        episode_step += 1
                        self.total_steps += 1

                        # If we have more than MAX_LEN_BUFFER experiences, we
                        # apply the gradients and update the global network,
                        # then we empty the episode buffers
                        if len(self.states_buffer) == parameters.MAX_LEN_BUFFER \
                                and not done:

                            feed_dict = {
                                self.network.inputs: [s],
                                self.network.state_in: self.lstm_state
                            }
                            bootstrap_value = sess.run(self.network.value,
                                                       feed_dict=feed_dict)

                            self.train(sess, bootstrap_value)
                            sess.run(self.update_local_vars)
                            self.initial_lstm_state = self.lstm_state

                    if len(self.states_buffer) != 0:
                        if done:
                            bootstrap_value = 0
                        else:
                            feed_dict = {
                                self.network.inputs: [s],
                                self.network.state_in: self.lstm_state
                            }
                            bootstrap_value = sess.run(self.network.value,
                                                       feed_dict=feed_dict)
                        self.train(sess, bootstrap_value)

                    if self.epsilon > parameters.EPSILON_STOP:
                        self.epsilon -= parameters.EPSILON_DECAY

                    self.nb_ep += 1

                    if not coord.should_stop():
                        DISPLAYER.add_reward(episode_reward, self.worker_index)

                    if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0:
                        print('Agent: %i, Episode %2i, Reward: %i, Steps: %i, '
                              'Epsilon: %7.3f' %
                              (self.worker_index, self.nb_ep, episode_reward,
                               episode_step, self.epsilon))

                    if (self.worker_index == 1
                            and self.nb_ep % parameters.SAVE_FREQ == 0):
                        self.save(self.total_steps)

                    if time() - self.starting_time > parameters.LIMIT_RUN_TIME:
                        coord.request_stop()

                    self.env.set_render(False)

            self.summary_writer.close()
            self.env.close()

    def play(self, sess, number_run, path=''):
        print("Playing", self.name, "for", number_run, "runs")

        with sess.as_default(), sess.graph.as_default():

            try:
                for i in range(number_run):

                    # Reset the local network to the global
                    if self.name != 'global':
                        sess.run(self.update_local_vars)

                    s = self.env.reset()
                    episode_reward = 0

                    done = False
                    self.lstm_state = self.network.lstm_state_init

                    while not done:
                        # Prediction of the policy
                        feed_dict = {
                            self.network.inputs: [s],
                            self.network.state_in: self.lstm_state
                        }
                        policy, self.lstm_state = sess.run(
                            [self.network.policy, self.network.state_out],
                            feed_dict=feed_dict)

                        policy = policy[0]

                        # Choose an action according to the policy
                        action = np.random.choice(self.action_size, p=policy)
                        s, r, done, info = self.env.act(action, path != '')
                        episode_reward += r

                    print("Episode reward :", episode_reward)

                    if path != '':
                        self.env.save_gif(path, i)

            except KeyboardInterrupt as e:
                pass

            finally:
                print("End of the demo")
                self.env.close()

    def close(self):
        self.env.close()
Exemple #12
0
class Agent:
    def __init__(self, sess):
        print("Initializing the agent...")

        self.sess = sess
        self.env = Environment()
        self.state_size = self.env.get_state_size()[0]
        self.action_size = self.env.get_action_size()
        self.low_bound, self.high_bound = self.env.get_bounds()

        self.buffer = ExperienceBuffer()

        print("Creation of the actor-critic network")
        self.network = Network(self.state_size, self.action_size,
                               self.low_bound, self.high_bound)

        self.sess.run(tf.global_variables_initializer())
        DISPLAYER.reset()

    def run(self):

        self.total_steps = 0

        for ep in range(1, parameters.TRAINING_STEPS + 1):

            episode_reward = 0
            episode_step = 0
            done = False

            # Initialize exploration noise process
            noise_process = np.zeros(self.action_size)
            noise_scale = (parameters.NOISE_SCALE_INIT *
                           parameters.NOISE_DECAY**ep) * \
                (self.high_bound - self.low_bound)

            # Initial state
            s = self.env.reset()
            render = (ep % parameters.RENDER_FREQ == 0 and parameters.DISPLAY)
            self.env.set_render(render)
            while episode_step < parameters.MAX_EPISODE_STEPS:  #and not done:

                # choose action based on deterministic policy
                a, = self.sess.run(self.network.actions,
                                   feed_dict={self.network.state_ph: s[None]})

                # add temporally-correlated exploration noise to action
                # (using an Ornstein-Uhlenbeck process)
                noise_process = parameters.EXPLO_THETA * \
                    (parameters.EXPLO_MU - noise_process) + \
                    parameters.EXPLO_SIGMA * np.random.randn(self.action_size)
                #print("a before noise: ", a)
                a += noise_scale * noise_process
                #print("a after noise: ",a)
                a = np.clip(a, self.low_bound, self.high_bound)
                #print("a after clip is: ",a)
                s_, r, done, info = self.env.act(a)
                if done:
                    print("done at step: ", episode_step)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 0.0 if done else 1.0))

                # update network weights to fit a minibatch of experience
                if self.total_steps % parameters.TRAINING_FREQ == 0 and \
                        len(self.buffer) >= parameters.BATCH_SIZE:

                    minibatch = self.buffer.sample()

                    _, _ = self.sess.run(
                        [
                            self.network.critic_train_op,
                            self.network.actor_train_op
                        ],
                        feed_dict={
                            self.network.state_ph:
                            np.asarray([elem[0] for elem in minibatch]),
                            self.network.action_ph:
                            np.asarray([elem[1] for elem in minibatch]),
                            self.network.reward_ph:
                            np.asarray([elem[2] for elem in minibatch]),
                            self.network.next_state_ph:
                            np.asarray([elem[3] for elem in minibatch]),
                            self.network.is_not_terminal_ph:
                            np.asarray([elem[4] for elem in minibatch])
                        })

                    # update target networks
                    _ = self.sess.run(self.network.update_slow_targets_op)

                s = s_
                episode_step += 1
                self.total_steps += 1
            if ep % parameters.DISP_EP_REWARD_FREQ == 0:
                print(
                    'Episode %i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'
                    % (ep, episode_reward, episode_step, noise_scale))
            DISPLAYER.add_reward(episode_reward)
            # We save CNN weights every 1000 epochs
            if ep % 1000 == 0 and ep != 0:
                self.save("NetworkParam/" + str(ep) + "_epochs")

    def play(self, number_run):
        self.load("NetworkParam/FinalParam")
        print("Playing for", number_run, "runs")

        self.env.set_render(True)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False
                counter = 0
                while counter < 100:  #not done:

                    a, = self.sess.run(
                        self.network.actions,
                        feed_dict={self.network.state_ph: s[None]})
                    #print("The action taken is: ",a)
                    s, r, done, info = self.env.act(a)
                    episode_reward += r
                    counter += 1
                    time.sleep(0.07)

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            self.env.set_render(False)
            print("End of the demo")
            self.env.close()

    def close(self):
        self.env.close()

    def save(self, name):
        """
        Save the weights of both of the networks into a .ckpt tensorflow session file
        :param name: Name of the file where the weights are saved
        """
        saver = tf.train.Saver()
        save_path = saver.save(self.sess, name + ".ckpt")
        print("Model saved in path: %s" % save_path)

    def load(self, name):
        """
        Load the weights of the 2 networks saved in the file into :ivar network
        :param name: name of the file containing the weights to load
        """
        saver = tf.train.Saver()
        saver.restore(self.sess, name + ".ckpt")
Exemple #13
0
class Agent:
    """
    This class builds an agent with its own QNetwork, memory buffer and
    environment to learn a policy.
    """

    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.displayer = displayer
        self.saver = saver

        self.env = Environment()
        self.QNetwork = QNetwork(self.sess)
        self.buffer = ExperienceBuffer()
        self.epsilon = Settings.EPSILON_START

        self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS - 1)
        self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS)

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !\n")

    def create_summaries(self):

        self.ep_reward_ph = tf.placeholder(tf.float32)
        ep_reward_summary = tf.summary.scalar("Episode/Episode reward", self.ep_reward_ph)

        self.steps_ph = tf.placeholder(tf.float32)
        steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph)

        self.epsilon_ph = tf.placeholder(tf.float32)
        epsilon_summary = tf.summary.scalar("Settings/Epsilon", self.epsilon_ph)

        self.ep_summary = tf.summary.merge([ep_reward_summary,
                                            epsilon_summary,
                                            steps_summary])

        self.lr_ph = tf.placeholder(tf.float32)
        self.lr_summary = tf.summary.scalar("Settings/Learning rate", self.lr_ph)

        self.writer = tf.summary.FileWriter("./logs", self.sess.graph)

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS // 5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.QNetwork.init_target()

        self.nb_ep = 1
        learning_steps = 0

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))
            plot_distrib = self.gui.plot_distrib.get(self.nb_ep)

            while episode_step <= max_step and not done:

                # Exploration by epsilon-greedy policy
                if random.random() < self.epsilon:
                    a = self.env.act_random()
                else:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    a = np.argmax(Qvalue, axis=0)

                    if plot_distrib:
                        self.displayer.disp_distrib(self.z, self.delta_z,
                                                    Qdistrib, Qvalue)

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                if episode_step % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample()
                    self.QNetwork.train(np.asarray(batch))
                    self.QNetwork.update_target()

                    feed_dict = {self.lr_ph: self.QNetwork.learning_rate}
                    summary = self.sess.run(self.lr_summary, feed_dict=feed_dict)
                    self.writer.add_summary(summary, learning_steps)
                    learning_steps += 1

                s = s_
                episode_step += 1

            # Decay epsilon
            if self.epsilon > Settings.EPSILON_STOP:
                self.epsilon -= Settings.EPSILON_DECAY

            self.displayer.add_reward(episode_reward, plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)
            
            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %f, Max steps: %i, LR: %fe-4' % (
                    self.nb_ep, episode_reward, episode_step, self.epsilon, max_step, self.QNetwork.learning_rate))

            # Write the summary
            feed_dict = {self.ep_reward_ph: episode_reward,
                         self.epsilon_ph: self.epsilon,
                         self.steps_ph: episode_step}
            summary = self.sess.run(self.ep_summary, feed_dict=feed_dict)
            self.writer.add_summary(summary, self.nb_ep)

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        self.env.close()

    def play(self, number_run, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            name      : the name of the gif that will be saved
        """
        print("Playing for", number_run, "runs")

        self.env.set_render(Settings.DISPLAY)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False
                self.env.set_gif(True, name)

                while not done:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    a = np.argmax(Qvalue, axis=0)
                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")

    def stop(self):
        self.env.close()
Exemple #14
0
class Q:
    def __init__(self):
        self.net = None
        self.env = Environment(False, 4)
        self.mem = Memory(32, 1000000)
        self.epsilon = 0.5
        self.gamma = 0.7
        self.number_of_actions = 4
        try:
            self.load_network()
        except IOError:
            print 'No network found'
            self.create_model()

    def create_model(self):
        print 'Creating model...'
        model = Sequential()
        model.add(
            Convolution2D(32,
                          8,
                          8,
                          subsample=(4, 4),
                          activation='relu',
                          input_shape=(4, 84, 84)))
        model.add(Convolution2D(64, 4, 4, activation='relu', subsample=(2, 2)))
        model.add(Convolution2D(64, 3, 3, activation='relu', subsample=(1, 1)))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.number_of_actions, activation='linear'))
        model.compile(loss='mse', optimizer='rmsprop')
        self.net = model
        print 'Done!'

    def save_network(self):
        json_string = self.net.to_json()
        open('deep_q_network.json', 'w').write(json_string)
        self.net.save_weights('network_weights.h5', overwrite=True)

    def load_network(self):
        print 'Loading network...'
        model = model_from_json(open('deep_q_network.json').read())
        model.load_weights('network_weights.h5')
        model.compile(loss='mse', optimizer='rmsprop')
        print 'Network loaded!'
        self.net = model

    def train(self, epochs):
        for i in xrange(epochs):
            state = self.env.get_state()
            while not self.env.isTerminal():
                qval = self.net.predict(state.reshape(1, 4, 84, 84),
                                        batch_size=1)
                if random.random() < self.epsilon:  # choose random action
                    action = np.random.randint(0, self.number_of_actions)
                else:  # choose best action from Q(s,a) values
                    action = np.argmax(qval)
                # Take action, observe new state S'
                reward = self.env.act(action)
                new_state = self.env.get_state()
                # Experience replay storage
                is_terminal = self.env.isTerminal()

                self.mem.store(state, action, reward, new_state, is_terminal)

                print 'Game : {}'.format(i)
                if self.mem.isFull():
                    minibatch = self.mem.sample()
                    self.train_on_minibatch(minibatch)
                state = new_state

            if self.epsilon > 0.1:  # decrement epsilon over time
                self.epsilon -= (1 / 100000)
            self.env.restart()
            if i % 10 == 0:
                self.save_network()

    def train_on_minibatch(self, minibatch):
        x_train, y_train = [], []
        for sample in minibatch:
            # Get max_Q(S',a)
            old_state, action, reward, new_state, terminal = sample
            old_qval = self.net.predict(old_state.reshape(1, 4, 84, 84),
                                        batch_size=1)
            newQ = self.net.predict(new_state.reshape(1, 4, 84, 84),
                                    batch_size=1)
            maxQ = np.max(newQ)
            y = np.zeros((1, self.number_of_actions))
            y[:] = old_qval[:]
            if not terminal:  # non-terminal state
                update = (reward + (self.gamma * maxQ))
            else:  # terminal state
                update = reward
            y[0][action] = update
            x_train.append(old_state.reshape(4, 84, 84))
            y_train.append(y.reshape(self.number_of_actions, ))

        x_train = np.array(x_train)
        y_train = np.array(y_train)
        self.net.fit(x_train,
                     y_train,
                     batch_size=self.mem.batch_size,
                     nb_epoch=1)

    def play(self):
        environment = Environment(True, 4)
        while not environment.isTerminal():
            state = environment.get_state()
            qval = self.net.predict(state.reshape(1, 4, 84, 84), batch_size=1)
            action = (np.argmax(qval))
            reward = environment.act(action)
Exemple #15
0
class Agent:
    """
    This class builds an agent that interacts with an environment to gather
    experiences and put them into a buffer.
    """

    def __init__(self, sess, n_agent, gui, displayer, buffer):
        print("Initializing agent %i..." % n_agent)

        self.n_agent = n_agent
        self.sess = sess
        self.gui = gui
        self.displayer = displayer
        self.buffer = buffer

        self.env = Environment()

        self.build_actor()
        self.build_update()

        self.create_summaries()

        print("Agent initialized !\n")

    def create_summaries(self):

        self.ep_reward_ph = tf.placeholder(tf.float32)
        ep_reward_summary = tf.summary.scalar("Episode/Episode reward", self.ep_reward_ph)

        self.steps_ph = tf.placeholder(tf.float32)
        steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph)

        self.noise_ph = tf.placeholder(tf.float32)
        noise_summary = tf.summary.scalar("Settings/Noise", self.noise_ph)

        self.ep_summary = tf.summary.merge([ep_reward_summary,
                                            noise_summary,
                                            steps_summary])

        self.writer = tf.summary.FileWriter(f"./logs/Agent_{self.n_agent}",
                                            self.sess.graph)

    def build_actor(self):
        """
        Build a copy of the learner's actor network to allow the agent to
        interact with the environment on its own.
        """
        scope = 'worker_agent_' + str(self.n_agent)
        self.state_ph = tf.placeholder(dtype=tf.float32,
                                       shape=[None, *Settings.STATE_SIZE],
                                       name='state_ph')

        # Get the policy prediction network
        self.policy = build_actor(self.state_ph, trainable=False, scope=scope)
        self.vars = get_vars(scope, trainable=False)

    def build_update(self):
        """
        Build the operation to copy the weights of the learner's actor network
        in the agent's network.
        """
        with self.sess.as_default(), self.sess.graph.as_default():

            self.network_vars = get_vars('learner_actor', trainable=True)
            self.update = copy_vars(self.network_vars, self.vars,
                                    1, 'update_agent_'+str(self.n_agent))

    def predict_action(self, s):
        """
        Wrapper method to get the action outputted by the actor network.
        """
        return self.sess.run(self.policy, feed_dict={self.state_ph: s[None]})[0]

    def run(self):
        """
        Method to run the agent in the environment to collect experiences.
        """
        print("Beginning of the run agent {}...".format(self.n_agent))

        self.sess.run(self.update)

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            memory = deque()
            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            noise_scale = Settings.NOISE_SCALE * Settings.NOISE_DECAY**(self.nb_ep//20)

            # Render Settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))

            while episode_step < max_step and not done and not self.gui.STOP:


                a = np.clip(self.predict_action(s),
                            Settings.LOW_BOUND, Settings.HIGH_BOUND)

                # Add gaussian noise
                noise = np.random.normal(size=Settings.ACTION_SIZE)

                a += noise_scale * noise
                s_, r, done, _ = self.env.act(a)
                episode_reward += r

                memory.append((s, a, r))

                # Keep the experience in memory until 'N_STEP_RETURN' steps has
                # passed to get the delayed return r_1 + ... + gamma^n r_n
                if len(memory) >= Settings.N_STEP_RETURN:
                    s_mem, a_mem, discount_r = memory.popleft()
                    for i, (si, ai, ri) in enumerate(memory):
                        discount_r += ri * Settings.DISCOUNT ** (i + 1)
                    self.buffer.add((s_mem, a_mem, discount_r, s_, 1 if not done else 0))

                s = s_
                episode_step += 1
                self.total_steps += 1

            # Periodically update agents on the network
            if self.nb_ep % Settings.UPDATE_ACTORS_FREQ == 0:
                self.sess.run(self.update)

            if not self.gui.STOP:
                if self.n_agent == 1 and self.gui.ep_reward.get(self.nb_ep):
                    print("Episode %i : reward %i, steps %i, noise scale %f" % (self.nb_ep, episode_reward, episode_step, noise_scale))

                plot = (self.n_agent == 1 and self.gui.plot.get(self.nb_ep))
                self.displayer.add_reward(episode_reward, self.n_agent, plot=plot)

                # Write the summary
                feed_dict = {self.ep_reward_ph: episode_reward,
                             self.noise_ph: noise_scale,
                             self.steps_ph: episode_step}
                summary = self.sess.run(self.ep_summary, feed_dict=feed_dict)
                self.writer.add_summary(summary, self.nb_ep)

                self.nb_ep += 1

        self.env.close()
Exemple #16
0
class Agent:
    """
    This class builds an agent with its own Network, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.displayer = displayer
        self.saver = saver

        self.env = Environment()
        self.network = Network(sess)
        self.buffer = ExperienceBuffer()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !")

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.network.init_target()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Initialize exploration noise process
            noise_process = np.zeros(Settings.ACTION_SIZE)
            noise_scale = (Settings.NOISE_SCALE_INIT *
                           Settings.NOISE_DECAY**self.nb_ep) * \
                (Settings.HIGH_BOUND - Settings.LOW_BOUND)

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))

            while episode_step <= max_step and not done:

                # Choose action based on deterministic policy
                a = self.network.act(s)

                # Add temporally-correlated exploration noise to action
                noise_process = Settings.EXPLO_THETA * \
                    (Settings.EXPLO_MU - noise_process) + \
                    Settings.EXPLO_SIGMA * np.random.randn(Settings.ACTION_SIZE)

                a += noise_scale * noise_process
                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                if self.total_steps % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample()
                    self.network.train(np.asarray(batch))
                    self.network.update_target()

                s = s_
                episode_step += 1
                self.total_steps += 1

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'
                    % (self.nb_ep, episode_reward, episode_step, noise_scale))

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        self.env.close()

    def play(self, number_run, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            name      : the name of the gif that will be saved
        """
        print("Playing for", number_run, "runs")

        self.env.set_render(Settings.DISPLAY)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False
                self.env.set_gif(True, name)

                while not done:
                    a = self.network.act(s)
                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")

    def stop(self):
        self.env.close()
Exemple #17
0
class Agent:

    def __init__(self, sess):
        print("Initializing the agent...")

        self.sess = sess
        self.env = Environment()
        self.state_size = self.env.get_state_size()
        self.action_size = self.env.get_action_size()

        print("Creation of the main QNetwork...")
        self.mainQNetwork = QNetwork(self.state_size, self.action_size, 'main')
        print("Main QNetwork created !\n")

        print("Creation of the target QNetwork...")
        self.targetQNetwork = QNetwork(self.state_size, self.action_size,
                                       'target')
        print("Target QNetwork created !\n")

        self.buffer = PrioritizedReplayBuffer(parameters.BUFFER_SIZE,
                                              parameters.ALPHA)

        self.epsilon = parameters.EPSILON_START
        self.beta = parameters.BETA_START

        self.initial_learning_rate = parameters.LEARNING_RATE

        trainables = tf.trainable_variables()
        self.update_target_ops = updateTargetGraph(trainables)

        self.nb_ep = 1
        self.best_run = -1e10

    def pre_train(self):
        print("Beginning of the pre-training...")

        for i in range(parameters.PRE_TRAIN_STEPS):

            s = self.env.reset()
            done = False
            episode_step = 0
            episode_reward = 0

            while episode_step < parameters.MAX_EPISODE_STEPS and not done:

                a = random.randint(0, self.action_size - 1)
                s_, r, done, info = self.env.act(a)
                self.buffer.add(s, a, r, s_, done)

                s = s_
                episode_reward += r
                episode_step += 1

            if i % 100 == 0:
                print("\tPre-train step n", i)

            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def run(self):
        print("Beginning of the run...")

        self.pre_train()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < parameters.TRAINING_STEPS:

            self.learning_rate = self.initial_learning_rate * \
                (parameters.TRAINING_STEPS - self.nb_ep) / \
                parameters.TRAINING_STEPS

            s = self.env.reset()
            episode_reward = 0
            done = False

            memory = deque()
            discount_R = 0

            episode_step = 0
            max_step = parameters.MAX_EPISODE_STEPS + \
                self.nb_ep // parameters.EP_ELONGATION

            # Render parameters
            self.env.set_render(self.nb_ep % parameters.RENDER_FREQ == 0)

            while episode_step < max_step and not done:

                if random.random() < self.epsilon:
                    a = random.randint(0, self.action_size - 1)
                else:
                    a = self.sess.run(self.mainQNetwork.predict,
                                      feed_dict={self.mainQNetwork.inputs: [s]})
                    a = a[0]

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                memory.append((s, a, r, s_, done))

                if len(memory) > parameters.N_STEP_RETURN:
                    s_mem, a_mem, r_mem, ss_mem, done_mem = memory.popleft()
                    discount_R = r_mem
                    for i, (si, ai, ri, s_i, di) in enumerate(memory):
                        discount_R += ri * parameters.DISCOUNT ** (i + 1)
                    self.buffer.add(s_mem, a_mem, discount_R, s_, done)

                if episode_step % parameters.TRAINING_FREQ == 0:

                    train_batch = self.buffer.sample(parameters.BATCH_SIZE,
                                                     self.beta)
                    # Incr beta
                    if self.beta <= parameters.BETA_STOP:
                        self.beta += parameters.BETA_INCR

                    feed_dict = {self.mainQNetwork.inputs: train_batch[0]}
                    oldQvalues = self.sess.run(self.mainQNetwork.Qvalues,
                                               feed_dict=feed_dict)
                    tmp = [0] * len(oldQvalues)
                    for i, oldQvalue in enumerate(oldQvalues):
                        tmp[i] = oldQvalue[train_batch[1][i]]
                    oldQvalues = tmp

                    feed_dict = {self.mainQNetwork.inputs: train_batch[3]}
                    mainQaction = self.sess.run(self.mainQNetwork.predict,
                                                feed_dict=feed_dict)

                    feed_dict = {self.targetQNetwork.inputs: train_batch[3]}
                    targetQvalues = self.sess.run(self.targetQNetwork.Qvalues,
                                                  feed_dict=feed_dict)

                    # Done multiplier :
                    # equals 0 if the episode was done
                    # equals 1 else
                    done_multiplier = (1 - train_batch[4])
                    doubleQ = targetQvalues[range(parameters.BATCH_SIZE),
                                            mainQaction]
                    targetQvalues = train_batch[2] + \
                        parameters.DISCOUNT * doubleQ * done_multiplier

                    errors = np.square(targetQvalues - oldQvalues) + 1e-6
                    self.buffer.update_priorities(train_batch[6], errors)

                    feed_dict = {self.mainQNetwork.inputs: train_batch[0],
                                 self.mainQNetwork.Qtarget: targetQvalues,
                                 self.mainQNetwork.actions: train_batch[1],
                                 self.mainQNetwork.learning_rate: self.learning_rate}
                    _ = self.sess.run(self.mainQNetwork.train,
                                      feed_dict=feed_dict)

                    update_target(self.update_target_ops, self.sess)

                s = s_
                episode_step += 1
                self.total_steps += 1

            # Decay epsilon
            if self.epsilon > parameters.EPSILON_STOP:
                self.epsilon -= parameters.EPSILON_DECAY

            DISPLAYER.add_reward(episode_reward)
            # if episode_reward > self.best_run and \
            #         self.nb_ep > 50:
            #     self.best_run = episode_reward
            #     print("Save best", episode_reward)
            #     SAVER.save('best')
            #     self.play(1)

            self.total_steps += 1

            if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0:
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %.3f'
                      ', Max steps: %i, Learning rate: %g' % (
                          self.nb_ep, episode_reward, episode_step,
                          self.epsilon, max_step, self.learning_rate))

            # Save the model
            if self.nb_ep % parameters.SAVE_FREQ == 0:
                SAVER.save(self.nb_ep)

            self.nb_ep += 1

    def play(self, number_run):
        print("Playing for", number_run, "runs")

        try:
            for i in range(number_run):

                self.env.set_render(True)

                s = self.env.reset()
                episode_reward = 0
                done = False

                episode_step = 0
                max_step = parameters.MAX_EPISODE_STEPS + \
                    self.nb_ep // parameters.EP_ELONGATION

                while episode_step < max_step and not done:
                    a = self.sess.run(self.mainQNetwork.predict,
                                      feed_dict={self.mainQNetwork.inputs: [s]})
                    a = a[0]
                    s, r, done, info = self.env.act(a)

                    episode_reward += r
                    episode_step += 1

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            self.env.set_render(False)
            print("End of the demo")
            self.env.close()

    def stop(self):
        self.env.close()
Exemple #18
0
class Agent:
    def __init__(self, sess):
        print("Initializing the agent...")

        self.sess = sess
        self.env = Environment()
        self.state_size = self.env.get_state_size()[0]
        self.action_size = self.env.get_action_size()
        self.low_bound, self.high_bound = self.env.get_bounds()

        self.buffer = ExperienceBuffer()

        print("Creation of the actor-critic network")
        self.network = Network(self.state_size, self.action_size,
                               self.low_bound, self.high_bound)

        self.sess.run(tf.global_variables_initializer())
        DISPLAYER.reset()

    def run(self):

        self.total_steps = 0

        for ep in range(1, parameters.TRAINING_STEPS + 1):

            episode_reward = 0
            episode_step = 0
            done = False

            # Initialize exploration noise process
            noise_process = np.zeros(self.action_size)
            noise_scale = (parameters.NOISE_SCALE_INIT *
                           parameters.NOISE_DECAY**ep) * \
                (self.high_bound - self.low_bound)

            # Initial state
            s = self.env.reset()
            render = (ep % parameters.RENDER_FREQ == 0 and parameters.DISPLAY)
            self.env.set_render(render)

            while episode_step < parameters.MAX_EPISODE_STEPS and not done:

                # choose action based on deterministic policy
                a, = self.sess.run(self.network.actions,
                                   feed_dict={self.network.state_ph: s[None]})

                # add temporally-correlated exploration noise to action
                # (using an Ornstein-Uhlenbeck process)
                noise_process = parameters.EXPLO_THETA * \
                    (parameters.EXPLO_MU - noise_process) + \
                    parameters.EXPLO_SIGMA * np.random.randn(self.action_size)

                a += noise_scale * noise_process

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                self.buffer.add((s, a, r, s_, 0.0 if done else 1.0))

                # update network weights to fit a minibatch of experience
                if self.total_steps % parameters.TRAINING_FREQ == 0 and \
                        len(self.buffer) >= parameters.BATCH_SIZE:

                    minibatch = self.buffer.sample()

                    _, _ = self.sess.run(
                        [
                            self.network.critic_train_op,
                            self.network.actor_train_op
                        ],
                        feed_dict={
                            self.network.state_ph:
                            np.asarray([elem[0] for elem in minibatch]),
                            self.network.action_ph:
                            np.asarray([elem[1] for elem in minibatch]),
                            self.network.reward_ph:
                            np.asarray([elem[2] for elem in minibatch]),
                            self.network.next_state_ph:
                            np.asarray([elem[3] for elem in minibatch]),
                            self.network.is_not_terminal_ph:
                            np.asarray([elem[4] for elem in minibatch])
                        })

                    # update target networks
                    _ = self.sess.run(self.network.update_slow_targets_op)

                s = s_
                episode_step += 1
                self.total_steps += 1

            if ep % parameters.DISP_EP_REWARD_FREQ == 0:
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'
                    % (ep, episode_reward, episode_step, noise_scale))
            DISPLAYER.add_reward(episode_reward)

    def play(self, number_run):
        print("Playing for", number_run, "runs")

        self.env.set_render(True)
        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False

                while not done:

                    a, = self.sess.run(
                        self.network.actions,
                        feed_dict={self.network.state_ph: s[None]})

                    s, r, done, info = self.env.act(a)
                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            self.env.set_render(False)
            print("End of the demo")
            self.env.close()

    def close(self):
        self.env.close()
Exemple #19
0
class Agent:

    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input

        self.local_network = Network(thread_index, device)
        self.local_network.build_loss()

        with tf.device(device):
            local_var_refs = [v._ref() for v in self.local_network.get_vars()]

            self.gradients = tf.gradients(self.local_network.total_loss,
                                          local_var_refs,
                                          gate_gradients=False,
                                          aggregation_method=None,
                                          colocate_gradients_with_ops=False)

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(),
            self.gradients)

        self.update_network = self.local_network.copy_network(global_network)

        self.env = Environment(thread_index == 1)
        self.state = self.env.reset()

        self.worker_total_steps = 0
        self.worker_total_eps = 0
        self.start_time = time.time()

        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * \
            (MAX_TIME_STEP - global_time_step) / MAX_TIME_STEP
        return max(learning_rate, 0)

    def _record_score(self, sess, summary_writer, summary_op, score_input, score, total_steps):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, total_steps)
        summary_writer.flush()

    def process(self, sess, total_steps, summary_writer, summary_op, score_input):

        start_time = time.time()
        buffer = []
        done = False
        episode_step = 0

        # copy weights from global to local
        sess.run(self.update_network)

        start_lstm_state = self.local_network.lstm_state_out

        for i in range(UPDATE_FREQ):

            pi, value = self.local_network.run_policy_and_value(sess,
                                                                self.state)

            a = np.random.choice(ACTION_SIZE, p=pi)
            s_, r, terminal, _ = self.env.act(a)

            self.episode_reward += r

            # clip reward
            r = np.clip(r, -1, 1)
            buffer.append((self.state, a, r, value))

            episode_step += 1
            self.worker_total_steps += 1
            self.state = s_

            if terminal:
                done = True
                self.worker_total_eps += 1

                DISPLAYER.add_reward(self.episode_reward, self.thread_index)

                if (self.thread_index == 1 and
                        self.worker_total_eps % DISP_REWARD_FREQ == 0):
                    cur_learning_rate = self._anneal_learning_rate(total_steps)
                    print('Episode %i, Reward %i, Steps %i, LR %g' %
                          (self.worker_total_eps, self.episode_reward,
                           episode_step, cur_learning_rate))

                self._record_score(sess, summary_writer, summary_op, score_input,
                                   self.episode_reward, total_steps)

                self.episode_reward = 0
                self.env.reset()
                self.local_network.reset_state()

                render = (DISPLAY and self.thread_index == 1 and
                          (self.worker_total_eps - 1) % RENDER_FREQ == 0)
                self.env.set_render(render)

                break

        batch_s = deque()
        batch_a = deque()
        batch_td = deque()
        batch_R = deque()

        # Bootstrapping
        R = 0.0
        if not done:
            R = self.local_network.run_value(sess, self.state)

        # compute and accumulate gradients
        for i in range(len(buffer) - 1, -1, -1):
            si, ai, ri, Vi = buffer[i]
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_s.appendleft(si)
            batch_a.appendleft(a)
            batch_td.appendleft(td)
            batch_R.appendleft(R)

        cur_learning_rate = self._anneal_learning_rate(total_steps)

        feed_dict = {self.local_network.state: batch_s,
                     self.local_network.action: batch_a,
                     self.local_network.td_error: batch_td,
                     self.local_network.reward: batch_R,
                     self.local_network.initial_lstm_state: start_lstm_state,
                     self.local_network.step_size: [len(batch_a)],
                     self.learning_rate_input: cur_learning_rate}
        sess.run(self.apply_gradients, feed_dict=feed_dict)

        if done and (self.thread_index == 1) and \
                (self.worker_total_eps % PERF_FREQ == 0 or
                 self.worker_total_eps == 15):
            global_time = time.time() - self.start_time
            steps_per_sec = total_steps / global_time
            print("### Performance : {} STEPS in {:.0f} sec."
                  "{:.0f} STEPS/sec. {:.2f}M STEPS/hour ###".format(
                      total_steps,  global_time, steps_per_sec,
                      steps_per_sec * 3600 / 1000000.))

        elapsed_time = time.time() - start_time
        return elapsed_time, done, episode_step

    def close(self):
        self.env.close()
Exemple #20
0
class Agent:
    def __init__(self, worker_index, sess, render=False, master=False):

        self.worker_index = worker_index
        if master:
            self.name = 'global'
        else:
            print("Initialization of the agent", str(worker_index))
            self.name = 'Worker_' + str(worker_index)

        self.env = Environment()
        self.state_size = self.env.get_state_size()
        self.action_size = self.env.get_action_size()
        self.low_bound, self.high_bound = self.env.get_bounds()

        self.network = Network(self.state_size, self.action_size, self.name)
        self.update_local_vars = update_target_graph('global', self.name)

        self.starting_time = 0
        self.epsilon = settings.EPSILON_START

        if self.name != 'global':
            self.summary_writer = tf.summary.FileWriter(
                "results/" + self.name, sess.graph)

    def save(self, episode_step):
        # Save model
        SAVER.save(episode_step)

        # Save summary statistics
        summary = tf.Summary()
        summary.value.add(tag='Perf/Reward',
                          simple_value=np.mean(self.rewards_plus))
        summary.value.add(tag='Perf/Value',
                          simple_value=np.mean(self.next_values))
        summary.value.add(tag='Losses/Value', simple_value=self.value_loss)
        summary.value.add(tag='Losses/Policy', simple_value=self.policy_loss)
        summary.value.add(tag='Losses/Entropy', simple_value=self.entropy)
        summary.value.add(tag='Losses/Grad Norm', simple_value=self.grad_norm)
        self.summary_writer.add_summary(summary, self.nb_ep)
        self.summary_writer.flush()

    def train(self, sess, bootstrap_value):

        # Add the bootstrap value to our experience
        self.rewards_plus = np.asarray(self.rewards_buffer + [bootstrap_value])
        discounted_reward = discount(self.rewards_plus, settings.DISCOUNT)[:-1]

        self.next_values = np.asarray(self.values_buffer[1:] +
                                      [bootstrap_value])
        advantages = self.rewards_buffer + \
            settings.DISCOUNT * self.next_values - \
            self.values_buffer
        advantages = discount(advantages,
                              settings.GENERALIZED_LAMBDA * settings.DISCOUNT)

        # Update the global network
        feed_dict = {
            self.network.discounted_reward: discounted_reward,
            self.network.inputs: self.states_buffer,
            self.network.actions: self.actions_buffer,
            self.network.advantages: advantages
        }
        losses = sess.run([
            self.network.value_loss, self.network.policy_loss,
            self.network.entropy, self.network.grad_norm,
            self.network.apply_grads
        ],
                          feed_dict=feed_dict)

        # Get the losses for tensorboard
        self.value_loss, self.policy_loss, self.entropy = losses[:3]
        self.grad_norm, _ = losses[3:]

        # Reinitialize buffers and variables
        self.states_buffer = []
        self.actions_buffer = []
        self.rewards_buffer = []
        self.values_buffer = []

    def work(self, sess, coord):
        print("Running", self.name, end='\n\n')
        self.starting_time = time()
        self.nb_ep = 1
        nearlyDone = 0
        with sess.as_default(), sess.graph.as_default():

            with coord.stop_on_exception():
                while not coord.should_stop():

                    self.states_buffer = []
                    self.actions_buffer = []
                    self.rewards_buffer = []
                    self.values_buffer = []
                    self.mean_values_buffer = []

                    self.total_steps = 0
                    episode_reward = 0
                    episode_step = 0

                    # Reset the local network to the global
                    sess.run(self.update_local_vars)

                    mean = 45 * TORAD
                    std = 0 * TORAD
                    wind_samples = 10
                    w = wind(mean=mean, std=std, samples=wind_samples)
                    WH = w.generateWind()
                    hdg0_rand = random.uniform(5, 12)
                    hdg0 = hdg0_rand * TORAD * np.ones(10)
                    s = self.env.reset(hdg0, WH)

                    done = False
                    #if self.worker_index == 1 and render and settings.DISPLAY:
                    #    self.env.set_render(True)

                    #self.lstm_state = self.network.lstm_state_init
                    #self.initial_lstm_state = self.lstm_state

                    while not coord.should_stop() and not done and \
                            episode_step < settings.MAX_EPISODE_STEP:

                        WH = np.random.uniform(mean - std,
                                               mean + std,
                                               size=wind_samples)
                        s = np.reshape([s[0, :], s[1, :]],
                                       [2 * self.state_size, 1])

                        # Prediction of the policy and the value
                        feed_dict = {self.network.inputs: [s]}
                        policy, value = sess.run(
                            [self.network.policy, self.network.value],
                            feed_dict=feed_dict)

                        policy, value = policy[0], value[0][0]

                        if random.random() < self.epsilon:
                            action = random.choice([1.5, 0, -1.5])

                        else:
                            # Choose an action according to the policy
                            action = np.random.choice([1.5, 0, -1.5], p=policy)

                        s_, v = self.env.act(action, WH)

                        #reward  assignation algorithm
                        if episode_step == 1:
                            r = 0
                        elif s[int(self.state_size / 2 - 2)] > (
                                13 *
                                TORAD) and s[int(self.state_size / 2 - 2)] < (
                                    15 * TORAD
                                ) and v > 0.63 and v < 0.67 and action < 0:
                            r = 0.5
                        else:
                            if v <= 0.69:
                                r = 0
                                nearlyDone = 0
                            elif v > 0.69 and v <= 0.75:
                                r = 0.00001
                                nearlyDone = 0
                            elif v > 0.75 and v <= 0.8:
                                r = 0.01
                                nearlyDone = 0
                            elif v > 0.80:
                                r = 0.1
                                if nearlyDone >= 3:
                                    r = 1
                                    done = True
                                elif nearlyDone == 2:
                                    r = 0.8
                                elif nearlyDone == 1:
                                    r = 0.25
                                nearlyDone = nearlyDone + 1
                            else:
                                r = 0
                                nearlyDone = False

                        #s_ = np.reshape(s_, [2*self.state_size,1])

                        # Store the experience
                        self.states_buffer.append(s)
                        self.actions_buffer.append(action)
                        self.rewards_buffer.append(r)
                        self.values_buffer.append(value)
                        self.mean_values_buffer.append(value)
                        episode_reward += r
                        s = s_

                        episode_step += 1
                        self.total_steps += 1

                        # If we have more than MAX_LEN_BUFFER experiences, we
                        # apply the gradients and update the global network,
                        # then we empty the episode buffers
                        if len(self.states_buffer) == settings.MAX_LEN_BUFFER \
                                and not done:

                            feed_dict = {
                                self.network.inputs: [
                                    np.reshape([s[0, :], s[1, :]],
                                               [2 * self.state_size, 1])
                                ]
                            }
                            bootstrap_value = sess.run(self.network.value,
                                                       feed_dict=feed_dict)

                            self.train(sess, bootstrap_value
                                       )  #with this we change global network
                            sess.run(self.update_local_vars)
                            #self.initial_lstm_state = self.lstm_state

                    if len(self.states_buffer) != 0:
                        if done:
                            bootstrap_value = 0
                        else:
                            feed_dict = {
                                self.network.inputs: [
                                    np.reshape([s[0, :], s[1, :]],
                                               [2 * self.state_size, 1])
                                ]
                            }
                            bootstrap_value = sess.run(self.network.value,
                                                       feed_dict=feed_dict)
                        self.train(sess, bootstrap_value)

                    if self.epsilon > settings.EPSILON_STOP:
                        self.epsilon -= settings.EPSILON_DECAY

                    self.nb_ep += 1

                    if not coord.should_stop():
                        DISPLAYER.add_reward(episode_reward, self.worker_index)

                    if (self.worker_index == 1 and
                            self.nb_ep % settings.DISP_EP_REWARD_FREQ == 0):
                        print(
                            'Episode %2i, Initial hdg: %2i, Reward: %7.3f, Steps: %i, '
                            'Epsilon: %7.3f' %
                            (self.nb_ep, hdg0_rand, episode_reward,
                             episode_step, self.epsilon))
                        print("Policy: ", policy)
                    if (self.worker_index == 1
                            and self.nb_ep % settings.SAVE_FREQ == 0):
                        self.save(self.total_steps)

                    if time() - self.starting_time > settings.LIMIT_RUN_TIME:
                        coord.request_stop()

            self.summary_writer.close()

    def play(self, sess, number_run, path=''):
        print("Playing", self.name, "for", number_run, "runs")

        with sess.as_default(), sess.graph.as_default():
            hdg0_rand_vec = [0, 7, 13]
            '''
            WIND CONDITIONS
            '''
            mean = 45 * TORAD
            std = 0 * TORAD
            wind_samples = 10
            w = wind(mean=mean, std=std, samples=wind_samples)

            try:
                for i in range(number_run):

                    # Reset the local network to the global
                    if self.name != 'global':
                        sess.run(self.update_local_vars)

                    WH = w.generateWind()
                    hdg0_rand = hdg0_rand_vec[i]
                    hdg0 = hdg0_rand * TORAD * np.ones(10)
                    s = self.env.reset(hdg0, WH)
                    episode_reward = 0
                    episode_step = 0
                    v_episode = []
                    i_episode = []
                    done = False

                    #self.lstm_state = self.network.lstm_state_init

                    while (not done and episode_step < 70):
                        i_episode.append(round(s[0][-1] / TORAD))
                        s = np.reshape([s[0, :], s[1, :]],
                                       [2 * self.state_size, 1])
                        # Prediction of the policy
                        feed_dict = {self.network.inputs: [s]}
                        policy, value = sess.run(
                            [self.network.policy, self.network.value],
                            feed_dict=feed_dict)

                        policy = policy[0]

                        # Choose an action according to the policy
                        action = np.random.choice([1.5, 0, -1.5], p=policy)
                        s_, r = self.env.act(action, WH)
                        if episode_step > 12:
                            if np.mean(v_episode[-4:]) > 0.8:
                                #done=True
                                print("Done!")
                            else:
                                done = False
                        episode_reward += r
                        v_episode.append(r)
                        episode_step += 1
                        s = s_
                    DISPLAYER.displayVI(v_episode, i_episode, i)

                    print("Episode reward :", episode_reward)

            except KeyboardInterrupt as e:
                pass

            finally:
                print("End of the demo")
Exemple #21
0
class Agent:
    def __init__(self, sess):
        print("Initializing the agent...")

        self.sess = sess
        self.env = Environment()
        self.state_size = self.env.get_state_size()[0]
        self.action_size = self.env.get_action_size()
        self.low_bound, self.high_bound = self.env.get_bounds()

        self.buffer = PrioritizedReplayBuffer(parameters.BUFFER_SIZE,
                                              parameters.ALPHA)

        print("Creation of the actor-critic network...")
        self.network = Network(self.state_size, self.action_size,
                               self.low_bound, self.high_bound)
        print("Network created !\n")

        self.epsilon = parameters.EPSILON_START
        self.beta = parameters.BETA_START

        self.best_run = -1e10

        self.sess.run(tf.global_variables_initializer())

    def run(self):

        self.nb_ep = 1
        self.total_steps = 0

        for self.nb_ep in range(1, parameters.TRAINING_STEPS + 1):

            episode_reward = 0
            episode_step = 0
            done = False
            memory = deque()

            # Initial state
            s = self.env.reset()
            max_steps = parameters.MAX_EPISODE_STEPS + self.nb_ep // parameters.EP_ELONGATION

            while episode_step < max_steps and not done:

                if random.random() < self.epsilon:
                    a = self.env.random()
                else:
                    # choose action based on deterministic policy
                    a, = self.sess.run(self.network.actions,
                                       feed_dict={self.network.state_ph: [s]})

                # Decay epsilon
                if self.epsilon > parameters.EPSILON_STOP:
                    self.epsilon -= parameters.EPSILON_DECAY

                s_, r, done, info = self.env.act(a)
                memory.append((s, a, r, s_, 0.0 if done else 1.0))

                if len(memory) > parameters.N_STEP_RETURN:
                    s_mem, a_mem, r_mem, ss_mem, done_mem = memory.popleft()
                    discount_R = 0
                    for i, (si, ai, ri, s_i, di) in enumerate(memory):
                        discount_R += ri * parameters.DISCOUNT**(i + 1)
                    self.buffer.add(s_mem, a_mem, discount_R, s_, done)

                # update network weights to fit a minibatch of experience
                if self.total_steps % parameters.TRAINING_FREQ == 0 and \
                        len(self.buffer) >= parameters.BATCH_SIZE:

                    minibatch = self.buffer.sample(parameters.BATCH_SIZE,
                                                   self.beta)

                    if self.beta <= parameters.BETA_STOP:
                        self.beta += parameters.BETA_INCR

                    td_errors, _, _ = self.sess.run(
                        [
                            self.network.td_errors,
                            self.network.critic_train_op,
                            self.network.actor_train_op
                        ],
                        feed_dict={
                            self.network.state_ph: minibatch[0],
                            self.network.action_ph: minibatch[1],
                            self.network.reward_ph: minibatch[2],
                            self.network.next_state_ph: minibatch[3],
                            self.network.is_not_terminal_ph: minibatch[4]
                        })

                    self.buffer.update_priorities(minibatch[6],
                                                  td_errors + 1e-6)
                    # update target networks
                    _ = self.sess.run(self.network.update_slow_targets_op)

                episode_reward += r
                s = s_
                episode_step += 1
                self.total_steps += 1

            self.nb_ep += 1

            if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0:
                print(
                    'Episode %2i, Reward: %7.3f, Steps: %i, Epsilon : %7.3f, Max steps : %i'
                    % (self.nb_ep, episode_reward, episode_step, self.epsilon,
                       max_steps))

            DISPLAYER.add_reward(episode_reward)

            if episode_reward > self.best_run and self.nb_ep > 100:
                self.best_run = episode_reward
                print("Best agent ! ", episode_reward)
                SAVER.save('best')

            if self.nb_ep % parameters.SAVE_FREQ == 0:
                SAVER.save(self.nb_ep)

    def play(self, number_run):
        print("Playing for", number_run, "runs")

        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False

                while not done:

                    a, = self.sess.run(self.network.actions,
                                       feed_dict={self.network.state_ph: [s]})

                    s_, r, done, info = self.env.act(a)
                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")
            self.env.close()

    def close(self):
        self.env.close()
Exemple #22
0
class Agent:
    """
    This class builds an agent with its own QNetwork, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment and QNetwork.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.displayer = displayer
        self.saver = saver

        self.env = Environment()
        self.QNetwork = QNetwork(sess)
        self.buffer = PrioritizedReplayBuffer(Settings.BUFFER_SIZE,
                                              Settings.ALPHA)
        self.epsilon = Settings.EPSILON_START
        self.beta = Settings.BETA_START

        self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS -
                                                            1)
        self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS)

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !\n")

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add(s, a, r, s_, 1 if not done else 0)

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.QNetwork.init_target()

        self.total_steps = 0
        self.nb_ep = 1

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False
            memory = deque()

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))
            plot_distrib = self.gui.plot_distrib.get(self.nb_ep)

            while episode_step <= max_step and not done:

                # Exploration by epsilon-greedy policy
                if random.random() < self.epsilon:
                    a = self.env.act_random()
                else:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    a = np.argmax(Qvalue, axis=0)

                    if plot_distrib:
                        self.displayer.disp_distrib(self.z, self.delta_z,
                                                    Qdistrib, Qvalue)

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                memory.append((s, a, r))

                # Keep the experience in memory until 'N_STEP_RETURN' steps has
                # passed to get the delayed return r_1 + ... + gamma^n r_n
                if len(memory) > Settings.N_STEP_RETURN:
                    s_mem, a_mem, discount_R = memory.popleft()
                    for i, (si, ai, ri) in enumerate(memory):
                        discount_R += ri * Settings.DISCOUNT**(i + 1)
                    self.buffer.add(s_mem, a_mem, discount_R, s_,
                                    1 if not done else 0)

                if episode_step % Settings.TRAINING_FREQ == 0:
                    batch = self.buffer.sample(Settings.BATCH_SIZE, self.beta)
                    loss = self.QNetwork.train(batch)
                    self.buffer.update_priorities(batch[6], loss)
                    self.QNetwork.update_target()

                s = s_
                episode_step += 1
                self.total_steps += 1

            # Decay epsilon
            if self.epsilon > Settings.EPSILON_STOP:
                self.epsilon -= Settings.EPSILON_DECAY

            self.QNetwork.decrease_lr()

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %i'
                      ', Max steps: %i' %
                      (self.nb_ep, episode_reward, episode_step, self.epsilon,
                       max_step))

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        self.env.close()

    def play(self, number_run, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            name      : the name of the gif that will be saved
        """
        print("Playing for", number_run, "runs")

        try:
            for i in range(number_run):

                s = self.env.reset()
                episode_reward = 0
                done = False
                self.env.set_gif(True, name)

                while not done:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    a = np.argmax(Qvalue, axis=0)
                    s, r, done, info = self.env.act(a)

                    episode_reward += r

                print("Episode reward :", episode_reward)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")
            self.env.close()

    def stop(self):
        self.env.close()