def play(self): environment = Environment(True, 4) while not environment.isTerminal(): state = environment.get_state() qval = self.net.predict(state.reshape(1, 4, 84, 84), batch_size=1) action = (np.argmax(qval)) reward = environment.act(action)
class Agent: def __init__(self, sess): print("Initializing the agent...") self.sess = sess self.env = Environment() self.state_size = self.env.get_state_size()[0] self.action_size = self.env.get_action_size() self.low_bound, self.high_bound = self.env.get_bounds() self.buffer = ExperienceBuffer() print("Creation of the actor-critic network") self.network = Network(self.state_size, self.action_size, self.low_bound, self.high_bound) self.epsilon = parameters.EPSILON_START self.epsilon_decay = (parameters.EPSILON_START - parameters.EPSILON_STOP) \ / parameters.EPSILON_STEPS self.best_run = -1e10 self.n_gif = 0 self.sess.run(tf.global_variables_initializer()) def run(self): self.total_steps = 0 for ep in range(1, parameters.TRAINING_STEPS + 1): episode_reward = 0 episode_step = 0 done = False # Initial state s = self.env.reset() self.env.set_render(ep % 1000 == 0) gif = (ep % 1500 == 0) step_allonge = ep // 1000 while episode_step < parameters.MAX_EPISODE_STEPS + step_allonge \ and not done: if random.random() < self.epsilon: a = self.env.random() else: # choose action based on deterministic policy a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: [s]}) s_, r, done, info = self.env.act(a, gif) episode_reward += r self.buffer.add((s, a, r, s_, 0.0 if done else 1.0)) # update network weights to fit a minibatch of experience if self.total_steps % parameters.TRAINING_FREQ == 0 and \ len(self.buffer) >= parameters.BATCH_SIZE: minibatch = self.buffer.sample() _, _ = self.sess.run([self.network.critic_train_op, self.network.actor_train_op], feed_dict={ self.network.state_ph: np.asarray([elem[0] for elem in minibatch]), self.network.action_ph: np.asarray([elem[1] for elem in minibatch]), self.network.reward_ph: np.asarray([elem[2] for elem in minibatch]), self.network.next_state_ph: np.asarray([elem[3] for elem in minibatch]), self.network.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch])}) # update target networks _ = self.sess.run(self.network.update_slow_targets_op) s = s_ episode_step += 1 self.total_steps += 1 # Decay epsilon if self.epsilon > parameters.EPSILON_STOP: self.epsilon -= self.epsilon_decay if gif: self.env.save_gif('results/gif/', self.n_gif) self.n_gif = (self.n_gif + 1) % 5 if episode_reward > self.best_run: self.best_run = episode_reward print("Save best", episode_reward) SAVER.save('best') DISPLAYER.add_reward(episode_reward) if ep % 50 == 0: print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %7.3f' ' (max step: %i)' % (ep, episode_reward, episode_step, self.epsilon, parameters.MAX_EPISODE_STEPS + step_allonge)) if ep % 500 == 0: DISPLAYER.disp() def play(self, number_run, path=''): print("Playing for", number_run, "runs") self.env.set_render(True) try: for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False while not done: a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: s[None]}) s, r, done, info = self.env.act(a, path != '') episode_reward += r print("Episode reward :", episode_reward) if path != '': self.env.save_gif(path, i) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: self.env.set_render(False) print("End of the demo") self.env.close() def close(self): self.env.close()
global_network = Network(0, device) env = Environment(True) # prepare session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) sess.run(tf.global_variables_initializer()) SAVER.set_sess(sess) settings.LOAD = True global_total_time, wall_time, total_eps, total_steps = SAVER.load() for i in range(NB_PLAY): done = False reward, step = 0, 0 state = env.reset() global_network.reset_state() while not done: pi, value = global_network.run_policy_and_value(sess, state) a = np.random.choice(settings.ACTION_SIZE, p=pi) state, r, done, _ = env.act(a) reward += r step += 1 print("Episode reward {} (in {} steps)".format(reward, step))
class Q: def __init__(self): self.net = None self.env = Environment(False, 4) self.mem = Memory(32, 1000000) self.epsilon = 0.5 self.gamma = 0.7 self.number_of_actions = 4 try: self.load_network() except IOError: print 'No network found' self.create_model() def create_model(self): print 'Creating model...' model = Sequential() model.add( Convolution2D(32, 8, 8, subsample=(4, 4), activation='relu', input_shape=(4, 84, 84))) model.add(Convolution2D(64, 4, 4, activation='relu', subsample=(2, 2))) model.add(Convolution2D(64, 3, 3, activation='relu', subsample=(1, 1))) model.add(Flatten()) model.add(Dense(512, activation='relu')) model.add(Dense(self.number_of_actions, activation='linear')) model.compile(loss='mse', optimizer='rmsprop') self.net = model print 'Done!' def save_network(self): json_string = self.net.to_json() open('deep_q_network.json', 'w').write(json_string) self.net.save_weights('network_weights.h5', overwrite=True) def load_network(self): print 'Loading network...' model = model_from_json(open('deep_q_network.json').read()) model.load_weights('network_weights.h5') model.compile(loss='mse', optimizer='rmsprop') print 'Network loaded!' self.net = model def train(self, epochs): for i in xrange(epochs): state = self.env.get_state() while not self.env.isTerminal(): qval = self.net.predict(state.reshape(1, 4, 84, 84), batch_size=1) if random.random() < self.epsilon: # choose random action action = np.random.randint(0, self.number_of_actions) else: # choose best action from Q(s,a) values action = np.argmax(qval) # Take action, observe new state S' reward = self.env.act(action) new_state = self.env.get_state() # Experience replay storage is_terminal = self.env.isTerminal() self.mem.store(state, action, reward, new_state, is_terminal) print 'Game : {}'.format(i) if self.mem.isFull(): minibatch = self.mem.sample() self.train_on_minibatch(minibatch) state = new_state if self.epsilon > 0.1: # decrement epsilon over time self.epsilon -= (1 / 100000) self.env.restart() if i % 10 == 0: self.save_network() def train_on_minibatch(self, minibatch): x_train, y_train = [], [] for sample in minibatch: # Get max_Q(S',a) old_state, action, reward, new_state, terminal = sample old_qval = self.net.predict(old_state.reshape(1, 4, 84, 84), batch_size=1) newQ = self.net.predict(new_state.reshape(1, 4, 84, 84), batch_size=1) maxQ = np.max(newQ) y = np.zeros((1, self.number_of_actions)) y[:] = old_qval[:] if not terminal: # non-terminal state update = (reward + (self.gamma * maxQ)) else: # terminal state update = reward y[0][action] = update x_train.append(old_state.reshape(4, 84, 84)) y_train.append(y.reshape(self.number_of_actions, )) x_train = np.array(x_train) y_train = np.array(y_train) self.net.fit(x_train, y_train, batch_size=self.mem.batch_size, nb_epoch=1) def play(self): environment = Environment(True, 4) while not environment.isTerminal(): state = environment.get_state() qval = self.net.predict(state.reshape(1, 4, 84, 84), batch_size=1) action = (np.argmax(qval)) reward = environment.act(action)
class Agent: def __init__(self, sess): print("Initializing the agent...") self.sess = sess self.env = Environment() self.state_size = self.env.get_state_size()*2 #60 velocities and 60 incidences self.action_size = self.env.get_action_size() self.low_bound, self.high_bound = self.env.get_bounds() self.buffer = ExperienceBuffer() print("Creation of the actor-critic network") self.network = Network(self.state_size, self.action_size, self.low_bound, self.high_bound) self.sess.run(tf.global_variables_initializer()) DISPLAYER.reset() def run(self): #self.load("NetworkParam_best_ThirdSemester/FinalParam") #get the best parameters to start the training self.total_steps = 0 ''' WIND CONDITIONS ''' mean = 45 * TORAD std = 0.1 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples = wind_samples) WH = w.generateWind() for ep in range(1, parameters.TRAINING_STEPS+1): episode_reward = 0 episode_step = 0 nearlyDone=0 done=False # Initialize exploration noise process noise_process = np.zeros(self.action_size) noise_scale = (parameters.NOISE_SCALE_INIT * parameters.NOISE_DECAY**ep) * \ (self.high_bound - self.low_bound) # Initial state w = wind(mean=mean, std=std, samples = wind_samples) WH = w.generateWind() hdg0_rand = random.uniform(6,13) hdg0 = hdg0_rand * TORAD * np.ones(10) s = self.env.reset(hdg0,WH) while episode_step < parameters.MAX_EPISODE_STEPS: #and not done: WH = np.random.uniform(mean - std, mean + std, size=wind_samples) # choose action based on deterministic policy s = np.reshape([s[0,:], s[1,:]], [self.state_size,1]) a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: s[None]}) # add temporally-correlated exploration noise to action # (using an Ornstein-Uhlenbeck process) noise_process = parameters.EXPLO_THETA * \ (parameters.EXPLO_MU - noise_process) + \ parameters.EXPLO_SIGMA * np.random.randn(self.action_size) a += noise_scale * noise_process #to respect the bounds: a = np.clip(a, self.low_bound, self.high_bound) s_, v = self.env.act(a,WH) #reward assignation algorithm if episode_step==1: r=0 #elif s[int(self.state_size/2-2)]>(13*TORAD) and s[int(self.state_size/2-2)]<(15*TORAD) and v>0.63 and v<0.67 and a<0: # r=0.1 else: if v<=0.69: r=0 nearlyDone = 0 elif v>0.69 and v<=0.75: r=0.00001 nearlyDone = 0 elif v>0.75 and v<=0.8: r=0.01 nearlyDone = 0 elif v>0.80: r=0.1 if nearlyDone>=3: r=1 done = True elif nearlyDone==2: r=0.8 elif nearlyDone==1: r=0.25 nearlyDone=nearlyDone+1 else: r=0 nearlyDone = False episode_reward += r self.buffer.add((s, np.reshape(a, [1,1] ), r, np.reshape(s_, [self.state_size,1]), 0.0 if episode_step<parameters.MAX_EPISODE_STEPS-1 else 1.0)) #, 0.0 if done else 1.0 # update network weights to fit a minibatch of experience if self.total_steps % parameters.TRAINING_FREQ == 0 and \ len(self.buffer) >= parameters.BATCH_SIZE: minibatch = self.buffer.sample() _, _,critic_loss = self.sess.run([self.network.critic_train_op, self.network.actor_train_op,self.network.critic_loss], feed_dict={ self.network.state_ph: np.asarray([elem[0] for elem in minibatch]), self.network.action_ph: np.asarray([elem[1] for elem in minibatch]), self.network.reward_ph: np.asarray([elem[2] for elem in minibatch]), self.network.next_state_ph: np.asarray([elem[3] for elem in minibatch]), self.network.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch])}) # update target networks _ = self.sess.run(self.network.update_slow_targets_op) s = s_ episode_step += 1 self.total_steps += 1 if ep % parameters.DISP_EP_REWARD_FREQ == 0: print('Episode %2i, initial heading: %7.3f, Reward: %7.3f, Final noise scale: %7.3f, critic loss: %7.3f' % (ep, hdg0[0]*(1/TORAD), episode_reward, noise_scale,critic_loss)) DISPLAYER.add_reward(episode_reward) # We save CNN weights every 500 epochs if ep % 500 == 0 and ep != 0: self.save("NetworkParam/"+ str(ep) +"_epochs") self.save("NetworkParam/"+"FinalParam") def playActor(self): self.load("NetworkParam/FinalParam") hdg0_rand_vec=[0,7,12] ''' WIND CONDITIONS ''' mean = 45 * TORAD std = 0.1 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples = wind_samples) try: for i in range(len(hdg0_rand_vec)): # Initial state WH = w.generateWind() hdg0_rand = hdg0_rand_vec[i] hdg0 = hdg0_rand * TORAD * np.ones(10) s = self.env.reset(hdg0,WH) episode_reward = 0 episode_step=0 v_episode=[] i_episode=[] while episode_step < 40: #not done: if episode_step==0: i_episode.append(hdg0_rand+WH[0]/TORAD-40) else: i_episode.append(s[0][-1]/TORAD) s = np.reshape([s[0,:], s[1,:]], [self.state_size,1]) a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: s[None]}) a = np.clip(a, self.low_bound, self.high_bound) s_, r = self.env.act(a,WH) episode_reward += r v_episode.append(r) episode_step += 1 s = s_ DISPLAYER.displayVI(v_episode,i_episode,i) print("Episode reward :", episode_reward," for incidence: ",hdg0_rand) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: print("End of the demo") def playCritic(self): self.load("NetworkParam/FinalParam") hdg0_rand_vec=[0,7,12] ''' WIND CONDITIONS ''' mean = 45 * TORAD std = 0.1 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples = wind_samples) try: for i in range(len(hdg0_rand_vec)): # Initial state WH = w.generateWind() hdg0_rand = hdg0_rand_vec[i] hdg0 = hdg0_rand * TORAD * np.ones(10) s = self.env.reset(hdg0,WH) episode_reward = 0 episode_step=0 v_episode=[] i_episode=[] while episode_step < 30: #not done: if episode_step==0: i_episode.append(hdg0_rand+WH[0]/TORAD-40) else: i_episode.append(s[0][-1]/TORAD) # Critic policy critic = [self.evaluate(s, -1.5),self.evaluate(s, -1.25),self.evaluate(s, -1), self.evaluate(s, -0.75),self.evaluate(s, -0.5),self.evaluate(s, -0.25),self.evaluate(s, 0),self.evaluate(s, 0.25), self.evaluate(s, 0.5),self.evaluate(s, 0.75),self.evaluate(s, 1),self.evaluate(s, 1.25), self.evaluate(s, 1.5)] a = np.argmax(critic) if a == 0: a = -1.5 if a == 1: a = -1.25 if a == 2: a = -1 if a == 3: a = -0.75 if a == 4: a = -0.5 if a == 5: a = -0.25 if a == 6: a = 0 if a == 7: a = 0.25 if a == 8: a = 0.5 if a == 9: a = 0.75 if a == 10: a = 1 if a == 11: a = 1.25 if a == 12: a = 1.5 s_, r = self.env.act(a,WH) episode_reward += r v_episode.append(r) episode_step += 1 s = s_ DISPLAYER.displayVI(v_episode,i_episode,i+3) print("Episode reward :", episode_reward," for incidence: ",hdg0_rand) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: print("End of the demo") def save(self, name): """ Save the weights of both of the networks into a .ckpt tensorflow session file :param name: Name of the file where the weights are saved """ saver = tf.train.Saver() save_path = saver.save(self.sess, name+".ckpt") print("Model saved in path: %s" % save_path) def load(self, name): """ Load the weights of the 2 networks saved in the file into :ivar network :param name: name of the file containing the weights to load """ saver = tf.train.Saver() saver.restore(self.sess, name+".ckpt") def evaluate(self, state, action): """ Evaluate the Q-value of a state-action pair using the critic neural network. :param np.array state: state that we want to evaluate. :param float action: action that we want to evaluate (has to be between permitted bounds) :return: The continuous action value. """ s = np.reshape([state[0, :], state[1, :]], (1,self.state_size, 1)) a = np.reshape(action, (1,self.action_size, 1)) q = self.sess.run( self.network.q_values_of_given_actions, feed_dict={ self.network.state_ph: s, self.network.action_ph: a}) return q
class Agent: """ This class builds an agent with its own QNetwork, memory buffer and environment to learn a policy. """ def __init__(self, sess, gui, displayer, saver): """ Build a new instance of Environment and QNetwork. Args: sess : the tensorflow session in which to build the network gui : a GUI instance to manage the control of the agent displayer: a Displayer instance to keep track of the episode rewards saver : a Saver instance to save periodically the network """ print("Initializing the agent...") self.sess = sess self.gui = gui self.gui_thread = threading.Thread(target=lambda: self.gui.run(self)) self.displayer = displayer self.saver = saver signal.signal(signal.SIGINT, self.interrupt) self.env = Environment() self.QNetwork = QNetwork(sess) self.buffer = ExperienceBuffer(prioritized=Settings.PRIORITIZED_ER) self.epsilon = Settings.EPSILON_START self.beta = Settings.BETA_START self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS - 1) self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS) self.create_summaries() self.best_run = -1e10 self.n_gif = 0 print("Agent initialized !\n") def create_summaries(self): self.ep_reward_ph = tf.placeholder(tf.float32) ep_reward_summary = tf.summary.scalar("Episode/Episode reward", self.ep_reward_ph) self.steps_ph = tf.placeholder(tf.float32) steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph) self.epsilon_ph = tf.placeholder(tf.float32) epsilon_summary = tf.summary.scalar("Settings/Epsilon", self.epsilon_ph) self.ep_summary = tf.summary.merge( [ep_reward_summary, epsilon_summary, steps_summary]) self.lr_ph = tf.placeholder(tf.float32) self.lr_summary = tf.summary.scalar("Settings/Learning rate", self.lr_ph) self.writer = tf.summary.FileWriter("./logs", self.sess.graph) def pre_train(self): """ Method to run a random agent in the environment to fill the memory buffer. """ print("Beginning of the pre-training...") for i in range(Settings.PRE_TRAIN_EPS): s = self.env.reset() done = False episode_reward = 0 episode_step = 0 while episode_step < Settings.MAX_EPISODE_STEPS and not done: a = self.env.act_random() s_, r, done, info = self.env.act(a) self.buffer.add((s, a, r, s_, 1 if not done else 0)) s = s_ episode_reward += r episode_step += 1 if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS // 5) == 0: print("Pre-train step n", i) # Set the best score to at least the max score the random agent got self.best_run = max(self.best_run, episode_reward) print("End of the pre training !") def save_best(self, episode_reward): self.best_run = episode_reward print("Save best", episode_reward) self.saver.save('best') # self.play(1, 'best') def run(self): """ Method to run the agent in the environment to collect experiences and learn on these experiences by gradient descent. """ print("Beginning of the run...") self.pre_train() self.QNetwork.init_target() self.gui_thread.start() self.nb_ep = 1 learning_steps = 0 while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP: s = self.env.reset() episode_reward = 0 done = False memory = deque() episode_step = 1 # The more episodes the agent performs, the longer they are max_step = Settings.MAX_EPISODE_STEPS if Settings.EP_ELONGATION > 0: max_step += self.nb_ep // Settings.EP_ELONGATION # Render settings self.env.set_render(self.gui.render.get(self.nb_ep)) self.env.set_gif(self.gui.gif.get(self.nb_ep)) plot_distrib = self.gui.plot_distrib.get(self.nb_ep) while episode_step <= max_step and not done: # Exploration by NoisyNets or epsilon-greedy policy if not Settings.NOISY and random.random() < self.epsilon: a = self.env.act_random() else: if Settings.DISTRIBUTIONAL: Qdistrib = self.QNetwork.act(s) Qvalue = np.sum(self.z * Qdistrib, axis=1) else: Qvalue = self.QNetwork.act(s) a = np.argmax(Qvalue, axis=0) if plot_distrib: self.displayer.disp_distrib(self.z, self.delta_z, Qdistrib, Qvalue) s_, r, done, info = self.env.act(a) episode_reward += r memory.append((s, a, r, s_, done)) # Keep the experience in memory until 'N_STEP_RETURN' steps has # passed to get the delayed return r_1 + ... + gamma^n r_n while len(memory) >= Settings.N_STEP_RETURN or (memory and memory[-1][4]): s_mem, a_mem, discount_R, si_, done_ = memory.popleft() if not done_ and memory: for i in range(Settings.N_STEP_RETURN - 1): si, ai, ri, si_, done_ = memory[i] discount_R += ri * Settings.DISCOUNT**(i + 1) if done_: break self.buffer.add( (s_mem, a_mem, discount_R, si_, 1 if not done_ else 0)) if episode_step % Settings.TRAINING_FREQ == 0: if Settings.PRIORITIZED_ER: batch, idx, weights = self.buffer.sample(self.beta) else: batch = self.buffer.sample(self.beta) idx = weights = None loss = self.QNetwork.train(np.asarray(batch), weights) self.buffer.update(idx, loss) self.QNetwork.update_target() feed_dict = {self.lr_ph: self.QNetwork.learning_rate} summary = self.sess.run(self.lr_summary, feed_dict=feed_dict) self.writer.add_summary(summary, learning_steps) learning_steps += 1 s = s_ episode_step += 1 # Decay epsilon if self.epsilon > Settings.EPSILON_STOP: self.epsilon -= Settings.EPSILON_DECAY self.displayer.add_reward(episode_reward, plot=self.gui.plot.get(self.nb_ep)) # if episode_reward > self.best_run: # self.save_best(episode_reward) # Episode display if self.gui.ep_reward.get(self.nb_ep): print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %f' ', Max steps: %i, Learning rate: %fe-4' % (self.nb_ep, episode_reward, episode_step, self.epsilon, max_step, self.QNetwork.learning_rate * 1e4)) # Write the summary feed_dict = { self.ep_reward_ph: episode_reward, self.epsilon_ph: self.epsilon, self.steps_ph: episode_step } summary = self.sess.run(self.ep_summary, feed_dict=feed_dict) self.writer.add_summary(summary, self.nb_ep) # Save the model if self.gui.save.get(self.nb_ep): self.saver.save(self.nb_ep) self.nb_ep += 1 print("Training completed !") self.env.close() self.display() self.gui.end_training() self.gui_thread.join() def play(self, number_run=1, gif=False, name=None): """ Method to evaluate the policy without exploration. Args: number_run: the number of episodes to perform gif : whether to save a gif or not name : the name of the gif that will be saved """ self.env.set_render(Settings.DISPLAY) for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False self.env.set_gif(gif, name) while not done: if Settings.DISTRIBUTIONAL: Qdistrib = self.QNetwork.act(s) Qvalue = np.sum(self.z * Qdistrib, axis=1) else: Qvalue = self.QNetwork.act(s) a = np.argmax(Qvalue, axis=0) s, r, done, info = self.env.act(a) episode_reward += r if gif: self.env.save_gif() print("Episode reward :", episode_reward) def display(self): self.displayer.disp() def stop(self): self.env.close() def interrupt(self, sig, frame): self.gui.stop_run()
class Agent: def __init__(self, sess): print("Initializing the agent...") self.sess = sess self.env = Environment() self.state_size = self.env.get_state_size()[0] self.action_size = self.env.get_action_size() self.bounds = self.env.get_bounds() print("Creation of the actor-critic network") self.network = Network(self.sess, self.state_size, self.action_size, self.bounds) self.critic_lr = settings.CRITIC_LEARNING_RATE self.actor_lr = settings.ACTOR_LEARNING_RATE self.delta_critic_lr = self.critic_lr / settings.TRAINING_EPS self.delta_actor_lr = self.actor_lr / settings.TRAINING_EPS self.sess.run(tf.global_variables_initializer()) def predict_action(self, s, plot_distrib): if plot_distrib: action, distrib, value = self.sess.run([ self.network.actions, self.network.Q_distrib_suggested_actions, self.network.Q_values_suggested_actions ], feed_dict={ self.network.state_ph: s[None] }) action, distrib, value = action[0], distrib[0], value[0] fig = plt.figure(2) fig.clf() plt.bar(self.z, distrib, self.delta_z) plt.axvline(value, color='red', linewidth=0.7) plt.show(block=False) plt.pause(0.001) return action return self.sess.run(self.network.actions, feed_dict={self.network.state_ph: s[None]})[0] def run(self): self.total_steps = 1 self.sess.run(self.network.target_init) self.z = self.sess.run(self.network.z) self.delta_z = self.network.delta_z ep = 1 while ep < settings.TRAINING_EPS + 1 and not GUI.STOP: s = self.env.reset() episode_reward = 0 episode_step = 0 done = False memory = deque() # Initialize exploration noise process noise_scale = settings.NOISE_SCALE * settings.NOISE_DECAY**ep # Initial state self.env.set_render(GUI.render.get(ep)) self.env.set_gif(GUI.gif.get(ep)) plot_distrib = GUI.plot_distrib.get(ep) max_eps = settings.MAX_EPISODE_STEPS + (ep // 50) while episode_step < max_eps and not done: noise = np.random.normal(size=self.action_size) scaled_noise = noise_scale * noise a = np.clip( self.predict_action(s, plot_distrib) + scaled_noise, *self.bounds) s_, r, done, info = self.env.act(a) episode_reward += r memory.append((s, a, r, s_, 0 if done else 1)) if len(memory) >= settings.N_STEP_RETURN: s_mem, a_mem, discount_r, ss_mem, done_mem = memory.popleft( ) for i, (si, ai, ri, s_i, di) in enumerate(memory): discount_r += ri * settings.DISCOUNT**(i + 1) BUFFER.add(s_mem, a_mem, discount_r, s_, 0 if done else 1) if len( BUFFER ) > 0 and self.total_steps % settings.TRAINING_FREQ == 0: self.network.train(BUFFER.sample(), self.critic_lr, self.actor_lr) s = s_ episode_step += 1 self.total_steps += 1 self.critic_lr -= self.delta_critic_lr self.actor_lr -= self.delta_actor_lr # Plot reward plot = GUI.plot.get(ep) DISPLAYER.add_reward(episode_reward, plot) # Print episode reward if GUI.ep_reward.get(ep): print( 'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f, Critic LR: %f, Actor LR: %f' % (ep, episode_reward, episode_step, noise_scale, self.critic_lr, self.actor_lr)) # Save the model if GUI.save.get(ep): SAVER.save(ep) ep += 1 def play(self, number_run): print("Playing for", number_run, "runs") self.env.set_render(settings.DISPLAY) try: for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False while not done: a = self.predict_action(s) s, r, done, info = self.env.act(a) episode_reward += r print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: self.env.set_render(False) print("End of the demo") self.env.close() def close(self): self.env.close()
class Agent: def __init__(self, sess): print("Initializing the agent...") self.sess = sess self.env = Environment() self.state_size = self.env.get_state_size() self.action_size = self.env.get_action_size() print("Creation of the main QNetwork...") self.mainQNetwork = QNetwork(self.state_size, self.action_size, 'main') print("Main QNetwork created !\n") print("Creation of the target QNetwork...") self.targetQNetwork = QNetwork(self.state_size, self.action_size, 'target') print("Target QNetwork created !\n") self.buffer = PrioritizedReplayBuffer(parameters.BUFFER_SIZE, parameters.ALPHA) self.epsilon = parameters.EPSILON_START self.beta = parameters.BETA_START trainables = tf.trainable_variables() self.update_target_ops = updateTargetGraph(trainables) self.nb_ep = 1 def pre_train(self): print("Beginning of the pre-training...") for i in range(parameters.PRE_TRAIN_STEPS): s = self.env.reset() done = False episode_step = 0 episode_reward = 0 while episode_step < parameters.MAX_EPISODE_STEPS and not done: a = random.randint(0, self.action_size - 1) s_, r, done, info = self.env.act(a) self.buffer.add(s, a, r, s_, done) s = s_ episode_reward += r episode_step += 1 if i % 100 == 0: print("Pre-train step n", i) print("End of the pre training !") def run(self): print("Beginning of the run...") self.pre_train() self.total_steps = 0 self.nb_ep = 1 while self.nb_ep < parameters.TRAINING_STEPS: s = self.env.reset() episode_reward = 0 done = False memory = deque() discount_R = 0 episode_step = 0 # Render parameters self.env.set_render(self.nb_ep % parameters.RENDER_FREQ == 0) while episode_step < parameters.MAX_EPISODE_STEPS and not done: if random.random() < self.epsilon: a = random.randint(0, self.action_size - 1) else: a = self.sess.run( self.mainQNetwork.predict, feed_dict={self.mainQNetwork.inputs: [s]}) a = a[0] s_, r, done, info = self.env.act(a) episode_reward += r memory.append((s, a, r, s_, done)) if len(memory) > parameters.N_STEP_RETURN: s_mem, a_mem, r_mem, ss_mem, done_mem = memory.popleft() discount_R = r_mem for i, (si, ai, ri, s_i, di) in enumerate(memory): discount_R += ri * parameters.DISCOUNT**(i + 1) self.buffer.add(s_mem, a_mem, discount_R, s_, done) if episode_step % parameters.TRAINING_FREQ == 0: train_batch = self.buffer.sample(parameters.BATCH_SIZE, self.beta) # Incr beta if self.beta <= parameters.BETA_STOP: self.beta += parameters.BETA_INCR feed_dict = {self.mainQNetwork.inputs: train_batch[3]} mainQaction = self.sess.run(self.mainQNetwork.predict, feed_dict=feed_dict) feed_dict = {self.targetQNetwork.inputs: train_batch[3]} targetQvalues = self.sess.run(self.targetQNetwork.Qvalues, feed_dict=feed_dict) # Done multiplier : # equals 0 if the episode was done # equals 1 else done_multiplier = (1 - train_batch[4]) doubleQ = targetQvalues[range(parameters.BATCH_SIZE), mainQaction] targetQvalues = train_batch[2] + \ parameters.DISCOUNT * doubleQ * done_multiplier feed_dict = { self.mainQNetwork.inputs: train_batch[0], self.mainQNetwork.Qtarget: targetQvalues, self.mainQNetwork.actions: train_batch[1] } td_error, _ = self.sess.run( [self.mainQNetwork.td_error, self.mainQNetwork.train], feed_dict=feed_dict) self.buffer.update_priorities(train_batch[6], td_error + 1e-6) update_target(self.update_target_ops, self.sess) s = s_ episode_step += 1 self.total_steps += 1 # Decay epsilon if self.epsilon > parameters.EPSILON_STOP: self.epsilon -= parameters.EPSILON_DECAY DISPLAYER.add_reward(episode_reward) self.total_steps += 1 if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0: print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %f' % (self.nb_ep, episode_reward, episode_step, self.epsilon)) self.nb_ep += 1 def play(self, number_run): print("Playing for", number_run, "runs") try: for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False while not done: a = self.sess.run( self.mainQNetwork.predict, feed_dict={self.mainQNetwork.inputs: [s]}) a = a[0] s, r, done, info = self.env.act(a) episode_reward += r print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: self.env.set_render(False) print("End of the demo") self.env.close() def stop(self): self.env.close()
class Agent: """ This class builds an agent with its own Network, memory buffer and environment to learn a policy. """ def __init__(self, sess, gui, displayer, saver): """ Build a new instance of Environment, QNetwork and ExperienceBuffer. Args: sess : the tensorflow session in which to build the network gui : a GUI instance to manage the control of the agent displayer: a Displayer instance to keep track of the episode rewards saver : a Saver instance to save periodically the network """ print("Initializing the agent...") self.sess = sess self.gui = gui self.gui_thread = threading.Thread(target=lambda: self.gui.run(self)) self.displayer = displayer self.saver = saver signal.signal(signal.SIGINT, self.interrupt) self.env = Environment() self.network = Network(sess) self.buffer = ExperienceBuffer() self.create_summaries() self.best_run = -1e10 self.n_gif = 0 print("Agent initialized !") def create_summaries(self): self.ep_reward_ph = tf.placeholder(tf.float32) ep_reward_summary = tf.summary.scalar("Episode/Episode reward", self.ep_reward_ph) self.steps_ph = tf.placeholder(tf.float32) steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph) self.noise_ph = tf.placeholder(tf.float32) noise_summary = tf.summary.scalar("Settings/Noise", self.noise_ph) self.ep_summary = tf.summary.merge( [ep_reward_summary, noise_summary, steps_summary]) self.writer = tf.summary.FileWriter("./logs", self.sess.graph) def pre_train(self): """ Method to run a random agent in the environment to fill the memory buffer. """ print("Beginning of the pre-training...") for i in range(Settings.PRE_TRAIN_EPS): s = self.env.reset() done = False episode_reward = 0 episode_step = 0 while episode_step < Settings.MAX_EPISODE_STEPS and not done: a = self.env.act_random() s_, r, done, info = self.env.act(a) self.buffer.add((s, a, r, s_, 1 if not done else 0)) s = s_ episode_reward += r episode_step += 1 if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS // 5) == 0: print("Pre-train step n", i) # Set the best score to at least the max score the random agent got self.best_run = max(self.best_run, episode_reward) print("End of the pre training !") def save_best(self, episode_reward): self.best_run = episode_reward print("Save best", episode_reward) self.saver.save('best') # self.play(1, 'best') def run(self): """ Method to run the agent in the environment to collect experiences and learn on these experiences by gradient descent. """ print("Beginning of the run...") self.pre_train() self.network.init_target() self.gui_thread.start() self.total_steps = 0 self.nb_ep = 1 while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP: s = self.env.reset() episode_reward = 0 done = False episode_step = 1 # The more episodes the agent performs, the longer they are max_step = Settings.MAX_EPISODE_STEPS if Settings.EP_ELONGATION > 0: max_step += self.nb_ep // Settings.EP_ELONGATION # Initialize exploration noise process noise_process = np.zeros(Settings.ACTION_SIZE) noise_scale = (Settings.NOISE_SCALE_INIT * Settings.NOISE_DECAY**self.nb_ep) * \ (Settings.HIGH_BOUND - Settings.LOW_BOUND) # Render settings self.env.set_render(self.gui.render.get(self.nb_ep)) self.env.set_gif(self.gui.gif.get(self.nb_ep)) while episode_step <= max_step and not done: # Choose action based on deterministic policy a = self.network.act(s) # Add temporally-correlated exploration noise to action noise_process = Settings.EXPLO_THETA * \ (Settings.EXPLO_MU - noise_process) + \ Settings.EXPLO_SIGMA * np.random.randn(Settings.ACTION_SIZE) a += noise_scale * noise_process s_, r, done, info = self.env.act(a) episode_reward += r self.buffer.add((s, a, r, s_, 1 if not done else 0)) if self.total_steps % Settings.TRAINING_FREQ == 0: batch = self.buffer.sample() self.network.train(np.asarray(batch)) self.network.update_target() s = s_ episode_step += 1 self.total_steps += 1 self.displayer.add_reward(episode_reward, plot=self.gui.plot.get(self.nb_ep)) # if episode_reward > self.best_run: # self.save_best(episode_reward) # Episode display if self.gui.ep_reward.get(self.nb_ep): print( 'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f' % (self.nb_ep, episode_reward, episode_step, noise_scale)) # Write the summary feed_dict = { self.ep_reward_ph: episode_reward, self.noise_ph: noise_scale[0], self.steps_ph: episode_step } summary = self.sess.run(self.ep_summary, feed_dict=feed_dict) self.writer.add_summary(summary, self.nb_ep) # Save the model if self.gui.save.get(self.nb_ep): self.saver.save(self.nb_ep) self.nb_ep += 1 print("Training completed !") self.env.close() self.display() self.gui.end_training() self.gui_thread.join() def play(self, number_run=1, gif=False, name=None): """ Method to evaluate the policy without exploration. Args: number_run: the number of episodes to perform gif : whether to save a gif or not name : the name of the gif that will be saved """ self.env.set_render(Settings.DISPLAY) for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False self.env.set_gif(gif, name) while not done: a = self.network.act(s) s, r, done, info = self.env.act(a) episode_reward += r if gif: self.env.save_gif() print("Episode reward :", episode_reward) def display(self): self.displayer.disp() def stop(self): self.env.close() def interrupt(self, sig, frame): self.gui.stop_run()
class Agent: def __init__(self, worker_index, sess, render=False, master=False): print("Initialization of the agent", str(worker_index)) self.worker_index = worker_index if master: self.name = 'global' else: self.name = 'Worker_' + str(worker_index) self.env = Environment() self.env.set_render(render) self.state_size = self.env.get_state_size() self.action_size = self.env.get_action_size() self.network = Network(self.state_size, self.action_size, self.name) self.update_local_vars = update_target_graph('global', self.name) self.starting_time = 0 self.epsilon = parameters.EPSILON_START if self.name != 'global': self.summary_writer = tf.summary.FileWriter( "results/" + self.name, sess.graph) def save(self, episode_step): # Save model SAVER.save(episode_step) # Save summary statistics summary = tf.Summary() summary.value.add(tag='Perf/Reward', simple_value=np.mean(self.rewards_plus)) summary.value.add(tag='Perf/Value', simple_value=np.mean(self.next_values)) summary.value.add(tag='Losses/Value', simple_value=self.value_loss) summary.value.add(tag='Losses/Policy', simple_value=self.policy_loss) summary.value.add(tag='Losses/Entropy', simple_value=self.entropy) summary.value.add(tag='Losses/Grad Norm', simple_value=self.grad_norm) self.summary_writer.add_summary(summary, self.nb_ep) self.summary_writer.flush() def train(self, sess, bootstrap_value): # Add the bootstrap value to our experience self.rewards_plus = np.asarray(self.rewards_buffer + [bootstrap_value]) discounted_reward = discount(self.rewards_plus, parameters.DISCOUNT)[:-1] self.next_values = np.asarray(self.values_buffer[1:] + [bootstrap_value]) advantages = self.rewards_buffer + \ parameters.DISCOUNT * self.next_values - \ self.values_buffer advantages = discount( advantages, parameters.GENERALIZED_LAMBDA * parameters.DISCOUNT) # Update the global network feed_dict = { self.network.discounted_reward: discounted_reward, self.network.inputs: self.states_buffer, self.network.actions: self.actions_buffer, self.network.advantages: advantages, self.network.state_in: self.initial_lstm_state } losses = sess.run([ self.network.value_loss, self.network.policy_loss, self.network.entropy, self.network.grad_norm, self.network.state_out, self.network.apply_grads ], feed_dict=feed_dict) # Get the losses for tensorboard self.value_loss, self.policy_loss, self.entropy = losses[:3] self.grad_norm, self.lstm_state, _ = losses[3:] # Reinitialize buffers and variables self.states_buffer = [] self.actions_buffer = [] self.rewards_buffer = [] self.values_buffer = [] self.lstm_buffer = [] def work(self, sess, coord): print("Running", self.name, end='\n\n') self.starting_time = time() self.nb_ep = 1 with sess.as_default(), sess.graph.as_default(): with coord.stop_on_exception(): while not coord.should_stop(): self.states_buffer = [] self.actions_buffer = [] self.rewards_buffer = [] self.values_buffer = [] self.mean_values_buffer = [] self.lstm_buffer = [] self.total_steps = 0 episode_reward = 0 episode_step = 0 # Reset the local network to the global sess.run(self.update_local_vars) s = self.env.reset() done = False render = (self.nb_ep % parameters.RENDER_FREQ == 0) if render and parameters.DISPLAY: self.env.set_render(True) self.lstm_state = self.network.lstm_state_init self.initial_lstm_state = self.lstm_state while not coord.should_stop() and not done and \ episode_step < parameters.MAX_EPISODE_STEP: self.lstm_buffer.append(self.lstm_state) # Prediction of the policy and the value feed_dict = { self.network.inputs: [s], self.network.state_in: self.lstm_state } policy, value, self.lstm_state = sess.run( [ self.network.policy, self.network.value, self.network.state_out ], feed_dict=feed_dict) policy, value = policy[0], value[0][0] if random.random() < self.epsilon: action = random.randint(0, self.action_size - 1) else: # Choose an action according to the policy action = np.random.choice(self.action_size, p=policy) s_, r, done, _ = self.env.act(action) # Store the experience self.states_buffer.append(s) self.actions_buffer.append(action) self.rewards_buffer.append(r) self.values_buffer.append(value) self.mean_values_buffer.append(value) episode_reward += r s = s_ episode_step += 1 self.total_steps += 1 # If we have more than MAX_LEN_BUFFER experiences, we # apply the gradients and update the global network, # then we empty the episode buffers if len(self.states_buffer) == parameters.MAX_LEN_BUFFER \ and not done: feed_dict = { self.network.inputs: [s], self.network.state_in: self.lstm_state } bootstrap_value = sess.run(self.network.value, feed_dict=feed_dict) self.train(sess, bootstrap_value) sess.run(self.update_local_vars) self.initial_lstm_state = self.lstm_state if len(self.states_buffer) != 0: if done: bootstrap_value = 0 else: feed_dict = { self.network.inputs: [s], self.network.state_in: self.lstm_state } bootstrap_value = sess.run(self.network.value, feed_dict=feed_dict) self.train(sess, bootstrap_value) if self.epsilon > parameters.EPSILON_STOP: self.epsilon -= parameters.EPSILON_DECAY self.nb_ep += 1 if not coord.should_stop(): DISPLAYER.add_reward(episode_reward, self.worker_index) if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0: print('Agent: %i, Episode %2i, Reward: %i, Steps: %i, ' 'Epsilon: %7.3f' % (self.worker_index, self.nb_ep, episode_reward, episode_step, self.epsilon)) if (self.worker_index == 1 and self.nb_ep % parameters.SAVE_FREQ == 0): self.save(self.total_steps) if time() - self.starting_time > parameters.LIMIT_RUN_TIME: coord.request_stop() self.env.set_render(False) self.summary_writer.close() self.env.close() def play(self, sess, number_run, path=''): print("Playing", self.name, "for", number_run, "runs") with sess.as_default(), sess.graph.as_default(): try: for i in range(number_run): # Reset the local network to the global if self.name != 'global': sess.run(self.update_local_vars) s = self.env.reset() episode_reward = 0 done = False self.lstm_state = self.network.lstm_state_init while not done: # Prediction of the policy feed_dict = { self.network.inputs: [s], self.network.state_in: self.lstm_state } policy, self.lstm_state = sess.run( [self.network.policy, self.network.state_out], feed_dict=feed_dict) policy = policy[0] # Choose an action according to the policy action = np.random.choice(self.action_size, p=policy) s, r, done, info = self.env.act(action, path != '') episode_reward += r print("Episode reward :", episode_reward) if path != '': self.env.save_gif(path, i) except KeyboardInterrupt as e: pass finally: print("End of the demo") self.env.close() def close(self): self.env.close()
class Agent: def __init__(self, sess): print("Initializing the agent...") self.sess = sess self.env = Environment() self.state_size = self.env.get_state_size()[0] self.action_size = self.env.get_action_size() self.low_bound, self.high_bound = self.env.get_bounds() self.buffer = ExperienceBuffer() print("Creation of the actor-critic network") self.network = Network(self.state_size, self.action_size, self.low_bound, self.high_bound) self.sess.run(tf.global_variables_initializer()) DISPLAYER.reset() def run(self): self.total_steps = 0 for ep in range(1, parameters.TRAINING_STEPS + 1): episode_reward = 0 episode_step = 0 done = False # Initialize exploration noise process noise_process = np.zeros(self.action_size) noise_scale = (parameters.NOISE_SCALE_INIT * parameters.NOISE_DECAY**ep) * \ (self.high_bound - self.low_bound) # Initial state s = self.env.reset() render = (ep % parameters.RENDER_FREQ == 0 and parameters.DISPLAY) self.env.set_render(render) while episode_step < parameters.MAX_EPISODE_STEPS: #and not done: # choose action based on deterministic policy a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: s[None]}) # add temporally-correlated exploration noise to action # (using an Ornstein-Uhlenbeck process) noise_process = parameters.EXPLO_THETA * \ (parameters.EXPLO_MU - noise_process) + \ parameters.EXPLO_SIGMA * np.random.randn(self.action_size) #print("a before noise: ", a) a += noise_scale * noise_process #print("a after noise: ",a) a = np.clip(a, self.low_bound, self.high_bound) #print("a after clip is: ",a) s_, r, done, info = self.env.act(a) if done: print("done at step: ", episode_step) episode_reward += r self.buffer.add((s, a, r, s_, 0.0 if done else 1.0)) # update network weights to fit a minibatch of experience if self.total_steps % parameters.TRAINING_FREQ == 0 and \ len(self.buffer) >= parameters.BATCH_SIZE: minibatch = self.buffer.sample() _, _ = self.sess.run( [ self.network.critic_train_op, self.network.actor_train_op ], feed_dict={ self.network.state_ph: np.asarray([elem[0] for elem in minibatch]), self.network.action_ph: np.asarray([elem[1] for elem in minibatch]), self.network.reward_ph: np.asarray([elem[2] for elem in minibatch]), self.network.next_state_ph: np.asarray([elem[3] for elem in minibatch]), self.network.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]) }) # update target networks _ = self.sess.run(self.network.update_slow_targets_op) s = s_ episode_step += 1 self.total_steps += 1 if ep % parameters.DISP_EP_REWARD_FREQ == 0: print( 'Episode %i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f' % (ep, episode_reward, episode_step, noise_scale)) DISPLAYER.add_reward(episode_reward) # We save CNN weights every 1000 epochs if ep % 1000 == 0 and ep != 0: self.save("NetworkParam/" + str(ep) + "_epochs") def play(self, number_run): self.load("NetworkParam/FinalParam") print("Playing for", number_run, "runs") self.env.set_render(True) try: for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False counter = 0 while counter < 100: #not done: a, = self.sess.run( self.network.actions, feed_dict={self.network.state_ph: s[None]}) #print("The action taken is: ",a) s, r, done, info = self.env.act(a) episode_reward += r counter += 1 time.sleep(0.07) print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: self.env.set_render(False) print("End of the demo") self.env.close() def close(self): self.env.close() def save(self, name): """ Save the weights of both of the networks into a .ckpt tensorflow session file :param name: Name of the file where the weights are saved """ saver = tf.train.Saver() save_path = saver.save(self.sess, name + ".ckpt") print("Model saved in path: %s" % save_path) def load(self, name): """ Load the weights of the 2 networks saved in the file into :ivar network :param name: name of the file containing the weights to load """ saver = tf.train.Saver() saver.restore(self.sess, name + ".ckpt")
class Agent: """ This class builds an agent with its own QNetwork, memory buffer and environment to learn a policy. """ def __init__(self, sess, gui, displayer, saver): """ Build a new instance of Environment, QNetwork and ExperienceBuffer. Args: sess : the tensorflow session in which to build the network gui : a GUI instance to manage the control of the agent displayer: a Displayer instance to keep track of the episode rewards saver : a Saver instance to save periodically the network """ print("Initializing the agent...") self.sess = sess self.gui = gui self.displayer = displayer self.saver = saver self.env = Environment() self.QNetwork = QNetwork(self.sess) self.buffer = ExperienceBuffer() self.epsilon = Settings.EPSILON_START self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS - 1) self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS) self.create_summaries() self.best_run = -1e10 self.n_gif = 0 print("Agent initialized !\n") def create_summaries(self): self.ep_reward_ph = tf.placeholder(tf.float32) ep_reward_summary = tf.summary.scalar("Episode/Episode reward", self.ep_reward_ph) self.steps_ph = tf.placeholder(tf.float32) steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph) self.epsilon_ph = tf.placeholder(tf.float32) epsilon_summary = tf.summary.scalar("Settings/Epsilon", self.epsilon_ph) self.ep_summary = tf.summary.merge([ep_reward_summary, epsilon_summary, steps_summary]) self.lr_ph = tf.placeholder(tf.float32) self.lr_summary = tf.summary.scalar("Settings/Learning rate", self.lr_ph) self.writer = tf.summary.FileWriter("./logs", self.sess.graph) def pre_train(self): """ Method to run a random agent in the environment to fill the memory buffer. """ print("Beginning of the pre-training...") for i in range(Settings.PRE_TRAIN_EPS): s = self.env.reset() done = False episode_reward = 0 episode_step = 0 while episode_step < Settings.MAX_EPISODE_STEPS and not done: a = self.env.act_random() s_, r, done, info = self.env.act(a) self.buffer.add((s, a, r, s_, 1 if not done else 0)) s = s_ episode_reward += r episode_step += 1 if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS // 5) == 0: print("Pre-train step n", i) # Set the best score to at least the max score the random agent got self.best_run = max(self.best_run, episode_reward) print("End of the pre training !") def save_best(self, episode_reward): self.best_run = episode_reward print("Save best", episode_reward) self.saver.save('best') # self.play(1, 'best') def run(self): """ Method to run the agent in the environment to collect experiences and learn on these experiences by gradient descent. """ print("Beginning of the run...") self.pre_train() self.QNetwork.init_target() self.nb_ep = 1 learning_steps = 0 while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP: s = self.env.reset() episode_reward = 0 done = False episode_step = 1 # The more episodes the agent performs, the longer they are max_step = Settings.MAX_EPISODE_STEPS if Settings.EP_ELONGATION > 0: max_step += self.nb_ep // Settings.EP_ELONGATION # Render settings self.env.set_render(self.gui.render.get(self.nb_ep)) self.env.set_gif(self.gui.gif.get(self.nb_ep)) plot_distrib = self.gui.plot_distrib.get(self.nb_ep) while episode_step <= max_step and not done: # Exploration by epsilon-greedy policy if random.random() < self.epsilon: a = self.env.act_random() else: Qdistrib = self.QNetwork.act(s) Qvalue = np.sum(self.z * Qdistrib, axis=1) a = np.argmax(Qvalue, axis=0) if plot_distrib: self.displayer.disp_distrib(self.z, self.delta_z, Qdistrib, Qvalue) s_, r, done, info = self.env.act(a) episode_reward += r self.buffer.add((s, a, r, s_, 1 if not done else 0)) if episode_step % Settings.TRAINING_FREQ == 0: batch = self.buffer.sample() self.QNetwork.train(np.asarray(batch)) self.QNetwork.update_target() feed_dict = {self.lr_ph: self.QNetwork.learning_rate} summary = self.sess.run(self.lr_summary, feed_dict=feed_dict) self.writer.add_summary(summary, learning_steps) learning_steps += 1 s = s_ episode_step += 1 # Decay epsilon if self.epsilon > Settings.EPSILON_STOP: self.epsilon -= Settings.EPSILON_DECAY self.displayer.add_reward(episode_reward, plot=self.gui.plot.get(self.nb_ep)) # if episode_reward > self.best_run: # self.save_best(episode_reward) # Episode display if self.gui.ep_reward.get(self.nb_ep): print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %f, Max steps: %i, LR: %fe-4' % ( self.nb_ep, episode_reward, episode_step, self.epsilon, max_step, self.QNetwork.learning_rate)) # Write the summary feed_dict = {self.ep_reward_ph: episode_reward, self.epsilon_ph: self.epsilon, self.steps_ph: episode_step} summary = self.sess.run(self.ep_summary, feed_dict=feed_dict) self.writer.add_summary(summary, self.nb_ep) # Save the model if self.gui.save.get(self.nb_ep): self.saver.save(self.nb_ep) self.nb_ep += 1 self.env.close() def play(self, number_run, name=None): """ Method to evaluate the policy without exploration. Args: number_run: the number of episodes to perform name : the name of the gif that will be saved """ print("Playing for", number_run, "runs") self.env.set_render(Settings.DISPLAY) try: for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False self.env.set_gif(True, name) while not done: Qdistrib = self.QNetwork.act(s) Qvalue = np.sum(self.z * Qdistrib, axis=1) a = np.argmax(Qvalue, axis=0) s, r, done, info = self.env.act(a) episode_reward += r print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: print("End of the demo") def stop(self): self.env.close()
class Agent: """ This class builds an agent that interacts with an environment to gather experiences and put them into a buffer. """ def __init__(self, sess, n_agent, gui, displayer, buffer): print("Initializing agent %i..." % n_agent) self.n_agent = n_agent self.sess = sess self.gui = gui self.displayer = displayer self.buffer = buffer self.env = Environment() self.build_actor() self.build_update() self.create_summaries() print("Agent initialized !\n") def create_summaries(self): self.ep_reward_ph = tf.placeholder(tf.float32) ep_reward_summary = tf.summary.scalar("Episode/Episode reward", self.ep_reward_ph) self.steps_ph = tf.placeholder(tf.float32) steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph) self.noise_ph = tf.placeholder(tf.float32) noise_summary = tf.summary.scalar("Settings/Noise", self.noise_ph) self.ep_summary = tf.summary.merge([ep_reward_summary, noise_summary, steps_summary]) self.writer = tf.summary.FileWriter(f"./logs/Agent_{self.n_agent}", self.sess.graph) def build_actor(self): """ Build a copy of the learner's actor network to allow the agent to interact with the environment on its own. """ scope = 'worker_agent_' + str(self.n_agent) self.state_ph = tf.placeholder(dtype=tf.float32, shape=[None, *Settings.STATE_SIZE], name='state_ph') # Get the policy prediction network self.policy = build_actor(self.state_ph, trainable=False, scope=scope) self.vars = get_vars(scope, trainable=False) def build_update(self): """ Build the operation to copy the weights of the learner's actor network in the agent's network. """ with self.sess.as_default(), self.sess.graph.as_default(): self.network_vars = get_vars('learner_actor', trainable=True) self.update = copy_vars(self.network_vars, self.vars, 1, 'update_agent_'+str(self.n_agent)) def predict_action(self, s): """ Wrapper method to get the action outputted by the actor network. """ return self.sess.run(self.policy, feed_dict={self.state_ph: s[None]})[0] def run(self): """ Method to run the agent in the environment to collect experiences. """ print("Beginning of the run agent {}...".format(self.n_agent)) self.sess.run(self.update) self.total_steps = 0 self.nb_ep = 1 while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP: s = self.env.reset() episode_reward = 0 done = False memory = deque() episode_step = 1 # The more episodes the agent performs, the longer they are max_step = Settings.MAX_EPISODE_STEPS if Settings.EP_ELONGATION > 0: max_step += self.nb_ep // Settings.EP_ELONGATION noise_scale = Settings.NOISE_SCALE * Settings.NOISE_DECAY**(self.nb_ep//20) # Render Settings self.env.set_render(self.gui.render.get(self.nb_ep)) self.env.set_gif(self.gui.gif.get(self.nb_ep)) while episode_step < max_step and not done and not self.gui.STOP: a = np.clip(self.predict_action(s), Settings.LOW_BOUND, Settings.HIGH_BOUND) # Add gaussian noise noise = np.random.normal(size=Settings.ACTION_SIZE) a += noise_scale * noise s_, r, done, _ = self.env.act(a) episode_reward += r memory.append((s, a, r)) # Keep the experience in memory until 'N_STEP_RETURN' steps has # passed to get the delayed return r_1 + ... + gamma^n r_n if len(memory) >= Settings.N_STEP_RETURN: s_mem, a_mem, discount_r = memory.popleft() for i, (si, ai, ri) in enumerate(memory): discount_r += ri * Settings.DISCOUNT ** (i + 1) self.buffer.add((s_mem, a_mem, discount_r, s_, 1 if not done else 0)) s = s_ episode_step += 1 self.total_steps += 1 # Periodically update agents on the network if self.nb_ep % Settings.UPDATE_ACTORS_FREQ == 0: self.sess.run(self.update) if not self.gui.STOP: if self.n_agent == 1 and self.gui.ep_reward.get(self.nb_ep): print("Episode %i : reward %i, steps %i, noise scale %f" % (self.nb_ep, episode_reward, episode_step, noise_scale)) plot = (self.n_agent == 1 and self.gui.plot.get(self.nb_ep)) self.displayer.add_reward(episode_reward, self.n_agent, plot=plot) # Write the summary feed_dict = {self.ep_reward_ph: episode_reward, self.noise_ph: noise_scale, self.steps_ph: episode_step} summary = self.sess.run(self.ep_summary, feed_dict=feed_dict) self.writer.add_summary(summary, self.nb_ep) self.nb_ep += 1 self.env.close()
class Agent: """ This class builds an agent with its own Network, memory buffer and environment to learn a policy. """ def __init__(self, sess, gui, displayer, saver): """ Build a new instance of Environment, QNetwork and ExperienceBuffer. Args: sess : the tensorflow session in which to build the network gui : a GUI instance to manage the control of the agent displayer: a Displayer instance to keep track of the episode rewards saver : a Saver instance to save periodically the network """ print("Initializing the agent...") self.sess = sess self.gui = gui self.displayer = displayer self.saver = saver self.env = Environment() self.network = Network(sess) self.buffer = ExperienceBuffer() self.best_run = -1e10 self.n_gif = 0 print("Agent initialized !") def pre_train(self): """ Method to run a random agent in the environment to fill the memory buffer. """ print("Beginning of the pre-training...") for i in range(Settings.PRE_TRAIN_EPS): s = self.env.reset() done = False episode_reward = 0 episode_step = 0 while episode_step < Settings.MAX_EPISODE_STEPS and not done: a = self.env.act_random() s_, r, done, info = self.env.act(a) self.buffer.add((s, a, r, s_, 1 if not done else 0)) s = s_ episode_reward += r episode_step += 1 if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS // 5) == 0: print("Pre-train step n", i) # Set the best score to at least the max score the random agent got self.best_run = max(self.best_run, episode_reward) print("End of the pre training !") def save_best(self, episode_reward): self.best_run = episode_reward print("Save best", episode_reward) self.saver.save('best') # self.play(1, 'best') def run(self): """ Method to run the agent in the environment to collect experiences and learn on these experiences by gradient descent. """ print("Beginning of the run...") self.pre_train() self.network.init_target() self.total_steps = 0 self.nb_ep = 1 while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP: s = self.env.reset() episode_reward = 0 done = False episode_step = 1 # The more episodes the agent performs, the longer they are max_step = Settings.MAX_EPISODE_STEPS if Settings.EP_ELONGATION > 0: max_step += self.nb_ep // Settings.EP_ELONGATION # Initialize exploration noise process noise_process = np.zeros(Settings.ACTION_SIZE) noise_scale = (Settings.NOISE_SCALE_INIT * Settings.NOISE_DECAY**self.nb_ep) * \ (Settings.HIGH_BOUND - Settings.LOW_BOUND) # Render settings self.env.set_render(self.gui.render.get(self.nb_ep)) self.env.set_gif(self.gui.gif.get(self.nb_ep)) while episode_step <= max_step and not done: # Choose action based on deterministic policy a = self.network.act(s) # Add temporally-correlated exploration noise to action noise_process = Settings.EXPLO_THETA * \ (Settings.EXPLO_MU - noise_process) + \ Settings.EXPLO_SIGMA * np.random.randn(Settings.ACTION_SIZE) a += noise_scale * noise_process s_, r, done, info = self.env.act(a) episode_reward += r self.buffer.add((s, a, r, s_, 1 if not done else 0)) if self.total_steps % Settings.TRAINING_FREQ == 0: batch = self.buffer.sample() self.network.train(np.asarray(batch)) self.network.update_target() s = s_ episode_step += 1 self.total_steps += 1 self.displayer.add_reward(episode_reward, plot=self.gui.plot.get(self.nb_ep)) # if episode_reward > self.best_run: # self.save_best(episode_reward) # Episode display if self.gui.ep_reward.get(self.nb_ep): print( 'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f' % (self.nb_ep, episode_reward, episode_step, noise_scale)) # Save the model if self.gui.save.get(self.nb_ep): self.saver.save(self.nb_ep) self.nb_ep += 1 self.env.close() def play(self, number_run, name=None): """ Method to evaluate the policy without exploration. Args: number_run: the number of episodes to perform name : the name of the gif that will be saved """ print("Playing for", number_run, "runs") self.env.set_render(Settings.DISPLAY) try: for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False self.env.set_gif(True, name) while not done: a = self.network.act(s) s, r, done, info = self.env.act(a) episode_reward += r print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: print("End of the demo") def stop(self): self.env.close()
class Agent: def __init__(self, sess): print("Initializing the agent...") self.sess = sess self.env = Environment() self.state_size = self.env.get_state_size() self.action_size = self.env.get_action_size() print("Creation of the main QNetwork...") self.mainQNetwork = QNetwork(self.state_size, self.action_size, 'main') print("Main QNetwork created !\n") print("Creation of the target QNetwork...") self.targetQNetwork = QNetwork(self.state_size, self.action_size, 'target') print("Target QNetwork created !\n") self.buffer = PrioritizedReplayBuffer(parameters.BUFFER_SIZE, parameters.ALPHA) self.epsilon = parameters.EPSILON_START self.beta = parameters.BETA_START self.initial_learning_rate = parameters.LEARNING_RATE trainables = tf.trainable_variables() self.update_target_ops = updateTargetGraph(trainables) self.nb_ep = 1 self.best_run = -1e10 def pre_train(self): print("Beginning of the pre-training...") for i in range(parameters.PRE_TRAIN_STEPS): s = self.env.reset() done = False episode_step = 0 episode_reward = 0 while episode_step < parameters.MAX_EPISODE_STEPS and not done: a = random.randint(0, self.action_size - 1) s_, r, done, info = self.env.act(a) self.buffer.add(s, a, r, s_, done) s = s_ episode_reward += r episode_step += 1 if i % 100 == 0: print("\tPre-train step n", i) self.best_run = max(self.best_run, episode_reward) print("End of the pre training !") def run(self): print("Beginning of the run...") self.pre_train() self.total_steps = 0 self.nb_ep = 1 while self.nb_ep < parameters.TRAINING_STEPS: self.learning_rate = self.initial_learning_rate * \ (parameters.TRAINING_STEPS - self.nb_ep) / \ parameters.TRAINING_STEPS s = self.env.reset() episode_reward = 0 done = False memory = deque() discount_R = 0 episode_step = 0 max_step = parameters.MAX_EPISODE_STEPS + \ self.nb_ep // parameters.EP_ELONGATION # Render parameters self.env.set_render(self.nb_ep % parameters.RENDER_FREQ == 0) while episode_step < max_step and not done: if random.random() < self.epsilon: a = random.randint(0, self.action_size - 1) else: a = self.sess.run(self.mainQNetwork.predict, feed_dict={self.mainQNetwork.inputs: [s]}) a = a[0] s_, r, done, info = self.env.act(a) episode_reward += r memory.append((s, a, r, s_, done)) if len(memory) > parameters.N_STEP_RETURN: s_mem, a_mem, r_mem, ss_mem, done_mem = memory.popleft() discount_R = r_mem for i, (si, ai, ri, s_i, di) in enumerate(memory): discount_R += ri * parameters.DISCOUNT ** (i + 1) self.buffer.add(s_mem, a_mem, discount_R, s_, done) if episode_step % parameters.TRAINING_FREQ == 0: train_batch = self.buffer.sample(parameters.BATCH_SIZE, self.beta) # Incr beta if self.beta <= parameters.BETA_STOP: self.beta += parameters.BETA_INCR feed_dict = {self.mainQNetwork.inputs: train_batch[0]} oldQvalues = self.sess.run(self.mainQNetwork.Qvalues, feed_dict=feed_dict) tmp = [0] * len(oldQvalues) for i, oldQvalue in enumerate(oldQvalues): tmp[i] = oldQvalue[train_batch[1][i]] oldQvalues = tmp feed_dict = {self.mainQNetwork.inputs: train_batch[3]} mainQaction = self.sess.run(self.mainQNetwork.predict, feed_dict=feed_dict) feed_dict = {self.targetQNetwork.inputs: train_batch[3]} targetQvalues = self.sess.run(self.targetQNetwork.Qvalues, feed_dict=feed_dict) # Done multiplier : # equals 0 if the episode was done # equals 1 else done_multiplier = (1 - train_batch[4]) doubleQ = targetQvalues[range(parameters.BATCH_SIZE), mainQaction] targetQvalues = train_batch[2] + \ parameters.DISCOUNT * doubleQ * done_multiplier errors = np.square(targetQvalues - oldQvalues) + 1e-6 self.buffer.update_priorities(train_batch[6], errors) feed_dict = {self.mainQNetwork.inputs: train_batch[0], self.mainQNetwork.Qtarget: targetQvalues, self.mainQNetwork.actions: train_batch[1], self.mainQNetwork.learning_rate: self.learning_rate} _ = self.sess.run(self.mainQNetwork.train, feed_dict=feed_dict) update_target(self.update_target_ops, self.sess) s = s_ episode_step += 1 self.total_steps += 1 # Decay epsilon if self.epsilon > parameters.EPSILON_STOP: self.epsilon -= parameters.EPSILON_DECAY DISPLAYER.add_reward(episode_reward) # if episode_reward > self.best_run and \ # self.nb_ep > 50: # self.best_run = episode_reward # print("Save best", episode_reward) # SAVER.save('best') # self.play(1) self.total_steps += 1 if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0: print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %.3f' ', Max steps: %i, Learning rate: %g' % ( self.nb_ep, episode_reward, episode_step, self.epsilon, max_step, self.learning_rate)) # Save the model if self.nb_ep % parameters.SAVE_FREQ == 0: SAVER.save(self.nb_ep) self.nb_ep += 1 def play(self, number_run): print("Playing for", number_run, "runs") try: for i in range(number_run): self.env.set_render(True) s = self.env.reset() episode_reward = 0 done = False episode_step = 0 max_step = parameters.MAX_EPISODE_STEPS + \ self.nb_ep // parameters.EP_ELONGATION while episode_step < max_step and not done: a = self.sess.run(self.mainQNetwork.predict, feed_dict={self.mainQNetwork.inputs: [s]}) a = a[0] s, r, done, info = self.env.act(a) episode_reward += r episode_step += 1 print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: self.env.set_render(False) print("End of the demo") self.env.close() def stop(self): self.env.close()
class Agent: def __init__(self, sess): print("Initializing the agent...") self.sess = sess self.env = Environment() self.state_size = self.env.get_state_size()[0] self.action_size = self.env.get_action_size() self.low_bound, self.high_bound = self.env.get_bounds() self.buffer = ExperienceBuffer() print("Creation of the actor-critic network") self.network = Network(self.state_size, self.action_size, self.low_bound, self.high_bound) self.sess.run(tf.global_variables_initializer()) DISPLAYER.reset() def run(self): self.total_steps = 0 for ep in range(1, parameters.TRAINING_STEPS + 1): episode_reward = 0 episode_step = 0 done = False # Initialize exploration noise process noise_process = np.zeros(self.action_size) noise_scale = (parameters.NOISE_SCALE_INIT * parameters.NOISE_DECAY**ep) * \ (self.high_bound - self.low_bound) # Initial state s = self.env.reset() render = (ep % parameters.RENDER_FREQ == 0 and parameters.DISPLAY) self.env.set_render(render) while episode_step < parameters.MAX_EPISODE_STEPS and not done: # choose action based on deterministic policy a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: s[None]}) # add temporally-correlated exploration noise to action # (using an Ornstein-Uhlenbeck process) noise_process = parameters.EXPLO_THETA * \ (parameters.EXPLO_MU - noise_process) + \ parameters.EXPLO_SIGMA * np.random.randn(self.action_size) a += noise_scale * noise_process s_, r, done, info = self.env.act(a) episode_reward += r self.buffer.add((s, a, r, s_, 0.0 if done else 1.0)) # update network weights to fit a minibatch of experience if self.total_steps % parameters.TRAINING_FREQ == 0 and \ len(self.buffer) >= parameters.BATCH_SIZE: minibatch = self.buffer.sample() _, _ = self.sess.run( [ self.network.critic_train_op, self.network.actor_train_op ], feed_dict={ self.network.state_ph: np.asarray([elem[0] for elem in minibatch]), self.network.action_ph: np.asarray([elem[1] for elem in minibatch]), self.network.reward_ph: np.asarray([elem[2] for elem in minibatch]), self.network.next_state_ph: np.asarray([elem[3] for elem in minibatch]), self.network.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]) }) # update target networks _ = self.sess.run(self.network.update_slow_targets_op) s = s_ episode_step += 1 self.total_steps += 1 if ep % parameters.DISP_EP_REWARD_FREQ == 0: print( 'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f' % (ep, episode_reward, episode_step, noise_scale)) DISPLAYER.add_reward(episode_reward) def play(self, number_run): print("Playing for", number_run, "runs") self.env.set_render(True) try: for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False while not done: a, = self.sess.run( self.network.actions, feed_dict={self.network.state_ph: s[None]}) s, r, done, info = self.env.act(a) episode_reward += r print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: self.env.set_render(False) print("End of the demo") self.env.close() def close(self): self.env.close()
class Agent: def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.local_network = Network(thread_index, device) self.local_network.build_loss() with tf.device(device): local_var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients(self.local_network.total_loss, local_var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients) self.update_network = self.local_network.copy_network(global_network) self.env = Environment(thread_index == 1) self.state = self.env.reset() self.worker_total_steps = 0 self.worker_total_eps = 0 self.start_time = time.time() self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * \ (MAX_TIME_STEP - global_time_step) / MAX_TIME_STEP return max(learning_rate, 0) def _record_score(self, sess, summary_writer, summary_op, score_input, score, total_steps): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, total_steps) summary_writer.flush() def process(self, sess, total_steps, summary_writer, summary_op, score_input): start_time = time.time() buffer = [] done = False episode_step = 0 # copy weights from global to local sess.run(self.update_network) start_lstm_state = self.local_network.lstm_state_out for i in range(UPDATE_FREQ): pi, value = self.local_network.run_policy_and_value(sess, self.state) a = np.random.choice(ACTION_SIZE, p=pi) s_, r, terminal, _ = self.env.act(a) self.episode_reward += r # clip reward r = np.clip(r, -1, 1) buffer.append((self.state, a, r, value)) episode_step += 1 self.worker_total_steps += 1 self.state = s_ if terminal: done = True self.worker_total_eps += 1 DISPLAYER.add_reward(self.episode_reward, self.thread_index) if (self.thread_index == 1 and self.worker_total_eps % DISP_REWARD_FREQ == 0): cur_learning_rate = self._anneal_learning_rate(total_steps) print('Episode %i, Reward %i, Steps %i, LR %g' % (self.worker_total_eps, self.episode_reward, episode_step, cur_learning_rate)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, total_steps) self.episode_reward = 0 self.env.reset() self.local_network.reset_state() render = (DISPLAY and self.thread_index == 1 and (self.worker_total_eps - 1) % RENDER_FREQ == 0) self.env.set_render(render) break batch_s = deque() batch_a = deque() batch_td = deque() batch_R = deque() # Bootstrapping R = 0.0 if not done: R = self.local_network.run_value(sess, self.state) # compute and accumulate gradients for i in range(len(buffer) - 1, -1, -1): si, ai, ri, Vi = buffer[i] R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_s.appendleft(si) batch_a.appendleft(a) batch_td.appendleft(td) batch_R.appendleft(R) cur_learning_rate = self._anneal_learning_rate(total_steps) feed_dict = {self.local_network.state: batch_s, self.local_network.action: batch_a, self.local_network.td_error: batch_td, self.local_network.reward: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], self.learning_rate_input: cur_learning_rate} sess.run(self.apply_gradients, feed_dict=feed_dict) if done and (self.thread_index == 1) and \ (self.worker_total_eps % PERF_FREQ == 0 or self.worker_total_eps == 15): global_time = time.time() - self.start_time steps_per_sec = total_steps / global_time print("### Performance : {} STEPS in {:.0f} sec." "{:.0f} STEPS/sec. {:.2f}M STEPS/hour ###".format( total_steps, global_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) elapsed_time = time.time() - start_time return elapsed_time, done, episode_step def close(self): self.env.close()
class Agent: def __init__(self, worker_index, sess, render=False, master=False): self.worker_index = worker_index if master: self.name = 'global' else: print("Initialization of the agent", str(worker_index)) self.name = 'Worker_' + str(worker_index) self.env = Environment() self.state_size = self.env.get_state_size() self.action_size = self.env.get_action_size() self.low_bound, self.high_bound = self.env.get_bounds() self.network = Network(self.state_size, self.action_size, self.name) self.update_local_vars = update_target_graph('global', self.name) self.starting_time = 0 self.epsilon = settings.EPSILON_START if self.name != 'global': self.summary_writer = tf.summary.FileWriter( "results/" + self.name, sess.graph) def save(self, episode_step): # Save model SAVER.save(episode_step) # Save summary statistics summary = tf.Summary() summary.value.add(tag='Perf/Reward', simple_value=np.mean(self.rewards_plus)) summary.value.add(tag='Perf/Value', simple_value=np.mean(self.next_values)) summary.value.add(tag='Losses/Value', simple_value=self.value_loss) summary.value.add(tag='Losses/Policy', simple_value=self.policy_loss) summary.value.add(tag='Losses/Entropy', simple_value=self.entropy) summary.value.add(tag='Losses/Grad Norm', simple_value=self.grad_norm) self.summary_writer.add_summary(summary, self.nb_ep) self.summary_writer.flush() def train(self, sess, bootstrap_value): # Add the bootstrap value to our experience self.rewards_plus = np.asarray(self.rewards_buffer + [bootstrap_value]) discounted_reward = discount(self.rewards_plus, settings.DISCOUNT)[:-1] self.next_values = np.asarray(self.values_buffer[1:] + [bootstrap_value]) advantages = self.rewards_buffer + \ settings.DISCOUNT * self.next_values - \ self.values_buffer advantages = discount(advantages, settings.GENERALIZED_LAMBDA * settings.DISCOUNT) # Update the global network feed_dict = { self.network.discounted_reward: discounted_reward, self.network.inputs: self.states_buffer, self.network.actions: self.actions_buffer, self.network.advantages: advantages } losses = sess.run([ self.network.value_loss, self.network.policy_loss, self.network.entropy, self.network.grad_norm, self.network.apply_grads ], feed_dict=feed_dict) # Get the losses for tensorboard self.value_loss, self.policy_loss, self.entropy = losses[:3] self.grad_norm, _ = losses[3:] # Reinitialize buffers and variables self.states_buffer = [] self.actions_buffer = [] self.rewards_buffer = [] self.values_buffer = [] def work(self, sess, coord): print("Running", self.name, end='\n\n') self.starting_time = time() self.nb_ep = 1 nearlyDone = 0 with sess.as_default(), sess.graph.as_default(): with coord.stop_on_exception(): while not coord.should_stop(): self.states_buffer = [] self.actions_buffer = [] self.rewards_buffer = [] self.values_buffer = [] self.mean_values_buffer = [] self.total_steps = 0 episode_reward = 0 episode_step = 0 # Reset the local network to the global sess.run(self.update_local_vars) mean = 45 * TORAD std = 0 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples=wind_samples) WH = w.generateWind() hdg0_rand = random.uniform(5, 12) hdg0 = hdg0_rand * TORAD * np.ones(10) s = self.env.reset(hdg0, WH) done = False #if self.worker_index == 1 and render and settings.DISPLAY: # self.env.set_render(True) #self.lstm_state = self.network.lstm_state_init #self.initial_lstm_state = self.lstm_state while not coord.should_stop() and not done and \ episode_step < settings.MAX_EPISODE_STEP: WH = np.random.uniform(mean - std, mean + std, size=wind_samples) s = np.reshape([s[0, :], s[1, :]], [2 * self.state_size, 1]) # Prediction of the policy and the value feed_dict = {self.network.inputs: [s]} policy, value = sess.run( [self.network.policy, self.network.value], feed_dict=feed_dict) policy, value = policy[0], value[0][0] if random.random() < self.epsilon: action = random.choice([1.5, 0, -1.5]) else: # Choose an action according to the policy action = np.random.choice([1.5, 0, -1.5], p=policy) s_, v = self.env.act(action, WH) #reward assignation algorithm if episode_step == 1: r = 0 elif s[int(self.state_size / 2 - 2)] > ( 13 * TORAD) and s[int(self.state_size / 2 - 2)] < ( 15 * TORAD ) and v > 0.63 and v < 0.67 and action < 0: r = 0.5 else: if v <= 0.69: r = 0 nearlyDone = 0 elif v > 0.69 and v <= 0.75: r = 0.00001 nearlyDone = 0 elif v > 0.75 and v <= 0.8: r = 0.01 nearlyDone = 0 elif v > 0.80: r = 0.1 if nearlyDone >= 3: r = 1 done = True elif nearlyDone == 2: r = 0.8 elif nearlyDone == 1: r = 0.25 nearlyDone = nearlyDone + 1 else: r = 0 nearlyDone = False #s_ = np.reshape(s_, [2*self.state_size,1]) # Store the experience self.states_buffer.append(s) self.actions_buffer.append(action) self.rewards_buffer.append(r) self.values_buffer.append(value) self.mean_values_buffer.append(value) episode_reward += r s = s_ episode_step += 1 self.total_steps += 1 # If we have more than MAX_LEN_BUFFER experiences, we # apply the gradients and update the global network, # then we empty the episode buffers if len(self.states_buffer) == settings.MAX_LEN_BUFFER \ and not done: feed_dict = { self.network.inputs: [ np.reshape([s[0, :], s[1, :]], [2 * self.state_size, 1]) ] } bootstrap_value = sess.run(self.network.value, feed_dict=feed_dict) self.train(sess, bootstrap_value ) #with this we change global network sess.run(self.update_local_vars) #self.initial_lstm_state = self.lstm_state if len(self.states_buffer) != 0: if done: bootstrap_value = 0 else: feed_dict = { self.network.inputs: [ np.reshape([s[0, :], s[1, :]], [2 * self.state_size, 1]) ] } bootstrap_value = sess.run(self.network.value, feed_dict=feed_dict) self.train(sess, bootstrap_value) if self.epsilon > settings.EPSILON_STOP: self.epsilon -= settings.EPSILON_DECAY self.nb_ep += 1 if not coord.should_stop(): DISPLAYER.add_reward(episode_reward, self.worker_index) if (self.worker_index == 1 and self.nb_ep % settings.DISP_EP_REWARD_FREQ == 0): print( 'Episode %2i, Initial hdg: %2i, Reward: %7.3f, Steps: %i, ' 'Epsilon: %7.3f' % (self.nb_ep, hdg0_rand, episode_reward, episode_step, self.epsilon)) print("Policy: ", policy) if (self.worker_index == 1 and self.nb_ep % settings.SAVE_FREQ == 0): self.save(self.total_steps) if time() - self.starting_time > settings.LIMIT_RUN_TIME: coord.request_stop() self.summary_writer.close() def play(self, sess, number_run, path=''): print("Playing", self.name, "for", number_run, "runs") with sess.as_default(), sess.graph.as_default(): hdg0_rand_vec = [0, 7, 13] ''' WIND CONDITIONS ''' mean = 45 * TORAD std = 0 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples=wind_samples) try: for i in range(number_run): # Reset the local network to the global if self.name != 'global': sess.run(self.update_local_vars) WH = w.generateWind() hdg0_rand = hdg0_rand_vec[i] hdg0 = hdg0_rand * TORAD * np.ones(10) s = self.env.reset(hdg0, WH) episode_reward = 0 episode_step = 0 v_episode = [] i_episode = [] done = False #self.lstm_state = self.network.lstm_state_init while (not done and episode_step < 70): i_episode.append(round(s[0][-1] / TORAD)) s = np.reshape([s[0, :], s[1, :]], [2 * self.state_size, 1]) # Prediction of the policy feed_dict = {self.network.inputs: [s]} policy, value = sess.run( [self.network.policy, self.network.value], feed_dict=feed_dict) policy = policy[0] # Choose an action according to the policy action = np.random.choice([1.5, 0, -1.5], p=policy) s_, r = self.env.act(action, WH) if episode_step > 12: if np.mean(v_episode[-4:]) > 0.8: #done=True print("Done!") else: done = False episode_reward += r v_episode.append(r) episode_step += 1 s = s_ DISPLAYER.displayVI(v_episode, i_episode, i) print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass finally: print("End of the demo")
class Agent: def __init__(self, sess): print("Initializing the agent...") self.sess = sess self.env = Environment() self.state_size = self.env.get_state_size()[0] self.action_size = self.env.get_action_size() self.low_bound, self.high_bound = self.env.get_bounds() self.buffer = PrioritizedReplayBuffer(parameters.BUFFER_SIZE, parameters.ALPHA) print("Creation of the actor-critic network...") self.network = Network(self.state_size, self.action_size, self.low_bound, self.high_bound) print("Network created !\n") self.epsilon = parameters.EPSILON_START self.beta = parameters.BETA_START self.best_run = -1e10 self.sess.run(tf.global_variables_initializer()) def run(self): self.nb_ep = 1 self.total_steps = 0 for self.nb_ep in range(1, parameters.TRAINING_STEPS + 1): episode_reward = 0 episode_step = 0 done = False memory = deque() # Initial state s = self.env.reset() max_steps = parameters.MAX_EPISODE_STEPS + self.nb_ep // parameters.EP_ELONGATION while episode_step < max_steps and not done: if random.random() < self.epsilon: a = self.env.random() else: # choose action based on deterministic policy a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: [s]}) # Decay epsilon if self.epsilon > parameters.EPSILON_STOP: self.epsilon -= parameters.EPSILON_DECAY s_, r, done, info = self.env.act(a) memory.append((s, a, r, s_, 0.0 if done else 1.0)) if len(memory) > parameters.N_STEP_RETURN: s_mem, a_mem, r_mem, ss_mem, done_mem = memory.popleft() discount_R = 0 for i, (si, ai, ri, s_i, di) in enumerate(memory): discount_R += ri * parameters.DISCOUNT**(i + 1) self.buffer.add(s_mem, a_mem, discount_R, s_, done) # update network weights to fit a minibatch of experience if self.total_steps % parameters.TRAINING_FREQ == 0 and \ len(self.buffer) >= parameters.BATCH_SIZE: minibatch = self.buffer.sample(parameters.BATCH_SIZE, self.beta) if self.beta <= parameters.BETA_STOP: self.beta += parameters.BETA_INCR td_errors, _, _ = self.sess.run( [ self.network.td_errors, self.network.critic_train_op, self.network.actor_train_op ], feed_dict={ self.network.state_ph: minibatch[0], self.network.action_ph: minibatch[1], self.network.reward_ph: minibatch[2], self.network.next_state_ph: minibatch[3], self.network.is_not_terminal_ph: minibatch[4] }) self.buffer.update_priorities(minibatch[6], td_errors + 1e-6) # update target networks _ = self.sess.run(self.network.update_slow_targets_op) episode_reward += r s = s_ episode_step += 1 self.total_steps += 1 self.nb_ep += 1 if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0: print( 'Episode %2i, Reward: %7.3f, Steps: %i, Epsilon : %7.3f, Max steps : %i' % (self.nb_ep, episode_reward, episode_step, self.epsilon, max_steps)) DISPLAYER.add_reward(episode_reward) if episode_reward > self.best_run and self.nb_ep > 100: self.best_run = episode_reward print("Best agent ! ", episode_reward) SAVER.save('best') if self.nb_ep % parameters.SAVE_FREQ == 0: SAVER.save(self.nb_ep) def play(self, number_run): print("Playing for", number_run, "runs") try: for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False while not done: a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: [s]}) s_, r, done, info = self.env.act(a) episode_reward += r print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: print("End of the demo") self.env.close() def close(self): self.env.close()
class Agent: """ This class builds an agent with its own QNetwork, memory buffer and environment to learn a policy. """ def __init__(self, sess, gui, displayer, saver): """ Build a new instance of Environment and QNetwork. Args: sess : the tensorflow session in which to build the network gui : a GUI instance to manage the control of the agent displayer: a Displayer instance to keep track of the episode rewards saver : a Saver instance to save periodically the network """ print("Initializing the agent...") self.sess = sess self.gui = gui self.displayer = displayer self.saver = saver self.env = Environment() self.QNetwork = QNetwork(sess) self.buffer = PrioritizedReplayBuffer(Settings.BUFFER_SIZE, Settings.ALPHA) self.epsilon = Settings.EPSILON_START self.beta = Settings.BETA_START self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS - 1) self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS) self.best_run = -1e10 self.n_gif = 0 print("Agent initialized !\n") def pre_train(self): """ Method to run a random agent in the environment to fill the memory buffer. """ print("Beginning of the pre-training...") for i in range(Settings.PRE_TRAIN_EPS): s = self.env.reset() done = False episode_reward = 0 episode_step = 0 while episode_step < Settings.MAX_EPISODE_STEPS and not done: a = self.env.act_random() s_, r, done, info = self.env.act(a) self.buffer.add(s, a, r, s_, 1 if not done else 0) s = s_ episode_reward += r episode_step += 1 if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS // 5) == 0: print("Pre-train step n", i) # Set the best score to at least the max score the random agent got self.best_run = max(self.best_run, episode_reward) print("End of the pre training !") def save_best(self, episode_reward): self.best_run = episode_reward print("Save best", episode_reward) self.saver.save('best') # self.play(1, 'best') def run(self): """ Method to run the agent in the environment to collect experiences and learn on these experiences by gradient descent. """ print("Beginning of the run...") self.pre_train() self.QNetwork.init_target() self.total_steps = 0 self.nb_ep = 1 while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP: s = self.env.reset() episode_reward = 0 done = False memory = deque() episode_step = 1 # The more episodes the agent performs, the longer they are max_step = Settings.MAX_EPISODE_STEPS if Settings.EP_ELONGATION > 0: max_step += self.nb_ep // Settings.EP_ELONGATION # Render settings self.env.set_render(self.gui.render.get(self.nb_ep)) self.env.set_gif(self.gui.gif.get(self.nb_ep)) plot_distrib = self.gui.plot_distrib.get(self.nb_ep) while episode_step <= max_step and not done: # Exploration by epsilon-greedy policy if random.random() < self.epsilon: a = self.env.act_random() else: Qdistrib = self.QNetwork.act(s) Qvalue = np.sum(self.z * Qdistrib, axis=1) a = np.argmax(Qvalue, axis=0) if plot_distrib: self.displayer.disp_distrib(self.z, self.delta_z, Qdistrib, Qvalue) s_, r, done, info = self.env.act(a) episode_reward += r memory.append((s, a, r)) # Keep the experience in memory until 'N_STEP_RETURN' steps has # passed to get the delayed return r_1 + ... + gamma^n r_n if len(memory) > Settings.N_STEP_RETURN: s_mem, a_mem, discount_R = memory.popleft() for i, (si, ai, ri) in enumerate(memory): discount_R += ri * Settings.DISCOUNT**(i + 1) self.buffer.add(s_mem, a_mem, discount_R, s_, 1 if not done else 0) if episode_step % Settings.TRAINING_FREQ == 0: batch = self.buffer.sample(Settings.BATCH_SIZE, self.beta) loss = self.QNetwork.train(batch) self.buffer.update_priorities(batch[6], loss) self.QNetwork.update_target() s = s_ episode_step += 1 self.total_steps += 1 # Decay epsilon if self.epsilon > Settings.EPSILON_STOP: self.epsilon -= Settings.EPSILON_DECAY self.QNetwork.decrease_lr() self.displayer.add_reward(episode_reward, plot=self.gui.plot.get(self.nb_ep)) # if episode_reward > self.best_run: # self.save_best(episode_reward) # Episode display if self.gui.ep_reward.get(self.nb_ep): print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %i' ', Max steps: %i' % (self.nb_ep, episode_reward, episode_step, self.epsilon, max_step)) # Save the model if self.gui.save.get(self.nb_ep): self.saver.save(self.nb_ep) self.nb_ep += 1 self.env.close() def play(self, number_run, name=None): """ Method to evaluate the policy without exploration. Args: number_run: the number of episodes to perform name : the name of the gif that will be saved """ print("Playing for", number_run, "runs") try: for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False self.env.set_gif(True, name) while not done: Qdistrib = self.QNetwork.act(s) Qvalue = np.sum(self.z * Qdistrib, axis=1) a = np.argmax(Qvalue, axis=0) s, r, done, info = self.env.act(a) episode_reward += r print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: print("End of the demo") self.env.close() def stop(self): self.env.close()