def save(self, episode_step): # Save model SAVER.save(episode_step) # Save summary statistics summary = tf.Summary() summary.value.add(tag='Perf/Reward', simple_value=np.mean(self.rewards_plus)) summary.value.add(tag='Perf/Value', simple_value=np.mean(self.next_values)) summary.value.add(tag='Losses/Value', simple_value=self.value_loss) summary.value.add(tag='Losses/Policy', simple_value=self.policy_loss) summary.value.add(tag='Losses/Entropy', simple_value=self.entropy) summary.value.add(tag='Losses/Grad Norm', simple_value=self.grad_norm) self.summary_writer.add_summary(summary, self.nb_ep) self.summary_writer.flush()
def run(self): self.nb_ep = 1 self.total_steps = 0 for self.nb_ep in range(1, parameters.TRAINING_STEPS + 1): episode_reward = 0 episode_step = 0 done = False memory = deque() # Initial state s = self.env.reset() max_steps = parameters.MAX_EPISODE_STEPS + self.nb_ep // parameters.EP_ELONGATION while episode_step < max_steps and not done: if random.random() < self.epsilon: a = self.env.random() else: # choose action based on deterministic policy a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: [s]}) # Decay epsilon if self.epsilon > parameters.EPSILON_STOP: self.epsilon -= parameters.EPSILON_DECAY s_, r, done, info = self.env.act(a) memory.append((s, a, r, s_, 0.0 if done else 1.0)) if len(memory) > parameters.N_STEP_RETURN: s_mem, a_mem, r_mem, ss_mem, done_mem = memory.popleft() discount_R = 0 for i, (si, ai, ri, s_i, di) in enumerate(memory): discount_R += ri * parameters.DISCOUNT**(i + 1) self.buffer.add(s_mem, a_mem, discount_R, s_, done) # update network weights to fit a minibatch of experience if self.total_steps % parameters.TRAINING_FREQ == 0 and \ len(self.buffer) >= parameters.BATCH_SIZE: minibatch = self.buffer.sample(parameters.BATCH_SIZE, self.beta) if self.beta <= parameters.BETA_STOP: self.beta += parameters.BETA_INCR td_errors, _, _ = self.sess.run( [ self.network.td_errors, self.network.critic_train_op, self.network.actor_train_op ], feed_dict={ self.network.state_ph: minibatch[0], self.network.action_ph: minibatch[1], self.network.reward_ph: minibatch[2], self.network.next_state_ph: minibatch[3], self.network.is_not_terminal_ph: minibatch[4] }) self.buffer.update_priorities(minibatch[6], td_errors + 1e-6) # update target networks _ = self.sess.run(self.network.update_slow_targets_op) episode_reward += r s = s_ episode_step += 1 self.total_steps += 1 self.nb_ep += 1 if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0: print( 'Episode %2i, Reward: %7.3f, Steps: %i, Epsilon : %7.3f, Max steps : %i' % (self.nb_ep, episode_reward, episode_step, self.epsilon, max_steps)) DISPLAYER.add_reward(episode_reward) if episode_reward > self.best_run and self.nb_ep > 100: self.best_run = episode_reward print("Best agent ! ", episode_reward) SAVER.save('best') if self.nb_ep % parameters.SAVE_FREQ == 0: SAVER.save(self.nb_ep)
def run(self): self.total_steps = 1 self.sess.run(self.network.target_init) self.z = self.sess.run(self.network.z) self.delta_z = self.network.delta_z ep = 1 while ep < settings.TRAINING_EPS + 1 and not GUI.STOP: s = self.env.reset() episode_reward = 0 episode_step = 0 done = False memory = deque() # Initialize exploration noise process noise_scale = settings.NOISE_SCALE * settings.NOISE_DECAY**ep # Initial state self.env.set_render(GUI.render.get(ep)) self.env.set_gif(GUI.gif.get(ep)) plot_distrib = GUI.plot_distrib.get(ep) max_eps = settings.MAX_EPISODE_STEPS + (ep // 50) while episode_step < max_eps and not done: noise = np.random.normal(size=self.action_size) scaled_noise = noise_scale * noise a = np.clip( self.predict_action(s, plot_distrib) + scaled_noise, *self.bounds) s_, r, done, info = self.env.act(a) episode_reward += r memory.append((s, a, r, s_, 0 if done else 1)) if len(memory) >= settings.N_STEP_RETURN: s_mem, a_mem, discount_r, ss_mem, done_mem = memory.popleft( ) for i, (si, ai, ri, s_i, di) in enumerate(memory): discount_r += ri * settings.DISCOUNT**(i + 1) BUFFER.add(s_mem, a_mem, discount_r, s_, 0 if done else 1) if len( BUFFER ) > 0 and self.total_steps % settings.TRAINING_FREQ == 0: self.network.train(BUFFER.sample(), self.critic_lr, self.actor_lr) s = s_ episode_step += 1 self.total_steps += 1 self.critic_lr -= self.delta_critic_lr self.actor_lr -= self.delta_actor_lr # Plot reward plot = GUI.plot.get(ep) DISPLAYER.add_reward(episode_reward, plot) # Print episode reward if GUI.ep_reward.get(ep): print( 'Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f, Critic LR: %f, Actor LR: %f' % (ep, episode_reward, episode_step, noise_scale, self.critic_lr, self.actor_lr)) # Save the model if GUI.save.get(ep): SAVER.save(ep) ep += 1
def run(self): print("Beginning of the run...") self.pre_train() self.total_steps = 0 self.nb_ep = 1 while self.nb_ep < parameters.TRAINING_STEPS: self.learning_rate = self.initial_learning_rate * \ (parameters.TRAINING_STEPS - self.nb_ep) / \ parameters.TRAINING_STEPS s = self.env.reset() episode_reward = 0 done = False memory = deque() discount_R = 0 episode_step = 0 max_step = parameters.MAX_EPISODE_STEPS + \ self.nb_ep // parameters.EP_ELONGATION # Render parameters self.env.set_render(self.nb_ep % parameters.RENDER_FREQ == 0) while episode_step < max_step and not done: if random.random() < self.epsilon: a = random.randint(0, self.action_size - 1) else: a = self.sess.run(self.mainQNetwork.predict, feed_dict={self.mainQNetwork.inputs: [s]}) a = a[0] s_, r, done, info = self.env.act(a) episode_reward += r memory.append((s, a, r, s_, done)) if len(memory) > parameters.N_STEP_RETURN: s_mem, a_mem, r_mem, ss_mem, done_mem = memory.popleft() discount_R = r_mem for i, (si, ai, ri, s_i, di) in enumerate(memory): discount_R += ri * parameters.DISCOUNT ** (i + 1) self.buffer.add(s_mem, a_mem, discount_R, s_, done) if episode_step % parameters.TRAINING_FREQ == 0: train_batch = self.buffer.sample(parameters.BATCH_SIZE, self.beta) # Incr beta if self.beta <= parameters.BETA_STOP: self.beta += parameters.BETA_INCR feed_dict = {self.mainQNetwork.inputs: train_batch[0]} oldQvalues = self.sess.run(self.mainQNetwork.Qvalues, feed_dict=feed_dict) tmp = [0] * len(oldQvalues) for i, oldQvalue in enumerate(oldQvalues): tmp[i] = oldQvalue[train_batch[1][i]] oldQvalues = tmp feed_dict = {self.mainQNetwork.inputs: train_batch[3]} mainQaction = self.sess.run(self.mainQNetwork.predict, feed_dict=feed_dict) feed_dict = {self.targetQNetwork.inputs: train_batch[3]} targetQvalues = self.sess.run(self.targetQNetwork.Qvalues, feed_dict=feed_dict) # Done multiplier : # equals 0 if the episode was done # equals 1 else done_multiplier = (1 - train_batch[4]) doubleQ = targetQvalues[range(parameters.BATCH_SIZE), mainQaction] targetQvalues = train_batch[2] + \ parameters.DISCOUNT * doubleQ * done_multiplier errors = np.square(targetQvalues - oldQvalues) + 1e-6 self.buffer.update_priorities(train_batch[6], errors) feed_dict = {self.mainQNetwork.inputs: train_batch[0], self.mainQNetwork.Qtarget: targetQvalues, self.mainQNetwork.actions: train_batch[1], self.mainQNetwork.learning_rate: self.learning_rate} _ = self.sess.run(self.mainQNetwork.train, feed_dict=feed_dict) update_target(self.update_target_ops, self.sess) s = s_ episode_step += 1 self.total_steps += 1 # Decay epsilon if self.epsilon > parameters.EPSILON_STOP: self.epsilon -= parameters.EPSILON_DECAY DISPLAYER.add_reward(episode_reward) # if episode_reward > self.best_run and \ # self.nb_ep > 50: # self.best_run = episode_reward # print("Save best", episode_reward) # SAVER.save('best') # self.play(1) self.total_steps += 1 if self.nb_ep % parameters.DISP_EP_REWARD_FREQ == 0: print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %.3f' ', Max steps: %i, Learning rate: %g' % ( self.nb_ep, episode_reward, episode_step, self.epsilon, max_step, self.learning_rate)) # Save the model if self.nb_ep % parameters.SAVE_FREQ == 0: SAVER.save(self.nb_ep) self.nb_ep += 1
from Displayer import DISPLAYER from Saver import SAVER import parameters if __name__ == '__main__': tf.reset_default_graph() with tf.Session() as sess: agent = Agent(sess) SAVER.set_sess(sess) SAVER.load(agent) print("Beginning of the run") try: agent.run() except KeyboardInterrupt: pass print("End of the run") SAVER.save(agent.total_steps) DISPLAYER.disp() # agent.play(10) # agent.play(3, "results/gif/".format(parameters.ENV)) agent.close()
def run(self): self.total_steps = 0 for ep in range(1, parameters.TRAINING_STEPS + 1): episode_reward = 0 episode_step = 0 done = False # Initial state s = self.env.reset() self.env.set_render(ep % 1000 == 0) gif = (ep % 1500 == 0) step_allonge = ep // 1000 while episode_step < parameters.MAX_EPISODE_STEPS + step_allonge \ and not done: if random.random() < self.epsilon: a = self.env.random() else: # choose action based on deterministic policy a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: [s]}) s_, r, done, info = self.env.act(a, gif) episode_reward += r self.buffer.add((s, a, r, s_, 0.0 if done else 1.0)) # update network weights to fit a minibatch of experience if self.total_steps % parameters.TRAINING_FREQ == 0 and \ len(self.buffer) >= parameters.BATCH_SIZE: minibatch = self.buffer.sample() _, _ = self.sess.run([self.network.critic_train_op, self.network.actor_train_op], feed_dict={ self.network.state_ph: np.asarray([elem[0] for elem in minibatch]), self.network.action_ph: np.asarray([elem[1] for elem in minibatch]), self.network.reward_ph: np.asarray([elem[2] for elem in minibatch]), self.network.next_state_ph: np.asarray([elem[3] for elem in minibatch]), self.network.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch])}) # update target networks _ = self.sess.run(self.network.update_slow_targets_op) s = s_ episode_step += 1 self.total_steps += 1 # Decay epsilon if self.epsilon > parameters.EPSILON_STOP: self.epsilon -= self.epsilon_decay if gif: self.env.save_gif('results/gif/', self.n_gif) self.n_gif = (self.n_gif + 1) % 5 if episode_reward > self.best_run: self.best_run = episode_reward print("Save best", episode_reward) SAVER.save('best') DISPLAYER.add_reward(episode_reward) if ep % 50 == 0: print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %7.3f' ' (max step: %i)' % (ep, episode_reward, episode_step, self.epsilon, parameters.MAX_EPISODE_STEPS + step_allonge)) if ep % 500 == 0: DISPLAYER.disp()
if __name__ == '__main__': tf.reset_default_graph() with tf.Session() as sess: agent = Agent(sess) SAVER.set_sess(sess) SAVER.load(agent) if settings.GUI: gui = threading.Thread(target=GUI.main) gui.start() try: agent.run() except KeyboardInterrupt: pass print("End of the run") SAVER.save('last') DISPLAYER.disp() if settings.GUI: gui.join() else: agent.play(5) agent.close()
for i in range(settings.NB_THREADS): train_threads.append(threading.Thread(target=work, args=(i, ))) # Intercept CTRL+C signal signal.signal(signal.SIGINT, signal_handler) # Set start time global_start_time = time.time() # Start the workers' thread for t in train_threads: t.start() print('Press Ctrl+C to stop') signal.pause() for t in train_threads: t.join() wall_time += time.time() - global_start_time print("Wall time of simulation : %f\nGlobal CPU time : %f\n" "Total number of episodes : %i\nTotal number of steps : %i" % (wall_time, global_total_time, total_eps, total_steps)) print('Now saving data. Please wait') SAVER.save(global_total_time, wall_time, total_eps, total_steps) DISPLAYER.disp() summary_writer.close()
from Displayer import DISPLAYER from Saver import SAVER import parameters if __name__ == '__main__': tf.reset_default_graph() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) agent = Agent(sess) SAVER.set_sess(sess) SAVER.load(agent) try: agent.run() except KeyboardInterrupt: pass print("End of the run") SAVER.save(agent.nb_ep) DISPLAYER.disp() agent.play(10) agent.stop()