class Runner: def __init__(self, num_episodes, num_steps, out='./data', tau=0.01): self.dirname = out self.num_episodes = num_episodes self.num_steps = num_steps self.scores = [] actor = None if os.path.isdir(self.actor_loc): actor = tf.keras.models.load_model(self.actor_loc) self.trainer = Trainer(actor=actor) self.file_writer = FileWriter(self.dirname, 'scores') self.file_writer.init_file(['score']) @property def actor_loc(self): return os.path.join(self.dirname, 'lunar_lander_evo', 'actor') def start(self): for i in range(self.num_episodes): score = self.trainer.run_episode() self.scores.append(score) score = np.array(self.scores[-25:]).mean() self.file_writer.write_val(score) if i % 20 == 0: self.trainer.actor.model.save(self.actor_loc) self.print(i, score) if i % 5 == 0: score = self.test_run() def print(self, i, score): print('----------------------------------') print('episode:', i) print(' length', self.trainer.episode_length) print(' score:', score) def test_run(self): done = False env = gym.make('LunarLanderContinuous-v2') state = env.reset() episode_length = 0 episode_r = 0 while not done: episode_length += 1 action = self.trainer.actor.model(state[None, :]) next_state, reward, done, _ = env.step(action[0]) episode_r += reward state = next_state print('-----------------------------------------------') print('Test run:') print(' accumulated reward:', episode_r) env.close() return episode_r
class Runner: def __init__(self, num_episodes, num_steps, out='./data'): self.dirname = out self.num_episodes = num_episodes self.num_steps = num_steps self.scores = [] actor = None if os.path.isdir(self.model_loc): actor = tf.keras.models.load_model(self.model_loc) self.trainer = Trainer(actor=actor) self.file_writer = FileWriter(self.dirname, 'scores') self.file_writer.init_file(['score']) @property def model_loc(self): return os.path.join(self.dirname, 'moon_lander') def start(self): for i in range(self.num_episodes): self.trainer.record_episode(self.num_steps) score = self.trainer.train() self.scores.append(score) print(score) score = np.array(self.scores[-25:]).mean() self.file_writer.write_val(score) self.print(i, score) self.trainer.reset() if i % 20 == 0: self.trainer.actor.model.save(self.model_loc) def print(self, i, score): print('----------------------------------') print('episode:', i) print(' length', len(self.trainer.states)) print(' score:', score)
class Runner: def __init__(self, num_episodes, num_steps, out='./data'): self.dirname = out self.num_episodes = num_episodes self.num_steps = num_steps self.scores = [] critic = None if os.path.isdir(self.model_loc): critic = tf.keras.models.load_model(self.model_loc) self.trainer = Trainer(critic=critic) self.file_writer = FileWriter(self.dirname, 'scores') self.file_writer.init_file(['score']) @property def model_loc(self): return os.path.join(self.dirname, 'cart_pole_dqn') def start(self): for i in range(self.num_episodes): score = self.trainer.record_episode(self.num_steps) self.scores.append(score) score = np.array(self.scores[-25:]).mean() self.file_writer.write_val(score) if i % 20 == 0: self.trainer.Q.model.save(self.model_loc) self.print(i, score) def print(self, i, score): print('----------------------------------') print('episode:', i) print(' length', self.trainer.episode_length) print(' score:', score)
class Runner: def __init__(self, num_episodes, num_steps, out='./data', tau=0.01): self.dirname = out self.num_episodes = num_episodes self.num_steps = num_steps self.scores = [] critic = None if os.path.isdir(self.critic_loc): critic = tf.keras.models.load_model(self.critic_loc) actor = None if os.path.isdir(self.actor_loc): actor = tf.keras.models.load_model(self.actor_loc) self.trainer = Trainer(tau, actor=actor, critic=critic) self.file_writer = FileWriter(self.dirname, 'scores') self.file_writer.init_file(['score']) @property def actor_loc(self): return os.path.join(self.dirname, 'lunar_lander_ddpg', 'actor') @property def critic_loc(self): return os.path.join(self.dirname, 'lunar_lander_ddpg', 'critic') @property def best_actor_loc(self): return os.path.join(self.dirname, 'lunar_lander_ddpg', 'best_actor') @property def best_critic_loc(self): return os.path.join(self.dirname, 'lunar_lander_ddpg', 'best_critic') def start(self): for i in range(self.num_episodes): score, learning_success_rate, episode_length = self.trainer\ .run_episode() if i % 20 == 0: self.trainer.actor.model.save(self.actor_loc) self.trainer.critic.model.save(self.critic_loc) self.print(i, score, learning_success_rate) if i % 5 == 0: score = self.test_run() self.scores.append(score) score = np.array(self.scores[-25:]).mean() self.file_writer.write_val(score) if score >= max(self.scores): self.trainer.actor.model.save(self.best_actor_loc) self.trainer.critic.model.save(self.best_critic_loc) def print(self, i, score, learning_success_rate): print('----------------------------------') print('episode:', i) print(' length', self.trainer.episode_length) print(' score:', score) print(' learning sucess:', learning_success_rate) def test_run(self): done = False env = gym.make('LunarLanderContinuous-v2') state = env.reset() episode_length = 0 episode_r = 0 acc_Q_loss = 0 while not done: episode_length += 1 action = self.trainer.actor.model(state[np.newaxis, :]) next_state, reward, done, _ = env.step(action[0]) next_state = next_state[np.newaxis, :] next_action = self.trainer.actor.model(next_state) Q_input = tf.concat([next_state, next_action], axis=1) y = reward + self.trainer.discount_factor*self.trainer\ .critic.model(Q_input) Q_input = tf.concat([state[np.newaxis, :], action], axis=1) pred_reward = (y - self.trainer.critic.model(Q_input))\ .numpy()[0][0] acc_Q_loss += (reward - pred_reward)**2 episode_r += reward state = next_state[0] print('-----------------------------------------------') print('Test run:') print(' ep len:', episode_length) print(' accumulated reward:', episode_r) print(' accumulated Q loss:', math.sqrt(acc_Q_loss)) env.close() return episode_r
class Runner: def __init__(self, num_episodes, num_steps, out='./data', tau=0.01): self.dirname = out self.num_episodes = num_episodes self.num_steps = num_steps self.scores = [] critic_1 = None if os.path.isdir(self.critic_1_loc): critic_1 = tf.keras.models.load_model(self.critic_1_loc) critic_2 = None if os.path.isdir(self.critic_2_loc): critic_2 = tf.keras.models.load_model(self.critic_2_loc) actor = None if os.path.isdir(self.actor_loc): actor = tf.keras.models.load_model(self.actor_loc) self.trainer = Trainer(tau, actor=actor, critics=[critic_1, critic_2]) self.file_writer = FileWriter(self.dirname, 'scores') self.file_writer.init_file(['score']) @property def actor_loc(self): return os.path.join(self.dirname, 'lunar_lander_td3', 'actor') @property def critic_1_loc(self): return os.path.join(self.dirname, 'lunar_lander_td3', 'critic_1') @property def critic_2_loc(self): return os.path.join(self.dirname, 'lunar_lander_td3', 'critic_2') @property def best_actor_loc(self): return os.path.join(self.dirname, 'lunar_lander_td3', 'best_actor') @property def best_critic_1_loc(self): return os.path.join(self.dirname, 'lunar_lander_td3', 'best_critic_1') @property def best_critic_2_loc(self): return os.path.join(self.dirname, 'lunar_lander_td3', 'best_critic_2') def start(self): for i in range(self.num_episodes): reward_sum, episode_length = \ self.trainer.run_episode() if i % 10 == 0: self.trainer.actor.model.save(self.actor_loc) self.trainer.critic_1.model.save(self.critic_1_loc) self.trainer.critic_2.model.save(self.critic_2_loc) self.print(i, reward_sum, episode_length) if i % 3 == 0: score = self.test_run() self.scores.append(score) score = np.array(self.scores[-25:]).mean() self.file_writer.write_val(score) if score >= max(self.scores): self.trainer.actor.model.save(self.best_actor_loc) self.trainer.critic_1.model.save(self.best_critic_1_loc) self.trainer.critic_2.model.save(self.best_critic_2_loc) def print(self, i, reward_sum, episode_length): print('----------------------------------') print('episode:', i) print(' length', episode_length) print(' reward_sum:', reward_sum) def test_run(self, max_ep_steps=300): done = False env = gym.make('LunarLanderContinuous-v2') state = env.reset() episode_length = 0 episode_r = 0 acc_Q_loss_1 = 0 acc_Q_loss_2 = 0 while not done and episode_length < max_ep_steps: episode_length += 1 action = self.trainer.actor.model(state[np.newaxis, :]) next_state, reward, done, _ = env.step(action[0]) next_state = next_state[np.newaxis, :] next_action = self.trainer.actor.model(next_state) Q_input = tf.concat([next_state, next_action], axis=1) Q_1_val = self.trainer.target_critic_1.model(Q_input) Q_2_val = self.trainer.target_critic_1.model(Q_input) Q_val = tf.math.minimum(Q_1_val, Q_2_val) y = reward + self.trainer.discount_factor * Q_val Q_input = tf.concat([state[np.newaxis, :], action], axis=1) Q_1 = self.trainer.critic_1.model(Q_input) Q_2 = self.trainer.critic_2.model(Q_input) pred_reward_1 = (y - Q_1).numpy()[0][0] pred_reward_2 = (y - Q_2).numpy()[0][0] acc_Q_loss_1 += (reward - pred_reward_1)**2 acc_Q_loss_2 += (reward - pred_reward_2)**2 episode_r += reward state = next_state[0] print('*****************Evaluation run:**********************') print(' ep len:', episode_length) print(' accumulated reward:', episode_r) print(' accumulated Q_1 loss:', math.sqrt(acc_Q_loss_1)) print(' accumulated Q_2 loss:', math.sqrt(acc_Q_loss_2)) print('******************************************************') env.close() return episode_r