def generate(): actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum")) episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 # update episode_observations, episode_total_reward = run_episode(ppo, env) ppo.store_episode(episode_observations) ppo.update() # show reward smoothed_total_reward = smoothed_total_reward * 0.9 + episode_total_reward * 0.1 logger.info( f"Episode {episode} total reward={smoothed_total_reward:.2f}") if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") break else: reward_fulfilled = 0 trajectories = [] for i in range(expert_episodes): logger.info(f"Generating trajectory {i}") trajectories.append([{ "state": s["state"], "action": s["action"] } for s in run_episode(ppo, env)[0]]) archive = Archive( path=os.path.join(ROOT, "generated", f"{generated_name}_" + get_time_string())) archive.add_item("expert_trajectories", trajectories) archive.save() logger.info(f"Trajectories saved as {archive.path}")
def generate_expert_episodes(): actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum")) logger.info("Training expert PPO") episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 # update episode_observations, episode_total_reward = run_episode(ppo, env) ppo.store_episode(episode_observations) ppo.update() # show reward smoothed_total_reward = smoothed_total_reward * 0.9 + episode_total_reward * 0.1 logger.info( f"Episode {episode} total reward={smoothed_total_reward:.2f}") if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") break else: reward_fulfilled = 0 trajectories = [] for i in range(expert_episodes): logger.info(f"Generating trajectory {i}") trajectories.append([{ "state": s["state"], "action": s["action"] } for s in run_episode(ppo, env)[0]]) return trajectories
state, reward, terminal, _ = env.step(action.item()) state = convert(state) total_reward += reward tmp_observations.append({ "state": { "mem": old_state, "hidden": old_hidden }, "action": { "action": action }, "next_state": { "mem": state, "hidden": hidden }, "reward": reward, "terminal": terminal }) # update rppo.store_episode(tmp_observations) rppo.update() # show reward smoothed_total_reward = (smoothed_total_reward * 0.9 + total_reward * 0.1) logger.info("Episode {} total reward={:.2f}".format( episode, smoothed_total_reward))
action = ppo.act({"mem": history.get()})[0] state, reward, terminal, _ = env.step(action.item()) state = convert(state) total_reward += reward old_history = history.get() new_history = history.append(state).get() tmp_observations.append({ "state": { "mem": old_history }, "action": { "action": action }, "next_state": { "mem": new_history }, "reward": reward, "terminal": terminal, }) # update ppo.store_episode(tmp_observations) ppo.update() # show reward smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1 logger.info( f"Episode {episode} total reward={smoothed_total_reward:.2f}")