Esempio n. 1
0
def generate():
    actor = Actor(observe_dim, action_num)
    critic = Critic(observe_dim)

    ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"))

    episode, step, reward_fulfilled = 0, 0, 0
    smoothed_total_reward = 0

    while episode < max_episodes:
        episode += 1
        # update
        episode_observations, episode_total_reward = run_episode(ppo, env)
        ppo.store_episode(episode_observations)
        ppo.update()

        # show reward
        smoothed_total_reward = smoothed_total_reward * 0.9 + episode_total_reward * 0.1
        logger.info(
            f"Episode {episode} total reward={smoothed_total_reward:.2f}")

        if smoothed_total_reward > solved_reward:
            reward_fulfilled += 1
            if reward_fulfilled >= solved_repeat:
                logger.info("Environment solved!")
                break
        else:
            reward_fulfilled = 0

    trajectories = []
    for i in range(expert_episodes):
        logger.info(f"Generating trajectory {i}")
        trajectories.append([{
            "state": s["state"],
            "action": s["action"]
        } for s in run_episode(ppo, env)[0]])
    archive = Archive(
        path=os.path.join(ROOT, "generated", f"{generated_name}_" +
                          get_time_string()))
    archive.add_item("expert_trajectories", trajectories)
    archive.save()
    logger.info(f"Trajectories saved as {archive.path}")
Esempio n. 2
0
File: gail.py Progetto: iffiX/machin
def generate_expert_episodes():
    actor = Actor(observe_dim, action_num)
    critic = Critic(observe_dim)

    ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"))
    logger.info("Training expert PPO")

    episode, step, reward_fulfilled = 0, 0, 0
    smoothed_total_reward = 0

    while episode < max_episodes:
        episode += 1
        # update
        episode_observations, episode_total_reward = run_episode(ppo, env)
        ppo.store_episode(episode_observations)
        ppo.update()

        # show reward
        smoothed_total_reward = smoothed_total_reward * 0.9 + episode_total_reward * 0.1
        logger.info(
            f"Episode {episode} total reward={smoothed_total_reward:.2f}")

        if smoothed_total_reward > solved_reward:
            reward_fulfilled += 1
            if reward_fulfilled >= solved_repeat:
                logger.info("Environment solved!")
                break
        else:
            reward_fulfilled = 0

    trajectories = []
    for i in range(expert_episodes):
        logger.info(f"Generating trajectory {i}")
        trajectories.append([{
            "state": s["state"],
            "action": s["action"]
        } for s in run_episode(ppo, env)[0]])
    return trajectories
Esempio n. 3
0
                state, reward, terminal, _ = env.step(action.item())
                state = convert(state)
                total_reward += reward

                tmp_observations.append({
                    "state": {
                        "mem": old_state,
                        "hidden": old_hidden
                    },
                    "action": {
                        "action": action
                    },
                    "next_state": {
                        "mem": state,
                        "hidden": hidden
                    },
                    "reward": reward,
                    "terminal": terminal
                })

        # update
        rppo.store_episode(tmp_observations)
        rppo.update()

        # show reward
        smoothed_total_reward = (smoothed_total_reward * 0.9 +
                                 total_reward * 0.1)

        logger.info("Episode {} total reward={:.2f}".format(
            episode, smoothed_total_reward))
Esempio n. 4
0
                action = ppo.act({"mem": history.get()})[0]
                state, reward, terminal, _ = env.step(action.item())
                state = convert(state)
                total_reward += reward

                old_history = history.get()
                new_history = history.append(state).get()
                tmp_observations.append({
                    "state": {
                        "mem": old_history
                    },
                    "action": {
                        "action": action
                    },
                    "next_state": {
                        "mem": new_history
                    },
                    "reward": reward,
                    "terminal": terminal,
                })

        # update
        ppo.store_episode(tmp_observations)
        ppo.update()

        # show reward
        smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1

        logger.info(
            f"Episode {episode} total reward={smoothed_total_reward:.2f}")