Esempio n. 1
0
def run(mdp_domain):
    domain = mdp_domain()
    solver = ValueIterationSolver(domain,
                                  discount=GAMMA,
                                  threshold=TAU,
                                  verbose=True)
    agent = BaseAgent(domain, solver, epochs=STEPS)
    state_values = agent.train()
    rewards, samples = agent.run(external_policy='randomized')

    states = extract_states(samples)

    bucket_count = select_bin_counts(samples=states)
    mdp_aggregate, aggregation_mapping = aggregate_mdp(values=state_values,
                                                       bin_count=bucket_count,
                                                       domain=domain)

    domain_aggregate = mdp_domain(mdp_aggregate)
    solver_aggregate = ValueIterationSolver(domain=domain_aggregate,
                                            discount=GAMMA,
                                            threshold=TAU,
                                            verbose=True)
    agent_aggregate = BaseAgent(domain=domain_aggregate,
                                solver=solver_aggregate,
                                epochs=STEPS)
    state_values_aggregate = agent_aggregate.train()
    rewards_aggregate, samples_aggregate = agent_aggregate.run()
    policy_aggregate = solver_aggregate.policy

    adapted_policy_aggregate = map_aggregate_policy(
        aggregate_policy=policy_aggregate,
        state_mapping=aggregation_mapping,
        original_domain=domain)
    domain.reset()
    rewards_aggregate_adapted, samples_aggregate_adapted = agent.run(
        external_policy=adapted_policy_aggregate)

    print('original return:', rewards.sum())
    print('aggregate return:', rewards_aggregate.sum())
    print('adapted return:', rewards_aggregate_adapted.sum())
    print('bin count:', bucket_count)

    return rewards, rewards_aggregate, rewards_aggregate_adapted
Esempio n. 2
0
def evaluate_agent(agent: BaseAgent, env: gym.Env, n_episodes: int = 100) -> dict:
    agent.eval()
    episode_rewards = []
    for i_episode in range(n_episodes):
        state = torch.from_numpy(env.reset()).float()
        t = 0
        done = False
        rewards = []
        while not done:
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = torch.from_numpy(next_state).float()
            rewards.append(reward)
            state = next_state
            t += 1

        episode_reward = np.sum(rewards)
        episode_rewards.append(episode_reward)
        _ = agent.end_episode()

    agent.train()
    return {
        'eval_mean_reward': np.mean(episode_rewards)
    }