def run(mdp_domain): domain = mdp_domain() solver = ValueIterationSolver(domain, discount=GAMMA, threshold=TAU, verbose=True) agent = BaseAgent(domain, solver, epochs=STEPS) state_values = agent.train() rewards, samples = agent.run(external_policy='randomized') states = extract_states(samples) bucket_count = select_bin_counts(samples=states) mdp_aggregate, aggregation_mapping = aggregate_mdp(values=state_values, bin_count=bucket_count, domain=domain) domain_aggregate = mdp_domain(mdp_aggregate) solver_aggregate = ValueIterationSolver(domain=domain_aggregate, discount=GAMMA, threshold=TAU, verbose=True) agent_aggregate = BaseAgent(domain=domain_aggregate, solver=solver_aggregate, epochs=STEPS) state_values_aggregate = agent_aggregate.train() rewards_aggregate, samples_aggregate = agent_aggregate.run() policy_aggregate = solver_aggregate.policy adapted_policy_aggregate = map_aggregate_policy( aggregate_policy=policy_aggregate, state_mapping=aggregation_mapping, original_domain=domain) domain.reset() rewards_aggregate_adapted, samples_aggregate_adapted = agent.run( external_policy=adapted_policy_aggregate) print('original return:', rewards.sum()) print('aggregate return:', rewards_aggregate.sum()) print('adapted return:', rewards_aggregate_adapted.sum()) print('bin count:', bucket_count) return rewards, rewards_aggregate, rewards_aggregate_adapted
def evaluate_agent(agent: BaseAgent, env: gym.Env, n_episodes: int = 100) -> dict: agent.eval() episode_rewards = [] for i_episode in range(n_episodes): state = torch.from_numpy(env.reset()).float() t = 0 done = False rewards = [] while not done: action = agent.get_action(state) next_state, reward, done, info = env.step(action) next_state = torch.from_numpy(next_state).float() rewards.append(reward) state = next_state t += 1 episode_reward = np.sum(rewards) episode_rewards.append(episode_reward) _ = agent.end_episode() agent.train() return { 'eval_mean_reward': np.mean(episode_rewards) }