def generate_grids(cols): grids = [] for i in range(cols): map_name_base = '{}x{}-base'.format(cols, cols) env = LochLomondEnv(problem_id=i, is_stochastic=True, reward_hole=-0.02, map_name_base=map_name_base) env.render() grid = EnvMDP.to_decoded(env).reshape(env.nrow * env.ncol) grids.append(np.hstack(([i], grid))) return grids
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=0.0): env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole) np.random.seed(12) results = [] for episode in range(max_episodes): env.reset() print('-' * 50) print_headers() for iteration in range(max_iters_per): action = env.action_space.sample() observation, reward, done, info = env.step(action) print(",".join([ str(episode), str(iteration), str(reward), str(done), str(info), str(action) ])) if done and reward == reward_hole: env.render() print("Hole Found in " + str(iteration) + " iterations") results.append({'iters': iteration, 'success': False}) break if done and reward == 1.0: env.render() print("Frisbee acquired in " + str(iteration) + " iterations") results.append({'iters': iteration, 'success': True}) break return results
for e in range(max_episodes): # iterate over episodes observation = env.reset( ) # reset the state of the env to the starting state for iter in range(max_iter_per_episode): #env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line action = env.action_space.sample( ) # your agent goes here (the current agent takes random actions) observation, reward, done, info = env.step( action) # observe what happends when you take the action # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner print("e,iter,reward,done =" + str(e) + " " + str(iter) + " " + str(reward) + " " + str(done)) # Check if we are done and monitor rewards etc... if (done and reward == reward_hole): env.render() print( "We have reached a hole :-( [we can't move so stop trying; just give up]" ) break if (done and reward == +1.0): env.render() print( "We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal]" ) break
reward_list= list() # Generate the specific problem env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole) # Let's visualize the problem/env print('env',env.desc) # Reset the random generator to a known state (for reproducability) np.random.seed(12) for e in range(max_episodes): # iterate over episodes observation = env.reset() # reset the state of the env to the starting state for iter in range(max_iter_per_episode): env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line #action = env.action_space.sample() # your agent goes here (the current agent takes random actions) random= runRandom() action= random.action() observation, reward, done, info = env.step(action) # observe what happends when you take the action print("================================================") print("info",info) # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner observation_list.append(observation) reward_list.append(reward) print("e,iter,reward,done =" + str(e) + " " + str(iter)+ " " + str(reward)+ " " + str(done)) # Check if we are done and monitor rewards etc...
def run_senseless_agent(problem_id, map): reward_hole = 0.0 max_episodes = 10000 max_iter_per_episode = 1000 env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, map_name_base=map, reward_hole=reward_hole) env.render() env.action_space.sample() np.random.seed(12) # variables for performance evaluation # number of times goal is reached out of max_episodes/ (performance measures where reward is collected) goal_episodes = [] # number of episodes agent falls in hole hole_episodes = [] # average number of iterations taken to reach goal per rewarded episode goal_iterations = [] rewards = [] # number of episodes before goal is first reached first_goal = 0 for e in range(max_episodes): rewards_current_episode = 0 state = env.reset() for iter in range(max_iter_per_episode): action = env.action_space.sample() state, reward, done, info = env.step(action) rewards_current_episode += reward if (done and reward == reward_hole): hole_episodes.append(e) break if (done and reward == +1.0): # env.render() goal_episodes.append(e) goal_iterations.append(iter+2) # sets first goal to episode if first_goal == 0: first_goal = e break rewards.append(rewards_current_episode) # calculating steps to goal goal_iteration_average = mean(goal_iterations) goal_iteration_bestcase = mini(goal_iterations) goal_iteration_worstcase = maxi(goal_iterations) # splits collected rewards into per 100 episodes rewards_per_100_eps = np.split(np.array(rewards), max_episodes / 100) rewards_per_100_eps = [str(sum(r / 100)) for r in rewards_per_100_eps] return len(goal_episodes), len(hole_episodes), goal_iteration_average, goal_iteration_bestcase, \ goal_iteration_worstcase, first_goal, rewards_per_100_eps
class MyAbstractAIAgent(): """ Abstract agent that works as a base for all our agents. """ def __init__(self, problem_id, map_name_base="8x8-base"): # map_name_base="4x4-base" if not (0 <= problem_id <= 7): raise ValueError("Problem ID must be 0 <= problem_id <= 7") self.map_name_base = map_name_base self.env = LochLomondEnv(problem_id=problem_id, is_stochastic=self.is_stochastic(), reward_hole=self.reward_hole(), map_name_base=map_name_base) self.problem_id = problem_id self.reset() self.out = 'out/' self.policy = {} self._train = [] self.graphs = {} def is_stochastic(self): raise NotImplementedError def reward_hole(self): raise NotImplementedError def reset(self): self.rewards = 0 self.failures = 0 self.eval = [] self.timeouts = 0 def solve(self, episodes=10000, iterations=1000, seed=None, gamma=0.95): print('Solving with {} Agent'.format(self.name().capitalize())) print('Problem: ', self.problem_id) print('Grid: ', self.map_name_base) print('Episodes that will run...: ', episodes) self.train(episodes=episodes, iterations=iterations) rewards = self.rewards timeouts = self.timeouts failures = self.failures for e in range(1, episodes + 1): # iterate over episodes state = self.env.reset() self.set_episode_seed(e, seed) if e % 1000 == 0: print("Eval Episode", e) for i in range(1, iterations + 1): action = self.action(state) state, reward, done, info = self.env.step(action) if done: if reward == 1.0: rewards += int(reward) else: failures += 1 # break the cycle break if not done: timeouts += 1 self.eval.append([ self.problem_id, e, i, to_human(action), int(reward), rewards, rewards / e, failures, timeouts ]) def action(self, i): raise NotImplementedError def train(self, episodes, iterations): raise NotImplementedError def env(self): return self.env def set_episode_seed(self, episode, seed=None): # by default no seed for abstract agent return None def alias(self): return '{}out_{}_{}_{}'.format(self.out, self.name(), self.problem_id, self.env.ncol) def evaluate(self, episodes): self.env.reset() print("This is the environment: ") print(self.env.render()) if (len(self.policy) > 0): print("This is the final policy: ") print_table( policy_to_arrows(self.policy, self.env.ncol, self.env.ncol)) print('Saving Evaluation Files...') self.write_eval_files() # Plotting mean rewards print('Saving Plots...') labels = ['Episodes', 'Mean Reward'] title = 'Problem {}. Plot for {} Agent'.format( self.problem_id, self.name().capitalize()) if (len(self._train) > 0): subtitle = 'Episodes vs Mean Reward (Training Phase).' self.plot_train(range(episodes), labels, title, subtitle, 'mr') subtitle = 'First 1000 Episodes vs Mean Reward (Training Phase).' self.plot_train(range(999), labels, title, subtitle, 'mr_first_1000') subtitle = 'Last 1000 Episodes vs Mean Reward (Training Phase).' self.plot_train(range(episodes - 1000, episodes - 1), labels, title, subtitle, 'mr_last_1000') if (len(self.eval) > 0): subtitle = 'Episodes vs Mean Reward (Evaluation Phase).' self.plot_evaluation(range(episodes), labels, title, subtitle, 'mr') subtitle = 'First 1000 Episodes vs Mean Reward (Evaluation Phase).' self.plot_evaluation(range(999), labels, title, subtitle, 'mr_first_1000') subtitle = 'Last 1000 Episodes vs Mean Reward (Evaluation Phase).' self.plot_evaluation(range(episodes - 1000, episodes - 1), labels, title, subtitle, 'mr_last_1000') if (len(self.graphs) > 0): subtitle = 'Utilities plot' self.plot_utilities(['Episodes', 'U'], title, subtitle) def write_eval_files(self): def data_for_file(name): if name == 'policy': return policy_to_list(self.policy) if name == 'u': return u_to_list(self.U) if name == 'eval': return self.eval if name == 'q': return self.Q if name == 'train': return self._train if name == 'graphs': return self.graphs return [] for file in self.files(): if file == 'graphs': filename = '{}_{}.json'.format(self.alias(), file) with open(filename, 'w') as outfile: json.dump(data_for_file(file), outfile) else: filename = '{}_{}.csv'.format(self.alias(), file) data = [self.header(file)] + data_for_file(file) np.savetxt(filename, data, delimiter=",", fmt='%s') print('\tFile saved: {}'.format(filename)) def header(self, key): headers = { 'eval': [ 'id', 'episode', 'iteration', 'action', 'reward', 'rewards', 'mean_rewards', 'failures', 'timeouts' ], 'policy': ['x', 'y', 'action'], 'u': ['x', 'y', 'u'], 'train': [ 'id', 'episode', 'iteration', 'reward', 'rewards', 'mean_rewards', 'failures', 'timeouts' ], 'graphs': ['x', 'y', 'value'], 'q': ['position', 'x', 'y', 'action', 'action_friendly', 'value'] } if key in headers: return headers[key] def plot_train(self, rows, labels, title, subtitle, suffix=''): """ Plots mean rewards from training phase """ train = np.array(self._train) x = pd.to_numeric(train[:, 1]) y = pd.to_numeric(train[:, 5]) filename = '{}_train_{}.png'.format(self.alias(), suffix) self.plot(x, y, rows, labels, filename, title, subtitle) def plot_evaluation(self, rows, labels, title, subtitle, suffix=''): """ Plots mean rewards from evaluation phase """ evaluation = np.array(self.eval) x = pd.to_numeric(evaluation[:, 1]) y = pd.to_numeric(evaluation[:, 6]) filename = '{}_eval_{}.png'.format(self.alias(), suffix) self.plot(x, y, rows, labels, filename, title, subtitle) def plot_utilities(self, labels, title, subtitle): for state, value in self.graphs.items(): x, y = zip(*value) plt.plot(x, y, label=str(state)) plt.ylim([-0.1, 1.05]) plt.legend(loc='lower right') plt.xlabel(labels[0]) plt.ylabel(labels[1]) filename = '{}_utilities.png'.format(self.alias()) plt.suptitle(title, fontsize=12) plt.title(subtitle, fontsize=10) plt.savefig(filename) plt.close() print('\tPlot saved: {}'.format(filename)) def plot(self, x, y, rows, labels, filename, title, subtitle): plt.plot(x[rows], y[rows]) plt.xlabel(labels[0]) plt.ylabel(labels[1]) plt.suptitle(title, fontsize=12) plt.title(subtitle, fontsize=10) plt.savefig(filename) plt.close() print('\tPlot saved: {}'.format(filename))