# Author: Marlos C. Machado import utils import random import numpy as np import environment as env from ssr_bellman_q import SSRBellmanQ if __name__ == "__main__": # Read arguments: args = utils.ArgsParser.read_input_args() random.seed(args.seed) np.random.seed(args.seed) # Instantiate objects I'll need environment = env.MDP(args.input) # Actual learning algorithm for ep in range(args.num_episodes): agent = SSRBellmanQ(environment, args.gamma, args.epsilon, beta=args.beta) time_step = 1 while not environment.is_terminal(): # print(time_step) agent.step() time_step += 1 environment.reset() print(ep, ",", agent.get_avg_undisc_return())
info = gen.new() configurations = len(info['rewards']) print 'Generating map', outer, '(', configurations, 'configuations )' sys.stdout.flush() world = info['map'] rewards = info['rewards'] terminal = info['terminal'] values = [] sprite = environment.SpriteFigure(environment.figure_library.objects, environment.figure_library.background, dim=args.sprite_dim) sprite.makeGrid(world, args.vis_path + str(outer) + '_sprites') for inner in tqdm(range(configurations)): reward_map = rewards[inner] terminal_map = terminal[inner] mdp = environment.MDP(world, reward_map, terminal_map) vi = environment.ValueIteration(mdp) values_list, policy = vi.iterate() value_map = mdp.representValues(values_list) values.append(value_map) info['values'] = values filename = os.path.join(args.save_path, str(outer) + '.p') pickle.dump(info, open(filename, 'wb'))
cumulative_score = 0 for ind in range(num_worlds): pred = predictions[ind] targ = targets[ind] pred_max = np.unravel_index(np.argmax(pred), pred.shape) targ_max = np.unravel_index(np.argmax(targ), targ.shape) man = abs(pred_max[0] - targ_max[0]) + abs(pred_max[1] - targ_max[1]) # pdb.set_trace() unif = np.ones(pred.shape) rew = rewards[ind] term = terminal[ind] mdp = environment.MDP(None, rew, term) si = pipeline.ScoreIteration(mdp, pred) avg_pred, scores_pred = si.iterate() mdp = environment.MDP(None, rew, term) si = pipeline.ScoreIteration(mdp, targ) avg_targ, scores_targ = si.iterate() mdp = environment.MDP(None, rew, term) si = pipeline.ScoreIteration(mdp, unif) avg_unif, scores_unif = si.iterate() avg_per_score = np.divide(scores_pred - scores_unif, scores_targ - scores_unif) avg_per_score[avg_per_score != avg_per_score] = 1 avg_per_score = np.mean(avg_per_score)