Ejemplo n.º 1
0
# Author: Marlos C. Machado

import utils
import random
import numpy as np
import environment as env
from ssr_bellman_q import SSRBellmanQ

if __name__ == "__main__":
    # Read arguments:
    args = utils.ArgsParser.read_input_args()

    random.seed(args.seed)
    np.random.seed(args.seed)

    # Instantiate objects I'll need
    environment = env.MDP(args.input)

    # Actual learning algorithm
    for ep in range(args.num_episodes):
        agent = SSRBellmanQ(environment, args.gamma, args.epsilon, beta=args.beta)
        time_step = 1
        while not environment.is_terminal():
            # print(time_step)
            agent.step()
            time_step += 1
        environment.reset()
        print(ep, ",", agent.get_avg_undisc_return())
Ejemplo n.º 2
0
    info = gen.new()
    configurations = len(info['rewards'])

    print 'Generating map', outer, '(', configurations, 'configuations )'
    sys.stdout.flush()

    world = info['map']
    rewards = info['rewards']
    terminal = info['terminal']
    values = []

    sprite = environment.SpriteFigure(environment.figure_library.objects,
                                      environment.figure_library.background,
                                      dim=args.sprite_dim)
    sprite.makeGrid(world, args.vis_path + str(outer) + '_sprites')

    for inner in tqdm(range(configurations)):
        reward_map = rewards[inner]
        terminal_map = terminal[inner]

        mdp = environment.MDP(world, reward_map, terminal_map)
        vi = environment.ValueIteration(mdp)

        values_list, policy = vi.iterate()
        value_map = mdp.representValues(values_list)
        values.append(value_map)

    info['values'] = values
    filename = os.path.join(args.save_path, str(outer) + '.p')
    pickle.dump(info, open(filename, 'wb'))
Ejemplo n.º 3
0
cumulative_score = 0
for ind in range(num_worlds):
    pred = predictions[ind]
    targ = targets[ind]

    pred_max = np.unravel_index(np.argmax(pred), pred.shape)
    targ_max = np.unravel_index(np.argmax(targ), targ.shape)
    man = abs(pred_max[0] - targ_max[0]) + abs(pred_max[1] - targ_max[1])

    # pdb.set_trace()

    unif = np.ones(pred.shape)
    rew = rewards[ind]
    term = terminal[ind]

    mdp = environment.MDP(None, rew, term)
    si = pipeline.ScoreIteration(mdp, pred)
    avg_pred, scores_pred = si.iterate()

    mdp = environment.MDP(None, rew, term)
    si = pipeline.ScoreIteration(mdp, targ)
    avg_targ, scores_targ = si.iterate()

    mdp = environment.MDP(None, rew, term)
    si = pipeline.ScoreIteration(mdp, unif)
    avg_unif, scores_unif = si.iterate()

    avg_per_score = np.divide(scores_pred - scores_unif,
                              scores_targ - scores_unif)
    avg_per_score[avg_per_score != avg_per_score] = 1
    avg_per_score = np.mean(avg_per_score)