Esempio n. 1
0
    taskToPerform = args.task
    epsilon = args.epsilon
    verbose = args.verbose
    inputMDP = args.input
    outputPath = args.output
    optionsToLoad = args.load
    bothDirections = args.both
    num_seeds = args.num_seeds
    max_length_episode = args.max_length_ep
    num_episodes = args.num_episodes

    if not verbose:
        warnings.filterwarnings('ignore')

    # Create environment
    env = GridWorld(path=inputMDP, useNegativeRewards=False)
    numStates = env.getNumStates()
    numRows, numCols = env.getGridDimensions()

    # I may load options if I'm told so:
    loadedOptions = None
    if optionsToLoad != None:
        loadedOptions = []
        for i in xrange(len(optionsToLoad)):
            loadedOptions.append(Utils.loadOption(optionsToLoad[i]))
            plot = Plotter(outputPath, env)
            plot.plotPolicy(loadedOptions[i], str(i + 1) + '_')

    if taskToPerform == 1:  #Discover options
        optionDiscoveryThroughPVFs(env=env,
                                   epsilon=epsilon,
#         [' ','#',' ',' ',' '],
#         [' ','#', 1,'#', 10],
#         ['S',' ',' ',' ',' '],
#         [-10,-10, -10, -10, -10]]

# grid = [[' ',' ',' ',+1],
#         ['#','#',' ','#'],
#         [' ','#',' ',' '],
#         [' ','#','#',' '],
#         ['S',' ',' ',' ']]

grid = [[ '#',-100, -100, -100, -100, -100, '#'],
            [   1, 'S',  ' ',  ' ',  ' ',  ' ',  10],
            [ '#',-100, -100, -100, -100, -100, '#']]

Grid = GridWorld(grid, 0.01, -1.0)
M = MarkovDecisonProcess(0.4, Grid, 100)
M.ValueIteration()
values = M.getValues()
actions = M.getActions()
qvalues = M.getQValues()
for i in values.keys():
    print(f"{i} : {values[i]}, {actions[i]}")

RL = QLearning(Grid, 50000, alpha = 0.6, epsilon = 0.2, discount = 0.4)
RL.train()
values = RL.getValues()
actions = RL.getActions()
qvalues = RL.getQValues()
for i in values.keys():
    print(f"{i} : {values[i]}, {actions[i]}")