def __init__(self, env, gamma=.99): grid = EnvMDP.to_grid_matrix(env) reward = {} states = set() self.rows = len(grid) self.cols = len(grid[0]) self.grid = grid for x in range(self.cols): for y in range(self.rows): if grid[y][x] is not None: states.add((x, y)) reward[(x, y)] = grid[y][x] self.states = states terminals = EnvMDP.to_position(env, letter=b'GH') actlist = list(range(env.action_space.n)) transitions = EnvMDP.to_transitions(env) init = EnvMDP.to_position(env, letter=b'S')[0] MDP.__init__(self, init, actlist=actlist, terminals=terminals, transitions=transitions, reward=reward, states=states, gamma=gamma)
def __init__(self, grid, goalVals, discount=.99, tau=.01, epsilon=.001): MDP.__init__(self, discount=discount, tau=tau, epsilon=epsilon) self.goalVals = goalVals self.grid = grid self.setGridWorld() self.valueIteration() self.extractPolicy()
def __init__(self, desc=None, map_name="4x4", slip_chance=0.2): if desc is None and map_name is None: raise ValueError('Must provide either desc or map_name') elif desc is None: desc = self.MAPS[map_name] assert ''.join(desc).count('S') == 1, "this implementation supports having exactly one initial state" assert all(c in "SFHG" for c in ''.join(desc)), "all cells must be either of S, F, H or G" self.desc = desc = np.asarray(list(map(list,desc)),dtype='str') self.lastaction = None nrow, ncol = desc.shape states = [(i, j) for i in range(nrow) for j in range(ncol)] actions = ["left","down","right","up"] initial_state = states[np.array(desc == b'S').ravel().argmax()] def move(row, col, movement): if movement== 'left': col = max(col-1,0) elif movement== 'down': row = min(row+1,nrow-1) elif movement== 'right': col = min(col+1,ncol-1) elif movement== 'up': row = max(row-1,0) else: raise("invalid action") return (row, col) transition_probs = {s : {} for s in states} rewards = {s : {} for s in states} for (row,col) in states: if desc[row, col] in "GH": continue for action_i in range(len(actions)): action = actions[action_i] transition_probs[(row, col)][action] = {} rewards[(row, col)][action] = {} for movement_i in [(action_i - 1) % len(actions), action_i, (action_i + 1) % len(actions)]: movement = actions[movement_i] newrow, newcol = move(row, col, movement) prob = (1. - slip_chance) if movement == action else (slip_chance / 2.) if prob == 0: continue if (newrow, newcol) not in transition_probs[row,col][action]: transition_probs[row,col][action][newrow, newcol] = prob else: transition_probs[row, col][action][newrow, newcol] += prob if desc[newrow, newcol] == 'G': rewards[row,col][action][newrow, newcol] = 1.0 MDP.__init__(self, transition_probs, rewards, initial_state)
def __init__(self, rows, cols, definitiveness, initstate, terminals, obstacles, gamma=.9): self.rows = rows self.cols = cols self.definitiveness = definitiveness self.initstate = initstate self.terminals = terminals self.obstacles = obstacles stateset = set() for y in range(1, self.cols + 1): for x in range(1, self.rows + 1): stateset.add((x, y)) actionset = {'up', 'down', 'right', 'left'} MDP.__init__(self, stateset, actionset, gamma)