Esempio n. 1
0
def solve(world, goalInWorld):
    solution = GridWorld(world.width, world.height)
    # print(solution.cells)
    solution.cells = [Cell(cell) for cell in world.cells]
    # print(solution.cells)
    goal = solution.get(goalInWorld.col, goalInWorld.line)
    goal.cost = 0
    closed = []
    opened = [goal]

    # reopen = 0

    while len(opened):
        # print(enigmaAsStr(solution, goal))
        # print("opened:", [(c.col, c.line) for c in opened])
        cell = opened.pop()
        closed.append(cell)

        # for adj in solution.getAdjacentCells(cell):
        for adj in solution.getAccessibleCells(cell):
            # print("cell", cell, "has got a adj", adj)
            if adj.reachable:  # we ignore obstacles
                direction = Direction.fromTo(adj, cell)
                cost = cell.cost + direction.cost()
                if adj.cost == -1:  # or adj.cost > cost:  # if not used yet
                    adj.direction = direction
                    adj.cost = cost
                    opened.append(adj)
                if adj.cost > cost:  # if not used yet
                    # reopen += 1
                    # print("reopen", reopen)
                    adj.direction = direction
                    adj.cost = cost
                    opened.append(adj)
    return solution
Esempio n. 2
0
def solve(world, goalInWorld):
    solution = GridWorld(world.width, world.height)
    # print(solution.cells)
    solution.cells = [Cell(cell) for cell in world.cells]
    # print(solution.cells)
    goal = solution.get(goalInWorld.col, goalInWorld.line)
    goal.cost = 0
    closed = []
    opened = [goal]

    # reopen = 0

    while len(opened):
        # print(enigmaAsStr(solution, goal))
        # print("opened:", [(c.col, c.line) for c in opened])
        cell = opened.pop()
        closed.append(cell)

        # for adj in solution.getAdjacentCells(cell):
        for adj in solution.getAccessibleCells(cell):
            # print("cell", cell, "has got a adj", adj)
            if adj.reachable:  # we ignore obstacles
                direction = Direction.fromTo(adj, cell)
                cost = cell.cost + direction.cost()
                if adj.cost == -1:  # or adj.cost > cost:  # if not used yet
                    adj.direction = direction
                    adj.cost = cost
                    opened.append(adj)
                if adj.cost > cost:  # if not used yet
                    # reopen += 1
                    # print("reopen", reopen)
                    adj.direction = direction
                    adj.cost = cost
                    opened.append(adj)
    return solution
Esempio n. 3
0
 def setUp(self):
     self.world = GridWorld(10, 10)
     self.obstaclesProb = 0.2
     self.world.addRandomObstacles(
         math.floor(self.world.getLength() * self.obstaclesProb))
     for cell in self.world.cells:
         if cell.reachable:
             self.goal = cell
             break
 def createSmallMaze(self):
     #should be GridWorldSmall()
     self.GridWorldGame = GridWorld((5, 5))
     cols = self.GridWorldGame.size[0]
     rows = self.GridWorldGame.size[1]
     self.MAZE_X = cols * 32
     self.MAZE_Y = rows * 32
     FRAME = 8
     self.START_X = (self.MAX_X - cols *
                     32) / 2 + FRAME  #what happens if its not 0 in %32
     self.START_Y = (self.MAX_Y - rows * 32) / 2 + FRAME
     self.smileyPos = (self.START_X, self.START_Y)
Esempio n. 5
0
def main():
    env = GridWorld()
    _, es1, ts1 = independentQLearning(env, lambda x: x < 100, 0)
    qList, es2, ts2 = shareStateQLearning(env, lambda x: x < 100, 0)
    iQL = plt.scatter(es1, ts1, c='red')
    ssQL = plt.scatter(es2, ts2, c='blue')
    iQL.set_label("Independent")
    ssQL.set_label("5 Predators, 2 Prey, Share State")
    plt.xlabel("Episodes")
    plt.ylabel("Cumulative TimeSteps")
    plt.legend()
    plt.show()
    env.simulateTrajectory(qList)
Esempio n. 6
0
class Tests(unittest.TestCase):
    def setUp(self):
        self.world = GridWorld(10, 10)
        self.obstaclesProb = 0.2
        self.world.addRandomObstacles(math.floor(self.world.getLength() * self.obstaclesProb))
        for cell in self.world.cells:
            if cell.reachable:
                self.goal = cell
                break

    def test_runs(self):
        solution = solve(self.world, self.goal)
        print(enigmaAsStr(solution, self.goal))
Esempio n. 7
0
class Tests(unittest.TestCase):
    def setUp(self):
        self.world = GridWorld(10, 10)
        self.obstaclesProb = 0.2
        self.world.addRandomObstacles(
            math.floor(self.world.getLength() * self.obstaclesProb))
        for cell in self.world.cells:
            if cell.reachable:
                self.goal = cell
                break

    def test_runs(self):
        solution = solve(self.world, self.goal)
        print(enigmaAsStr(solution, self.goal))
Esempio n. 8
0
def evaluate(goals, EQ):
    env = GridWorld(goals=goals, T_states=T_states)
    policy = EQ_P(EQ)
    state = env.reset()
    done = False
    t = 0
    G = 0
    while not done and t < 100:
        action = policy[state]
        state_, reward, done, _ = env.step(action)
        state = state_
        G += reward
        t += 1
    return G
Esempio n. 9
0
    def openMDPGUI(self):
        global w, g
        if self.checkSettingValues():
            self.master.destroy()

            df = float(self.discFactor.get())
            rews = list(map(lambda x: float(x.get()), self.rewValue))
            probs = list(map(lambda x: float(x.get()), self.probValue))

            w = GridWorld([[
                GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID,
                GridWorld.CELL_EXIT
            ],
                           [
                               GridWorld.CELL_VOID, GridWorld.CELL_WALL,
                               GridWorld.CELL_VOID, GridWorld.CELL_PIT
                           ],
                           [
                               GridWorld.CELL_VOID, GridWorld.CELL_VOID,
                               GridWorld.CELL_VOID, GridWorld.CELL_VOID
                           ]])
            w.setDiscountFactor(df)
            w.setRewards(rews[0], rews[1], rews[2])
            w.setProbabilities(probs[0], probs[1], probs[2], probs[3])

            g = MDPGUI(w)
Esempio n. 10
0
def objectiveFunction(args):

    learning_rate, min_epsilon, max_epsilon, epsilon_decay, discount_factor = args

    num_of_episodes = 500
    max_steps = 1000

    environment = GridWorld()

    agentQ = Q_Agent(environment,
                     epsilon=max_epsilon,
                     learning_rate=learning_rate,
                     discount_factor=discount_factor)

    train(environment,
          agentQ,
          episodes=num_of_episodes,
          max_steps_per_episode=max_steps,
          min_epsilon=min_epsilon,
          max_epsilon=max_epsilon,
          epsilon_decay=epsilon_decay)
    mean_reward = test(environment, agentQ, episodes=1000)

    value_map = np.zeros((environment.height, environment.width))
    for x in range(environment.height):
        for y in range(environment.width):
            q_values_of_state = agentQ.q_table[(x, y)]
            maxValue = max(q_values_of_state.values())
            value_map[x, y] = maxValue

    if save == True:
        utils.plotValueFunction(value_map,
                                os.path.join(save_path, 'heatmap.jpg'))

    return -(mean_reward)
Esempio n. 11
0
 def setUp(self):
     self.world = GridWorld(10, 10)
     self.obstaclesProb = 0.2
     self.world.addRandomObstacles(math.floor(self.world.getLength() * self.obstaclesProb))
     for cell in self.world.cells:
         if cell.reachable:
             self.goal = cell
             break
Esempio n. 12
0
def gridworld():
	''' Create complete discrete environment for MDP modelling (InSpace Tiled), including Rewards and Transition probabilities'''
	w = GridWorld([[GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_EXIT, GridWorld.CELL_VOID, GridWorld.CELL_VOID], 
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_PIT, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_PIT, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID]], discountFactor = 1 )
	
	w.setRewards(-0.04, -1, 1)
	w.setProbabilities(0.8, 0.1, 0.1, 0)
#	w.setDiscountFactor(0.6)
	return w
Esempio n. 13
0
def baseTester():
    ''' runs a somewhat comprehensive test'''
    try:
        import QLearner as ql
    except:
        pass

    #it is worth noting here that num_states can be 100 for any grid < 10x10 using the tuckerHash
    #we need a new hash algo if we are to use a grid outside those parameters
    baseKwargs = {'num_states':100, 'alpha':1.0, 'gamma':0.9, 'rar':0.5, 'radr':0.99, 'dyna':0, 'verbose':False}
    '''
    if you want to add your own test, add it here. I use a tuple to indicate one test it is:
    (csv file, expected convergence iterations, kwarg modifier, test name)
    '''
    myTestList = [('testEasyWorld.csv', 800, 13,{}, 'easy test'),
                  ('world01.csv', 7000, 16, {}, 'Tucker Test 1'),
                  ('world02.csv', 7000, 17, {}, 'Tucker Test 2'),
                  ('testGridWorld.csv', 5000, 20, {}, 'Leo Base Test'),
                  ('testGridWorld.csv', 18000, 20, {'alpha':.2}, 'Test Learning Rate'),
                  ('testEasyWorld.csv', 700, 13, {'rar': 0.05}, 'Test Exploration'),
                  ('testEasyWorld.csv', 700, 13, {'radr': 0.8}, 'Test Exploration Decay'),
                  ('testGridWorld.csv', 3000, 20, {'gamma':0.8}, 'Test Discount Rate'),
                  ('testGridWorld.csv', 1100, 20, {'dyna':100}, 'Test Dyna'),
                  ]
    
    fdtest=myTestList[7:9]              
                  
    #for test in myTestList:
    for test in fdtest:             
        print '-------------------------------'
        print test[4]
        world = GridWorld(test[0])
        testKwargs = copy(baseKwargs)
        for k in test[3].keys():
            testKwargs[k] = test[3][k]
        print 'parameters %s' % str(testKwargs)
        learner = ql.QLearner(**testKwargs)
        print world.grid
        myTester = QTester(world, learner)
        nIter = test[1]
        totalIter = nIter
        lastPolicyLength = 0
        #someone let me know if there's a better way to check for convergence time
        while (totalIter < (test[1] * 1.4)):
           myTester.nIter(nIter)
           nIter = int(.05*test[1])
           myPolicy = myTester.getPolicy()
           policyLength = len(myPolicy)
           totalIter += nIter
           if (lastPolicyLength == policyLength) and (policyLength < 100):
              print 'converged in approx %i iterations' % totalIter
              print policyLength, myPolicy, test[2]
              break
           lastPolicyLength = policyLength
        if (test[1]*1.2 >= totalIter) and (policyLength == test[2]):
           print '*** TEST PASSED ***'
        else:
           print 'xxx TEST FAILED xxx'
Esempio n. 14
0
 def setUp(self):
     self.n = 5
     self.p = 1
     self.gridworld = GridWorld(self.n, self.p)
     self.go_right_policy = np.ones(self.n * self.n, dtype=int)
     self.discount = 0.9
     self.large_discount = 0.2
     self.policy = np.array(
             [['TERMINAL', 'RIGHT', 'RIGHT', 'RIGHT', 'TERMINAL'],
              ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP'],
              ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP'],
              ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP'],
              ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP']])
     self.policy_large_discount = np.array(
             [['TERMINAL', 'LEFT', 'RIGHT', 'RIGHT', 'TERMINAL'],
              ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP'],
              ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP'],
              ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP'],
              ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP']])
Esempio n. 15
0
 def __init__(self,
              epsilon=0.01,
              greedy=False,
              alpha=0.1,
              gamma=0.95,
              visual=True,
              goal=(10, 8),
              agentPose=(1, 1, 'up'),
              showTrial=True,
              randomReset=False,
              epsilonStrat=1,
              epsilonFactor=500):
     """
     gridWorld: GridWorld object
     epsilon: value used for epsilon greedy search
     alpha: step size
     gamma: discount favtor
     """
     self.actionValues = Counter()
     self.epsilonFactor = epsilonFactor
     self.randomReset = randomReset
     self.epsilon = epsilon
     self.greedy = greedy
     self.epsilonStrat = epsilonStrat
     self.goal = goal
     self.Q = dict()
     self.gridWorld = GridWorld(goal,
                                agentPose,
                                visual=visual,
                                showTrial=showTrial,
                                randomReset=randomReset)
     self.actions = self.gridWorld.getActions()
     self.Model = dict()
     self.alpha = alpha
     self.PriorityQueue = PriorityQueue()
     self.gamma = gamma
     self.exp = []
     self.rewards = dict()
     self.rewardNums = dict()
     self.predecessors = defaultdict(set)
     self.initQValues()
Esempio n. 16
0
def start_grid_mdp():
    """
    starts the program, restarts if the user wants to
    """
    grid = load_grid(get_file_path())
    world = GridWorld(grid)
    move_costs = get_move_cost()
    gamma = get_gamma()
    eval_steps = get_evaluation_steps()
    MDP(world, eval_steps, gamma, move_costs)
    if start_again():
        start_grid_mdp()
Esempio n. 17
0
    def buildBiasEngine(self):
        """ 
			Simulates MDPs with varying bias to build a bias inference engine.
		"""

        print "Loading MDPs...\n"

        # Unnecessary progress bar for terminal
        bar = pyprind.ProgBar(len(self.test))
        for i in self.test:
            self.sims.append(
                GridWorld(self.grid, i, self.discount, self.tau, self.epsilon))
            bar.update()

        print "\nDone loading MDPs..."
Esempio n. 18
0
    def openMDPGUI(self):
        global w, g
        if self.checkSettingValues():
            self.master.destroy()

            df = float(self.discFactor.get())
            rews = list(map(lambda x: float(x.get()), self.rewValue))
            probs = list(map(lambda x: float(x.get()), self.probValue))

            w = GridWorld(
                [
                    [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_EXIT],
                    [GridWorld.CELL_VOID, GridWorld.CELL_WALL, GridWorld.CELL_VOID, GridWorld.CELL_PIT],
                    [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
                ]
            )
            w.setDiscountFactor(df)
            w.setRewards(rews[0], rews[1], rews[2])
            w.setProbabilities(probs[0], probs[1], probs[2], probs[3])

            g = MDPGUI(w)
Esempio n. 19
0
import numpy as np
from matplotlib import pyplot as plt
import deepdish as dd
from GridWorld import GridWorld
from library import *

env = GridWorld()
T_states = [(3, 3), (3, 9), (9, 3), (9, 9), (1, 1), (1, 2), (1, 3), (1, 4),
            (1, 5), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (11, 1), (11, 2),
            (11, 3), (11, 4), (11, 5), (11, 7), (11, 8), (11, 9), (11, 10),
            (2, 1), (3, 1), (4, 1), (5, 1), (7, 1), (8, 1), (9, 1), (10, 1),
            (2, 11), (3, 11), (4, 11), (5, 11), (6, 11), (8, 11), (9, 11),
            (10, 11), (11, 11)]

###################################### Qs
BTasksQ = [[t] for t in T_states]
###################################### EQs
Bases = []
n = int(np.ceil(np.log2(len(T_states))))
m = (2**n) / 2
for i in range(n):
    Bases.append([])
    b = False
    for j in range(0, 2**n):
        if j >= len(T_states):
            break
        if b:
            Bases[i].append(1)  #1=True=rmax
        else:
            Bases[i].append(0)  #0=False=rmin
        if (j + 1) % m == 0:
Esempio n. 20
0
from GridWorld import GridWorld
from GridWorld import GridWorldAdditive
from ValueIteration import ValueIteration

# Run Value Iteration in different Grid World environments
if __name__ == "__main__":
    gamma = 0.9
    print("Grid world Value Iteration with discounted rewards gamma = %.2f\n" % gamma)
    terminals = {(0, 3): +1, (1, 3): -1}
    gw = GridWorld((3, 4), 0.8, [(1, 1)], terminals)
    vi = ValueIteration()
    values = vi.valueIteration(gw, gamma)
    gw.printValues(values)
    qvalues = vi.getQValues(gw, values, gamma)
    gw.printQValues(qvalues)
    policy = vi.getPolicy(gw, values, gamma)
    gw.printPolicy(policy)

    reward = -0.01
    print("Grid world Value Iteration with additive rewards = %.2f\n" % reward)
    gwa = GridWorldAdditive((3, 4), 0.8, [(1, 1)], terminals, reward)
    values = vi.valueIteration(gwa, 1, 100)
    gwa.printValues(values)
    qvalues = vi.getQValues(gwa, values, 1)
    gwa.printQValues(qvalues)
    policy = vi.getPolicy(gwa, values, 1)
    gwa.printPolicy(policy)
 
    reward = -0.04
    print("Grid World with additive rewards = %.2f\n" % reward)
    gwa = GridWorldAdditive((3, 4), 0.8, [(1, 1)], terminals, reward)
Esempio n. 21
0
		hlpStr = ("Markov Decision Process Examples\n"
				  "	Examples:\n"
				  "		gridworld 1: std grid world as the book (step cost -0.04, discount factor 1)\n"
				  "		gridworld 2: low discount factor 0.6 (step cost -0.04)\n"
				  "		gridworld 3: low step cost -0.01\n"
				  "		gridworld 4: suicide mode (step cost -2)\n"
				  )
		print(hlpStr)
		exit()
	
	if len(sys.argv) == 1: showhelp()
	
	if sys.argv[1] == "gridworld":
		
		w = GridWorld([[GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_EXIT], 
				   [GridWorld.CELL_VOID, GridWorld.CELL_WALL, GridWorld.CELL_VOID, GridWorld.CELL_PIT],
				   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID]], discountFactor = 1 )
		
		if len(sys.argv) < 3:
			mdpc = MDPChooser()
		elif sys.argv[2] == "1":	
			w.setRewards(-0.04, -1, 1)
			w.setProbabilities(0.8, 0.1, 0.1, 0)
			w.setDiscountFactor(1)
			g = MDPGUI(w)
		elif sys.argv[2] == "2":
			w.setRewards(-0.04, -1, 1)
			w.setProbabilities(0.8, 0.1, 0.1, 0)
			w.setDiscountFactor(0.9)
			g = MDPGUI(w)
		elif sys.argv[2] == "3":
Esempio n. 22
0
 def __init__(self):
     self.game = GridWorld( (5,5))
     self.squareCountGrid = self.game.createSquareCount()
     self.alpha = 0.1
     self.gamma = 0.9
Esempio n. 23
0
from GridWorld import GridWorld


g = GridWorld(3,4)
policy={
    (0, 0):'R',
    (0, 1):'R',
    (0, 2):'R',
    (1, 0):'U',
    (1, 1):'U',
    (1, 2):'U',
    (1, 3):'U',
    (2, 0):'R',
    (2, 1):'R',
    (2, 2):'U',
    (2, 3):'L'
}

def print_policy(p,g):
    for r in range(g.row):
        print('------------------')
        for c in range(g.col):
            a = p.get((r,c),' ')
            print(' %s |'%a, end="")
        print("")

def print_value(V,g):
    for r in range(g.row):
        print('------------------')
        for c in range(g.col):
            v = V.get((r,c), 0)
Esempio n. 24
0
    t = 0
    G = 0
    while not done and t < 100:
        action = policy[state]
        state_, reward, done, _ = env.step(action)
        state = state_
        G += reward
        t += 1
    return G


for t in range(len(types)):
    print("type: ", t)

    # Learning universal bounds (min and max tasks)
    env = GridWorld(goals=T_states, dense_rewards=not types[t][0])
    EQ_max, _ = Goal_Oriented_Q_learning(env, maxiter=maxiter)

    env = GridWorld(goals=T_states,
                    goal_reward=-0.1,
                    dense_rewards=not types[t][0])
    EQ_min, _ = Goal_Oriented_Q_learning(env, maxiter=maxiter)

    # Learning base tasks and doing composed tasks
    goals = Bases[0]
    goals = [[pos, pos] for pos in goals]
    env = GridWorld(goals=goals,
                    dense_rewards=not types[t][0],
                    T_states=T_states if types[t][1] else goals)
    A, stats1 = Goal_Oriented_Q_learning(
        env, maxiter=maxiter, T_states=None if types[t][1] else T_states)
Esempio n. 25
0
class TDL_solution:
    def __init__(self):
        self.game = GridWorld( (5,5))
        self.squareCountGrid = self.game.createSquareCount()
        self.alpha = 0.1
        self.gamma = 0.9
    
    def playTDLGame(self,startSquare, randomMove):
        self.game.currentSquare = startSquare
        
        keepPlaying = not self.game.gameOver()
        squares_and_returns = [(self.game.currentSquare,0)]
     
        while keepPlaying:
            
            #policy
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            move = self.game.policyGrid[i][j]
      
            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i,j))
               
                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0,len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i = self.game.currentSquare[0]
            j = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i][j]
            squares_and_returns.append( (self.game.currentSquare,theReturn) )
            keepPlaying = not self.game.gameOver()
        
        G = 0
        self.squares_and_values = []
        for square , theReturn in reversed(squares_and_returns):
            self.squares_and_values.append( (square,G) )
            G = theReturn + self.game.gamma*G
        #self.squares_and_values.reverse()
    
    def playSarsa(self,startSquare, randomMove):
        self.game.currentSquare = startSquare
        keepPlaying = not self.game.gameOver()
        
        while keepPlaying:
            
            #policy
            i1 = self.game.currentSquare[0]
            j1 = self.game.currentSquare[1]
            move = self.game.policyGrid[i1][j1]
      
            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i1,j1))
                print( str(i1) + " " + str(j1) + " " + str(moves) + " " + str(move) )
                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0,len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i2 = self.game.currentSquare[0]
            j2 = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i2][j2]
            self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(theReturn + self.gamma*self.game.valueGrid[i2][j2]- self.game.valueGrid[i1][j1] )
            keepPlaying = not self.game.gameOver()
            
    def playQLearning(self,startSquare, randomMove):
        self.game.currentSquare = startSquare
        keepPlaying = not self.game.gameOver()
        
        while keepPlaying:
            
            #policy
            i1 = self.game.currentSquare[0]
            j1 = self.game.currentSquare[1]
            move = self.game.policyGrid[i1][j1]
            
            # we use the best move even if random runs over it
            i3 = self.game.currentSquare[0]
            j3 = self.game.currentSquare[1]
      
            if randomMove < np.random.rand():
                moves = self.game.possibleMoves((i1,j1))
                print( str(i1) + " " + str(j1) + " " + str(moves) + " " + str(move) )
                moves.remove(move)
                if len(moves) > 0:
                    idx = np.random.randint(0,len(moves))
                    move = moves[idx]
            #move
            self.game.move(move)
            i2 = self.game.currentSquare[0]
            j2 = self.game.currentSquare[1]
            theReturn = self.game.returnGrid[i2][j2]
            self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(theReturn + self.gamma*self.game.valueGrid[i3][j3]- self.game.valueGrid[i1][j1] )
            keepPlaying = not self.game.gameOver()
    
        
        
    def updateValueGrid(self):
        for t in range(len(self.squares_and_values) -1):
            
            square , _ = self.squares_and_values[t]
            nextSquare, value = self.squares_and_values[t+1]
            i1 = square[0]
            j1 = square[1]
            i2 = nextSquare[0]
            j2 = nextSquare[1]
            self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(value + self.gamma*self.game.valueGrid[i2][j2]- self.game.valueGrid[i1][j1] )  
    
    def updatePolicyGrid(self):
        
        #check if policy change
        #hasChanged = False
        #if bestMove is new set to true.
        rows = self.game.size[0]
        cols = self.game.size[1]
        change = False
        for i in range(rows):
            for j in range(cols):
                if self.game.policyGrid[i][j] in [0,1,2,3]:
                    self.game.currentSquare = (i,j)
                    oldMove = self.game.policyGrid[i][j]
                    self.game.policyGrid[i][j] = self.game.bestMove()
                    if oldMove != self.game.policyGrid[i][j]:
                        change = True
        return change
        
        
    def printGrids(self):
        self.game.printPolicyGrid()
        self.game.printReturnGrid()
        self.game.printValueGrid()
Esempio n. 26
0
from Evaluation import Evaluation
from GridWorld import GridWorld
from Learning import Learning

# グリッドワールドの大きさを指定
row = 5
column = 5

LearningAgentSpan = 10  # 学習エージェントの寿命
LearningTimes = 100  # 学習回数
P = 5  # 報酬
T = 10  # 遡る数

EvaluationAgentSpan = 10  # 評価エージェントの寿命
EvaluationTimes = 100  # 試行回数

grid_world = GridWorld(row, column)
grid_world.make_grid_world()

learning = Learning(grid_world.get_grid_world(), row, column)
learning.do_learning(LearningAgentSpan, LearningTimes, P, T)

evaluation = Evaluation(learning.get_grid_world(), row, column)
evaluation.evaluation(EvaluationAgentSpan, EvaluationTimes)
Esempio n. 27
0
    vehState = start
    env_file = open("Environment.txt", "w")
    gridWorld = CreateEnvironment()
    gridWorld.create(env_file,
                     size_row='10',
                     size_col='10',
                     agent_row=str(vehState[0]),
                     agent_col=str(vehState[1]),
                     goal_row=str(goal[0]),
                     goal_col=str(goal[1]),
                     static_number='2',
                     static_list=[0, 3, 2, 4])
    env_file = open("Environment.txt", "r")
    text_in_file = env_file.readline()
    print(text_in_file)
    grid = GridWorld(text_in_file)
    gw = grid.gridDefine()
    #-------------------------------------------------------

    # initialize agent class and uav class
    Agent = agent(vehState)
    # define a model dictionary, which maps user inputs of learning model names to learning model function
    modelType = {
        "random": Agent.predict_Random,
        "standard": Agent.predict_Standard,
        "NN": Agent.predict_NN
    }
    UAV = uav(vehState)

    # initialize decision model (options = "random", "standard", or "NN")
    model = "random"  # will be a user input
Esempio n. 28
0
import hashlib
import json
from GridWorld import GridWorld
import numpy as np
import copy

from matplotlib import pyplot as plt

import torch
from torch.nn.modules.loss import SmoothL1Loss
import torch.nn as nn
from torch.optim import Adam
import random

grid_world = GridWorld()

rewards_to_plot = []
stats_ax = None
rewards_ax = None
model = None


def init_gridworld(random_player=False, random_mines=False, maze=False):
    global grid_world
    grid_world = GridWorld(random_player, random_mines, maze)


class NeuralNetwork(nn.Module):
    def __init__(self, iterations=500):
        super(NeuralNetwork, self).__init__()
Esempio n. 29
0
from GridWorld import GridWorld
from Robot import Robot

env = GridWorld("grid-small.txt")
env.print_map()
gamma = 0.9

start = [0, 0]
agent = Robot(env, gamma)
epochs = 500
decay = 0.99
rvm_max_iter = 500
max_step = 1000
epsilon = 1
epsilon_threshold = 0.001
verbose = True
verbose_iteration = 1
steps, rewards = agent.learn(epochs, decay, rvm_max_iter, max_step, epsilon,  start, verbose, verbose_iteration)
path = agent.get_path(start)
print(path)
Esempio n. 30
0
 def __createEmptyPolicy(self):
     """we create a partial function that is undefined in all points"""
     c, r = self.world.size
     return [[(None if self.world.cellAt(x, y) == GridWorld.CELL_VOID else
               GridWorld.randomAction()) for x in range(c)]
             for y in range(r)]
Esempio n. 31
0
        self.drawUtilities(canvas)
        self.drawQValues(canvas)
        self.drawPolicy(canvas)


# ===========================================================================
# TEST
# ===========================================================================
if __name__ == '__main__':
    w = GridWorld([[
        GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID,
        GridWorld.CELL_EXIT
    ],
                   [
                       GridWorld.CELL_VOID, GridWorld.CELL_WALL,
                       GridWorld.CELL_VOID, GridWorld.CELL_PIT
                   ],
                   [
                       GridWorld.CELL_VOID, GridWorld.CELL_VOID,
                       GridWorld.CELL_VOID, GridWorld.CELL_VOID
                   ]],
                  discountFactor=1)
    w.setRewards(-0.04, -1, 1)
    w.setProbabilities(0.8, 0.1, 0.1, 0)
    print("GridWorld-----------")
    print(w)
    print("----------------")

    print("\nPolicy----------")
    p = Policy(w)
Esempio n. 32
0
def init_gridworld(random_player=False, random_mines=False, maze=False):
    global grid_world
    grid_world = GridWorld(random_player, random_mines, maze)
Esempio n. 33
0
            # RL take action and get next state and reward
            _, next_state_index, reward, done = env.step(action)

            # RL choose action based on next state
            next_action = RL.choose_action(str(next_state_index))

            # RL learn from this transition (s, a, r, s, a) ==> Sarsa
            RL.learn(str(state), action, reward, str(next_state_index), next_action)

            # swap state and action
            state = next_state_index
            action = next_action

            # break while loop when end of this episode
            if done:
                break

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    env = GridWorld()
    RL = Sarsa(actions=list(range(env.n_actions)))

    env.after(10000, update)
    env.mainloop()
    print(RL.q_table)
Esempio n. 34
0
from GridWorld import GridWorld

g = GridWorld(3, 4)
policy = {
    (0, 0): {
        'R': 1
    },
    (0, 1): {
        'R': 1
    },
    (0, 2): {
        'R': 1
    },
    (1, 0): {
        'U': 1
    },
    (1, 1): {
        'U': 1
    },
    (1, 2): {
        'U': 1
    },
    (1, 3): {
        'U': 1
    },
    (2, 0): {
        'R': 0.5,
        'U': 0.5
    },
    (2, 1): {
        'R': 1
Esempio n. 35
0
import tensorflow as tf

from GridWorld import GridWorld

np.random.seed(20)
tf.set_random_seed(20)

MAX_EPISODE = 1000
MAX_EP_STEPS = 1000  # maximum time step in one episode
GAMMA = 0.9  # reward discount in TD error
lr_actor = 0.001
lr_critic = 0.01

grid_world_h = 5
grid_world_w = 5
env = GridWorld(grid_world_h, grid_world_w)

n_features = 2
n_actions = 4


class Actor(object):
    def __init__(self, sess, n_features, n_actions, lr=0.001):
        self.sess = sess
        self.state = tf.placeholder(tf.float32, [1, n_features], "state")
        self.action = tf.placeholder(tf.int32, None, "act")
        self.td_error = tf.placeholder(tf.float32, None, "td_error")

        with tf.variable_scope('Actor'):
            state_layer = tf.layers.dense(
                inputs=self.state,
Esempio n. 36
0
		self.world.draw(canvas)
		self.drawUtilities(canvas)
		self.drawQValues(canvas)
		self.drawPolicy(canvas)
	
		
#===========================================================================
# TEST
#===========================================================================
if __name__ == '__main__':

	w = GridWorld([[GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_EXIT, GridWorld.CELL_VOID, GridWorld.CELL_VOID], 
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_PIT, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_PIT, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID],
			   [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID]], discountFactor = 1 )
	
	
	
	w.setRewards(-0.04, -1, 1)
	w.setProbabilities(0.8, 0.1, 0.1, 0)
#	w.setDiscountFactor(0.9)
#	print("-GridWorld-")
#	print(w)
#	print("-----------")
	
	print("\n---Policy---")
Esempio n. 37
0
	def __createEmptyPolicy(self):
		'''we create a partial function that is undefined in all points'''
		c, r = self.world.size
		return [ [ (None if self.world.cellAt(x,y) == GridWorld.CELL_VOID else GridWorld.randomAction()) for x in range(c) ] for y in range(r) ]
Esempio n. 38
0
        return -1
    grid_world.is_visited[x][y] = 1
    grid_world.dfs_route.append((x, y))
    random.shuffle(adjacent_nodes)
    for l in adjacent_nodes:
        if grid_world.is_visited[l[0]][l[1]] == 0:
            ret_val = random_dfs(grid_world, str(l[0]) + "," + str(l[1]))
            if ret_val == -1:
                grid_world.dfs_best_route.append((l[0], l[1]))
                return -1


def run_dfs(grid_world):
    # dfs(grid_world, grid_world.start_key)
    random_dfs(grid_world, grid_world.start_key)
    grid_world.dfs_best_route.append((grid_world.start_x, grid_world.start_y))
    grid_world.dfs_best_route = grid_world.dfs_best_route[::-1]


grid_world = GridWorld()
Functions.create_obstacles_from_hex(grid_world)
# Functions.create_random_obstacles(grid_world, 0.205)
# Functions.create_fixed_obstacles(grid_world, 6)
grid_world.scan_grid_and_generate_graph()
grid_world.print_graph()
grid_world.create_grid_ui(grid_world.m, grid_world.n, (grid_world.start_x, grid_world.start_y),
                          (grid_world.end_x, grid_world.end_y), grid_world.obstacles)
run_dfs(grid_world)
grid_world.move_on_given_route()
tk.mainloop()