def solve(world, goalInWorld): solution = GridWorld(world.width, world.height) # print(solution.cells) solution.cells = [Cell(cell) for cell in world.cells] # print(solution.cells) goal = solution.get(goalInWorld.col, goalInWorld.line) goal.cost = 0 closed = [] opened = [goal] # reopen = 0 while len(opened): # print(enigmaAsStr(solution, goal)) # print("opened:", [(c.col, c.line) for c in opened]) cell = opened.pop() closed.append(cell) # for adj in solution.getAdjacentCells(cell): for adj in solution.getAccessibleCells(cell): # print("cell", cell, "has got a adj", adj) if adj.reachable: # we ignore obstacles direction = Direction.fromTo(adj, cell) cost = cell.cost + direction.cost() if adj.cost == -1: # or adj.cost > cost: # if not used yet adj.direction = direction adj.cost = cost opened.append(adj) if adj.cost > cost: # if not used yet # reopen += 1 # print("reopen", reopen) adj.direction = direction adj.cost = cost opened.append(adj) return solution
def setUp(self): self.world = GridWorld(10, 10) self.obstaclesProb = 0.2 self.world.addRandomObstacles( math.floor(self.world.getLength() * self.obstaclesProb)) for cell in self.world.cells: if cell.reachable: self.goal = cell break
def createSmallMaze(self): #should be GridWorldSmall() self.GridWorldGame = GridWorld((5, 5)) cols = self.GridWorldGame.size[0] rows = self.GridWorldGame.size[1] self.MAZE_X = cols * 32 self.MAZE_Y = rows * 32 FRAME = 8 self.START_X = (self.MAX_X - cols * 32) / 2 + FRAME #what happens if its not 0 in %32 self.START_Y = (self.MAX_Y - rows * 32) / 2 + FRAME self.smileyPos = (self.START_X, self.START_Y)
def main(): env = GridWorld() _, es1, ts1 = independentQLearning(env, lambda x: x < 100, 0) qList, es2, ts2 = shareStateQLearning(env, lambda x: x < 100, 0) iQL = plt.scatter(es1, ts1, c='red') ssQL = plt.scatter(es2, ts2, c='blue') iQL.set_label("Independent") ssQL.set_label("5 Predators, 2 Prey, Share State") plt.xlabel("Episodes") plt.ylabel("Cumulative TimeSteps") plt.legend() plt.show() env.simulateTrajectory(qList)
class Tests(unittest.TestCase): def setUp(self): self.world = GridWorld(10, 10) self.obstaclesProb = 0.2 self.world.addRandomObstacles(math.floor(self.world.getLength() * self.obstaclesProb)) for cell in self.world.cells: if cell.reachable: self.goal = cell break def test_runs(self): solution = solve(self.world, self.goal) print(enigmaAsStr(solution, self.goal))
class Tests(unittest.TestCase): def setUp(self): self.world = GridWorld(10, 10) self.obstaclesProb = 0.2 self.world.addRandomObstacles( math.floor(self.world.getLength() * self.obstaclesProb)) for cell in self.world.cells: if cell.reachable: self.goal = cell break def test_runs(self): solution = solve(self.world, self.goal) print(enigmaAsStr(solution, self.goal))
def evaluate(goals, EQ): env = GridWorld(goals=goals, T_states=T_states) policy = EQ_P(EQ) state = env.reset() done = False t = 0 G = 0 while not done and t < 100: action = policy[state] state_, reward, done, _ = env.step(action) state = state_ G += reward t += 1 return G
def openMDPGUI(self): global w, g if self.checkSettingValues(): self.master.destroy() df = float(self.discFactor.get()) rews = list(map(lambda x: float(x.get()), self.rewValue)) probs = list(map(lambda x: float(x.get()), self.probValue)) w = GridWorld([[ GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_EXIT ], [ GridWorld.CELL_VOID, GridWorld.CELL_WALL, GridWorld.CELL_VOID, GridWorld.CELL_PIT ], [ GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID ]]) w.setDiscountFactor(df) w.setRewards(rews[0], rews[1], rews[2]) w.setProbabilities(probs[0], probs[1], probs[2], probs[3]) g = MDPGUI(w)
def objectiveFunction(args): learning_rate, min_epsilon, max_epsilon, epsilon_decay, discount_factor = args num_of_episodes = 500 max_steps = 1000 environment = GridWorld() agentQ = Q_Agent(environment, epsilon=max_epsilon, learning_rate=learning_rate, discount_factor=discount_factor) train(environment, agentQ, episodes=num_of_episodes, max_steps_per_episode=max_steps, min_epsilon=min_epsilon, max_epsilon=max_epsilon, epsilon_decay=epsilon_decay) mean_reward = test(environment, agentQ, episodes=1000) value_map = np.zeros((environment.height, environment.width)) for x in range(environment.height): for y in range(environment.width): q_values_of_state = agentQ.q_table[(x, y)] maxValue = max(q_values_of_state.values()) value_map[x, y] = maxValue if save == True: utils.plotValueFunction(value_map, os.path.join(save_path, 'heatmap.jpg')) return -(mean_reward)
def setUp(self): self.world = GridWorld(10, 10) self.obstaclesProb = 0.2 self.world.addRandomObstacles(math.floor(self.world.getLength() * self.obstaclesProb)) for cell in self.world.cells: if cell.reachable: self.goal = cell break
def gridworld(): ''' Create complete discrete environment for MDP modelling (InSpace Tiled), including Rewards and Transition probabilities''' w = GridWorld([[GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_EXIT, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_PIT, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_PIT, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID]], discountFactor = 1 ) w.setRewards(-0.04, -1, 1) w.setProbabilities(0.8, 0.1, 0.1, 0) # w.setDiscountFactor(0.6) return w
def baseTester(): ''' runs a somewhat comprehensive test''' try: import QLearner as ql except: pass #it is worth noting here that num_states can be 100 for any grid < 10x10 using the tuckerHash #we need a new hash algo if we are to use a grid outside those parameters baseKwargs = {'num_states':100, 'alpha':1.0, 'gamma':0.9, 'rar':0.5, 'radr':0.99, 'dyna':0, 'verbose':False} ''' if you want to add your own test, add it here. I use a tuple to indicate one test it is: (csv file, expected convergence iterations, kwarg modifier, test name) ''' myTestList = [('testEasyWorld.csv', 800, 13,{}, 'easy test'), ('world01.csv', 7000, 16, {}, 'Tucker Test 1'), ('world02.csv', 7000, 17, {}, 'Tucker Test 2'), ('testGridWorld.csv', 5000, 20, {}, 'Leo Base Test'), ('testGridWorld.csv', 18000, 20, {'alpha':.2}, 'Test Learning Rate'), ('testEasyWorld.csv', 700, 13, {'rar': 0.05}, 'Test Exploration'), ('testEasyWorld.csv', 700, 13, {'radr': 0.8}, 'Test Exploration Decay'), ('testGridWorld.csv', 3000, 20, {'gamma':0.8}, 'Test Discount Rate'), ('testGridWorld.csv', 1100, 20, {'dyna':100}, 'Test Dyna'), ] fdtest=myTestList[7:9] #for test in myTestList: for test in fdtest: print '-------------------------------' print test[4] world = GridWorld(test[0]) testKwargs = copy(baseKwargs) for k in test[3].keys(): testKwargs[k] = test[3][k] print 'parameters %s' % str(testKwargs) learner = ql.QLearner(**testKwargs) print world.grid myTester = QTester(world, learner) nIter = test[1] totalIter = nIter lastPolicyLength = 0 #someone let me know if there's a better way to check for convergence time while (totalIter < (test[1] * 1.4)): myTester.nIter(nIter) nIter = int(.05*test[1]) myPolicy = myTester.getPolicy() policyLength = len(myPolicy) totalIter += nIter if (lastPolicyLength == policyLength) and (policyLength < 100): print 'converged in approx %i iterations' % totalIter print policyLength, myPolicy, test[2] break lastPolicyLength = policyLength if (test[1]*1.2 >= totalIter) and (policyLength == test[2]): print '*** TEST PASSED ***' else: print 'xxx TEST FAILED xxx'
def setUp(self): self.n = 5 self.p = 1 self.gridworld = GridWorld(self.n, self.p) self.go_right_policy = np.ones(self.n * self.n, dtype=int) self.discount = 0.9 self.large_discount = 0.2 self.policy = np.array( [['TERMINAL', 'RIGHT', 'RIGHT', 'RIGHT', 'TERMINAL'], ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP'], ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP'], ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP'], ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP']]) self.policy_large_discount = np.array( [['TERMINAL', 'LEFT', 'RIGHT', 'RIGHT', 'TERMINAL'], ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP'], ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP'], ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP'], ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP']])
def __init__(self, epsilon=0.01, greedy=False, alpha=0.1, gamma=0.95, visual=True, goal=(10, 8), agentPose=(1, 1, 'up'), showTrial=True, randomReset=False, epsilonStrat=1, epsilonFactor=500): """ gridWorld: GridWorld object epsilon: value used for epsilon greedy search alpha: step size gamma: discount favtor """ self.actionValues = Counter() self.epsilonFactor = epsilonFactor self.randomReset = randomReset self.epsilon = epsilon self.greedy = greedy self.epsilonStrat = epsilonStrat self.goal = goal self.Q = dict() self.gridWorld = GridWorld(goal, agentPose, visual=visual, showTrial=showTrial, randomReset=randomReset) self.actions = self.gridWorld.getActions() self.Model = dict() self.alpha = alpha self.PriorityQueue = PriorityQueue() self.gamma = gamma self.exp = [] self.rewards = dict() self.rewardNums = dict() self.predecessors = defaultdict(set) self.initQValues()
def start_grid_mdp(): """ starts the program, restarts if the user wants to """ grid = load_grid(get_file_path()) world = GridWorld(grid) move_costs = get_move_cost() gamma = get_gamma() eval_steps = get_evaluation_steps() MDP(world, eval_steps, gamma, move_costs) if start_again(): start_grid_mdp()
def buildBiasEngine(self): """ Simulates MDPs with varying bias to build a bias inference engine. """ print "Loading MDPs...\n" # Unnecessary progress bar for terminal bar = pyprind.ProgBar(len(self.test)) for i in self.test: self.sims.append( GridWorld(self.grid, i, self.discount, self.tau, self.epsilon)) bar.update() print "\nDone loading MDPs..."
def openMDPGUI(self): global w, g if self.checkSettingValues(): self.master.destroy() df = float(self.discFactor.get()) rews = list(map(lambda x: float(x.get()), self.rewValue)) probs = list(map(lambda x: float(x.get()), self.probValue)) w = GridWorld( [ [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_EXIT], [GridWorld.CELL_VOID, GridWorld.CELL_WALL, GridWorld.CELL_VOID, GridWorld.CELL_PIT], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], ] ) w.setDiscountFactor(df) w.setRewards(rews[0], rews[1], rews[2]) w.setProbabilities(probs[0], probs[1], probs[2], probs[3]) g = MDPGUI(w)
import numpy as np from matplotlib import pyplot as plt import deepdish as dd from GridWorld import GridWorld from library import * env = GridWorld() T_states = [(3, 3), (3, 9), (9, 3), (9, 9), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 7), (11, 8), (11, 9), (11, 10), (2, 1), (3, 1), (4, 1), (5, 1), (7, 1), (8, 1), (9, 1), (10, 1), (2, 11), (3, 11), (4, 11), (5, 11), (6, 11), (8, 11), (9, 11), (10, 11), (11, 11)] ###################################### Qs BTasksQ = [[t] for t in T_states] ###################################### EQs Bases = [] n = int(np.ceil(np.log2(len(T_states)))) m = (2**n) / 2 for i in range(n): Bases.append([]) b = False for j in range(0, 2**n): if j >= len(T_states): break if b: Bases[i].append(1) #1=True=rmax else: Bases[i].append(0) #0=False=rmin if (j + 1) % m == 0:
from GridWorld import GridWorld from GridWorld import GridWorldAdditive from ValueIteration import ValueIteration # Run Value Iteration in different Grid World environments if __name__ == "__main__": gamma = 0.9 print("Grid world Value Iteration with discounted rewards gamma = %.2f\n" % gamma) terminals = {(0, 3): +1, (1, 3): -1} gw = GridWorld((3, 4), 0.8, [(1, 1)], terminals) vi = ValueIteration() values = vi.valueIteration(gw, gamma) gw.printValues(values) qvalues = vi.getQValues(gw, values, gamma) gw.printQValues(qvalues) policy = vi.getPolicy(gw, values, gamma) gw.printPolicy(policy) reward = -0.01 print("Grid world Value Iteration with additive rewards = %.2f\n" % reward) gwa = GridWorldAdditive((3, 4), 0.8, [(1, 1)], terminals, reward) values = vi.valueIteration(gwa, 1, 100) gwa.printValues(values) qvalues = vi.getQValues(gwa, values, 1) gwa.printQValues(qvalues) policy = vi.getPolicy(gwa, values, 1) gwa.printPolicy(policy) reward = -0.04 print("Grid World with additive rewards = %.2f\n" % reward) gwa = GridWorldAdditive((3, 4), 0.8, [(1, 1)], terminals, reward)
hlpStr = ("Markov Decision Process Examples\n" " Examples:\n" " gridworld 1: std grid world as the book (step cost -0.04, discount factor 1)\n" " gridworld 2: low discount factor 0.6 (step cost -0.04)\n" " gridworld 3: low step cost -0.01\n" " gridworld 4: suicide mode (step cost -2)\n" ) print(hlpStr) exit() if len(sys.argv) == 1: showhelp() if sys.argv[1] == "gridworld": w = GridWorld([[GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_EXIT], [GridWorld.CELL_VOID, GridWorld.CELL_WALL, GridWorld.CELL_VOID, GridWorld.CELL_PIT], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID]], discountFactor = 1 ) if len(sys.argv) < 3: mdpc = MDPChooser() elif sys.argv[2] == "1": w.setRewards(-0.04, -1, 1) w.setProbabilities(0.8, 0.1, 0.1, 0) w.setDiscountFactor(1) g = MDPGUI(w) elif sys.argv[2] == "2": w.setRewards(-0.04, -1, 1) w.setProbabilities(0.8, 0.1, 0.1, 0) w.setDiscountFactor(0.9) g = MDPGUI(w) elif sys.argv[2] == "3":
def __init__(self): self.game = GridWorld( (5,5)) self.squareCountGrid = self.game.createSquareCount() self.alpha = 0.1 self.gamma = 0.9
from GridWorld import GridWorld g = GridWorld(3,4) policy={ (0, 0):'R', (0, 1):'R', (0, 2):'R', (1, 0):'U', (1, 1):'U', (1, 2):'U', (1, 3):'U', (2, 0):'R', (2, 1):'R', (2, 2):'U', (2, 3):'L' } def print_policy(p,g): for r in range(g.row): print('------------------') for c in range(g.col): a = p.get((r,c),' ') print(' %s |'%a, end="") print("") def print_value(V,g): for r in range(g.row): print('------------------') for c in range(g.col): v = V.get((r,c), 0)
t = 0 G = 0 while not done and t < 100: action = policy[state] state_, reward, done, _ = env.step(action) state = state_ G += reward t += 1 return G for t in range(len(types)): print("type: ", t) # Learning universal bounds (min and max tasks) env = GridWorld(goals=T_states, dense_rewards=not types[t][0]) EQ_max, _ = Goal_Oriented_Q_learning(env, maxiter=maxiter) env = GridWorld(goals=T_states, goal_reward=-0.1, dense_rewards=not types[t][0]) EQ_min, _ = Goal_Oriented_Q_learning(env, maxiter=maxiter) # Learning base tasks and doing composed tasks goals = Bases[0] goals = [[pos, pos] for pos in goals] env = GridWorld(goals=goals, dense_rewards=not types[t][0], T_states=T_states if types[t][1] else goals) A, stats1 = Goal_Oriented_Q_learning( env, maxiter=maxiter, T_states=None if types[t][1] else T_states)
class TDL_solution: def __init__(self): self.game = GridWorld( (5,5)) self.squareCountGrid = self.game.createSquareCount() self.alpha = 0.1 self.gamma = 0.9 def playTDLGame(self,startSquare, randomMove): self.game.currentSquare = startSquare keepPlaying = not self.game.gameOver() squares_and_returns = [(self.game.currentSquare,0)] while keepPlaying: #policy i = self.game.currentSquare[0] j = self.game.currentSquare[1] move = self.game.policyGrid[i][j] if randomMove < np.random.rand(): moves = self.game.possibleMoves((i,j)) moves.remove(move) if len(moves) > 0: idx = np.random.randint(0,len(moves)) move = moves[idx] #move self.game.move(move) i = self.game.currentSquare[0] j = self.game.currentSquare[1] theReturn = self.game.returnGrid[i][j] squares_and_returns.append( (self.game.currentSquare,theReturn) ) keepPlaying = not self.game.gameOver() G = 0 self.squares_and_values = [] for square , theReturn in reversed(squares_and_returns): self.squares_and_values.append( (square,G) ) G = theReturn + self.game.gamma*G #self.squares_and_values.reverse() def playSarsa(self,startSquare, randomMove): self.game.currentSquare = startSquare keepPlaying = not self.game.gameOver() while keepPlaying: #policy i1 = self.game.currentSquare[0] j1 = self.game.currentSquare[1] move = self.game.policyGrid[i1][j1] if randomMove < np.random.rand(): moves = self.game.possibleMoves((i1,j1)) print( str(i1) + " " + str(j1) + " " + str(moves) + " " + str(move) ) moves.remove(move) if len(moves) > 0: idx = np.random.randint(0,len(moves)) move = moves[idx] #move self.game.move(move) i2 = self.game.currentSquare[0] j2 = self.game.currentSquare[1] theReturn = self.game.returnGrid[i2][j2] self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(theReturn + self.gamma*self.game.valueGrid[i2][j2]- self.game.valueGrid[i1][j1] ) keepPlaying = not self.game.gameOver() def playQLearning(self,startSquare, randomMove): self.game.currentSquare = startSquare keepPlaying = not self.game.gameOver() while keepPlaying: #policy i1 = self.game.currentSquare[0] j1 = self.game.currentSquare[1] move = self.game.policyGrid[i1][j1] # we use the best move even if random runs over it i3 = self.game.currentSquare[0] j3 = self.game.currentSquare[1] if randomMove < np.random.rand(): moves = self.game.possibleMoves((i1,j1)) print( str(i1) + " " + str(j1) + " " + str(moves) + " " + str(move) ) moves.remove(move) if len(moves) > 0: idx = np.random.randint(0,len(moves)) move = moves[idx] #move self.game.move(move) i2 = self.game.currentSquare[0] j2 = self.game.currentSquare[1] theReturn = self.game.returnGrid[i2][j2] self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(theReturn + self.gamma*self.game.valueGrid[i3][j3]- self.game.valueGrid[i1][j1] ) keepPlaying = not self.game.gameOver() def updateValueGrid(self): for t in range(len(self.squares_and_values) -1): square , _ = self.squares_and_values[t] nextSquare, value = self.squares_and_values[t+1] i1 = square[0] j1 = square[1] i2 = nextSquare[0] j2 = nextSquare[1] self.game.valueGrid[i1][j1] = self.game.valueGrid[i1][j1] + self.alpha*(value + self.gamma*self.game.valueGrid[i2][j2]- self.game.valueGrid[i1][j1] ) def updatePolicyGrid(self): #check if policy change #hasChanged = False #if bestMove is new set to true. rows = self.game.size[0] cols = self.game.size[1] change = False for i in range(rows): for j in range(cols): if self.game.policyGrid[i][j] in [0,1,2,3]: self.game.currentSquare = (i,j) oldMove = self.game.policyGrid[i][j] self.game.policyGrid[i][j] = self.game.bestMove() if oldMove != self.game.policyGrid[i][j]: change = True return change def printGrids(self): self.game.printPolicyGrid() self.game.printReturnGrid() self.game.printValueGrid()
from Evaluation import Evaluation from GridWorld import GridWorld from Learning import Learning # グリッドワールドの大きさを指定 row = 5 column = 5 LearningAgentSpan = 10 # 学習エージェントの寿命 LearningTimes = 100 # 学習回数 P = 5 # 報酬 T = 10 # 遡る数 EvaluationAgentSpan = 10 # 評価エージェントの寿命 EvaluationTimes = 100 # 試行回数 grid_world = GridWorld(row, column) grid_world.make_grid_world() learning = Learning(grid_world.get_grid_world(), row, column) learning.do_learning(LearningAgentSpan, LearningTimes, P, T) evaluation = Evaluation(learning.get_grid_world(), row, column) evaluation.evaluation(EvaluationAgentSpan, EvaluationTimes)
vehState = start env_file = open("Environment.txt", "w") gridWorld = CreateEnvironment() gridWorld.create(env_file, size_row='10', size_col='10', agent_row=str(vehState[0]), agent_col=str(vehState[1]), goal_row=str(goal[0]), goal_col=str(goal[1]), static_number='2', static_list=[0, 3, 2, 4]) env_file = open("Environment.txt", "r") text_in_file = env_file.readline() print(text_in_file) grid = GridWorld(text_in_file) gw = grid.gridDefine() #------------------------------------------------------- # initialize agent class and uav class Agent = agent(vehState) # define a model dictionary, which maps user inputs of learning model names to learning model function modelType = { "random": Agent.predict_Random, "standard": Agent.predict_Standard, "NN": Agent.predict_NN } UAV = uav(vehState) # initialize decision model (options = "random", "standard", or "NN") model = "random" # will be a user input
import hashlib import json from GridWorld import GridWorld import numpy as np import copy from matplotlib import pyplot as plt import torch from torch.nn.modules.loss import SmoothL1Loss import torch.nn as nn from torch.optim import Adam import random grid_world = GridWorld() rewards_to_plot = [] stats_ax = None rewards_ax = None model = None def init_gridworld(random_player=False, random_mines=False, maze=False): global grid_world grid_world = GridWorld(random_player, random_mines, maze) class NeuralNetwork(nn.Module): def __init__(self, iterations=500): super(NeuralNetwork, self).__init__()
from GridWorld import GridWorld from Robot import Robot env = GridWorld("grid-small.txt") env.print_map() gamma = 0.9 start = [0, 0] agent = Robot(env, gamma) epochs = 500 decay = 0.99 rvm_max_iter = 500 max_step = 1000 epsilon = 1 epsilon_threshold = 0.001 verbose = True verbose_iteration = 1 steps, rewards = agent.learn(epochs, decay, rvm_max_iter, max_step, epsilon, start, verbose, verbose_iteration) path = agent.get_path(start) print(path)
def __createEmptyPolicy(self): """we create a partial function that is undefined in all points""" c, r = self.world.size return [[(None if self.world.cellAt(x, y) == GridWorld.CELL_VOID else GridWorld.randomAction()) for x in range(c)] for y in range(r)]
self.drawUtilities(canvas) self.drawQValues(canvas) self.drawPolicy(canvas) # =========================================================================== # TEST # =========================================================================== if __name__ == '__main__': w = GridWorld([[ GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_EXIT ], [ GridWorld.CELL_VOID, GridWorld.CELL_WALL, GridWorld.CELL_VOID, GridWorld.CELL_PIT ], [ GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID ]], discountFactor=1) w.setRewards(-0.04, -1, 1) w.setProbabilities(0.8, 0.1, 0.1, 0) print("GridWorld-----------") print(w) print("----------------") print("\nPolicy----------") p = Policy(w)
def init_gridworld(random_player=False, random_mines=False, maze=False): global grid_world grid_world = GridWorld(random_player, random_mines, maze)
# RL take action and get next state and reward _, next_state_index, reward, done = env.step(action) # RL choose action based on next state next_action = RL.choose_action(str(next_state_index)) # RL learn from this transition (s, a, r, s, a) ==> Sarsa RL.learn(str(state), action, reward, str(next_state_index), next_action) # swap state and action state = next_state_index action = next_action # break while loop when end of this episode if done: break # end of game print('game over') env.destroy() if __name__ == "__main__": env = GridWorld() RL = Sarsa(actions=list(range(env.n_actions))) env.after(10000, update) env.mainloop() print(RL.q_table)
from GridWorld import GridWorld g = GridWorld(3, 4) policy = { (0, 0): { 'R': 1 }, (0, 1): { 'R': 1 }, (0, 2): { 'R': 1 }, (1, 0): { 'U': 1 }, (1, 1): { 'U': 1 }, (1, 2): { 'U': 1 }, (1, 3): { 'U': 1 }, (2, 0): { 'R': 0.5, 'U': 0.5 }, (2, 1): { 'R': 1
import tensorflow as tf from GridWorld import GridWorld np.random.seed(20) tf.set_random_seed(20) MAX_EPISODE = 1000 MAX_EP_STEPS = 1000 # maximum time step in one episode GAMMA = 0.9 # reward discount in TD error lr_actor = 0.001 lr_critic = 0.01 grid_world_h = 5 grid_world_w = 5 env = GridWorld(grid_world_h, grid_world_w) n_features = 2 n_actions = 4 class Actor(object): def __init__(self, sess, n_features, n_actions, lr=0.001): self.sess = sess self.state = tf.placeholder(tf.float32, [1, n_features], "state") self.action = tf.placeholder(tf.int32, None, "act") self.td_error = tf.placeholder(tf.float32, None, "td_error") with tf.variable_scope('Actor'): state_layer = tf.layers.dense( inputs=self.state,
self.world.draw(canvas) self.drawUtilities(canvas) self.drawQValues(canvas) self.drawPolicy(canvas) #=========================================================================== # TEST #=========================================================================== if __name__ == '__main__': w = GridWorld([[GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_EXIT, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_PIT, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_PIT, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID], [GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID]], discountFactor = 1 ) w.setRewards(-0.04, -1, 1) w.setProbabilities(0.8, 0.1, 0.1, 0) # w.setDiscountFactor(0.9) # print("-GridWorld-") # print(w) # print("-----------") print("\n---Policy---")
def __createEmptyPolicy(self): '''we create a partial function that is undefined in all points''' c, r = self.world.size return [ [ (None if self.world.cellAt(x,y) == GridWorld.CELL_VOID else GridWorld.randomAction()) for x in range(c) ] for y in range(r) ]
return -1 grid_world.is_visited[x][y] = 1 grid_world.dfs_route.append((x, y)) random.shuffle(adjacent_nodes) for l in adjacent_nodes: if grid_world.is_visited[l[0]][l[1]] == 0: ret_val = random_dfs(grid_world, str(l[0]) + "," + str(l[1])) if ret_val == -1: grid_world.dfs_best_route.append((l[0], l[1])) return -1 def run_dfs(grid_world): # dfs(grid_world, grid_world.start_key) random_dfs(grid_world, grid_world.start_key) grid_world.dfs_best_route.append((grid_world.start_x, grid_world.start_y)) grid_world.dfs_best_route = grid_world.dfs_best_route[::-1] grid_world = GridWorld() Functions.create_obstacles_from_hex(grid_world) # Functions.create_random_obstacles(grid_world, 0.205) # Functions.create_fixed_obstacles(grid_world, 6) grid_world.scan_grid_and_generate_graph() grid_world.print_graph() grid_world.create_grid_ui(grid_world.m, grid_world.n, (grid_world.start_x, grid_world.start_y), (grid_world.end_x, grid_world.end_y), grid_world.obstacles) run_dfs(grid_world) grid_world.move_on_given_route() tk.mainloop()