def qLearnMain(gWorldArg, gammaArg, alphaArg, epsilonArg, printDebugStatementsFlagArg, screenWidthArg): global gWorld, screenWidth, printDebugStatementsFlag global gamma, alpha, epsilon gWorld = gWorldArg screenWidth = screenWidthArg printDebugStatementsFlag = printDebugStatementsFlagArg # Q Learning Parameters gamma = gammaArg alpha = alphaArg epsilon = epsilonArg print '{:^{screenWidth}}'.format('{:=^{w}}'.format('', w = screenWidth-10), screenWidth=screenWidth) print '{:^{screenWidth}}'.format('{:^{w}}'.format('Welcome to Gold Explorer Using Reinforcement Learning - Q Learning', w = screenWidth-10), screenWidth=screenWidth) print '{:^{screenWidth}}'.format('{:=^{w}}'.format('', w = screenWidth-10), screenWidth=screenWidth) print qLearn() gWorld.printGridWorldRewardMatrix() printGrids(gWorld) gWorld.printGridWorldOptimumPolicyQValue() print '{:^{screenWidth}}'.format('{:=^{w}}'.format('', w = screenWidth-10), screenWidth=screenWidth) print '{:^{screenWidth}}'.format('{:^{w}}'.format('Thank you for using Gold Explorer Using Reinforcement Learning - Q Learning', w = screenWidth-10), screenWidth=screenWidth) print '{:^{screenWidth}}'.format('{:=^{w}}'.format('', w = screenWidth-10), screenWidth=screenWidth) print
def qLearnMain(gWorldArg, gammaArg, alphaArg, epsilonArg, printDebugStatementsFlagArg, screenWidthArg): global gWorld, screenWidth, printDebugStatementsFlag global gamma, alpha, epsilon gWorld = gWorldArg screenWidth = screenWidthArg printDebugStatementsFlag = printDebugStatementsFlagArg # Q Learning Parameters gamma = gammaArg alpha = alphaArg epsilon = epsilonArg print '{:^{screenWidth}}'.format('{:=^{w}}'.format('', w=screenWidth - 10), screenWidth=screenWidth) print '{:^{screenWidth}}'.format('{:^{w}}'.format( 'Welcome to Gold Explorer Using Reinforcement Learning - Q Learning', w=screenWidth - 10), screenWidth=screenWidth) print '{:^{screenWidth}}'.format('{:=^{w}}'.format('', w=screenWidth - 10), screenWidth=screenWidth) print qLearn() gWorld.printGridWorldRewardMatrix() printGrids(gWorld) gWorld.printGridWorldOptimumPolicyQValue() print '{:^{screenWidth}}'.format('{:=^{w}}'.format('', w=screenWidth - 10), screenWidth=screenWidth) print '{:^{screenWidth}}'.format('{:^{w}}'.format( 'Thank you for using Gold Explorer Using Reinforcement Learning - Q Learning', w=screenWidth - 10), screenWidth=screenWidth) print '{:^{screenWidth}}'.format('{:=^{w}}'.format('', w=screenWidth - 10), screenWidth=screenWidth) print
def qLearn(): global epsilon # Counters iterationCount = 0 episodeCount=0 currGrid = gWorld.getGrids()[0][0] epsilon_choices = randChoiceList([('explore', epsilon), ('exploit', 1-epsilon )]) sys.stdout.write('\n\tIterating.') if printDebugStatementsFlag == False else None while True: oldGridMatrixValue = getGridWorldQValues(gWorld) # To Check for convergence episodeCount +=1 goalTraversedFlag = False sys.stdout.write('\n{:^{screenWidth}}\n'.format('{:#^{w}}'.format(' Episode #'+str(episodeCount)+" ", w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None if episodeCount % 100 == 0 : epsilon = epsilon / (1 + epsilon) sys.stdout.write('\n\n{:^{screenWidth}}\n'.format('{:<{w}}'.format('***Updating epsilon to:'+ str(epsilon)+" ", w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None while True: if currGrid.isGoal(): sys.stdout.write('\n{:^{screenWidth}}\n'.format('{:<{w}}'.format('***Goal Reached Once, setting the flag', w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None goalTraversedFlag = True sys.stdout.write('\n{:^{screenWidth}}'.format('{:*^{w}}'.format('', w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None sys.stdout.write('\n{:^{screenWidth}}'.format('{:<{w}}'.format(" Current Grid: "+ str(currGrid.getGridName())+ "\tQ Value : " + str(currGrid.value) + "\tReward : " + str(currGrid.gridReward), w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None sys.stdout.write('\n{:^{screenWidth}}'.format('{:*^{w}}'.format('', w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None if currGrid.isBlocked(): sys.stdout.write('\n{:^{screenWidth}}'.format('{:<{w}}'.format('Blocked Grid... Skipping', w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None else: iterationCount +=1 if iterationCount % 200 == 0: sys.stdout.write(".") if printDebugStatementsFlag == False else None if iterationCount % 10500 == 0: sys.stdout.write("\n\t") if printDebugStatementsFlag == False else None sys.stdout.write('\n{:^{screenWidth}}\n'.format('{:#^{w}}'.format(' Iteration #'+str(iterationCount)+" ", w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None exploitOrExplore = random.choice(epsilon_choices) if exploitOrExplore == 'explore': sys.stdout.write('\n{:^{screenWidth}}\n'.format('{:<{w}}'.format('*****Exploring', w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None nextGridDirection,nextGrid = explore(currGrid, gWorld) elif exploitOrExplore == 'exploit': sys.stdout.write('\n{:^{screenWidth}}\n'.format('{:<{w}}'.format('*****Exploiting', w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None nextGridDirection,nextGrid = exploit(currGrid, gWorld) allQValuesOfNextGrid = [nextGrid.getQLeft(),nextGrid.getQRight(),nextGrid.getQUp(),nextGrid.getQDown()] maxQValueNextGrid = max(allQValuesOfNextGrid) sys.stdout.write('\n{:^{screenWidth}}'.format('{:<{w}}'.format("Action Chosen \t: "+ nextGridDirection+ "\tNextGrid : ("+ nextGrid.getGridName()+")" , w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None sys.stdout.write('\n{:^{screenWidth}}'.format('{:<{w}}'.format("All QValues Of NextGrid : "+ ','.join([str(round(v,3)) for v in allQValuesOfNextGrid])+ "\tMax : " + str(maxQValueNextGrid), w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None # Compute the Q(s,a) qValofCurrGrid = getQValueforCurrGrid(currGrid, nextGridDirection) newQValofCurrGrid = qValofCurrGrid + alpha * (currGrid.getGridReward() + (gamma * maxQValueNextGrid) - qValofCurrGrid) # Update Q Value of the current grid for the corresponding direction updateGridQValue(currGrid, nextGridDirection, newQValofCurrGrid) sys.stdout.write('\n{:^{screenWidth}}'.format('{:<{w}}'.format("Using the Q(s,a) equation, updated Grid "+ currGrid.getGridName()+"'s "+ nextGridDirection+" QValue to : "+ str(newQValofCurrGrid), w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None # Update current grid currGrid = nextGrid if goalTraversedFlag == True: sys.stdout.write('\n{:^{screenWidth}}\n'.format('{:<{w}}'.format("Goal Reached", w = screenWidth-10), screenWidth=screenWidth)) if printDebugStatementsFlag == True else None break if printDebugStatementsFlag: printGrids(gWorld) print "\n\n" currGrid = gWorld.getGrids()[0][0] newGridMatrixValue = getGridWorldQValues(gWorld) convergedFlag = isConverged(oldGridMatrixValue,newGridMatrixValue) if convergedFlag == True: print '\n\n{:^{screenWidth}}'.format('{:%^{w}}'.format(" Total # of Iterations\t:" + str(iterationCount)+" ", w = screenWidth-20), screenWidth=screenWidth) print '\n{:^{screenWidth}}\n'.format('{:%^{w}}'.format(" Total # of Episodes\t:" + str(episodeCount)+" ", w = screenWidth-20), screenWidth=screenWidth) break
def qLearn(): global epsilon # Counters iterationCount = 0 episodeCount = 0 currGrid = gWorld.getGrids()[0][0] epsilon_choices = randChoiceList([('explore', epsilon), ('exploit', 1 - epsilon)]) sys.stdout.write( '\n\tIterating.') if printDebugStatementsFlag == False else None while True: oldGridMatrixValue = getGridWorldQValues( gWorld) # To Check for convergence episodeCount += 1 goalTraversedFlag = False sys.stdout.write('\n{:^{screenWidth}}\n'.format( '{:#^{w}}'.format(' Episode #' + str(episodeCount) + " ", w=screenWidth - 10), screenWidth=screenWidth) ) if printDebugStatementsFlag == True else None if episodeCount % 100 == 0: epsilon = epsilon / (1 + epsilon) sys.stdout.write('\n\n{:^{screenWidth}}\n'.format( '{:<{w}}'.format('***Updating epsilon to:' + str(epsilon) + " ", w=screenWidth - 10), screenWidth=screenWidth) ) if printDebugStatementsFlag == True else None while True: if currGrid.isGoal(): sys.stdout.write('\n{:^{screenWidth}}\n'.format( '{:<{w}}'.format('***Goal Reached Once, setting the flag', w=screenWidth - 10), screenWidth=screenWidth )) if printDebugStatementsFlag == True else None goalTraversedFlag = True sys.stdout.write('\n{:^{screenWidth}}'.format( '{:*^{w}}'.format('', w=screenWidth - 10), screenWidth=screenWidth) ) if printDebugStatementsFlag == True else None sys.stdout.write('\n{:^{screenWidth}}'.format( '{:<{w}}'.format(" Current Grid: " + str(currGrid.getGridName()) + "\tQ Value : " + str(currGrid.value) + "\tReward : " + str(currGrid.gridReward), w=screenWidth - 10), screenWidth=screenWidth) ) if printDebugStatementsFlag == True else None sys.stdout.write('\n{:^{screenWidth}}'.format( '{:*^{w}}'.format('', w=screenWidth - 10), screenWidth=screenWidth) ) if printDebugStatementsFlag == True else None if currGrid.isBlocked(): sys.stdout.write('\n{:^{screenWidth}}'.format( '{:<{w}}'.format('Blocked Grid... Skipping', w=screenWidth - 10), screenWidth=screenWidth )) if printDebugStatementsFlag == True else None else: iterationCount += 1 if iterationCount % 200 == 0: sys.stdout.write( ".") if printDebugStatementsFlag == False else None if iterationCount % 10500 == 0: sys.stdout.write( "\n\t") if printDebugStatementsFlag == False else None sys.stdout.write('\n{:^{screenWidth}}\n'.format( '{:#^{w}}'.format(' Iteration #' + str(iterationCount) + " ", w=screenWidth - 10), screenWidth=screenWidth )) if printDebugStatementsFlag == True else None exploitOrExplore = random.choice(epsilon_choices) if exploitOrExplore == 'explore': sys.stdout.write('\n{:^{screenWidth}}\n'.format( '{:<{w}}'.format('*****Exploring', w=screenWidth - 10), screenWidth=screenWidth )) if printDebugStatementsFlag == True else None nextGridDirection, nextGrid = explore(currGrid, gWorld) elif exploitOrExplore == 'exploit': sys.stdout.write('\n{:^{screenWidth}}\n'.format( '{:<{w}}'.format('*****Exploiting', w=screenWidth - 10), screenWidth=screenWidth )) if printDebugStatementsFlag == True else None nextGridDirection, nextGrid = exploit(currGrid, gWorld) allQValuesOfNextGrid = [ nextGrid.getQLeft(), nextGrid.getQRight(), nextGrid.getQUp(), nextGrid.getQDown() ] maxQValueNextGrid = max(allQValuesOfNextGrid) sys.stdout.write('\n{:^{screenWidth}}'.format( '{:<{w}}'.format("Action Chosen \t: " + nextGridDirection + "\tNextGrid : (" + nextGrid.getGridName() + ")", w=screenWidth - 10), screenWidth=screenWidth )) if printDebugStatementsFlag == True else None sys.stdout.write('\n{:^{screenWidth}}'.format( '{:<{w}}'.format("All QValues Of NextGrid : " + ','.join( [str(round(v, 3)) for v in allQValuesOfNextGrid]) + "\tMax : " + str(maxQValueNextGrid), w=screenWidth - 10), screenWidth=screenWidth )) if printDebugStatementsFlag == True else None # Compute the Q(s,a) qValofCurrGrid = getQValueforCurrGrid(currGrid, nextGridDirection) newQValofCurrGrid = qValofCurrGrid + alpha * ( currGrid.getGridReward() + (gamma * maxQValueNextGrid) - qValofCurrGrid) # Update Q Value of the current grid for the corresponding direction updateGridQValue(currGrid, nextGridDirection, newQValofCurrGrid) sys.stdout.write( '\n{:^{screenWidth}}'.format('{:<{w}}'.format( "Using the Q(s,a) equation, updated Grid " + currGrid.getGridName() + "'s " + nextGridDirection + " QValue to : " + str(newQValofCurrGrid), w=screenWidth - 10), screenWidth=screenWidth) ) if printDebugStatementsFlag == True else None # Update current grid currGrid = nextGrid if goalTraversedFlag == True: sys.stdout.write('\n{:^{screenWidth}}\n'.format( '{:<{w}}'.format("Goal Reached", w=screenWidth - 10), screenWidth=screenWidth )) if printDebugStatementsFlag == True else None break if printDebugStatementsFlag: printGrids(gWorld) print "\n\n" currGrid = gWorld.getGrids()[0][0] newGridMatrixValue = getGridWorldQValues(gWorld) convergedFlag = isConverged(oldGridMatrixValue, newGridMatrixValue) if convergedFlag == True: print '\n\n{:^{screenWidth}}'.format('{:%^{w}}'.format( " Total # of Iterations\t:" + str(iterationCount) + " ", w=screenWidth - 20), screenWidth=screenWidth) print '\n{:^{screenWidth}}\n'.format('{:%^{w}}'.format( " Total # of Episodes\t:" + str(episodeCount) + " ", w=screenWidth - 20), screenWidth=screenWidth) break