def qLearning(n, p, p1, encoder, ENClast): import qinitial v = initvalact.initvalact(n) Q = qinitial.qinitial(n) #Q = [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]] a = v[1] # a = [[None, None, None], [None, None, None], [None, None, None]] # initializing action matrix # size = reading size of the given Q matrix [Nos of raws, Nos. of col, Nos. of actions possible per state] size = np.shape(Q) # storing size of Q-matrix n = size[0] Qlast = generateDummy(Q) # generating dummy of same sizq as Q to enter the while loop iteration = 0 # initializing the iteration reward = generate_rewardmatrix.generate_rewardmatrix(n) while qError(Q, Qlast) > 1.5 or Q == Qlast: # check for the error value to be 10**-3 or Q = Qlast # we want here Q!=Qlast becauses in starting phase if reward is zero in next step we will read error = 0 # and this will cause us to fall out of the loop iteration += 1 # incresing iteration value Qlast = deepcopy(Q) # copying Q to Qlast # state = selecting state randomly 1every time depending on the Q size state = random.randint(1, size[0] * size[1]) # temp = to retrive raw and column from Nos of state generated by random selector # state / Nos.of column will give us information about the raw number... # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2 # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1 # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2 temp = state / (size[1] * 1.0) # defining a temporary variable if ((temp).is_integer()): raw = int(temp) - 1 else: raw = int(temp) # temp = modulo of state and Total column # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0] # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1] temp = state % size[1] col = temp - 1 if col < 0: col = size[1] - 1 else: pass gotopos.gotopos(raw, col, p, p1, n) # to go to state that is selected randomly time.sleep(0.3) # ipdb.set_trace() for i in range(0, 20): # action selection according to selction of state if raw == 0 and col == 0: action = random.choice([1, 3]) elif raw == 0 and (col == -1 or col == size[1]-1): action = random.choice([1, 2]) elif raw == 0: action = random.choice([1, 2, 3]) elif raw == size[0]-1 and col == 0: action = random.choice([0, 3]) elif raw == size[0]-1 and (col == -1 or col == size[1]-1): action = random.choice([0, 2]) elif raw == size[0]-1: action = random.choice([0, 2, 3]) elif col == 0: action = random.choice([0, 1, 3]) elif (col == -1 or col == size[1]-1): action = random.choice([0, 1, 2]) else: action = random.randint(0, 3) # cells where all four actions are possible # defining nextstate according to choosen action if action == 0: # Up movement nextstate = Q[raw-1][col] rawtemp = raw - 1 # raw of nextstep coltemp = col # col of nextstep elif action == 1: # Down movememt nextstate = Q[raw+1][col] rawtemp = raw + 1 # raw of nextstep coltemp = col # col of nextstep elif action == 2: # Left movement nextstate = Q[raw][col-1] rawtemp = raw # raw of nextstep coltemp = col - 1 # col of nextstep else: # Right movement # ipdb.set_trace() nextstate = Q[raw][col+1] rawtemp = raw # raw of nextstep coltemp = col + 1 # col of nextstep # ipdb.set_trace() # try executing the Q-iteration formula with no errors.. ''' _____ADD HERE____ ACTION_PERFORMANCE FUNCTION UPDATE_REWARD FUNCTION ''' ENClast = encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() reward[raw][col][action] = ENC - ENClast # update_reward.update_reward(reward, raw, col, action, diff) try: Q[raw][col][action] = reward[raw][col][action] + gama * (max(nextstate)) #print "Q", Q # tracking if there is a type error (i.e. datatype missmatch) or not in above equation except TypeError as e: print("TypeError") raw = rawtemp col = coltemp print "qerror is", qError(Q,Qlast) print "reward is", reward # getting the appropriate action back from the given calculated values of Q matrix for r in range(0, size[0]): for c in range(0, size[1]): # ipdb.set_trace() a[r][c] = Q[r][c].index(max(Q[r][c])) # ipdb.set_trace() # function returns Q matrix, action matrix and nos of iteration return Q, a, iteration
def qLearning(n, reward1): v = initvalact.initvalact(n) Q = qinitial.qinitial(n) kMatrix = qinitial.qinitial(n) bita = 1 / 2 # Q = [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]] a = v[1] # a = [[None, None, None], [None, None, None], [None, None, None]] # initializing action matrix # size = reading size of the given Q matrix [Nos of raws, Nos. of col, Nos. of actions possible per state] size = np.shape(Q) # storing size of Q-matrix n = size[0] Qlast = generateDummy( Q) # generating dummy of same sizq as Q to enter the while loop iteration = 0 # initializing the iteration reward = generate_rewardmatrix.generate_rewardmatrix(n) # check for the error value to be 10**-3 or Q = Qlast ipdb.set_trace() while qError(Q, Qlast) > 1.5 or Q == Qlast or iteration <= 12: # we want here Q!=Qlast becauses in starting phase if reward is zero in next step we will read error = 0 # and this will cause us to fall out of the loop iteration += 1 # incresing iteration value Qlast = deepcopy(Q) # copying Q to Qlast # state = selecting state randomly every time depending on the Q size state = random.randint(1, size[0] * size[1]) # temp = to retrive raw and column from Nos of state generated by random selector # state / Nos.of column will give us information about the raw number... # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2 # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1 # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2 temp = state / (size[1] * 1.0) # defining a temporary variable if ((temp).is_integer()): raw = int(temp) - 1 else: raw = int(temp) # temp = modulo of state and Total column # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0] # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1] temp = state % size[1] col = temp - 1 if col < 0: col = size[1] - 1 else: pass # ipdb.set_trace() NumOfSelAct = 100 * (1 - 1 / (np.exp(0.05 * (iteration - 2)))) NumOfSelAct = round(NumOfSelAct * 25 / 100) print NumOfSelAct for i in range(0, 20): # action selection according to selction of state if i < NumOfSelAct: possibleActions = action_select(raw, col, n) tempList = [] for i in possibleActions: tempList.append(Q[raw][col][i]) action = possibleActions[tempList.index(max(tempList))] else: possibleActions = action_select(raw, col, n) action = random.choice(possibleActions) # defining nextstate according to choosen action if action == 0: # Up movement nextstate = Q[raw - 1][col] rawtemp = raw - 1 # raw of nextstep coltemp = col # col of nextstep elif action == 1: # Down movememt nextstate = Q[raw + 1][col] rawtemp = raw + 1 # raw of nextstep coltemp = col # col of nextstep elif action == 2: # Left movement nextstate = Q[raw][col - 1] rawtemp = raw # raw of nextstep coltemp = col - 1 # col of nextstep else: # Right movement # ipdb.set_trace() nextstate = Q[raw][col + 1] rawtemp = raw # raw of nextstep coltemp = col + 1 # col of nextstep # ipdb.set_trace() # try executing the Q-iteration formula with no errors.. ''' _____ADD HERE____ ACTION_PERFORMANCE FUNCTION UPDATE_REWARD FUNCTION ''' reward[raw][col][action] = reward1[raw][col][action] kMatrix[raw][col][action] = kMatrix[raw][col][action] + 1 try: alpha = 1 / ((kMatrix[raw][col][action])**bita) Q[raw][col][action] = (1-alpha) * Q[raw][col][action] + \ alpha * (reward[raw][col][action] + gama * (max(nextstate))) # tracking if there is a type error (i.e. datatype missmatch) or not in above equation except TypeError as e: print("TypeError") ipdb.set_trace() raw = rawtemp col = coltemp # getting the appropriate action back from the given calculated values of Q matrix for r in range(0, size[0]): for c in range(0, size[1]): # ipdb.set_trace() possibleActions = action_select(r, c, n) tempList = [] for i in possibleActions: tempList.append(Q[r][c][i]) a[r][c] = possibleActions[tempList.index(max(tempList))] # ipdb.set_trace() # function returns Q matrix, action matrix and nos of iteration print kMatrix print NumOfSelAct return Q, a, iteration
def qLearning(n, p, p1, encoder, ENClast): v = initvalact.initvalact(n) Q = qinitial.qinitial(n) kMatrix = qinitial.qinitial(n) restriCount = qinitial.qinitial(n) bita = 1/2 # Q = [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]] a = v[1] # a = [[None, None, None], [None, None, None], [None, None, None]] # initializing action matrix # size = reading size of the given Q matrix [Nos of raws, Nos. of col, Nos. of actions possible per state] size = np.shape(Q) # storing size of Q-matrix n = size[0] Qlast = generateDummy(Q) # generating dummy of same sizq as Q to enter the while loop iteration = 0 # initializing the iteration reward = generate_rewardmatrix.generate_rewardmatrix(n) # check for the error value to be 10**-3 or Q = Qlast global val1 val1 = pinSetup.valueRead_ON() while (qError(Q, Qlast) > 1.5 or Q == Qlast or iteration <= 4*n) and (val1 == 0): # we want here Q!=Qlast becauses in starting phase if reward is zero in next step we will read error = 0 # and this will cause us to fall out of the loop iteration += 1 # incresing iteration value Qlast = deepcopy(Q) # copying Q to Qlast # state = selecting state randomly 1every time depending on the Q size state = random.randint(1, size[0] * size[1]) # temp = to retrive raw and column from Nos of state generated by random selector # state / Nos.of column will give us information about the raw number... # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2 # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1 # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2 temp = state / (size[1] * 1.0) # defining a temporary variable if ((temp).is_integer()): raw = int(temp) - 1 else: raw = int(temp) # temp = modulo of state and Total column # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0] # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1] temp = state % size[1] col = temp - 1 if col < 0: col = size[1] - 1 else: pass gotopos.gotopos(raw, col, p, p1, n) # to go to state that is selected randomly time.sleep(0.3) # ipdb.set_trace() NumOfSelAct = 100*(1-1/(np.exp(0.05*(iteration-2)))) NumOfSelAct = round(NumOfSelAct*25/100) for i in range(0, 20): # action selection according to selction of state if i < NumOfSelAct: possibleActions = action_select(raw, col, n) tempList = [] for j in possibleActions: tempList.append(Q[raw][col][j]) action = possibleActions[tempList.index(max(tempList))] print ("for i") , i, ("selected action is"), action else: possibleActions = action_select(raw, col, n) action = random.choice(possibleActions) # defining nextstate according to choosen action if action == 0: # Up movement nextstate = Q[raw-1][col] rawtemp = raw - 1 # raw of nextstep coltemp = col # col of nextstep elif action == 1: # Down movememt nextstate = Q[raw+1][col] rawtemp = raw + 1 # raw of nextstep coltemp = col # col of nextstep elif action == 2: # Left movement nextstate = Q[raw][col-1] rawtemp = raw # raw of nextstep coltemp = col - 1 # col of nextstep else: # Right movement # ipdb.set_trace() nextstate = Q[raw][col+1] rawtemp = raw # raw of nextstep coltemp = col + 1 # col of nextstep # ipdb.set_trace() # try executing the Q-iteration formula with no errors.. ''' _____ADD HERE____ ACTION_PERFORMANCE FUNCTION UPDATE_REWARD FUNCTION ''' ENClast = encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() if ENC - ENClast > 0: diff = 1 if ENC - ENClast > 1: diff = 2 elif ENC - ENClast < 0: diff = -1 if ENC - ENClast < -1: diff = -2 else: diff = 0 oldreward = reward[raw][col][action] if (oldreward != 0 and diff == 0) or (np.sign(oldreward)!=np.sign(diff)): # restriCount[raw][col][action] += 1 # if restriCount[raw][col][action] < 3: print ("!! restriction applied !!") restriCount[raw][col][action] = 0 gotopos.gotopos(raw, col, p, p1, n) time.sleep(0.3) ENClast = encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() diff = ENC - ENClast direction = pinSetup.valueRead_dir() reward[raw][col][action] = ((-1)**direction)*diff # update_reward.update_reward(reward, raw, col, action, diff) kMatrix[raw][col][action] = kMatrix[raw][col][action] + 1 try: alpha = 1/((kMatrix[raw][col][action])**bita) Q[raw][col][action] = (1-alpha) * Q[raw][col][action] + \ alpha * (reward[raw][col][action] + gama * (max(nextstate))) # print "Q", Q # tracking if there is a type error (i.e. datatype missmatch) or not in above equation except TypeError as e: print("TypeError") raw = rawtemp col = coltemp print "iteration is", iteration print "qerror is", qError(Q, Qlast) print "reward is", reward val1 = pinSetup.valueRead_ON() # getting the appropriate action back from the given calculated values of Q matrix if val1 == 1: #import os print "Stop" #os.system("shutdown now") for r in range(0, size[0]): for c in range(0, size[1]): # ipdb.set_trace() possibleActions = action_select(r, c, n) tempList = [] for i in possibleActions: tempList.append(Q[r][c][i]) a[r][c] = possibleActions[tempList.index(max(tempList))] # ipdb.set_trace() # function returns Q matrix, action matrix and nos of iteration print kMatrix # print NumOfSelAct return Q, a, iteration
def qLearning(n): simulatedRewards = [ [ { 0: None, 1: 0, 2: None, 3: 0 }, # state = 1 { 0: None, 1: 0, 2: 0, 3: 0 }, # State = 2 { 0: None, 1: 0, 2: 0, 3: None } ], # State = 3 [ { 0: 0, 1: 0, 2: None, 3: 0 }, # State = 4 { 0: 0, 1: 0, 2: 0, 3: 0 }, # State = 5 { 0: 0, 1: 0, 2: 0, 3: None } ], # State = 6 [ { 0: 0, 1: None, 2: None, 3: -1 }, # State = 7 { 0: 0, 1: None, 2: 1, 3: -1 }, # State = 8 { 0: 0, 1: None, 2: 1, 3: None } ] ] # State = 9 v = initvalact.initvalact(n) Q = qinitial.qinitial(n) '''Q = [[[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]], # State1,State2, Stete3 [[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]], # State4, State5, State6 [[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]] # State7, State8, State9''' kMatrix = qinitial.qinitial(n) Tr = qinitial.qinitial(n) a = v[1] size = np.shape(Q) # storing size of Q-matrix n = size[0] Qlast = generateDummy( Q) # generating dummy of same sizq as Q to enter the while loop iteration = 0 # initializing the iteration reward = generate_rewardmatrix.generate_rewardmatrix(n) state = random.randint(1, size[0] * size[1]) while qError(Q, Qlast) > 1.5 or Q == Qlast or iteration <= 100: iteration += 1 # incresing iteration value Qlast = deepcopy(Q) # copying Q to Qlast # state = selecting state randomly 1every time depending on the Q size # temp = to retrive raw and column from Nos of state generated by random selector # state / Nos.of column will give us information about the raw number... # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2 # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1 # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2 if iteration == 1: temp = state / (size[1] * 1.0) # defining a temporary variable if ((temp).is_integer()): raw = int(temp) - 1 else: raw = int(temp) # temp = modulo of state and Total column # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0] # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1] temp = state % size[1] col = temp - 1 if col < 0: col = size[1] - 1 else: pass # *** gotopos.gotopos(raw, col, p, p1, n) # to go to state that is selected randomly # *** time.sleep(0.3) # ipdb.set_trace() possibleActions = action_select(raw, col, n) # ipdb.set_trace() probablity = epsilon_greedy_policy(Q[raw][col], possibleActions, epsilon) # ipdb.set_trace() actionIndex = np.random.choice(len(probablity), p=probablity) # ipdb.set_trace() action = possibleActions[actionIndex] # ipdb.set_trace() # for i in range(0, 20): # action selection according to selction of state ''' ------------------------------------------------------------- ***************REPLACED BY EPSILON GREDDY POLICY************* ------------------------------------------------------------- if i < NumOfSelAct: possibleActions = action_select(raw, col, n) tempList = [] for i in possibleActions: tempList.append(Q[raw][col][i]) action = possibleActions[tempList.index(max(tempList))] else: possibleActions = action_select(raw, col, n) action = random.choice(possibleActions) ------------------------------------------------------------- ***************REPLACED BY EPSILON GREDDY POLICY************* ------------------------------------------------------------- ''' # defining nextstate according to choosen action if action == 0: # Up movement nextstate = Q[raw - 1][col] rawtemp = raw - 1 # raw of nextstep coltemp = col # col of nextstep elif action == 1: # Down movememt nextstate = Q[raw + 1][col] rawtemp = raw + 1 # raw of nextstep coltemp = col # col of nextstep elif action == 2: # Left movement nextstate = Q[raw][col - 1] rawtemp = raw # raw of nextstep coltemp = col - 1 # col of nextstep else: # Right movement # ipdb.set_trace() nextstate = Q[raw][col + 1] rawtemp = raw # raw of nextstep coltemp = col + 1 # col of nextstep # ipdb.set_trace() # try executing the Q-iteration formula with no errors.. ''' _____ADD HERE____ ACTION_PERFORMANCE FUNCTION UPDATE_REWARD FUNCTION ENClast= encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) # if action == 0 or action == 1: # ENClast= encoder.getData() ENC= encoder.getData() reward[raw][col][action]= ENC - ENClast # update_reward.update_reward(reward, raw, col, action, diff) ''' # ipdb.set_trace() reward[raw][col][action] = simulatedRewards[raw][col][action] # ipdb.set_trace() kMatrix[raw][col][action] = kMatrix[raw][col][action] + 1 # ipdb.set_trace() try: alpha = 1 / ((kMatrix[raw][col][action])**bita) # ipdb.set_trace() eComplement = reward[raw][col][action] + gama * max( nextstate) - Q[raw][col][action] # ipdb.set_trace() e = reward[raw][col][action] + gama * max(nextstate) - max( Q[raw][col]) # ipdb.set_trace() for r in range(size[0]): for c in range(size[1]): for actn in action_select(raw, col, n): Tr[r][c][actn] = gama * lamda * Tr[r][c][actn] Q[r][c][ actn] = Q[r][c][actn] + alpha * Tr[r][c][actn] * e # ipdb.set_trace() Q[raw][col][action] = Q[raw][col][action] + alpha * eComplement # ipdb.set_trace() Tr[raw][col][action] += 1 # ipdb.set_trace() # print "Q", Q # tracking if there is a type error (i.e. datatype missmatch) or not in above equation except TypeError as e: print("TypeError") # ipdb.set_trace() print possibleActions print probablity print "raw= ", raw, "col = ", col, "action = ", action raw = rawtemp col = coltemp print "qerror is", qError(Q, Qlast) print "reward is", reward print "iteration = ", iteration # time.sleep(0.1) # ipdb.set_trace() # getting the appropriate action back from the given calculated values of Q matrix print Tr print Q for r in range(0, size[0]): for c in range(0, size[1]): possibleActions = action_select(r, c, n) tempList = [] for i in possibleActions: tempList.append(Q[r][c][i]) a[r][c] = possibleActions[tempList.index(max(tempList))] # ipdb.set_trace() # function returns Q matrix, action matrix and nos of iteration return Q, a, iteration
def qLearning(n, p, p1, encoder, ENClast): v = initvalact.initvalact(n) Q = qinitial.qinitial(n) kMatrix = qinitial.qinitial(n) Tr = qinitial.qinitial(n) a = v[1] size = np.shape(Q) # storing size of Q-matrix n = size[0] Qlast = generateDummy( Q) # generating dummy of same sizq as Q to enter the while loop iteration = 0 # initializing the iteration reward = generate_rewardmatrix.generate_rewardmatrix(n) state = random.randint(1, size[0] * size[1]) global val1 val1 = pinSetup.valueRead_ON() while True and val1 == 0: iteration += 1 # incresing iteration value epsilon = (0.9 / (np.exp(0.05 * (iteration - 1)))) + 0.1 Qlast = deepcopy(Q) # copying Q to Qlast # state = selecting state randomly 1every time depending on the Q size # temp = to retrive raw and column from Nos of state generated by random selector # state / Nos.of column will give us information about the raw number... # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2 # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1 # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2 if iteration == 1: temp = state / (size[1] * 1.0) # defining a temporary variable if ((temp).is_integer()): raw = int(temp) - 1 else: raw = int(temp) # temp = modulo of state and Total column # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0] # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1] temp = state % size[1] col = temp - 1 if col < 0: col = size[1] - 1 else: pass gotopos.gotopos(raw, col, p, p1, n) # to go to state that is selected randomly time.sleep(0.3) possibleActions = action_select(raw, col, n) probablity = epsilon_greedy_policy(Q[raw][col], possibleActions, epsilon) actionIndex = np.random.choice(len(probablity), p=probablity) action = possibleActions[actionIndex] # defining nextstate according to choosen action if action == 0: # Up movement nextstate = Q[raw - 1][col] rawtemp = raw - 1 # raw of nextstep coltemp = col # col of nextstep elif action == 1: # Down movememt nextstate = Q[raw + 1][col] rawtemp = raw + 1 # raw of nextstep coltemp = col # col of nextstep elif action == 2: # Left movement nextstate = Q[raw][col - 1] rawtemp = raw # raw of nextstep coltemp = col - 1 # col of nextstep else: # Right movement nextstate = Q[raw][col + 1] rawtemp = raw # raw of nextstep coltemp = col + 1 # col of nextstep # try executing the Q-iteration formula with no errors.. ''' _____ADD HERE____ ACTION_PERFORMANCE FUNCTION UPDATE_REWARD FUNCTION ''' ENClast = encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() diff = ENC - ENClast oldreward = reward[raw][col][action] if (oldreward != 0 and diff == 0) or (np.sign(oldreward) != np.sign(diff) and oldreward != 0): print("!! restriction applied !!") gotopos.gotopos(raw, col, p, p1, n) time.sleep(0.3) ENClast = encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() diff = ENC - ENClast direction = pinSetup.valueRead_dir() reward[raw][col][action] = ((-1)**direction) * diff kMatrix[raw][col][action] = kMatrix[raw][col][action] + 1 try: alpha = 1 / ((kMatrix[raw][col][action])**bita) eComplement = reward[raw][col][action] + gama * max( nextstate) - Q[raw][col][action] e = reward[raw][col][action] + gama * max(nextstate) - max( Q[raw][col]) for r in range(size[0]): for c in range(size[1]): for actn in action_select(raw, col, n): Tr[r][c][actn] = gama * lamda * Tr[r][c][actn] Q[r][c][ actn] = Q[r][c][actn] + alpha * Tr[r][c][actn] * e Q[raw][col][action] = Q[raw][col][action] + alpha * eComplement Tr[raw][col][action] += 1 # tracking if there is a type error (i.e. datatype missmatch) or not in above equation except TypeError as e: print("TypeError") print possibleActions print probablity print "raw= ", raw, "col = ", col, "action = ", action raw = rawtemp col = coltemp print "qerror is", qError(Q, Qlast) print "reward is", reward print "iteration = ", iteration val1 = pinSetup.valueRead_ON() # getting the appropriate action back from the given calculated values of Q matrix print Tr print Q for r in range(0, size[0]): for c in range(0, size[1]): possibleActions = action_select(r, c, n) tempList = [] for i in possibleActions: tempList.append(Q[r][c][i]) a[r][c] = possibleActions[tempList.index(max(tempList))] # function returns Q matrix, action matrix and nos of iteration return Q, a, iteration