def getQPred(pStates, rl, next_move, action_not_same): #print("Q state:",pStates.qs, "\nLast Q:", pStates.last_qs) #print("\nNext move: ",next_move) if action_not_same: rl.update(pStates.last_qs, next_move, pStates.qs, 0) else: rl.update(pStates.last_qs, next_move, pStates.qs, 0.1) return indexToAction(rl.action(pStates.qs))
def getMPCPred(self, pongStates): # if we the states are right, use nxtMPC if not self.checkState(pongStates): return 0, -1 predY, criticalT = getTrajectory(pongStates.statek1, self.justLeftOfPaddle, self.maxIter, self.c) #print("\n\nPrediciton: ", predY) if (self.learnMode): addState(pongStates.statek, pongStates.statek1, self.c) if (predY != None): nextU = self.nxtMPC(predY, pongStates.agent, self.m, self.y, self.u) #nextU = 0 nextInd = int(round(nextU)) #print("Action: ",nextInd) next_move = indexToAction(nextInd) return next_move, predY # otherwise, return zero else: return 0, -1
paddle = np.zeros([1, len(controlSeq)]) paddle[0][0] = 25 paddle[0][1] = 25 len(paddle) env = gym.make( "PongDeterministic-v4") # skip 4 frames everytime and no random action observation = env.reset() gs_agent = 92 downsample_factor = np.array([2, 4]) zfactor = 4 steps = len(controlSeq) - 1 i = 1 while i < steps: next_move = indexToAction(controlSeq[i]) observation, reward, done, info = env.step(next_move) i = i + 1 # allow opencv to interperet the image observation_rgb = cv2.cvtColor(observation, cv2.COLOR_BGR2RGB) # remove color dsNoColour = copy.deepcopy(observation_rgb[:, :, grayscale]) # downsample downsampled = copy.deepcopy( dsNoColour[::downsample_factor[1], ::downsample_factor[0]]) paddle[0][i] = findAgent(downsampled, gs_agent, paddle_aoi_full) # turn grayscale back into rgb for data visualisation p
def Main(): m, y, u = initGekko(alpha, beta, 25, 0) env = gym.make( "PongDeterministic-v4") # skip 4 frames everytime and no random action observation = env.reset() found = False sqlite3.register_adapter(np.int32, lambda val: int(val)) init_PT_db(db_file_name) d = getcwd() + "\\Database\\" + db_file_name # get path to db c = create_connection(d) observation, reward, done, info = env.step(random.choice(actions)) # TODO remove this -> aoi_ds = np.array( [[8,12], [72,48]]) # this is in for (x,y) (x,y) # at 4 (in y direction) it never seems to not find it and it is always 1 pixel # at 3, it is sometimes 2 pixels in y direction # at 5 sometimes it disapears # Debug statements print("Area of Interest:\n", aoi_orig) print("Downsampled aoi:\n", aoi_ds) print("Paddle area:\n", paddle_aoi_full) # initialise some variables i = 0 # the time step of the episode we are currently on found3 = False # boolean to see if we have found the last three position of the ball next_move = 0 # the next move our agent will take episode_rewards = [] # a list of rewards we have got in prev episodes reward_sum = 0 # the sum of rewards from the current episode episode_number = 1 # the episode number we are on last_action = deque(maxlen=3) last_action.append(0) last_action.append(0) last_action.append(0) if visualMode: grayscale = 0 else: grayscale = 2 paddle_aoi = copy.deepcopy(paddle_aoi_full) foundAgent = False foundBall = False # qlearning stuff index_q = 0 controller = QLearning(ball_x=82, ball_y=82, ai_pos_y=84, v_x=11, v_y=11, n_action=3) qstate = [0, 0, 0, 0, 0] game_actions = [] game_states = [] # wait until the ball and other player is in the game while episode_number <= numOfEps: ##################################### # hopefully loop will look like this # dsImg = downsample(observation) # pongStates.updateStates(dsImg,i) # mpcAction = mpc.getMPCPred(pongStates) # rlAction = rl.getQPred(pongStates) # next_move = chooseAction(pongStates,rlAction,mpcAction) # observation, reward, done, info = env.step(next_move) # show (ifvisual mode) ##################################### #next_move=random.choice(actions) next_move = 0 if (visualMode): # allow opencv to interperet the image observation = cv2.cvtColor(observation, cv2.COLOR_BGR2RGB) # remove color dsNoColour = copy.deepcopy(observation[:, :, grayscale]) # downsample downsampled = copy.deepcopy( dsNoColour[::downsample_factor[1], ::downsample_factor[0]]) # turn grayscale back into rgb for data visualisation purposes ds_nc_u = cv2.cvtColor(downsampled, cv2.COLOR_GRAY2RGB) # if we found the ball in the last frame, we only check a small area around the ball, rather than the whole frame if foundBall: foundBall, pos = findBall(downsampled, gs_ball, aoi) # otherwise check the whole downsampled frame else: foundBall, pos = findBall(downsampled, gs_ball, aoi_ds) #print(downsampled[paddlePos[0,1]:paddlePos[1,1],paddlePos[0,0]:paddlePos[1,0]]) if foundAgent: paddlePos = findAgent(downsampled, gs_agent, paddle_aoi) else: paddlePos = findAgent(downsampled, gs_agent, paddle_aoi_full) # if we have found the ball in this time step if foundBall: # append the ball position to queue last_pos.append([pos[0, 0], pos[0, 1], i]) # check if we have found the last three positions consecutively if len(last_pos) > 2: if (last_pos[2][2] - last_pos[1][2] == 1) and (last_pos[1][2] - last_pos[0][2] == 1): found3 = True else: found3 = False else: found3 = False #print("Position:\n",pos) aoi = findaoi(pos) else: #print("Could not find at timestep:", i) #aoi = np.empty([2,2]) last_pos.append([-1, -1, -1]) if paddlePos != -1: #print("found agent") foundAgent = True # the area to search for the ball in the next time step paddle_aoi[0, 1] = paddlePos - agentBuffer paddle_aoi[1, 1] = paddlePos + agentBuffer #print("aoi before rejig:\n",aoi) # if this is outside the bounds of the aoi of the whole downsampled version set it equal to it # since we do not want to search outside this area if (paddle_aoi[0, 1] < paddle_aoi_full[0, 1]): paddle_aoi[0, 1] = paddle_aoi_full[0, 1] if (paddle_aoi[1, 1] > paddle_aoi_full[1, 1]): paddle_aoi[1, 1] = paddle_aoi_full[1, 1] #print("Next paddle aoi: ",paddle_aoi," paddle position: ",paddlePos," paddle aoi full: ",paddle_aoi_full) else: foundAgent = False if (found3): #print("found last three. Last position: ",last_pos) # we want to predict the trajectory if 1. the ball is coming towards us. 2. it is past a certain point # as specified by startPredictPoint 3. We detected the ball position during this timestep if (((last_pos[1][0] - last_pos[0][0]) > 0) and last_pos[1][0] > startPredictPoint and i == last_pos[2][2]): #print("Ball is coming towards us, and in our half") # find the velocity at the last few timesteps velxk1 = last_pos[2][0] - last_pos[1][0] velyk1 = last_pos[2][1] - last_pos[1][1] velxk = last_pos[1][0] - last_pos[0][0] velyk = last_pos[1][1] - last_pos[0][1] # debug statement #if abs(velxk1)>6: # print("last Position: ",last_pos) # the last two states according to our data statek = (last_pos[1][0].astype(int), last_pos[1][1].astype(int), velxk.astype(float), velyk.astype(float)) statek1 = (last_pos[2][0].astype(int), last_pos[2][1].astype(int), velxk1.astype(float), velyk1.astype(float)) #print("adding states\nstatek: ",statek,"\nstatek1: ",statek1) if learningMode: # add this data to the database addState(statek, statek1, c) # get the prediction of the ball trajectory ball_y, T = getTrajectory(statek1, justLeftOfPaddle, maxIter, c) # get all the points the ball will hit on its trajectory # for visualisation purposes only if (visualMode): prediciton = getTrajectoryAll(statek1, justLeftOfPaddle, maxIter, c) # this is for visualisation, plot the trajectory the agent thinks the ball will take for n in prediciton: ds_nc_u[n[1], n[0]] = (40, 166, 255) #print("Prediction: ",ball_y) # if no prediction, do nothing if (ball_y == None): #print("No Prediction at time: ",i) next_move = 0 last_action.append(next_move) # if the agent isn't found make random choice TODO justify this elif (paddlePos == -1): next_move = random.choice(actions) # debug stuff #print("Can't see agent at time: ",i) #cv2.imshow("Test",downsampled[paddlePos[0,1]:paddlePos[0,0],paddlePos[1,1]:paddlePos[1,0]]) #if cv2.waitKey() & 0xFF == ord('q'): # break # this is where I need to put MPC else: nextU = nxtMPC(ball_y, paddlePos, m, y, u) nextInd = int(round(nextU)) next_move = indexToAction(nextInd) print(nextU) if visualMode: if (ball_y != None): # if we have a trajectory guess, paint the pixel red # visualisation only ds_nc_u[ball_y, justLeftOfPaddle] = (0, 0, 255) # get the observation (image), reward, done flag and info (unused) observation, reward, done, info = env.step(next_move) if (visualMode): bre = showImg(ds_nc_u) if bre: break i = i + 1 reward_sum += reward # stop the game from getting stuck in infinite loop, or reset when done an episode if done or i > 10000: observation = env.reset() # reset env episode_rewards.append( reward_sum) # add episode reward to reward list print('episode:', episode_number, ' reward total was %f' % (reward_sum) ) #. running mean: %f' % (reward_sum, running_reward)) reward_sum = 0 i = 0 episode_number += 1 if not visualMode: if i % 100 == 0: print("Time: ", i, ". Reward sum: ", reward_sum) print("commiting and closing") c.commit() c.close()