def train_agent(self): ''' Train the agent over a certain number of games. ''' for i in range(0, self.num_games): self.mdp= MDP(0.5, 0.5, 0.03, 0.01, 0.4) self.play_game() '''Testing
def train_agent(self): ''' Train the agent over a certain number of games. ''' # Your Code Goes Here! print("Training") for i in range(self.num_games): game = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - 0.2 / 2) game.discretize_state() #decretize the first state end_sign = False while not end_sign: act = self.f_function(game) cur_idx = game.find_state() #get current game index end_sign, reward = game.simulate_one_time_step( act) #simulate one time step game.discretize_state() #discretize next_idx = game.find_state() #next game index utility_list = self.Q_table[ next_idx] #(q1,q2,q3) #get next game utility max_q_next = max(utility_list) self.Q_table[cur_idx][act] = ( 1 - self.alpha_value ) * self.Q_table[cur_idx][act] + self.alpha_value * ( self.gamma_val * max_q_next + reward) #game.print_state() #print "===end round===" pass
def play_game(self): ''' Simulate an actual game till the agent loses. ''' # Your Code Goes Here! num_game = 1000 result = [] for i in range(num_game): print("Round: " + str(i + 1)) paddle_height = 0.2 end_sign = False counter = 0 game = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - 0.2 / 2) game.discretize_state() while not end_sign: action = self.f_function(game) end_sign, reward = game.simulate_one_time_step(action) game.discretize_state() if reward == 1: counter += 1 print "Rebound:" + str(counter) result.append(counter) print "the mean is " + str(statistics.mean(result)) print "the variance is " + str(statistics.stdev(result)) pass
def train_agent(self, should_show_gui): ''' Train the agent over a certain number of games. ''' if should_show_gui: win = GraphWin('Pong game', 500, 500) ball_count = 0 for i in range(self.num_games): if should_show_gui: mdpInstance = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - .2 / 2, win) else: mdpInstance = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - .2 / 2, None) self.play_game(mdpInstance) ball_count += MDP.get_ball_count(mdpInstance) if should_show_gui: win.close() print("average: ", float(ball_count) / float(self.num_games)) pass
def play_game(self): ''' Simulate an actual game till the agent loses. ''' reward = 0 self.curState = MDP() self.d_curState = self.curState.discretize_state() while self.d_curState != 10368: old_State = self.d_curState action_selected = self.f_function() reward = self.curState.simulate_one_time_step(action_selected) if (reward == 1): #print reward self.bounced = self.bounced + 1 next_State = self.curState.discretize_state() self.updateQ(old_State, reward, action_selected, next_State)
def __init__(self, num_games=0, alpha_value=0, gamma_value=0, epsilon_value=0): ''' Setup the Simulator with the provided values. :param num_games - number of games to be trained on. :param alpha_value - 1/alpha_value is the decay constant. :param gamma_value - Discount Factor. :param epsilon_value - Probability value for the epsilon-greedy approach. ''' self.num_games = num_games self.epsilon_value = epsilon_value self.alpha_value = alpha_value self.gamma_val = gamma_value self.q_table = {} self.mdp = MDP() self.hits = 0 self.train_agent()
def __init__(self, num_games=0, alpha_value=0, gamma_value=0, epsilon_value=0): ''' Setup the Simulator with the provided values. :param num_games - number of games to be trained on. :param alpha_value - 1/alpha_value is the decay constant. :param gamma_value - Discount Factor. :param epsilon_value - Probability value for the epsilon-greedy approach. ''' self.num_games = num_games self.epsilon_value = epsilon_value self.alpha_value = alpha_value self.gamma_val = gamma_value self.total_rebounds = 0 self.mdp= MDP(0.5, 0.5, 0.03, 0.01, 0.4) #self.Q = [[[[[[0] * 3] * 12] * 3] * 2] * 12] * 12 self.Q = [[[[[[0 for i in range(3)] for j in range(12)] for k in range(3)] for l in range(2)] for m in range(12)] for n in range(12)] #self.R = [[[[[0] * 12] * 3] * 2] * 12] * 12 self.R = [[[[[0 for i in range(12)] for j in range(3)] for k in range(2)] for l in range(12)] for m in range(12)] self.train_agent()
def f_function(self, mdpInstance): ''' Choose action based on an epsilon greedy approach :return action selected ''' action_selected = None #should be 0 for no move, 1 for up, or 2 for down x = np.random.random() if x < self.epsilon_value: action_selected = np.random.randint(low=0, high=2) else: discrete = MDP.discretize_state(mdpInstance) curr_state = self.Q[:, int(discrete[0]), discrete[1], discrete[2], int(discrete[3]), discrete[4]] max_val = -1 for i in range(len(curr_state)): if curr_state[i] > max_val: max_val = curr_state[i] action_selected = i return action_selected
def f_function(self): ''' Choose action based on an epsilon greedy approach :return action selected ''' action_selected = None self.mdp.discretize_state() r = random.random() if(r > self.epsilon_value): curr_max = 0 for a in range(0,3): if self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][a] >= curr_max: curr_max = self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][a] action_selected = a # If they're all zeros, chose one at random if curr_max == 0: action_selected = floor(random.random() * 3) else: action_selected = floor(random.random() * 3) # Create a temporary MDP for help with the Q learning formula, in order to look forward into the future temp_mdp = MDP(self.mdp.ball_x, self.mdp.ball_y, self.mdp.velocity_x, self.mdp.velocity_y, self.mdp.paddle_y) temp_mdp.simulate_one_time_step(action_selected) temp_mdp.discretize_state() max_a_prime = max(self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][0], self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][1], self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][2]) # Update Q via the learning function self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][action_selected] = \ (1 - self.alpha_value) * self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][action_selected] \ + self.alpha_value * (self.R[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y] + self.gamma_val * max_a_prime) return action_selected
# cost.exceptObjective(1,1) # Define a cube state space sC = Cube(intervals=np.asarray([[0, 10], [0, 10], [0, 10], [0, 10]]), isContinuous=False) # Define action space aC = Cube(intervals=np.asarray([[0, 5], [0, 5], [0, 5], [0, 5]]), isContinuous=False) # Define exogenous noise space nC = Cube(intervals=np.asarray([[0, 8], [0, 8], [0, 8], [0, 8]]), isContinuous=False) # As an illustration, we define the following MDP transition kernel. T = Transition(sC, aC) # MDP cost/reward function can be defined using the "Objective" class. O = Objective(sC, aC, False, False) # Construct a finite horizon MDP mdp = MDP(initState=np.array([5, 5, 5, 5]), sSpace=sC, aSpace=aC, nSpace=nC, transition=T, objective=O, isFiniteHorizon=10, isAveCost=False, terminalStates=None)
class Simulator: def __init__(self, num_games=0, alpha_value=0, gamma_value=0, epsilon_value=0): ''' Setup the Simulator with the provided values. :param num_games - number of games to be trained on. :param alpha_value - 1/alpha_value is the decay constant. :param gamma_value - Discount Factor. :param epsilon_value - Probability value for the epsilon-greedy approach. ''' self.num_games = num_games self.epsilon_value = epsilon_value self.alpha_value = alpha_value self.gamma_val = gamma_value self.total_rebounds = 0 self.mdp= MDP(0.5, 0.5, 0.03, 0.01, 0.4) #self.Q = [[[[[[0] * 3] * 12] * 3] * 2] * 12] * 12 self.Q = [[[[[[0 for i in range(3)] for j in range(12)] for k in range(3)] for l in range(2)] for m in range(12)] for n in range(12)] #self.R = [[[[[0] * 12] * 3] * 2] * 12] * 12 self.R = [[[[[0 for i in range(12)] for j in range(3)] for k in range(2)] for l in range(12)] for m in range(12)] self.train_agent() def f_function(self): ''' Choose action based on an epsilon greedy approach :return action selected ''' action_selected = None self.mdp.discretize_state() r = random.random() if(r > self.epsilon_value): curr_max = 0 for a in range(0,3): if self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][a] >= curr_max: curr_max = self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][a] action_selected = a # If they're all zeros, chose one at random if curr_max == 0: action_selected = floor(random.random() * 3) else: action_selected = floor(random.random() * 3) # Create a temporary MDP for help with the Q learning formula, in order to look forward into the future temp_mdp = MDP(self.mdp.ball_x, self.mdp.ball_y, self.mdp.velocity_x, self.mdp.velocity_y, self.mdp.paddle_y) temp_mdp.simulate_one_time_step(action_selected) temp_mdp.discretize_state() max_a_prime = max(self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][0], self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][1], self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][2]) # Update Q via the learning function self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][action_selected] = \ (1 - self.alpha_value) * self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][action_selected] \ + self.alpha_value * (self.R[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y] + self.gamma_val * max_a_prime) return action_selected def train_agent(self): ''' Train the agent over a certain number of games. ''' for i in range(0, self.num_games): self.mdp= MDP(0.5, 0.5, 0.03, 0.01, 0.4) self.play_game() '''Testing print("\nAvg Rebounds: " + str(self.total_rebounds / self.num_games)) print("\nTotal Rebounds:" + str(self.total_rebounds)) self.total_rebounds = 0 for i in range(0,6): self.play_game() print("\nRebounds after training: " + str(self.total_rebounds / 5)) #print(str(self.R)) #print(str("\n" + str(self.Q))) ''' def play_game(self): ''' Simulate an actual game till the agent loses. ''' while self.mdp.miss != True: # Ball hits paddle, update reward reward = 0 self.mdp.simulate_one_time_step(self.f_function()) if self.mdp.dis_paddle_y == self.mdp.dis_ball_y and self.mdp.dis_ball_x == 11: reward = 1 self.total_rebounds = self.total_rebounds + 1 elif self.mdp.dis_paddle_y != self.mdp.dis_ball_y and self.mdp.dis_ball_x == 11: reward = -1 # print("CONTINUOUS: \nball_x: " + str(self.mdp.ball_x) + "\nball_y: " + str(self.mdp.ball_y) + "\nvelocity_x: " + str(self.mdp.velocity_x) + "\nvelocity_y: " + str(self.mdp.velocity_y) + "\npaddle_y " + str(self.mdp.paddle_y) + "\nmiss: " + str(self.mdp.miss)) # print("\nDISCRETE: \nball_x: " + str(self.mdp.dis_ball_x) + "\nball_y: " + str(self.mdp.dis_ball_y) + "\nvelocity_x: " + str(self.mdp.dis_velocity_x) + "\nvelocity_y: " + str(self.mdp.dis_velocity_y) + "\npaddle_y " + str(self.mdp.dis_paddle_y) + "\nmiss: " + str(self.mdp.miss)) self.R[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y] = reward
class Simulator: def __init__(self, num_games=0, alpha_value=0, gamma_value=0, epsilon_value=0): ''' Setup the Simulator with the provided values. :param num_games - number of games to be trained on. :param alpha_value - 1/alpha_value is the decay constant. :param gamma_value - Discount Factor. :param epsilon_value - Probability value for the epsilon-greedy approach. ''' self.num_games = num_games self.epsilon_value = epsilon_value self.alpha_value = alpha_value self.gamma_value = gamma_value self.curState = 0 self.d_curState = 0 self.Q = [] for i in range(10369): self.Q.append([0, 0, 0]) self.bounced = 0 #start training self.train_agent() print("total bounced: " + str(self.bounced)) print("avg bounced per game: " + str(self.bounced / self.num_games)) def f_function(self): ''' Choose action based on an epsilon greedy approach :return action selected ''' if (uniform(0, 1) < self.epsilon_value): action_selected = randint(0, 2) #print "r" else: action_selected = 0 self.d_curState = self.curState.discretize_state() if (self.Q[self.d_curState][action_selected] < self.Q[self.d_curState][1]): action_selected = 1 if (self.Q[self.d_curState][action_selected] < self.Q[self.d_curState][2]): action_selected = 2 #print action_selected return action_selected def train_agent(self): ''' Train the agent over a certain number of games. ''' print("start training") #print self.curState.discretize_state() for i in range(self.num_games): #print i self.play_game() def play_game(self): ''' Simulate an actual game till the agent loses. ''' reward = 0 self.curState = MDP() self.d_curState = self.curState.discretize_state() while self.d_curState != 10368: old_State = self.d_curState action_selected = self.f_function() reward = self.curState.simulate_one_time_step(action_selected) if (reward == 1): #print reward self.bounced = self.bounced + 1 next_State = self.curState.discretize_state() self.updateQ(old_State, reward, action_selected, next_State) #print str(self.d_curState)+ " " +str(self.curState.ball_x) def updateQ(self, old_State, reward, action_selected, next_State): # Q[S][a]= (1-alpha)*Q[S][a] + alpha*(R+gamma*max(Q[a'])) self.Q[old_State][action_selected] = (1 - self.alpha_value) * self.Q[ old_State][action_selected] + self.alpha_value * ( reward + self.gamma_value * max(self.Q[next_State]))
class Simulator: def __init__(self, num_games=0, alpha_value=0, gamma_value=0, epsilon_value=0): ''' Setup the Simulator with the provided values. :param num_games - number of games to be trained on. :param alpha_value - 1/alpha_value is the decay constant. :param gamma_value - Discount Factor. :param epsilon_value - Probability value for the epsilon-greedy approach. ''' self.num_games = num_games self.epsilon_value = epsilon_value self.alpha_value = alpha_value self.gamma_val = gamma_value self.q_table = {} self.mdp = MDP() self.hits = 0 self.train_agent() def f_function(self): ''' Choose action based on an epsilon greedy approach :return action selected ''' rand_num = random.random() action_selected = 0 if (rand_num < .04): action_selected = random.randint(0, 2) else: state = self.mdp.discretize_state() action_selected = numpy.argmax( numpy.array( self.get_table_val( (state[0], state[1], state[2], state[3])))) return action_selected def train_agent(self): ''' Train the agent over a certain number of games. ''' for i in range(self.num_games): self.mdp.reset() self.play_game() if (i % 1000 == 0): print("Game ", i, " average ", int(self.hits / 1000), " hits") self.hits = 0 def play_game(self): ''' Simulate an actual game till the agent loses. ''' reward = 0 while reward != -1: action = self.f_function() # pick action prevState = self.mdp.discretize_state() reward = self.mdp.simulate_one_time_step(action) if reward != -1: if reward == 1: self.hits += 1 self.update_q(prevState, self.mdp.discretize_state(), reward, action) def update_q(self, prev_state, state, reward, action): if (self.q_table.get((prev_state[0], prev_state[1], prev_state[2], prev_state[3])) == None): self.q_table[(prev_state[0], prev_state[1], prev_state[2], prev_state[3])] = [0, 0, 0] prev_q_val = self.q_table[(prev_state[0], prev_state[1], prev_state[2], prev_state[3])][action] best_action = self.f_function() if (self.q_table.get( (state[0], state[1], state[2], state[3])) == None): self.q_table[(state[0], state[1], state[2], state[3])] = [0, 0, 0] curr_q_val = self.q_table[(state[0], state[1], state[2], state[3])][best_action] prev_q_val = prev_q_val + self.alpha_value * ( reward + self.gamma_val * curr_q_val - prev_q_val) self.q_table[(prev_state[0], prev_state[1], prev_state[2], prev_state[3])][action] = prev_q_val # Returns an array of size 3 for q-val of each move def get_table_val(self, index): if (self.q_table.get(index) != None): return self.q_table[index] else: self.q_table[index] = [0, 0, 0] return self.q_table[index]
def play_game(self, mdpInstance): ''' Simulate an actual game till the agent loses. ''' '''initial_state = discretize_step new_action = f_function() simulate_one_time_step(new_action) new_state = discretize_step() ''' didLose = False while didLose is False: prev_tuple = MDP.discretize_state(mdpInstance) self.arr_states.append(prev_tuple) prev_action = self.f_function(mdpInstance) shouldReward = MDP.simulate_one_time_step(mdpInstance, prev_action) new_tuple = MDP.discretize_state(mdpInstance) if new_tuple[4] == 1: error = -1 + self.gamma_val * 0 - self.Q[ prev_action, int(prev_tuple[0]), prev_tuple[1], prev_tuple[2], int(prev_tuple[3]), prev_tuple[4]] self.Q[prev_action, int(prev_tuple[0]), prev_tuple[1], prev_tuple[2], int(prev_tuple[3]), prev_tuple[4]] += self.alpha_value * error didLose = True break max_state = self.Q[:, int(new_tuple[0]), new_tuple[1], new_tuple[2], int(new_tuple[3]), new_tuple[4]] #update Q max_val = -1 max_Q = 0 for i in range(len(max_state)): if max_state[i] > max_val: max_val = max_state[i] max_Q = i if shouldReward: error = 1 + self.gamma_val * max_val - self.Q[ prev_action, int(prev_tuple[0]), prev_tuple[1], prev_tuple[2], int(prev_tuple[3]), prev_tuple[4]] self.Q[prev_action, int(prev_tuple[0]), prev_tuple[1], prev_tuple[2], int(prev_tuple[3]), prev_tuple[4]] += self.alpha_value * error else: error = 0 + self.gamma_val * max_val - self.Q[ prev_action, int(prev_tuple[0]), prev_tuple[1], prev_tuple[2], int(prev_tuple[3]), prev_tuple[4]] self.Q[prev_action, int(prev_tuple[0]), prev_tuple[1], prev_tuple[2], int(prev_tuple[3]), prev_tuple[4]] += self.alpha_value * error pass