def Q_train(alpha, gamma, epsilon, max_iterations): w = np.zeros((SS,act_set)) # Initialize b = 0 # Initialize Rewards = [] for noe in range(episodes): state = Car.reset() r = 0 # Initialize reward done = False for m in range(max_iterations): if done == True: break q_vals = weight(state, w, b) a = Action_select(q_vals, epsilon) Q = q_vals[a] Sprime, reward, done = Car.step(a) '''Computing q_pi (s,a)''' Qprime = weight(Sprime, w, b) Q_next = max(Qprime) '''Gradient Update''' grad = alpha * (Q - (reward + gamma*Q_next)) for j in state.keys(): w[j][a] = w[j][a] - grad * state[j] b = b - grad state = Sprime r += reward ## Rendering ## '''Executed to see improvements after every 1000 episodes else it slows the overall execution''' if noe%1000 == 0: MountainCar.render(Car) #env Rewards.append(r) MountainCar.close(Car) return w, b, Rewards
def main(): (program, mode, weight_out, returns_out, episodes, max_iterations, epsilon, gamma, alpha) = sys.argv epsilon, gamma, alpha, episodes, max_iterations = float(epsilon), float( gamma), float(alpha), int(episodes), int(max_iterations) # Output files w_out = open(weight_out, 'w') r_out = open(returns_out, 'w') # Initialize Mountain Car car = MountainCar(mode=mode) actions, num_actions = (0, 1, 2), 3 # Weights: <dim(S)> by <num_actions> matrix w = np.zeros((car.state_space, num_actions)) bias = 0 # Represent state as numpy array def state_rep(state_dict, mode): if mode == "raw": state = np.asarray(list(state_dict.values())) elif mode == "tile": state = np.zeros(2048) for key in state_dict: state[key] = 1 return state # Do actions for i in range(episodes): # Initialize num_iters = 0 total_rewards = 0 # Raw dictionary state_dict = car.reset() # Convert to numpy array state = state_rep(state_dict, mode) while num_iters < max_iterations: num_iters += 1 # E greedy action = getAction(state, actions, epsilon, w, bias) # Observe sample (next_state_dict, reward, done) = car.step(action) # Add current reward total_rewards += reward # Next state, get best action for next state next_state = state_rep(next_state_dict, mode) next_best_action = getBestAction(next_state, actions, w, bias) next_state_best_Q = QValue(next_state, next_best_action, w, bias) # Sample sample = reward + (gamma * next_state_best_Q) diff = QValue(state, action, w, bias) - sample # Update weights w[:, action] = w[:, action] - alpha * diff * state bias = bias - alpha * diff * 1 # Break if done if not done: state = next_state else: break # Print rewards r_out.write(str(total_rewards) + "\n") # Print weight outputs w_out.write(str(bias) + '\n') for row in w: for elem in row: w_out.write(str(elem) + '\n') # Close car.close() w_out.close() r_out.close()
class qlearning(object): def __init__(self, mode, epsilon, gamma, learning_rate): self.epsilon = epsilon self.gamma = gamma self.lr = learning_rate self.mode = mode self.env = MountainCar(mode) self.state_space = self.env.state_space self.action_space = 3 self.W = np.zeros((self.state_space, self.action_space)) self.b = 0 # given the current state and action, approximate thee action value (q_s) def linear_approx(self, state): #return np.dot(state.T, self.W).T + self.b return state.dot(self.W) + self.b # choose an action based on epsilon-greedy method def select_action(self, state): if np.random.rand() < self.epsilon: # selects uniformly at random from one of the 3 actions (0, 1, 2) with probability ε return np.random.randint(0, self.action_space) else: # selects the optimal action with probability 1 − ε # In case of multiple maximum values, return the first one return np.argmax(self.linear_approx(state)) def transfer_state(self, state): if self.mode == "raw": return np.fromiter(state.values(), dtype=float) elif self.mode == "tile": idx = sorted(state.keys()) trans_state = np.zeros((self.state_space)) trans_state[idx] = 1 return trans_state else: print("Error mode.") return def run(self, weight_out, returns_out, episodes, max_iterations): with open(returns_out, 'w') as f_returns: # perform training for episode in range(episodes): rewards = 0 state = self.transfer_state(self.env.reset()) if Debug: print("episode " + str(episode) + " init state: ", end="") print(state) for i in range(max_iterations): # call step action = self.select_action(state) next_state, reward, done = self.env.step(action) next_state = self.transfer_state(next_state) if Debug and i % 100 == 0: print("episode " + str(episode) + " iter " + str(i) + ", action: " + str(action) + " next state: ", end="") print(next_state) # update w_a delta = state cur_q = self.linear_approx(state) next_q = self.linear_approx(next_state) self.W[:, action] = self.W[:, action] - self.lr * ( cur_q[action] - (reward + self.gamma * np.max(next_q))) * delta # update bias self.b = self.b - self.lr * ( cur_q[action] - (reward + self.gamma * np.max(next_q))) state = next_state rewards += reward if done: break f_returns.write(str(rewards) + "\n") if Debug: print("[episode ", episode + 1, "] total rewards: ", rewards) with open(weight_out, 'w') as f_weight: f_weight.write(str(self.b) + "\n") # write the values of weights in row major order for i in range(self.W.shape[0]): for j in range(self.W.shape[1]): f_weight.write(str(self.W[i][j]) + "\n") # visualization # self.env.render() def close(self): self.env.close()