def main(): dimension = num_tiles_per_tiling * numTilings * num_actions weight_vector = np.zeros(dimension) env = MountainCar() steps = [] # A list of steps in each episode for idx in range(max_episodes): # Check for overflow in weight vector (happens when step size is too large) if not is_weight_valid(weight_vector): # Replace the performance measure in last episode, when the overflow occurs, to infinity as an error signal print('Invalid value encountered. Break out!') steps[-1] = max_steps_per_episode + 1 break print(str(idx) + 'th episode') steps_per_episode = 0 # steps required in this episode state = env.reset() # initialize S action = epsilon_greedy(state, weight_vector, epsilon) eligibility_trace = np.zeros(dimension) done = False while not done: next_state, reward, done = env.step(action) steps_per_episode += 1 # Update eligibility trace eligibility_trace = gamma * lam * eligibility_trace + feature( state, action) next_action = epsilon_greedy(next_state, weight_vector, epsilon) # Compute TD error if done: TD_error = reward - np.dot(weight_vector, feature( state, action)) else: TD_error = reward + gamma * np.dot( weight_vector, feature(next_state, next_action)) - np.dot( weight_vector, feature(state, action)) # Update weight vector weight_vector += alpha * TD_error * eligibility_trace state = next_state action = next_action # Terminate current episode if exceeds max_steps_per_episode if steps_per_episode > max_steps_per_episode: break # Record steps used for this episode print('steps used in this episode: ' + str(steps_per_episode) + '\n') steps.append(steps_per_episode) np.savetxt(args.filename, steps)
def main(): dimension = num_tiles_per_tiling * numTilings * num_actions weight_vector = np.zeros(dimension) env = PuddleWorld() performances = [ ] # Record the steps used and reward collect within each episode for idx in range(max_episodes): # Check for overflow in weight vector (happens when step size is too large) if not is_weight_valid(weight_vector): # Replace the performance measure in last episode, when the overflow occurs, to infinity as an error signal performances[-1] = [np.float('inf'), np.float('inf')] break state = env.reset() action = epsilon_greedy(state, weight_vector, epsilon) eligibility_trace = np.zeros(dimension) done = False print(str(idx) + 'th episode') steps_per_episode = 0 # steps required in this episode reward_per_episode = 0.0 while not done: next_state, reward, done = env.step(action) # Update eligibility trace eligibility_trace = gamma * lam * eligibility_trace + feature( state, action) next_action = epsilon_greedy(next_state, weight_vector, epsilon) # Compute TD error if done: TD_error = reward - np.dot(weight_vector, feature( state, action)) else: TD_error = reward + gamma * np.dot( weight_vector, feature(next_state, next_action)) - np.dot( weight_vector, feature(state, action)) # Update weight vector weight_vector += alpha * TD_error * eligibility_trace state = next_state action = next_action steps_per_episode += 1 reward_per_episode += reward print('steps used in this episode: ' + str(steps_per_episode)) print('rewards in this episode: ' + str(reward_per_episode) + '\n') performances.append([steps_per_episode, reward_per_episode]) np.savetxt(args.filename, performances)
def main(): # Load the 500 sample states and their true state value samples = np.load('sampleOnPolicy.npy') feature_samples = create_features(samples) dimension = num_tiles_per_tiling * numTilings weight_vector = np.zeros(dimension) MSEs = np.zeros(max_episodes) env = MountainCar() for idx in range(max_episodes): # Compute MSVE at the begining of each episode MSEs[idx] = MSE(samples, feature_samples, weight_vector) print('MSE of ' + str(idx) + 'th episodes: ' + str(MSEs[idx])) state = env.reset() F = interest(state) # Initialize follow-on trace F_0 M = lam * interest(state) + (1 - lam) * F # Initialize Emphasis M_0 eligibility_trace = rho(state) * M * feature( state) # Initialize eligibility trace e_0 done = False while not done: action = simple_policy(state[1]) next_state, reward, done = env.step(action) # Update weight vector TD_error = reward + gamma * np.dot( weight_vector, feature(next_state)) - np.dot( weight_vector, feature(state)) weight_vector += alpha * TD_error * eligibility_trace # Update Follow-on trace F = rho(state) * gamma * F + interest(next_state) # Update Emphasis M = lam * interest(next_state) + (1 - lam) * F # Update eligibility trace eligibility_trace = rho(next_state) * ( gamma * lam * eligibility_trace + M * feature(next_state)) state = next_state # Write out result (MSEs) to a file writeF(args.filename, MSEs)
def create_features(samples): assert samples.shape[1] == 3 result = np.zeros(shape=(samples.shape[0], num_tiles_per_tiling * numTilings)) # For each state, compute its feature for idx in range(samples.shape[0]): sample = samples[idx] feature_sample = feature(sample) result[idx] = feature_sample return result
def main(): # Load the 500 sample states and their true state value samples = np.load('sampleOnPolicy.npy') feature_samples = create_features(samples) dimension = num_tiles_per_tiling * numTilings weight_vector = np.zeros(dimension) MSEs = np.zeros(max_episodes) env = MountainCar() for idx in range(max_episodes): # Compute MSVE at the begining of each episode MSEs[idx] = MSE(samples, feature_samples, weight_vector) print('MSE of ' + str(idx) + 'th episodes: ' + str(MSEs[idx])) state = env.reset() eligibility_trace = np.zeros(dimension) done = False while not done: action = simple_policy(state[1]) next_state, reward, done = env.step(action) # Update eligibility trace eligibility_trace = gamma * lam * eligibility_trace + feature( state) # Compute TD error TD_error = reward + gamma * np.dot( weight_vector, feature(next_state)) - np.dot( weight_vector, feature(state)) # Update weight vector weight_vector += alpha * TD_error * eligibility_trace state = next_state # Write out result (MSEs) to a file in target directory writeF(args.filename, MSEs)
def epsilon_greedy(state, weight_vector, epsilon): estimated_action_values = [] # Get apprximate action-values for all legal action in state for action in range(num_actions): action = action - 1 # Remember our legal actions are -1, 0, 1 ft = feature(state, action) estimated_action_value = np.dot(weight_vector, ft) estimated_action_values.append(estimated_action_value) if np.random.random() > epsilon: max_q = max(estimated_action_values) greedy_actions = [ action for action, q in enumerate(estimated_action_values) if q == max_q ] action = np.random.choice(greedy_actions) if len( greedy_actions ) != 0 else 1 # if weight_vector overflows and thus max_q = nan we always return 0 return action - 1 # Note we use -1, 0, 1 as action representations in our main code else: return np.random.choice([-1, 0, 1])
def epsilon_greedy(state, weight_vector, epsilon): estimated_action_values = [] # Get apprximate action-values for all legal action in state for action in range(num_actions): ft = feature(state, action) estimated_action_value = np.dot(weight_vector, ft) estimated_action_values.append(estimated_action_value) if np.random.random() > epsilon: max_q = max(estimated_action_values) greedy_actions = [ action for action, q in enumerate(estimated_action_values) if q == max_q ] action = np.random.choice(greedy_actions) if len( greedy_actions ) != 0 else 0 # if weight_vector overflows and thus max_q = nan we always return 0 return action else: return np.random.choice(list( range(num_actions))) # randomly select an action from [0, 1, 2, 3]