Esempio n. 1
def main():
    dimension = num_tiles_per_tiling * numTilings * num_actions
    weight_vector = np.zeros(dimension)
    env = MountainCar()
    steps = []  # A list of steps in each episode

    for idx in range(max_episodes):
        # Check for overflow in weight vector (happens when step size is too large)
        if not is_weight_valid(weight_vector):
            # Replace the performance measure in last episode, when the overflow occurs, to infinity as an error signal
            print('Invalid value encountered. Break out!')
            steps[-1] = max_steps_per_episode + 1

        print(str(idx) + 'th episode')
        steps_per_episode = 0  # steps required in this episode

        state = env.reset()  # initialize S
        action = epsilon_greedy(state, weight_vector, epsilon)
        eligibility_trace = np.zeros(dimension)
        done = False

        while not done:
            next_state, reward, done = env.step(action)

            steps_per_episode += 1

            # Update eligibility trace
            eligibility_trace = gamma * lam * eligibility_trace + feature(
                state, action)

            next_action = epsilon_greedy(next_state, weight_vector, epsilon)

            # Compute TD error
            if done:
                TD_error = reward -, feature(
                    state, action))
                TD_error = reward + gamma *
                    weight_vector, feature(next_state, next_action)) -
                        weight_vector, feature(state, action))

            # Update weight vector
            weight_vector += alpha * TD_error * eligibility_trace

            state = next_state
            action = next_action

            # Terminate current episode if exceeds max_steps_per_episode
            if steps_per_episode > max_steps_per_episode:

        # Record steps used for this episode
        print('steps used in this episode: ' + str(steps_per_episode) + '\n')

    np.savetxt(args.filename, steps)
Esempio n. 2
def main():
    dimension = num_tiles_per_tiling * numTilings * num_actions
    weight_vector = np.zeros(dimension)
    env = PuddleWorld()
    performances = [
    ]  # Record the steps used and reward collect within each episode

    for idx in range(max_episodes):
        # Check for overflow in weight vector (happens when step size is too large)
        if not is_weight_valid(weight_vector):
            # Replace the performance measure in last episode, when the overflow occurs, to infinity as an error signal
            performances[-1] = [np.float('inf'), np.float('inf')]

        state = env.reset()
        action = epsilon_greedy(state, weight_vector, epsilon)
        eligibility_trace = np.zeros(dimension)
        done = False

        print(str(idx) + 'th episode')
        steps_per_episode = 0  # steps required in this episode
        reward_per_episode = 0.0

        while not done:
            next_state, reward, done = env.step(action)

            # Update eligibility trace
            eligibility_trace = gamma * lam * eligibility_trace + feature(
                state, action)

            next_action = epsilon_greedy(next_state, weight_vector, epsilon)

            # Compute TD error
            if done:
                TD_error = reward -, feature(
                    state, action))
                TD_error = reward + gamma *
                    weight_vector, feature(next_state, next_action)) -
                        weight_vector, feature(state, action))

            # Update weight vector
            weight_vector += alpha * TD_error * eligibility_trace

            state = next_state
            action = next_action

            steps_per_episode += 1
            reward_per_episode += reward

        print('steps used in this episode: ' + str(steps_per_episode))
        print('rewards in this episode: ' + str(reward_per_episode) + '\n')
        performances.append([steps_per_episode, reward_per_episode])

    np.savetxt(args.filename, performances)
Esempio n. 3
def main():
    # Load the 500 sample states and their true state value
    samples = np.load('sampleOnPolicy.npy')
    feature_samples = create_features(samples)

    dimension = num_tiles_per_tiling * numTilings

    weight_vector = np.zeros(dimension)
    MSEs = np.zeros(max_episodes)
    env = MountainCar()

    for idx in range(max_episodes):
        # Compute MSVE at the begining of each episode
        MSEs[idx] = MSE(samples, feature_samples, weight_vector)
        print('MSE of ' + str(idx) + 'th episodes: ' + str(MSEs[idx]))

        state = env.reset()

        F = interest(state)  # Initialize follow-on trace F_0
        M = lam * interest(state) + (1 - lam) * F  # Initialize Emphasis M_0
        eligibility_trace = rho(state) * M * feature(
            state)  # Initialize eligibility trace e_0
        done = False
        while not done:
            action = simple_policy(state[1])
            next_state, reward, done = env.step(action)

            # Update weight vector
            TD_error = reward + gamma *
                weight_vector, feature(next_state)) -
                    weight_vector, feature(state))
            weight_vector += alpha * TD_error * eligibility_trace

            # Update Follow-on trace
            F = rho(state) * gamma * F + interest(next_state)

            # Update Emphasis
            M = lam * interest(next_state) + (1 - lam) * F

            # Update eligibility trace
            eligibility_trace = rho(next_state) * (
                gamma * lam * eligibility_trace + M * feature(next_state))

            state = next_state

    # Write out result (MSEs) to a file
    writeF(args.filename, MSEs)
Esempio n. 4
def create_features(samples):
    assert samples.shape[1] == 3

    result = np.zeros(shape=(samples.shape[0],
                             num_tiles_per_tiling * numTilings))
    # For each state, compute its feature
    for idx in range(samples.shape[0]):
        sample = samples[idx]
        feature_sample = feature(sample)
        result[idx] = feature_sample
    return result
Esempio n. 5
def main():
    # Load the 500 sample states and their true state value
    samples = np.load('sampleOnPolicy.npy')
    feature_samples = create_features(samples)

    dimension = num_tiles_per_tiling * numTilings

    weight_vector = np.zeros(dimension)
    MSEs = np.zeros(max_episodes)
    env = MountainCar()

    for idx in range(max_episodes):
        # Compute MSVE at the begining of each episode
        MSEs[idx] = MSE(samples, feature_samples, weight_vector)
        print('MSE of ' + str(idx) + 'th episodes: ' + str(MSEs[idx]))

        state = env.reset()
        eligibility_trace = np.zeros(dimension)
        done = False
        while not done:
            action = simple_policy(state[1])
            next_state, reward, done = env.step(action)

            # Update eligibility trace
            eligibility_trace = gamma * lam * eligibility_trace + feature(
            # Compute TD error
            TD_error = reward + gamma *
                weight_vector, feature(next_state)) -
                    weight_vector, feature(state))
            # Update weight vector
            weight_vector += alpha * TD_error * eligibility_trace

            state = next_state

    # Write out result (MSEs) to a file in target directory
    writeF(args.filename, MSEs)
Esempio n. 6
def epsilon_greedy(state, weight_vector, epsilon):
    estimated_action_values = []
    # Get apprximate action-values for all legal action in state
    for action in range(num_actions):
        action = action - 1  # Remember our legal actions are -1, 0, 1
        ft = feature(state, action)
        estimated_action_value =, ft)

    if np.random.random() > epsilon:
        max_q = max(estimated_action_values)
        greedy_actions = [
            action for action, q in enumerate(estimated_action_values)
            if q == max_q
        action = np.random.choice(greedy_actions) if len(
        ) != 0 else 1  # if weight_vector overflows and thus max_q = nan we always return 0
        return action - 1  # Note we use -1, 0, 1 as action representations in our main code
        return np.random.choice([-1, 0, 1])
Esempio n. 7
def epsilon_greedy(state, weight_vector, epsilon):
    estimated_action_values = []
    # Get apprximate action-values for all legal action in state
    for action in range(num_actions):
        ft = feature(state, action)
        estimated_action_value =, ft)

    if np.random.random() > epsilon:
        max_q = max(estimated_action_values)
        greedy_actions = [
            action for action, q in enumerate(estimated_action_values)
            if q == max_q
        action = np.random.choice(greedy_actions) if len(
        ) != 0 else 0  # if weight_vector overflows and thus max_q = nan we always return 0
        return action
        return np.random.choice(list(
            range(num_actions)))  # randomly select an action from [0, 1, 2, 3]