def test_environment_features():
    np.random.seed(0)
    config = Config()
    config.init_noise_var = 0.0
    config.num_obs_features = 4

    env = BoyanChain(config)

    def run_env(e: BoyanChain, s=20):
        for i in range(s):
            print("Step number: {0}".format(i + 1))
            current_state = e.current_state
            next_state, _, observed_features, terminal = e.step()
            print("\tMoved: {0} --> {1}".format(current_state, next_state))
            print("\tObserved Features: {0}".format(observed_features))

            if terminal:
                e.reset()

    run_env(env, 20)

    print("\nAdding 4 features without noise...")
    env.reset()
    env.add_feature(4)
    run_env(env, 20)

    print("\nAdding 4 features with noise...")
    env.reset()
    env.add_feature(4, noise=1)
    run_env(env, 20)
def learning_value_function(sample_size=100000, checkpoint=1000):
    np.random.seed(0)
    config = Config()
    config.init_noise_var = 0.1
    config.num_obs_features = 4

    env = BoyanChain(config)

    theta = np.zeros(config.num_obs_features, dtype=np.float64)
    theta_star = env.optimal_weights
    alpha = 0.005

    def train(th, th_star, e: BoyanChain, ss, ckpt):
        e.reset()
        current_features = e.get_observable_features()
        mean_square_value_diff = 0.0
        for i in range(ss):
            current_value = np.dot(current_features, th)
            optimal_value = np.dot(e.current_features, th_star)
            current_state, reward, next_features, terminal = e.step()

            next_value = np.dot(next_features, th)
            temporal_diff = reward + (
                1 - int(terminal)) * next_value - current_value
            th += alpha * temporal_diff * current_features

            mean_square_value_diff += np.square(current_value -
                                                optimal_value) / ckpt
            if (i + 1) % ckpt == 0:
                print("Training Step: {0}".format(i + 1))
                print(
                    "\tEstimated MSVE: {0:.4f}".format(mean_square_value_diff))
                print("\tTrue MSVE: {0:.4f}".format(e.compute_msve(th)))
                mean_square_value_diff *= 0

            current_features = next_features

            if terminal:
                e.reset()
                current_features = e.get_observable_features()

    print("First phase of training...")
    train(theta, theta_star, env, sample_size, checkpoint)
    env.add_feature(4, 0.0)

    print("\n\nSecond phase of training...")
    new_theta = np.zeros(8, dtype=np.float64)
    new_theta[:4] += theta
    train(new_theta, theta_star, env, sample_size, checkpoint)
Example #3
0
def boyan_chain_test(steps=50000):
    from src.env.BoyanChain import BoyanChain
    from src.env.RandomFeatures_task import LinearFunctionApproximator
    from src.util import Config
    import matplotlib.pyplot as plt

    config = Config()
    checkpoint = 100
    """ Environment Setup """
    config.init_noise_var = 0.1
    config.num_obs_features = 4
    config.max_num_features = 9
    """ AutoTIDBD Setup """
    config.parameter_size = 4
    config.theta = 0.001
    config.tau = 10000
    config.init_stepsize = 0.001
    # to keep track of learning progress
    run_avg_msve = np.zeros(steps // checkpoint, dtype=np.float64)
    current_checkpoint = 0
    avg_msve = 0

    env = BoyanChain(config)
    approximator = LinearFunctionApproximator(config)
    optimizer = AutoTIDBD(config)
    """ Start of Learning"""
    curr_obs_feats = env.get_observable_features()
    for s in range(steps):
        state_value = approximator.get_prediction(curr_obs_feats)
        optimal_value = env.compute_true_value()
        # step in the environment
        _, r, next_obs_feats, term = env.step()
        next_state_value = approximator.get_prediction(next_obs_feats)
        # compute td error
        td_error = r + (1 - term) * next_state_value - state_value
        # update weights
        _, _, new_weights = optimizer.update_weight_vector(
            td_error,
            features=curr_obs_feats,
            weights=approximator.get_weight_vector(),
            discounted_next_features=next_obs_feats)
        approximator.update_weight_vector(new_weights)
        # update features
        curr_obs_feats = next_obs_feats
        # keep track of progress
        avg_msve += np.square(state_value - optimal_value) / checkpoint
        # check if terminal state
        if term:
            env.reset()
            curr_obs_feats = env.get_observable_features()
        # store learning progress so far
        if (s + 1) % checkpoint == 0:
            run_avg_msve[current_checkpoint] += avg_msve
            avg_msve *= 0
            current_checkpoint += 1

        if (s + 1) == (steps // 2):
            env.add_feature(k=4, noise=0.0, fake_feature=False)
            approximator.increase_num_features(4)
            optimizer.increase_size(4)
            curr_obs_feats = env.get_observable_features()

    print("The average MSVE is: {0:0.4f}".format(np.average(run_avg_msve)))

    xaxis = np.arange(run_avg_msve.size) + 1
    plt.plot(xaxis, run_avg_msve)
    plt.show()
    plt.close()
Example #4
0
def sarsa_zero_test(steps=10000,
                    add_new_centers=False,
                    number_of_irrelevant_features=0):
    import matplotlib.pyplot as plt
    from src.env.RandomFeatures_task import LinearFunctionApproximator
    from src.step_size_methods.sgd import SGD

    # epsilon greedy policy
    def choose_action(av_array: np.ndarray, epsilon):
        p = np.random.rand()
        if p > epsilon:
            argmax_av = np.random.choice(
                np.flatnonzero(av_array == av_array.max()))
            return argmax_av
        else:
            return np.random.randint(av_array.size)

    # for computing action values
    def get_action_values(n, features, approximator_list):
        action_values = np.zeros(n, dtype=np.float64)
        for k in range(n):
            action_values[k] += approximator_list[k].get_prediction(features)
        return action_values

    completed_episodes_per_run = []
    for _ in range(1):
        print("==== Results for Sarsa(0) with Epsilon Greedy Policy ====")
        config = Config()

        # setting up feature function
        config.state_dims = 2
        config.state_lims = np.array(((-1, 1), (-1, 1)), dtype=np.float64)
        # config.initial_centers = np.array(((0.0,0.0), (-1.8,0), (1.8,0), (0.0,-1.8), (0.0,1.8)), dtype=np.float64)
        config.initial_centers = np.array(
            ((0.0, 0.0), (0.25, 0.25), (0.25, -0.25), (-0.25, -0.25),
             (-0.25, 0.25)),
            dtype=np.float64)
        config.sigma = 0.5
        config.init_noise_mean = 0.0
        config.init_noise_var = 0.01
        feature_function = RadialBasisFunction(config)

        # setting up environment
        config.norm_state = True
        env = MountainCar(config)

        # function approximator and optimizer parameters
        num_actions = 3
        random_action_prob = 0.1
        gamma = 0.99
        config.num_obs_features = feature_function.num_features
        config.max_num_features = 200  # as long as this is more than 12
        config.num_actions = num_actions
        config.alpha = 0.005
        config.rescale = False
        config.parameter_size = feature_function.num_features
        function_approximator = []
        optimizer = []
        # one instance for each action
        for i in range(num_actions):
            function_approximator.append(LinearFunctionApproximator(config))
            optimizer.append(SGD(config))

        # setting up summaries
        all_episodes_return = []
        episode_return = 0

        # setting up initial state, action, features, and action values
        curr_s = env.get_current_state()
        curr_features = feature_function.get_observable_features(curr_s)
        curr_avs = get_action_values(num_actions, curr_features,
                                     function_approximator)
        curr_a = choose_action(curr_avs, random_action_prob)
        midpoint_episode = 0
        for i in range(steps):
            # get current action values
            curr_avs = get_action_values(num_actions, curr_features,
                                         function_approximator)
            # execute current action
            next_s, r, terminal = env.step(curr_a)
            next_features = feature_function.get_observable_features(next_s)
            # get next action values and action
            next_action_values = get_action_values(num_actions, next_features,
                                                   function_approximator)
            next_action = choose_action(next_action_values, random_action_prob)
            # compute TD error for Sarsa(0)
            td_error = r + gamma * (
                1 -
                terminal) * next_action_values[next_action] - curr_avs[curr_a]
            # update weight vector
            _, ss, new_weights = optimizer[curr_a].update_weight_vector(
                td_error, curr_features,
                function_approximator[curr_a].get_weight_vector())
            function_approximator[curr_a].update_weight_vector(new_weights)
            # set current features and action
            curr_features = next_features
            curr_a = next_action
            # keep track of sum of rewards
            episode_return += r
            # if terminal state
            if terminal:
                env.reset()
                all_episodes_return.append(episode_return)
                episode_return *= 0
                curr_s = env.get_current_state()
                curr_features = feature_function.get_observable_features(
                    curr_s)
                curr_avs = get_action_values(num_actions, curr_features,
                                             function_approximator)
                curr_a = choose_action(curr_avs, random_action_prob)
            # if midpoint of training
            if (i + 1) == (steps // 2):
                if add_new_centers:
                    new_centers = np.array(
                        ((0, 0), (0.25, 0.25), (0.25, -0.25), (-0.25, -0.25),
                         (-0.25, 0.25)),
                        dtype=np.float64)
                    feature_function.add_centers(new_centers,
                                                 noise_var=0,
                                                 noise_mean=0)
                    for k in range(num_actions):
                        function_approximator[k].increase_num_features(
                            new_centers.shape[0])
                        optimizer[k].increase_size(new_centers.shape[0],
                                                   init_stepsize=0.25)
                if number_of_irrelevant_features > 0:
                    new_feature_mean = 0.0
                    new_feature_var = 0.05
                    fake_features = True
                    feature_function.add_feature(number_of_irrelevant_features,
                                                 noise_mean=new_feature_mean,
                                                 noise_var=new_feature_var,
                                                 fake_feature=fake_features)
                    for k in range(num_actions):
                        function_approximator[k].increase_num_features(
                            number_of_irrelevant_features)
                        optimizer[k].increase_size(
                            number_of_irrelevant_features)
                curr_features = feature_function.get_observable_features(
                    curr_s)
                midpoint_episode = len(all_episodes_return)
        completed_episodes_per_run.append(len(all_episodes_return))
        print("Number of episodes completed: {0}".format(
            len(all_episodes_return)))
    print("Average episodes completed: {0:0.4f}".format(
        np.average(completed_episodes_per_run)))

    print("Return per episode:\n", all_episodes_return)
    plt.plot(np.arange(len(all_episodes_return)) + 1, all_episodes_return)
    plt.vlines(x=midpoint_episode, ymin=-800, ymax=0)
    plt.ylim((-800, 0))
    plt.show()
    plt.close()