コード例 #1
0
def evaluate_policies(policies_memmap, performance_memmap, evaluation_run_num,
                      ace_run_num, config_num, policy_num, random_seed):

    if evaluation_run_num == 0:
        performance_memmap[config_num]['parameters'] = policies_memmap[
            config_num]['parameters']

    # Load the policy to evaluate:
    configuration = policies_memmap[config_num]['parameters']
    weights = policies_memmap[config_num]['policies'][ace_run_num,
                                                      policy_num]['weights']
    actor = BinaryACE(weights.shape[0], weights.shape[1], 0.)
    actor.theta = weights

    # Handle situations where the learning process diverged:
    if np.any(np.isnan(weights)):
        # If the weights overflowed, assign NaN as return:
        performance_memmap[config_num]['results'][ace_run_num, policy_num,
                                                  evaluation_run_num] = np.nan
    else:
        # Set up the environment:
        import gym_puddle  # Re-import the puddleworld env in each subprocess or it sometimes isn't found during creation.
        env = gym.make(args.environment).unwrapped
        env.seed(random_seed)
        rng = env.np_random
        if args.objective == 'episodic':
            # Use the environment's start state:
            s_t = env.reset()
        else:
            raise NotImplementedError

        # Configure the tile coder:
        num_tiles_per_dim = configuration['num_tiles_per_dim']
        num_tilings = configuration['num_tilings']
        bias_unit = configuration['bias_unit']
        tc = TileCoder(
            np.array([env.observation_space.low,
                      env.observation_space.high]).T, num_tiles_per_dim,
            num_tilings, bias_unit)

        # Write the total rewards received to file:
        performance_memmap[config_num]['results'][
            ace_run_num, policy_num,
            evaluation_run_num] = evaluate_policy(actor, tc, env, rng)
コード例 #2
0
    experience_memmap_test = np.lib.format.open_memmap(
        args.experience_file_test, mode='r')
    num_runs, num_test_eval = experience_memmap_test.shape

    # Generate the random seed for each run without replacement to prevent the birthday paradox:
    random.seed(args.random_seed)
    random_seeds = random.sample(range(2**32), num_runs)

    # Create the tile coder to be used for all parameter settings:
    if args.environment == 'pw':
        dummy_env = puddleworld()
    else:
        dummy_env = gym.make(
            args.environment).unwrapped  # Make a dummy env to get shape info.
    tc = TileCoder(
        np.array([
            dummy_env.observation_space.low, dummy_env.observation_space.high
        ]).T, args.num_tiles_per_dim, args.num_tilings, args.bias_unit)

    # Create the memmapped array of learned policies that will be populated in parallel:
    parameters_dtype = np.dtype([('alpha_a', float), ('alpha_w', float),
                                 ('alpha_v', float), ('lambda', float),
                                 ('eta', float), ('gamma', float),
                                 ('num_tiles_per_dim', int,
                                  (len(args.num_tiles_per_dim), )),
                                 ('num_tilings', int), ('bias_unit', bool)])
    policy_dtype = np.dtype([('timesteps', int),
                             ('weights', float, (dummy_env.action_space.n,
                                                 tc.total_num_tiles))])
    num_policies = num_timesteps // args.checkpoint_interval + 1
    configuration_dtype = np.dtype([('parameters', parameters_dtype),
                                    ('policies', policy_dtype,
コード例 #3
0
    def test_aa_binary_ace_on_policy(self):
        env = gym.make(
            'MountainCar-v0'
        ).unwrapped  # Get the underlying environment object to bypass the built-in timestep limit.
        env.seed(317850564)  # Seed generated by: np.random.randint(2**31 - 1)
        rng = env.np_random

        alpha_a = .1
        alpha_c = .2
        alpha_c2 = .001
        lambda_c = .9

        gamma = 1.
        num_runs = 10
        num_episodes = 10
        rewards = np.full((num_runs, num_episodes), np.nan)
        for run_num in tqdm(range(num_runs)):
            tc = TileCoder(
                np.array(
                    [env.observation_space.low, env.observation_space.high]).T,
                [5, 5], 8, True)
            actor = BinaryACE(env.action_space.n, tc.total_num_tiles,
                              alpha_a / tc.num_active_features)
            critic = BinaryGQ(env.action_space.n, tc.total_num_tiles,
                              alpha_c / tc.num_active_features,
                              alpha_c2 / tc.num_active_features, lambda_c)
            for episode_num in range(num_episodes):
                g = 0.
                f_t = 1.
                critic.z *= 0.  # reset traces
                indices_t = tc.encode(env.reset())
                for t in range(1000):
                    a_t = rng.choice(env.action_space.n,
                                     p=actor.pi(indices_t))  # Select action.
                    s_tp1, r_tp1, terminal, _ = env.step(
                        a_t)  # Interact with environment.
                    indices_tp1 = tc.encode(s_tp1)
                    critic.learn(indices_t, a_t, 1, gamma, r_tp1, indices_tp1,
                                 actor.pi(indices_tp1),
                                 0 if terminal else gamma)
                    q_t = critic.estimate(indices_t)
                    actor.all_actions_learn(indices_t, q_t, f_t)
                    indices_t = indices_tp1
                    f_t *= gamma
                    g += r_tp1
                    if terminal:
                        break
                rewards[run_num, episode_num] = g
        mean_rewards = np.mean(rewards, axis=0)
        sem_rewards = st.sem(rewards, axis=0)
        ci = sem_rewards * st.t.ppf((1.95) / 2, num_runs - 1)

        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.errorbar(np.arange(num_episodes),
                    mean_rewards,
                    yerr=[ci, ci],
                    label='All Actions AC')
        plt.legend(loc='lower right')
        plt.title('Mountain Car (on-policy)')
        plt.xlabel('Episode Number')
        plt.ylabel('Total Reward')
        plt.ylim(-1000, 0)
        plt.savefig('all_actions_ac_on_policy.png')

        self.assertGreater(mean_rewards[-1], -250)
コード例 #4
0
    def test_tile_coder(self):
        np.random.seed(2734287609)

        # Define a function to approximate:
        def target_function(x1, x2):
            return np.sin(np.sqrt(x1**2 + x2**2)) / np.sqrt(x1**2 + x2**2)

        # A function to add noise:
        def noise(y):
            return y + .1 * np.random.randn()

        # Create a tile coder and weight vector:
        tc = TileCoder(space=[[-10, 10], [-10, 10]],
                       num_tiles_per_dim=[11, 11],
                       num_tilings=8,
                       bias_unit=True)
        weights = np.zeros(tc.total_num_tiles)
        step_size = .2 / tc.num_active_features

        # Use the tile coder to train the weight vector from noisy samples of the target function:
        num_examples = 1000
        for i in range(num_examples):
            x1, x2 = np.random.random_sample(
                2) * 20 - 10  # Sample the input space uniformly.
            indices = tc.encode(
                (x1, x2))  # Compute indices for the input point.
            y_hat = weights[indices].sum(
            )  # Generate an estimate from the weights.
            y = noise(target_function(
                x1, x2))  # Get a noisy sample output from the function.
            weights[indices] += step_size * (y - y_hat
                                             )  # Update the weight vector.

        # Check the function and the learned approximation:
        resolution = 100
        x1 = np.arange(-10, 10, 20 / resolution)
        x2 = np.arange(-10, 10, 20 / resolution)
        y = np.zeros((resolution, resolution))
        y_hat = np.zeros((resolution, resolution))
        for j in range(len(x1)):
            for k in range(len(x2)):
                y[j, k] = target_function(x1[j],
                                          x2[k])  # True value of the function.
                y_hat[j, k] = weights[tc.encode(
                    (x1[j], x2[k]))].sum()  # Learned estimate.

        # Visualize the function and the learned approximation:
        x1, x2 = np.meshgrid(x1, x2)
        fig = plt.figure()

        ax = fig.add_subplot(1, 2, 1, projection='3d')
        ax.plot_surface(x1, x2, y_hat, cmap='hot')
        ax.set_zlim(-.25, 1.)
        ax.set_title('Learned estimate after {} examples'.format(num_examples))

        ax = fig.add_subplot(1, 2, 2, projection='3d')
        ax.plot_surface(x1, x2, y, cmap='hot')
        ax.set_zlim(-.25, 1.)
        ax.set_title('True function')
        plt.savefig('tile_coder_test.png')

        tolerance = .01
        self.assertLess(np.mean(y - y_hat), tolerance)
コード例 #5
0
    def test_low_variance_binary_ace_off_policy(self):
        env = gym.make('MountainCar-v0').unwrapped
        env.seed(278422227)
        rng = env.np_random

        # ACE parameters:
        alpha_a = .0005
        alpha_c = .1
        alpha_c2 = .0005
        lambda_c = 0.
        alpha_f = .1
        eta = 1.
        i = lambda g=1: 1.  # Uniform interest.

        # OffPAC parameters:
        # alpha_a = .01
        # alpha_c = .01
        # alpha_c2 = .00005
        # lambda_c = .4
        # alpha_f = .01
        # eta = 0.
        # i = lambda g=1: 1.  # Uniform interest.

        gamma = 1.
        num_runs = 1
        num_timesteps = 20000
        evaluation_interval = 1000
        num_evaluation_runs = 10
        rewards = np.zeros((num_runs, num_timesteps // evaluation_interval + 1,
                            num_evaluation_runs))
        for run_num in range(num_runs):
            tc = TileCoder(
                np.array(
                    [env.observation_space.low, env.observation_space.high]).T,
                [5, 5], 8, True)
            actor = BinaryACE(env.action_space.n, tc.total_num_tiles,
                              alpha_a / tc.num_active_features)
            critic = BinaryTDC(tc.total_num_tiles,
                               alpha_c / tc.num_active_features,
                               alpha_c2 / tc.num_active_features, lambda_c)
            fhat = BinaryFHat(tc.total_num_tiles, alpha_f)

            mu = np.ones(env.action_space.n
                         ) / env.action_space.n  # Uniform random policy.
            gamma_t = 0.
            indices_t = tc.encode(env.reset())
            for t in tqdm(range(num_timesteps)):
                if t % evaluation_interval == 0:
                    rewards[run_num,
                            t // evaluation_interval] = Parallel(n_jobs=-1)(
                                delayed(evaluate_policy)(
                                    actor, tc, num_timesteps=1000)
                                for _ in range(num_evaluation_runs))
                a_t = rng.choice(env.action_space.n, p=mu)
                s_tp1, r_tp1, terminal, _ = env.step(a_t)
                gamma_tp1 = 0. if terminal else gamma
                indices_tp1 = tc.encode(s_tp1)
                rho_t = actor.pi(indices_t)[a_t] / mu[a_t]
                delta_t = r_tp1 + gamma_tp1 * critic.estimate(
                    indices_tp1) - critic.estimate(indices_t)
                critic.learn(delta_t, indices_t, gamma_t, indices_tp1,
                             gamma_tp1, rho_t)
                i_t = i(gamma_t)
                f_t = fhat.estimate(indices_t)
                m_t = (1 - eta) * i_t + eta * f_t
                actor.learn(indices_t, a_t, delta_t, m_t, rho_t)
                fhat.learn(indices_tp1, gamma_tp1, indices_t, rho_t, i_t)
                gamma_t = gamma_tp1
                indices_t = indices_tp1

                if terminal:
                    gamma_t = 0.
                    s_tp1 = env.reset()

            rewards[run_num, -1] = Parallel(n_jobs=-1)(
                delayed(evaluate_policy)(actor, tc, num_timesteps=1000)
                for _ in range(num_evaluation_runs))

        # Plot results:
        mean_eval_rewards = np.mean(rewards, axis=2)
        var_eval_rewards = np.var(rewards, axis=2)
        mean_rewards = np.mean(mean_eval_rewards, axis=0)
        sem_rewards = np.sqrt(
            np.sum(var_eval_rewards / num_evaluation_runs, axis=0)) / num_runs
        fig = plt.figure()
        ax = fig.add_subplot(111)
        x = np.array([
            evaluation_interval * i
            for i in range(num_timesteps // evaluation_interval + 1)
        ])
        confs = sem_rewards * st.t.ppf(
            (1.0 + 0.95) / 2, num_evaluation_runs - 1)
        label = f'$\\alpha_a$:{alpha_a}, $\\alpha_c$:{alpha_c}, $\\alpha_c2$:{alpha_c2}, $\\lambda_c$:{lambda_c}, $\\eta$:{eta}{"(OffPAC)" if eta==0. else ""}'
        ax.errorbar(x, mean_rewards, yerr=[confs, confs], label=label)
        plt.legend(loc='lower right')
        plt.title('Mountain Car (off-policy)')
        plt.xlabel('Timesteps')
        plt.ylabel('Total Reward')
        plt.ylim(-1000, 0)
        plt.savefig('low_variance_binary_ace_off_policy.png')
        self.assertGreater(mean_rewards[-1], -200)