Exemple #1
0
def generate_experience_test(experience, run_num, random_seed):
    # Check if this run of experience has already been generated:
    if np.count_nonzero(experience[run_num]) != 0:
        return

    # Initialize the environment:
    if args.environment == 'pw':
        env = puddleworld()
    else:
        import gym_puddle  # Re-import the puddleworld env in each subprocess or it sometimes isn't found during creation.
        env = gym.make(args.environment).unwrapped
    env.seed(random_seed)
    rng = env.np_random

    # Create the behaviour policy:
    mu = eval(args.behaviour_policy, {
        'np': np,
        'env': env
    })  # Give the eval'd function access to some objects.

    # Generate the required timesteps of experience:
    s_t = env.reset()
    a_t = rng.choice(env.action_space.n, p=mu(s_t))
    t = 0
    step = 0
    while t != args.num_timesteps:
        # Take action a_t, observe next state s_tp1 and reward r_tp1:
        s_tp1, r_tp1, terminal, _ = env.step(a_t)

        # The agent is reset to a starting state after a terminal transition:
        if terminal:
            s_tp1 = env.reset()

        a_tp1 = rng.choice(env.action_space.n, p=mu(s_t))

        step += 1
        #adds every 1000th state as an evaluation state
        if step % 1000 == 0:
            # Add the transition:
            experience[run_num, t] = (s_t, )
            step = 0
            t += 1

        # Update temporary variables:
        s_t = s_tp1
        a_t = a_tp1
Exemple #2
0
def generate_experience(experience, run_num, random_seed):
    # Check if this run of experience has already been generated:
    if np.count_nonzero(experience[run_num]) != 0:
        return

    # Initialize the environment:
    if args.environment == 'pw':
        env = puddleworld()
    else:
        import gym_puddle
        env = gym.make(args.environment).unwrapped
    env.seed(random_seed)
    rng = env.np_random

    # Create the behaviour policy:
    mu = eval(args.behaviour_policy, {
        'np': np,
        'env': env
    })  # Give the eval'd function access to some objects.

    # Generate the required timesteps of experience:
    s_t = env.reset()
    a_t = rng.choice(env.action_space.n, p=mu(s_t))
    for t in range(args.num_timesteps):
        # Take action a_t, observe next state s_tp1 and reward r_tp1:
        s_tp1, r_tp1, terminal, _ = env.step(a_t)

        # The agent is reset to a starting state after a terminal transition:
        if terminal:
            s_tp1 = env.reset()

        a_tp1 = rng.choice(env.action_space.n, p=mu(s_t))

        # Add the transition:
        experience[run_num, t] = (s_t, a_t, r_tp1, s_tp1, a_tp1, terminal)

        # Update temporary variables:
        s_t = s_tp1
        a_t = a_tp1
def run_ace(experience_memmap, policies_memmap, performance_memmap, run_num,
            config_num, parameters, random_seed, experience_memmap_test,
            num_test_eval):

    # If this run and configuration has already been done (i.e., previous run timed out), exit early:
    if np.count_nonzero(policies_memmap[config_num]['policies'][run_num]) != 0:
        return

    alpha_a, alpha_w, alpha_v, lambda_c, eta = parameters

    # If this is the first run with a set of parameters, save the parameters:
    if run_num == 0:
        policies_memmap[config_num]['parameters'] = (alpha_a, alpha_w, alpha_v,
                                                     lambda_c, eta, args.gamma,
                                                     args.num_tiles_per_dim,
                                                     args.num_tilings,
                                                     args.bias_unit)
        performance_memmap[config_num]['parameters'] = (alpha_a, alpha_w,
                                                        alpha_v, lambda_c, eta,
                                                        args.gamma,
                                                        args.num_tiles_per_dim,
                                                        args.num_tilings,
                                                        args.bias_unit)

    # Create the environment to evaluate the learned policy in:
    if args.environment == 'pw':
        env = puddleworld()
    else:
        import gym_puddle
        env = gym.make(args.environment).unwrapped
    env.seed(random_seed)
    rng = env.np_random

    actor = BinaryACE(env.action_space.n, tc.total_num_tiles,
                      alpha_a / tc.num_active_features)
    if args.all_actions:
        critic = BinaryGQ(env.action_space.n, tc.total_num_tiles,
                          alpha_w / tc.num_active_features,
                          alpha_v / tc.num_active_features, lambda_c)
    elif args.critic == 'ETD':
        critic = BinaryETD(tc.total_num_tiles,
                           alpha_w / tc.num_active_features, lambda_c)
    else:
        critic = BinaryTDRC(tc.total_num_tiles,
                            alpha_w / tc.num_active_features, lambda_c)

    if args.direct_f:
        # Initialize the function approximator being used to estimate the emphatic weightings:
        fhat = BinaryFHat(tc.total_num_tiles, alpha_v / tc.num_active_features,
                          args.normalize)

    i = eval(args.interest_function)  # Create the interest function to use.
    mu = eval(args.behaviour_policy, {
        'np': np,
        'env': env
    })  # Create the behaviour policy and give it access to numpy and the env.

    policies = np.zeros(num_policies, dtype=policy_dtype)
    performance = np.zeros((num_policies, args.num_evaluation_runs),
                           dtype=float)
    performance_excursions = np.zeros((num_policies, num_test_eval),
                                      dtype=float)

    np.seterr(divide='raise', over='raise', invalid='raise')
    try:
        transitions = experience_memmap[run_num]
        gamma_t = 0.
        f_t = 0.
        rho_tm1 = 1.
        indices_t = tc.encode(transitions[0][0])
        for t, transition in enumerate(transitions):
            # Save and evaluate the learned policy if it's a checkpoint timestep:
            if t % args.checkpoint_interval == 0:
                performance[t // args.checkpoint_interval] = [
                    evaluate_policy(actor, tc, env, rng, args.max_timesteps)
                    for _ in range(args.num_evaluation_runs)
                ]
                perf_excursions = []
                for sample in range(num_test_eval):
                    env.state = experience_memmap_test[run_num][sample][0]
                    perf_excursions.append(
                        evaluate_policy(
                            actor,
                            tc,
                            env,
                            rng,
                            args.max_timesteps,
                            state=experience_memmap_test[run_num][sample][0]))
                performance_excursions[
                    t // args.checkpoint_interval] = perf_excursions
                policies[t //
                         args.checkpoint_interval] = (t, np.copy(actor.theta))

            # Unpack the stored transition.
            s_t, a_t, r_tp1, s_tp1, a_tp1, terminal = transition
            gamma_tp1 = args.gamma if not terminal else 0  # Transition-dependent discounting.
            indices_tp1 = tc.encode(s_tp1)
            i_t = i(s_t, gamma_t)
            i_tp1 = i(s_tp1, gamma_tp1)
            # Compute importance sampling ratio for the policy:
            pi_t = actor.pi(indices_t)
            mu_t = mu(s_t)
            rho_t = pi_t[a_t] / mu_t[a_t]

            if args.direct_f:
                # Estimate emphatic weightings with the function approximator:
                f_t = fhat.estimate(indices_t)

                # Update the function approximator:
                fhat.learn(indices_tp1, gamma_tp1, indices_t, rho_t, i_tp1)
            else:
                # Estimate emphatic weightings with the follow-on trace:
                f_t = (
                    1 - gamma_t
                ) * i_t + rho_tm1 * gamma_t * f_t if args.normalize else i_t + rho_tm1 * gamma_t * f_t
            m_t = (1 - eta) * i_t + eta * f_t

            if args.all_actions:
                critic.learn(indices_t, a_t, rho_t, gamma_t, r_tp1,
                             indices_tp1, actor.pi(indices_tp1), gamma_tp1)
                q_t = critic.estimate(indices_t)
                actor.all_actions_learn(indices_t, q_t, m_t)
            elif args.critic == 'ETD':
                delta_t = r_tp1 + gamma_tp1 * critic.estimate(
                    indices_tp1) - critic.estimate(indices_t)
                critic.learn(delta_t, indices_t, gamma_t, i_t, rho_t, f_t)
                actor.learn(indices_t, a_t, delta_t, m_t, rho_t)
            else:
                delta_t = r_tp1 + gamma_tp1 * critic.estimate(
                    indices_tp1) - critic.estimate(indices_t)
                critic.learn(delta_t, indices_t, gamma_t, indices_tp1,
                             gamma_tp1, rho_t)
                actor.learn(indices_t, a_t, delta_t, m_t, rho_t)

            gamma_t = gamma_tp1
            indices_t = indices_tp1
            rho_tm1 = rho_t

        # Save and evaluate the policy after the final timestep:
        policies[-1] = (t + 1, np.copy(actor.theta))
        performance[-1] = [
            evaluate_policy(actor, tc, env, rng, args.max_timesteps)
            for _ in range(args.num_evaluation_runs)
        ]
        perf_excursions = []
        for sample in range(num_test_eval):
            env.state = experience_memmap_test[run_num][sample][0]
            perf_excursions.append(
                evaluate_policy(
                    actor,
                    tc,
                    env,
                    rng,
                    args.max_timesteps,
                    state=experience_memmap_test[run_num][sample][0]))
        performance_excursions[-1] = perf_excursions

        # Save the learned policies and their performance to the memmap:
        performance_memmap[config_num]['results'][run_num] = performance
        performance_memmap[config_num]['results_excursions'][
            run_num] = performance_excursions
        policies_memmap[config_num]['policies'][run_num] = policies
    except (FloatingPointError, ValueError) as e:
        # Save NaN to indicate the weights overflowed and exit early:
        performance_memmap[config_num]['results'][run_num] = np.full_like(
            performance, np.NaN)
        performance_memmap[config_num]['results_excursions'][
            run_num] = np.full_like(performance_excursions, np.NaN)
        policies_memmap[config_num]['policies'][run_num] = np.full_like(
            policies, np.NaN)
        return
    experience_memmap = np.lib.format.open_memmap(args.experience_file,
                                                  mode='r')
    num_runs, num_timesteps = experience_memmap.shape

    # Load the input data as a memmap to prevent a copy being loaded into memory in each sub-process:
    experience_memmap_test = np.lib.format.open_memmap(
        args.experience_file_test, mode='r')
    num_runs, num_test_eval = experience_memmap_test.shape

    # Generate the random seed for each run without replacement to prevent the birthday paradox:
    random.seed(args.random_seed)
    random_seeds = random.sample(range(2**32), num_runs)

    # Create the tile coder to be used for all parameter settings:
    if args.environment == 'pw':
        dummy_env = puddleworld()
    else:
        dummy_env = gym.make(
            args.environment).unwrapped  # Make a dummy env to get shape info.
    tc = TileCoder(
        np.array([
            dummy_env.observation_space.low, dummy_env.observation_space.high
        ]).T, args.num_tiles_per_dim, args.num_tilings, args.bias_unit)

    # Create the memmapped array of learned policies that will be populated in parallel:
    parameters_dtype = np.dtype([('alpha_a', float), ('alpha_w', float),
                                 ('alpha_v', float), ('lambda', float),
                                 ('eta', float), ('gamma', float),
                                 ('num_tiles_per_dim', int,
                                  (len(args.num_tiles_per_dim), )),
                                 ('num_tilings', int), ('bias_unit', bool)])
Exemple #5
0
    # Generate the random seed for each run without replacement to prevent the birthday problem:
    random.seed(args.random_seed)
    random_seeds = random.sample(range(2**32), args.num_runs)

    # Save the command line arguments in a format interpretable by argparse:
    output_dir = Path(args.output_dir)

    if args.test_data:
        #we consider 50 different start states for evaluation
        args.num_timesteps = 50

        utils.save_args_to_file(args, output_dir / 'experience_test.args')

        # Create the memmapped structured array of experience to be populated in parallel:
        if args.environment == 'pw':
            env = puddleworld()
        else:
            env = gym.make(
                args.environment
            ).unwrapped  # Make a dummy env to get shape info for observations.
        transition_dtype = np.dtype([('s_t', float,
                                      env.observation_space.shape)])

        experience_memmap_path = str(output_dir / 'experience_test.npy')
        if os.path.isfile(experience_memmap_path):
            experience_memmap = np.lib.format.open_memmap(
                experience_memmap_path, mode='r+')
        else:
            experience_memmap = np.lib.format.open_memmap(
                experience_memmap_path,
                shape=(args.num_runs, args.num_timesteps),