def policy_f(env, scaler, featurizer, print_ep_lens):
    '''
    Main Calling Function for generating expert policy.
    ** Read the multi-line comment at the starting of this file to gain understanding.
    
    Args:
        env: Gym environment
        scaler: Mean and variance of the state values.
        featurizer: The container used for generating expert trajectories.
        print_ep_stats: [Bool] Prints interation with no. of time steps required for completion
        
    Returns:
        a) Plots statisics of mountain car learning with inbuilt rewards in the gym environment.
        So that we are able to compare results with the mountain car learning with learnt reward function.
        b) Returns "Demostration By Expert" DBE policy.
    '''
    estimator = Estimator(env, scaler, featurizer)
    stats = q_learning_best_policy(env,
                                   estimator,
                                   200,
                                   epsilon=0.0,
                                   print_ep_lens=False)
    print("___Plotting Learning Stats of the Agent____")
    plotting.plot_cost_to_go_mountain_car(env, estimator)
    plotting.plot_episode_stats(stats, smoothing_window=25)
    final_policy = greedy_policy(estimator, env.action_space.n)
    return final_policy, estimator
def main():
    env = gym.make('MountainCar-v0')
    env.seed(SEED)
    env.reset()

    ntilings = 8
    tiles = Tiles(env.low, env.high, (8, 8), ntilings, env.action_space.n)

    Q, _ = episodic_semi_gradient_n_step_sarsa(
        env, tiles, 0.99, 0.5 / ntilings, 0.2, 100)

    plot_cost_to_go_mountain_car(env, Q)

    watch_greedy_policy(env, Q)

    env.close()
Exemple #3
0
def main():
    logging.info("define environment and basis function")
    env_id = "MountainCar-v0"
    env = gym.envs.make(env_id)
    logging.info("env_id: {}".format(env_id))
    action_list = range(env.action_space.n)

    # linear basis func
    p_linear = 3
    q_linear = 3
    phi_linear = simple_phi
    psi_linear = phi_linear

    # radial basis (gaussian) fn
    p_rbf = 100
    q_rbf = 100
    phi_rbf = get_basis_function(env_id)
    psi_rbf = phi_rbf

    # this is specific to mountaincar-v0
    init_s_sampler = lambda: [np.random.uniform(-0.4, -0.6), 0.0]

    # 2. define hyperparams
    gamma = 0.95
    n_trial = 2
    n_iteration = 10
    # @note: hard-coded
    # this's gotta be sufficiently large to avoid mc variance issue
    sample_size_mc = 10**2
    #p = p_linear
    #q = q_linear
    #phi = phi_linear
    #psi = psi_linear
    p = p_rbf
    q = q_rbf
    phi = phi_rbf
    psi = psi_rbf
    precision = 1e-4
    use_slack = False
    # @note: reward may have to be scaled to work with slack penalty
    slack_penalty = 1e-3
    eps = 0.0001
    #eps = 0
    # this should be large to account for varying init sate
    mu_sample_size = 50

    logging.info("collect a batch of data (D) from pi_expert (and some noise)")
    pi_exp = NearExpertPolicy()
    pi_random = get_random_policy()

    # preprocessing D in numpy array for k
    logging.info("apprenticeship learning starts")
    logging.info("feature dim:\n{}".format(phi))

    mu_exp = AL.estimate_mu(env=env,
                            pi_eval=pi_exp,
                            mu_sample_size=sample_size_mc,
                            phi=phi,
                            gamma=gamma,
                            return_epi_len=False)
    #mu_mc_list = estimate_mu_mc(env, pi_exp, phi_linear, gamma, sample_size_mc)
    #mu_mc_list = estimate_mu_mc(env, pi_exp, phi_rbf, gamma, sample_size_mc)
    #mu_exp = np.mean(mu_mc_list, axis=0)

    pi_init = pi_random

    mdp_solver = LinearQ3(env=env,
                          phi=phi,
                          action_list=action_list,
                          n_episode=100,
                          epsilon=0.0,
                          gamma=gamma)

    al = AL(env=env,
            pi_init=pi_init,
            action_list=action_list,
            p=p,
            q=q,
            phi=phi,
            psi=psi,
            gamma=gamma,
            eps=eps,
            mu_exp=mu_exp,
            init_s_sampler=init_s_sampler,
            mu_sample_size=mu_sample_size,
            precision=precision,
            mdp_solver=mdp_solver,
            use_slack=use_slack,
            slack_penalty=slack_penalty)

    results = al.run(n_trial=n_trial, n_iteration=n_iteration)

    # 5. post-process results (plotting)
    pi_irl = results["policy_best"][0]
    weight_irl = results["weight_best"][0]
    margin_v = results["margin_v"][0]
    margin_mu = results["margin_mu"][0]
    weight = results["weight"][0]

    state_dim = env.observation_space.shape[0]
    # discrete action
    action_dim = 1
    n_action = env.action_space.n
    sim = Simulator(env, state_dim=state_dim, action_dim=action_dim)

    D_irl, stats = sim.simulate(pi_irl,
                                n_trial=1,
                                n_episode=15,
                                return_stats=True)

    plotting.plot_cost_to_go_mountain_car(env, pi_irl._estimator)
    plotting.plot_episode_stats(stats, smoothing_window=5)

    np.save("data/D_irl.npy".format(time()), D_irl)
    np.save("data/margin_v.npy".format(time()), margin_v)
    np.save("data/margin_mu.npy".format(time()), margin_mu)
    np.save("data/weight.npy".format(time()), weight)
    np.save("data/weight_best.npy".format(time()), weight_irl)
    print("D_irl shape{}".format(D_irl.shape))

    with open("data/res_{}".format(time()), "wb") as f:
        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
            next_state, reward, end, _ = env.step(action)
            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)),
                                           p=next_action_probs)

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            q_values_next = estimator.predict(next_state)
            td_target = reward + discount_factor * q_values_next[next_action]

            estimator.update(state, action, td_target)

            if i_episode % 10 == 0:
                print("\rStep {} @ Episode {}/{} ({})".format(
                    t, i_episode + 1, num_episodes, reward))

            if end:
                break

            state = next_state
            action = next_action
    return stats


estimator = FunctionApproximator()
stats = sarsa(env, estimator, 200, epsilon=0.0)

plotting.plot_cost_to_go_mountain_car(env, estimator)
plotting.plot_episode_stats(stats, smoothing_window=25)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)

            next_state,reward,done,_=env.step(action)
            q_values_next = estimator.predict(next_state)
            td_target = reward + discount_factor * np.max(q_values_next)
            estimator.update(state, action, td_target)

            # stats.episode_rewards[i_epoch_num] += reward
            # stats.episode_lengths[i_epoch_num] = it
            print("\rStep {} @ Episode {}/{}".format(it, i_epoch_num + 1, epoch_num))

            if done:
                print it
                break
            state = next_state


estimator=Estimator()
Q_learning_with_value_approximation(env, estimator, 100, epsilon=0.0)
plotting.plot_cost_to_go_mountain_car(env, estimator)










def main():
    estimator = Estimator()
    stats = expected_sarsa(env, estimator, 100, epsilon=0.0)
    plotting.plot_cost_to_go_mountain_car(env, estimator)
    plotting.plot_episode_stats(stats, smoothing_window=25)