def policy_f(env, scaler, featurizer, print_ep_lens): ''' Main Calling Function for generating expert policy. ** Read the multi-line comment at the starting of this file to gain understanding. Args: env: Gym environment scaler: Mean and variance of the state values. featurizer: The container used for generating expert trajectories. print_ep_stats: [Bool] Prints interation with no. of time steps required for completion Returns: a) Plots statisics of mountain car learning with inbuilt rewards in the gym environment. So that we are able to compare results with the mountain car learning with learnt reward function. b) Returns "Demostration By Expert" DBE policy. ''' estimator = Estimator(env, scaler, featurizer) stats = q_learning_best_policy(env, estimator, 200, epsilon=0.0, print_ep_lens=False) print("___Plotting Learning Stats of the Agent____") plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25) final_policy = greedy_policy(estimator, env.action_space.n) return final_policy, estimator
def main(): env = gym.make('MountainCar-v0') env.seed(SEED) env.reset() ntilings = 8 tiles = Tiles(env.low, env.high, (8, 8), ntilings, env.action_space.n) Q, _ = episodic_semi_gradient_n_step_sarsa( env, tiles, 0.99, 0.5 / ntilings, 0.2, 100) plot_cost_to_go_mountain_car(env, Q) watch_greedy_policy(env, Q) env.close()
def main(): logging.info("define environment and basis function") env_id = "MountainCar-v0" env = gym.envs.make(env_id) logging.info("env_id: {}".format(env_id)) action_list = range(env.action_space.n) # linear basis func p_linear = 3 q_linear = 3 phi_linear = simple_phi psi_linear = phi_linear # radial basis (gaussian) fn p_rbf = 100 q_rbf = 100 phi_rbf = get_basis_function(env_id) psi_rbf = phi_rbf # this is specific to mountaincar-v0 init_s_sampler = lambda: [np.random.uniform(-0.4, -0.6), 0.0] # 2. define hyperparams gamma = 0.95 n_trial = 2 n_iteration = 10 # @note: hard-coded # this's gotta be sufficiently large to avoid mc variance issue sample_size_mc = 10**2 #p = p_linear #q = q_linear #phi = phi_linear #psi = psi_linear p = p_rbf q = q_rbf phi = phi_rbf psi = psi_rbf precision = 1e-4 use_slack = False # @note: reward may have to be scaled to work with slack penalty slack_penalty = 1e-3 eps = 0.0001 #eps = 0 # this should be large to account for varying init sate mu_sample_size = 50 logging.info("collect a batch of data (D) from pi_expert (and some noise)") pi_exp = NearExpertPolicy() pi_random = get_random_policy() # preprocessing D in numpy array for k logging.info("apprenticeship learning starts") logging.info("feature dim:\n{}".format(phi)) mu_exp = AL.estimate_mu(env=env, pi_eval=pi_exp, mu_sample_size=sample_size_mc, phi=phi, gamma=gamma, return_epi_len=False) #mu_mc_list = estimate_mu_mc(env, pi_exp, phi_linear, gamma, sample_size_mc) #mu_mc_list = estimate_mu_mc(env, pi_exp, phi_rbf, gamma, sample_size_mc) #mu_exp = np.mean(mu_mc_list, axis=0) pi_init = pi_random mdp_solver = LinearQ3(env=env, phi=phi, action_list=action_list, n_episode=100, epsilon=0.0, gamma=gamma) al = AL(env=env, pi_init=pi_init, action_list=action_list, p=p, q=q, phi=phi, psi=psi, gamma=gamma, eps=eps, mu_exp=mu_exp, init_s_sampler=init_s_sampler, mu_sample_size=mu_sample_size, precision=precision, mdp_solver=mdp_solver, use_slack=use_slack, slack_penalty=slack_penalty) results = al.run(n_trial=n_trial, n_iteration=n_iteration) # 5. post-process results (plotting) pi_irl = results["policy_best"][0] weight_irl = results["weight_best"][0] margin_v = results["margin_v"][0] margin_mu = results["margin_mu"][0] weight = results["weight"][0] state_dim = env.observation_space.shape[0] # discrete action action_dim = 1 n_action = env.action_space.n sim = Simulator(env, state_dim=state_dim, action_dim=action_dim) D_irl, stats = sim.simulate(pi_irl, n_trial=1, n_episode=15, return_stats=True) plotting.plot_cost_to_go_mountain_car(env, pi_irl._estimator) plotting.plot_episode_stats(stats, smoothing_window=5) np.save("data/D_irl.npy".format(time()), D_irl) np.save("data/margin_v.npy".format(time()), margin_v) np.save("data/margin_mu.npy".format(time()), margin_mu) np.save("data/weight.npy".format(time()), weight) np.save("data/weight_best.npy".format(time()), weight_irl) print("D_irl shape{}".format(D_irl.shape)) with open("data/res_{}".format(time()), "wb") as f: pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
next_state, reward, end, _ = env.step(action) next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t q_values_next = estimator.predict(next_state) td_target = reward + discount_factor * q_values_next[next_action] estimator.update(state, action, td_target) if i_episode % 10 == 0: print("\rStep {} @ Episode {}/{} ({})".format( t, i_episode + 1, num_episodes, reward)) if end: break state = next_state action = next_action return stats estimator = FunctionApproximator() stats = sarsa(env, estimator, 200, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state,reward,done,_=env.step(action) q_values_next = estimator.predict(next_state) td_target = reward + discount_factor * np.max(q_values_next) estimator.update(state, action, td_target) # stats.episode_rewards[i_epoch_num] += reward # stats.episode_lengths[i_epoch_num] = it print("\rStep {} @ Episode {}/{}".format(it, i_epoch_num + 1, epoch_num)) if done: print it break state = next_state estimator=Estimator() Q_learning_with_value_approximation(env, estimator, 100, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator)
def main(): estimator = Estimator() stats = expected_sarsa(env, estimator, 100, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25)