def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'Pendulum-v0' pickle_path = '../gpirl/notebooks/plots/rllab_trpo_trainig/itr_112.pkl' # pickle_path = 'data/acrobat_data_rllab_trpo/exp_1/itr_800.pkl' iter_data = joblib.load(pickle_path) env = GymEnv(env_name) max_r = 1 while True: o = env.reset() disc_r = 0 r_sum = 0 done = False i = 0 while not done: env.render() a, _ = iter_data['policy'].get_action(o) o, r, done, _ = env.step(a) # s = [np.arccos(o[0]), np.arccos(o[1])] # r = -np.cos(s[0]) - np.cos(s[1] + s[0]) disc_r += r * 0.99**(500 - i) r_sum += r i += 1 # max_r = r print("disc_r : {} , sum_r : {}".format(disc_r, r_sum)) print("last x : {}".format(o[0])) print("-------------------------")
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'Cartpole-v3' pickle_path = 'data/Cartpole_v3_data_rllab_TRPO/exp_1/itr_1200.pkl' # pickle_path = 'data/acrobat_data_rllab_trpo/exp_1/itr_800.pkl' iter_data = joblib.load(pickle_path) env = GymEnv(env_name) max_r = 1 while True: o = env.reset() disc_r = 0 r_sum = 0 done = False i = 0 print("stable point : {}".format(env.env._stable_x)) while not done: env.render() a, _ = iter_data['policy'].get_action(o) o, r, done, _ = env.step(a) # s = [np.arccos(o[0]), np.arccos(o[1])] # r = -np.cos(s[0]) - np.cos(s[1] + s[0]) disc_r += r * 0.99**(500 - i) r_sum += r i += 1 # max_r = r print("disc_r : {} , sum_r : {}".format(disc_r, r_sum)) print("last x : {}".format(o[0])) print("-------------------------")
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'LunarLanderContinuous-v3' env = GymEnv(env_name) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=350, epoch_length=350, min_pool_size=350, n_epochs=600, discount=0.99, scale_reward=1.0/140.0, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), str(algo.__class__.__name__), exp_name) os.makedirs(data_path, exist_ok=True) logger.set_snapshot_dir(data_path) algo.train() logger.set_snapshot_dir(None)
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'LunarLanderContinuous-v3' env = GymEnv(env_name) itr_num = 400 episode_length = 400 while True: o = env.reset() disc_r = 0 r_sum = 0 done = False i = 0 pickle_path = '../gpirl/notebooks_lunarlander/plots/gpirl_400_iter_post_trainig/itr_{}.pkl'.format( itr_num) # pickle_path = 'data/LunarLanderContinuous_v3_data_rllab_PPO/exp_1/itr_{}.pkl'.format(itr_num) iter_data = joblib.load(pickle_path) while i < episode_length: env.render() a, _ = iter_data['policy'].get_action(o) o, r, done, _ = env.step(a) # s = [np.arccos(o[0]), np.arccos(o[1])] # r = -np.cos(s[0]) - np.cos(s[1] + s[0]) disc_r += r * 0.99**(500 - i) r_sum += r i += 1 if done: break # max_r = r print("disc_r : {} , sum_r : {}".format(disc_r, r_sum)) print("last x : {}".format(o[0])) print("iterations : {}".format(i)) print("-------------------------")
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'LunarLanderContinuous-v3' env = GymEnv(env_name) max_r = 1 itr_num = 0 itr_inc = 30 init_max_trials = 6 norm_max_trials = 4 max_iters = 400 trail_num = 1 while itr_num < 211: o = env.reset() if itr_num < 1: max_trials = init_max_trials else: max_trials = norm_max_trials disc_r = 0 r_sum = 0 done = False i = 0 # pickle_path = '../gpirl/notebooks_lunarlander/plots/gpirl_400_iter_post_trainig/itr_{}.pkl'.format(itr_num) pickle_path = 'data/LunarLanderContinuous_v3_data_rllab_PPO/exp_1/itr_{}.pkl'.format( itr_num) iter_data = joblib.load(pickle_path) while i < max_iters: env.render() a, _ = iter_data['policy'].get_action(o) o, r, done, _ = env.step(a) # s = [np.arccos(o[0]), np.arccos(o[1])] # r = -np.cos(s[0]) - np.cos(s[1] + s[0]) disc_r += r * 0.99**(500 - i) r_sum += r i += 1 if done: break # max_r = r print("disc_r : {} , sum_r : {}".format(disc_r, r_sum)) print("last x : {}".format(o[0])) print("iterations : {}".format(i)) print("-------------------------") if trail_num % max_trials == 0: if itr_num < 1: trail_num = norm_max_trials itr_num += itr_inc print("**************** itr number : {} **********************". format(itr_num)) trail_num += 1
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'Acrobot-v2' env = GymEnv(env_name) policy = GaussianMLPPolicy(env_spec=env, hidden_sizes=(64, 64)) algo = PPO(env=env, policy=policy, n_itr=1500, batch_size=8000, max_path_length=1000, discount=0.95, store_paths=True, entropy_weight=ent_wt, baseline=LinearFeatureBaseline(env_spec=env)) data_path = 'data/acrobat_data_rllab_ppo/%s/' % exp_name os.makedirs(data_path, exist_ok=True) logger.set_snapshot_dir(data_path) algo.train() logger.set_snapshot_dir(None)
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'LunarLanderContinuous-v3' env = GymEnv(env_name) policy = GaussianMLPPolicy(env_spec=env, hidden_sizes=(64, 64)) baseline = GaussianMLPBaseline(env_spec=env) algo = PPO(env=env, policy=policy, n_itr=1500, batch_size=8000, max_path_length=1000, discount=0.99, store_paths=True, entropy_weight=ent_wt, baseline=baseline) data_path = 'data/%s_data_rllab_%s/%s/' % (env_name.replace( '-', '_'), str(algo.__class__.__name__), exp_name) os.makedirs(data_path, exist_ok=True) logger.set_snapshot_dir(data_path) algo.train() logger.set_snapshot_dir(None)
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'Acrobot-v2' pickle_path = 'data/acrobat_data_moded_reward_ppo/exp_1/itr_800.pkl' # pickle_path = 'data/acrobat_data_rllab_trpo/exp_1/itr_800.pkl' iter_data = joblib.load(pickle_path) env = GymEnv(env_name) max_r = 1 while True: o = env.reset() disc_r=0 done=False i=0 # print("New episode!") while not done: env.render() a, _ = iter_data['policy'].get_action(o) o, r, done, _ = env.step(a) s = [np.arccos(o[0]) , np.arccos(o[1])] # r = -np.cos(s[0]) - np.cos(s[1] + s[0]) disc_r += r * 0.99 ** (1000-i) i += 1 # max_r = r print("disc_r : {}".format(disc_r))
from __future__ import print_function from inverse_rl.envs import register_custom_envs import sys, gym, time import numpy as np from pyglet.window import key as ks import datetime import pandas as pd import matplotlib.pyplot as plt import os # # Test yourself as a learning agent! Pass environment name as a command-line argument, for example: # # python keyboard_agent.py SpaceInvadersNoFrameskip-v4 # keyboard = ks.KeyStateHandler() register_custom_envs() env = gym.make( 'LunarLanderContinuous-v3' if len(sys.argv) < 2 else sys.argv[1]) Kp = 0.1 Kt = 0.2 gamma = 0.9 save_path = "data/lunarlander_demo/" ACTIONS = env.action_space RESET_ACTION = np.asarray([0., 0.]) SKIP_CONTROL = 0 # Use previous control decision SKIP_CONTROL times, that's how you # can test what skip is still usable. # print("ACTION high low : {} , {}".format(env.action_space.high, env.action_space.low)) human_wants_restart = False human_sets_pause = False state_cols = ["state_" + str(i) for i in range(env.observation_space.shape[0])]