def run(train_fn, env_config, net_config=None): mode = '--train' if not sys.argv[1:] else sys.argv[1] if mode == '--train': train_fn() elif mode == '--test': if not sys.argv[2:]: model_path = get_latest_model_path() # TODO else: model_path = sys.argv[2] env = gym.make(env_config['env_name']) _, get_action = load_policy_and_env(fpath=model_path, deterministic=True, env=env, net_config=net_config) run_policy(env, get_action)
def run(train_fn, env_config, net_config=None): mode = '--train' if not sys.argv[1:] else sys.argv[1] if mode == '--train': train_fn() elif mode == '--play': from deepdrive_zero import player env_config['physics_steps_per_observation'] = 1 player.start(env_config=env_config) elif mode == '--test': if not sys.argv[2:]: model_path = get_latest_model_path() # TODO else: model_path = sys.argv[2] env = gym.make(env_config['env_name']) env.configure_env(env_config) _, get_action = load_policy_and_env(fpath=model_path, deterministic=True, env=env, net_config=net_config) run_policy(env, get_action)
def __init__(self, env, add_penalty=10, threshold=25, mult_penalty=None, cost_penalty=0, buckets=None, safe_policy=False): self.base_env = env # Use safety-gym environement as the base env self.buckets = buckets # no. of buckets for discretization # Adding cost dimension to observation space if self.buckets is None: # If scalar cost low = np.concatenate([env.observation_space.low, np.array([0])]) high = np.concatenate( [env.observation_space.high, np.array([np.inf])]) else: # If discretized cost low = np.concatenate([ env.observation_space.low, np.array([0 for i in range(self.buckets)]) ]) high = np.concatenate([ env.observation_space.high, np.array([np.inf for i in range(self.buckets)]) ]) self.observation_space = Box( low=low, high=high, dtype=np.float32 ) # Augment observation space domain with cost domain self.action_space = env.action_space self.total_rews = [] # To store total episode returns self.total_costs = [] # To store total episode costs self.t = -1 self.add_penalty = add_penalty # add_penalty is Beta from the proposal. self.threshold = threshold # threshold value for cost self.mult_penalty = mult_penalty # If mult_penalty is not None, all rewards get multiplied by it once the constraint is violated. self.cost_penalty = cost_penalty # cost_penalty is equal to zeta from the proposal. # use a safe_policy if constraint voilated if safe_policy is not False: _, self.safe_policy = load_policy_and_env(safe_policy) self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') else: self.safe_policy = False
#env.plot_entire_trajectory() if __name__ == '__main__': import argparse import time parser = argparse.ArgumentParser() parser.add_argument('--load_path', type=str, help="path to saved model and stuff") parser.add_argument('--save_path', type=str, help="path to save demos for later use") parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--seed', '-s', type=int, default=0) parser.add_argument('--cpu', type=int, default=1) parser.add_argument('--num_rollouts', type=int, default=10, help='how many rollouts eval over') parser.add_argument('--random_rollouts', action='store_true', default = False, help = 'generate purely random rollouts') args = parser.parse_args() #load pretrained policy and env using built in spinning up functionality env, get_action = load_policy_and_env(args.load_path) env.seed(args.seed) demos = [] for i_episode in range(args.num_rollouts): observation = env.reset() states = [] actions = [] t = 0 while True: env.render() # print(observation) # action = env.action_space.sample() states.append(observation) action = get_action(observation) if args.random_rollouts: #generate completely random actions if flag is set
if TEST_STATIC_OBSTACLE: _, get_action = load_policy( '/home/c2/src/spinningup/data/dd2d-ppo-intersection/dd2d-ppo-intersection_s0', use_model_only=False) env = gym.make('deepdrive-2d-static-obstacle-no-g-pen-v0') else: p = '/home/c2/src/tmp/spinningup/data/deepdrive-2d-intersection-no-constrained-controls-example/deepdrive-2d-intersection-no-constrained-controls-example_s0_2020_03-10_13-14.50/best_HorizonReturn/2020_03-11_11-36.27' if 'no-end-g' in p or 'no-contraint-g' in p or 'no-g' in p or 'no-constrain' in p: os.environ['END_ON_HARMFUL_GS'] = '0' os.environ['GFORCE_PENALTY_COEFF'] = '0' os.environ['JERK_PENALTY_COEFF'] = '0' if 'no-constrain' in p: os.environ['CONSTRAIN_CONTROLS'] = '0' if 'delta-controls' in p or 'deepdrive-2d-intersection-no-g-or-jerk2' in p: os.environ['EXPECT_NORMALIZED_ACTION_DELTAS'] = '1' else: os.environ['EXPECT_NORMALIZED_ACTION_DELTAS'] = '0' if 'one-waypoint' in p: env_name = 'deepdrive-2d-one-waypoint-v0' else: env_name = 'deepdrive-2d-intersection-w-gs-allow-decel-v0' _, get_action = load_policy_and_env(p, deterministic=True) # env = gym.make('deepdrive-2d-intersection-v0') env = gym.make(env_name) env.configure_env() # env.unwrapped.physics_steps_per_observation = 1 run_policy(env, get_action)
safe_policy=args.safe_policy else: safe_policy = False if len(args.collector_policy)>0: collector_policy=args.collector_policy else: collector_policy = None run_exp(alg=args.alg,alpha=args.alpha,add_penalty=args.add_penalty, mult_penalty=args.mult_penalty,cost_penalty=args.cost_penalty,buckets=args.buckets, epochs=args.epochs,start_steps=args.start_steps,split_policy=bool(args.split_policy), ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),safe_policy=safe_policy, entropy_constraint=args.entropy_constraint,collector_policy=collector_policy,filename=filename,data_aug=False, env_name=args.env_name) # Test the trained policy device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Create an animation of the policy _, get_action = load_policy_and_env("results/" + filename + "policy", deterministic=True) # Load policy parameters frames = [] # Create an animation of the policy env = gym.make('Safexp-PointGoal1-v0') env = constraint_wrapper(env, add_penalty=args.add_penalty, mult_penalty=args.mult_penalty, cost_penalty=args.cost_penalty, buckets=args.buckets, safe_policy=safe_policy) # Test for 5k steps for i in range(5): o = env.reset() for i in range(1000): frames.append(env.render(mode="rgb_array")) a = get_action(torch.tensor(o).to(device)) o, r, d, _ = env.step(a) if args.alg == "sac":
J_couplings = set_couplings(Ns_rew,Na, seed) env_rew= qenv.RandomTFIM(Ns_rew,J_couplings[:,0],Nt,rtype,dt,actType,measured_obs=measured_obs, g_target=hfield ,noise=noise) dirOut=dirO+'RandomIsing'+"P"+str(Nt)+'_N'+str(Ns_act)+'_rw'+rtype gs_energy = -J_couplings[:,0].sum() Emax=-gs_energy else: raise ValueError(f'Model not implemented:{model}') dirOut += '/'+measured_obs+'/network'+str(layers[0])+'x'+str(layers[1]) print(deterministic_act, plotSValue) #_, get_action, get_value = load_tf_policy('./'+dirOut,deterministic=deterministic_act, valueFunction=plotSValue) _, get_action = load_policy_and_env('./'+dirOut,deterministic=deterministic_act) get_value = None if actType=='cont': head='# 1-episode, 2-action-gamma, 3-action-beta, 4-reward, 5-energy' data=np.zeros([Na*Nt,5]) summary=np.zeros([Na+1,5]) for ep in range(Na): if model == 'RandomTFIM' : #J_couplings = set_couplings(Ns_rew, 0) env_rew = qenv.RandomTFIM(Ns_rew,J_couplings[:,ep],Nt,rtype,dt,actType,measured_obs=measured_obs, g_target=hfield ,noise=noise) gs_energy = -J_couplings[:,ep].sum() Emax=-gs_energy print(gs_energy) if local_opt: f_grad = lambda x : env_rew.get_fullEvo(x, grad=True) o_act = env_act.reset()
def test_ppo(exp_dir, itr='last'): _, get_action, lstm = load_policy_and_env(exp_dir, itr=itr) env = hide_and_seek.make_env() run_policy(env, get_action, lstm=lstm)
from spinup.utils.test_policy import load_policy_and_env, run_policy import gym import offload_env # Replace 'policy_path' with file path to the # output policy of the train_*.py files: policy_path = 'VPG_RUNS/vpg-base/vpg-base_s40' _, get_action = load_policy_and_env(policy_path) env = gym.make('offload-v0') run_policy(env, get_action)
from spinup.utils.test_policy import load_policy_and_env, run_policy import gym import custompendulumenv import laserhockey _, get_action = load_policy_and_env( '/Users/julianstastny/Code/rl-course/Hockey-project/spinningup/data/td3-layernorm-all/td3-layernorm-all_s0' ) env = gym.make('CustomPendulum-v0') #_, get_action = load_policy_and_env('/path/to/output_directory') #env = gym.make('CustomPendulum-v0') run_policy(env, get_action)