Ejemplo n.º 1
0
def run(train_fn, env_config, net_config=None):
    mode = '--train' if not sys.argv[1:] else sys.argv[1]
    if mode == '--train':
        train_fn()
    elif mode == '--test':
        if not sys.argv[2:]:
            model_path = get_latest_model_path()  # TODO
        else:
            model_path = sys.argv[2]
        env = gym.make(env_config['env_name'])
        _, get_action = load_policy_and_env(fpath=model_path, deterministic=True,
                                            env=env, net_config=net_config)
        run_policy(env, get_action)
Ejemplo n.º 2
0
def run(train_fn, env_config, net_config=None):
    mode = '--train' if not sys.argv[1:] else sys.argv[1]
    if mode == '--train':
        train_fn()
    elif mode == '--play':
        from deepdrive_zero import player
        env_config['physics_steps_per_observation'] = 1
        player.start(env_config=env_config)
    elif mode == '--test':
        if not sys.argv[2:]:
            model_path = get_latest_model_path()  # TODO
        else:
            model_path = sys.argv[2]
        env = gym.make(env_config['env_name'])
        env.configure_env(env_config)
        _, get_action = load_policy_and_env(fpath=model_path, deterministic=True,
                                            env=env, net_config=net_config)
        run_policy(env, get_action)
Ejemplo n.º 3
0
 def __init__(self,
              env,
              add_penalty=10,
              threshold=25,
              mult_penalty=None,
              cost_penalty=0,
              buckets=None,
              safe_policy=False):
     self.base_env = env  # Use safety-gym environement as the base env
     self.buckets = buckets  # no. of buckets for discretization
     # Adding cost dimension to observation space
     if self.buckets is None:  # If scalar cost
         low = np.concatenate([env.observation_space.low, np.array([0])])
         high = np.concatenate(
             [env.observation_space.high,
              np.array([np.inf])])
     else:  # If discretized cost
         low = np.concatenate([
             env.observation_space.low,
             np.array([0 for i in range(self.buckets)])
         ])
         high = np.concatenate([
             env.observation_space.high,
             np.array([np.inf for i in range(self.buckets)])
         ])
     self.observation_space = Box(
         low=low, high=high, dtype=np.float32
     )  # Augment observation space domain with cost domain
     self.action_space = env.action_space
     self.total_rews = []  # To store total episode returns
     self.total_costs = []  # To store total episode costs
     self.t = -1
     self.add_penalty = add_penalty  # add_penalty is Beta from the proposal.
     self.threshold = threshold  # threshold value for cost
     self.mult_penalty = mult_penalty  # If mult_penalty is not None, all rewards get multiplied by it once the constraint is violated.
     self.cost_penalty = cost_penalty  # cost_penalty is equal to zeta from the proposal.
     # use a safe_policy if constraint voilated
     if safe_policy is not False:
         _, self.safe_policy = load_policy_and_env(safe_policy)
         self.device = torch.device(
             'cuda' if torch.cuda.is_available() else 'cpu')
     else:
         self.safe_policy = False
Ejemplo n.º 4
0
    #env.plot_entire_trajectory()
if __name__ == '__main__':
    import argparse
    import time
    parser = argparse.ArgumentParser()
    parser.add_argument('--load_path', type=str, help="path to saved model and stuff")
    parser.add_argument('--save_path', type=str, help="path to save demos for later use")
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed', '-s', type=int, default=0)
    parser.add_argument('--cpu', type=int, default=1)
    parser.add_argument('--num_rollouts', type=int, default=10, help='how many rollouts eval over')
    parser.add_argument('--random_rollouts', action='store_true', default = False, help = 'generate purely random rollouts')
    args = parser.parse_args()

    #load pretrained policy and env using built in spinning up functionality
    env, get_action = load_policy_and_env(args.load_path)
    env.seed(args.seed)
    demos = []
    for i_episode in range(args.num_rollouts):
        observation = env.reset()
        states = []
        actions = []
        t = 0
        while True:
            env.render()
            # print(observation)
            # action = env.action_space.sample()
            states.append(observation)
            action = get_action(observation)
            if args.random_rollouts:
                #generate completely random actions if flag is set
if TEST_STATIC_OBSTACLE:
    _, get_action = load_policy(
        '/home/c2/src/spinningup/data/dd2d-ppo-intersection/dd2d-ppo-intersection_s0',
        use_model_only=False)

    env = gym.make('deepdrive-2d-static-obstacle-no-g-pen-v0')
else:
    p = '/home/c2/src/tmp/spinningup/data/deepdrive-2d-intersection-no-constrained-controls-example/deepdrive-2d-intersection-no-constrained-controls-example_s0_2020_03-10_13-14.50/best_HorizonReturn/2020_03-11_11-36.27'
    if 'no-end-g' in p or 'no-contraint-g' in p or 'no-g' in p or 'no-constrain' in p:
        os.environ['END_ON_HARMFUL_GS'] = '0'
        os.environ['GFORCE_PENALTY_COEFF'] = '0'
        os.environ['JERK_PENALTY_COEFF'] = '0'
    if 'no-constrain' in p:
        os.environ['CONSTRAIN_CONTROLS'] = '0'
    if 'delta-controls' in p or 'deepdrive-2d-intersection-no-g-or-jerk2' in p:
        os.environ['EXPECT_NORMALIZED_ACTION_DELTAS'] = '1'
    else:
        os.environ['EXPECT_NORMALIZED_ACTION_DELTAS'] = '0'

    if 'one-waypoint' in p:
        env_name = 'deepdrive-2d-one-waypoint-v0'
    else:
        env_name = 'deepdrive-2d-intersection-w-gs-allow-decel-v0'
    _, get_action = load_policy_and_env(p, deterministic=True)
    # env = gym.make('deepdrive-2d-intersection-v0')
    env = gym.make(env_name)
    env.configure_env()

# env.unwrapped.physics_steps_per_observation = 1
run_policy(env, get_action)
        safe_policy=args.safe_policy
    else:
        safe_policy = False
    if len(args.collector_policy)>0:
        collector_policy=args.collector_policy
    else:
        collector_policy = None
    run_exp(alg=args.alg,alpha=args.alpha,add_penalty=args.add_penalty,
            mult_penalty=args.mult_penalty,cost_penalty=args.cost_penalty,buckets=args.buckets,
         epochs=args.epochs,start_steps=args.start_steps,split_policy=bool(args.split_policy),
            ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),safe_policy=safe_policy,
            entropy_constraint=args.entropy_constraint,collector_policy=collector_policy,filename=filename,data_aug=False,
            env_name=args.env_name)
    # Test the trained policy
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Create an animation of the policy
    _, get_action = load_policy_and_env("results/" + filename + "policy", deterministic=True) # Load policy parameters
    frames = []

    # Create an animation of the policy
    env = gym.make('Safexp-PointGoal1-v0')
    env = constraint_wrapper(env, add_penalty=args.add_penalty, mult_penalty=args.mult_penalty,
                             cost_penalty=args.cost_penalty, buckets=args.buckets,
                             safe_policy=safe_policy)
    # Test for 5k steps
    for i in range(5):
        o = env.reset()
        for i in range(1000):
            frames.append(env.render(mode="rgb_array"))
            a = get_action(torch.tensor(o).to(device))
            o, r, d, _ = env.step(a)
    if args.alg == "sac":
Ejemplo n.º 7
0
        J_couplings = set_couplings(Ns_rew,Na, seed)
        env_rew= qenv.RandomTFIM(Ns_rew,J_couplings[:,0],Nt,rtype,dt,actType,measured_obs=measured_obs, g_target=hfield ,noise=noise)
        dirOut=dirO+'RandomIsing'+"P"+str(Nt)+'_N'+str(Ns_act)+'_rw'+rtype
        gs_energy = -J_couplings[:,0].sum()
        Emax=-gs_energy
    else:
        raise ValueError(f'Model not implemented:{model}')





    dirOut += '/'+measured_obs+'/network'+str(layers[0])+'x'+str(layers[1])
    print(deterministic_act, plotSValue)
    #_, get_action, get_value = load_tf_policy('./'+dirOut,deterministic=deterministic_act, valueFunction=plotSValue)
    _, get_action = load_policy_and_env('./'+dirOut,deterministic=deterministic_act)
    get_value = None

    if actType=='cont':
        head='# 1-episode,  2-action-gamma, 3-action-beta, 4-reward, 5-energy'
        data=np.zeros([Na*Nt,5])
        summary=np.zeros([Na+1,5])
        for ep in range(Na):
            if model == 'RandomTFIM' :
                #J_couplings = set_couplings(Ns_rew, 0)
                env_rew = qenv.RandomTFIM(Ns_rew,J_couplings[:,ep],Nt,rtype,dt,actType,measured_obs=measured_obs, g_target=hfield ,noise=noise)
                gs_energy = -J_couplings[:,ep].sum()
                Emax=-gs_energy
            print(gs_energy)
            if local_opt: f_grad = lambda x : env_rew.get_fullEvo(x, grad=True)
            o_act = env_act.reset()
def test_ppo(exp_dir, itr='last'):
    _, get_action, lstm = load_policy_and_env(exp_dir, itr=itr)
    env = hide_and_seek.make_env()
    run_policy(env, get_action, lstm=lstm)
Ejemplo n.º 9
0
from spinup.utils.test_policy import load_policy_and_env, run_policy
import gym
import offload_env

# Replace 'policy_path' with file path to the
# output policy of the train_*.py files:
policy_path = 'VPG_RUNS/vpg-base/vpg-base_s40'
_, get_action = load_policy_and_env(policy_path)

env = gym.make('offload-v0')
run_policy(env, get_action)
Ejemplo n.º 10
0
from spinup.utils.test_policy import load_policy_and_env, run_policy
import gym
import custompendulumenv
import laserhockey

_, get_action = load_policy_and_env(
    '/Users/julianstastny/Code/rl-course/Hockey-project/spinningup/data/td3-layernorm-all/td3-layernorm-all_s0'
)
env = gym.make('CustomPendulum-v0')

#_, get_action = load_policy_and_env('/path/to/output_directory')
#env = gym.make('CustomPendulum-v0')

run_policy(env, get_action)