def get_time_to_first_contact(env, policy, is_random=False, num_trajs=100): import itertools time_contact = [] if is_random: from rllab.policies.uniform_control_policy import UniformControlPolicy policy = UniformControlPolicy(env.spec) print("Using {}".format(policy)) for traj_i in range(num_trajs): obs = env.reset() print("Start traj {}".format(traj_i)) for t in itertools.count(): action, _ = policy.get_action(obs) obs, reward, done, env_info = env.step(action) if env_info['contact_reward'] > 0 or done: time_contact.append(t) break # plt.hist(time_contact) # plt.title("Time to first contact over {} trajectories".format(num_trajs)) # plt.show() data_path = input("Where do you want to save it? \n") np.save(data_path, time_contact) print("Data saved") print( "Mean time to first contact: {}, median:{}, std:{} for {}, ({} trajectories)" .format(np.mean(time_contact), np.median(time_contact), np.std(time_contact), policy, num_trajs))
def __init__(self, env, tensorboard_path, **kwargs): exploration_strategy = RandomStrategy(env.spec) policy = UniformControlPolicy(env.spec) super().__init__(env, policy, exploration_strategy) self.summary_writer = tf.summary.FileWriter( tensorboard_path, graph=tf.get_default_graph()) self.summary = None
def __init__(self, sess, env, cost_approximator, cost_trainer, novice_policy, novice_policy_optimizer, num_frames=4, concat_timesteps=True, train_disc=True): """ sess : tensorflow session cost_approximator : the NN or whatever cost function that can take in your observations/states and then give you your reward cost_trainer : this is the trainer for optimizing the cost (i.e. runs tensorflow training ops, etc.) novice_policy : the policy of your novice agent novice_policy_optimizer : the optimizer which runs a policy optimization step (or constrained number of iterations) much of this can be found in https://github.com/bstadie/third_person_im/blob/master/sandbox/bradly/third_person/algos/cyberpunk_trainer.py#L164 """ self.sess = sess self.env = env self.cost_approximator = cost_approximator self.cost_trainer = cost_trainer self.iteration = 0 self.novice_policy = novice_policy self.novice_policy_optimizer = novice_policy_optimizer # self.sampler = BaseSampler(self.novice_policy_optimizer) self.concat_timesteps = concat_timesteps self.num_frames = num_frames self.replay_buffer = {} self.max_replays = 3 self.replay_index = 0 self.replay_times = 40 self.should_train_cost = True self.prev_reward_dist = None self.is_first_disc_update = True self.gc_time = time.time() self.gc_time_threshold = 60 # seconds between garbage collection # as in traditional GANs, we add failure noise self.noise_fail_policy = UniformControlPolicy(env.spec) self.train_disc = train_disc self.zero_baseline = ZeroBaseline(env_spec=env.spec) self.rand_algo = NOP( env=env, policy=self.noise_fail_policy, baseline=self.zero_baseline, batch_size=1 * self.env.horizon, max_path_length=self.env.horizon, n_itr=1, discount=0.995, step_size=0.01, ) self.rand_algo.start_worker( ) # TODO: Call this in constructor instead ? self.rand_algo.init_opt() self.should_do_policy_step = True self.should_do_exploration = True self.num_steps_since_last_trpo = 0
def episode_reward(env, policy, is_random=False): import itertools mean_reward = [] if is_random: from rllab.policies.uniform_control_policy import UniformControlPolicy policy = UniformControlPolicy(env.spec) print ("Using {}".format(policy)) for traj_i in range(num_trajs): obs = env.reset() print ("Start traj {}".format(traj_i)) rewards for t in itertools.count(): action, _ = policy.get_action(obs) obs, reward, done, env_info = env.step(action) if done: break plt.his print ("Mean time to first contact: {} for {}, ({} trajectories)".format(np.mean(time_contact), policy, num_trajs))
def test_state_hist(env): policy = UniformControlPolicy(env.spec) _states = [] o = env.reset() try: while True: _states.append(o) a, _ = policy.get_action(o) next_o, r, d, env_info = env.step(a) if d: o = env.reset() else: o = next_o except KeyboardInterrupt: states = np.asarray(_states) save_path = '/Users/dianchen/state.npy' np.save(save_path, states) # pickle.dump(states, save_path) print ("State samples saved to {}".format(save_path))
def random_action_launcher(variant): from railrl.algos.noop_algo import NoOpAlgo from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.uniform_control_policy import UniformControlPolicy from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env) policy = UniformControlPolicy(env_spec=env.spec) algorithm = NoOpAlgo( env, policy, es, **variant['algo_params'] ) algorithm.train()
policy, sess) elif args.test_inverse_loss: investigate_inverse_loss(encoder, inverse_model, forward_model, env, policy, sess, img_path=args.data_path, num_trajs=100, animate=args.render) elif args.test_forward_loss: if policy is None: # TODO: Remove this hack after CoRL deadline from rllab.policies.uniform_control_policy import UniformControlPolicy policy = UniformControlPolicy(env.spec) investigate_forward_loss(encoder, inverse_model, forward_model, env, policy, sess, data_path=args.data_path, num_trajs=200, animate=args.render, num_top=50) elif args.plot_forward: plot_forward(encoder, inverse_model, forward_model, env, policy, sess) elif args.time_contact: get_time_to_first_contact(env,
if args.seed >= 0: set_seed(args.seed) if args.collection_file: all_feasible_starts = pickle.load(open(args.collection_file, 'rb')) with tf.Session() as sess: data = joblib.load(args.file) if "algo" in data: policy = data["algo"].policy env = data["algo"].env else: policy = data['policy'] env = data['env'] if args.random_policy: policy = UniformControlPolicy(env_spec=env.spec) while True: if args.init_state: from sandbox.envs.base import FixedStateGenerator env.update_start_generator(FixedStateGenerator( args.init_state)) elif args.collection_file: from sandbox.envs.base import UniformListStateGenerator init_states = all_feasible_starts.sample(1000) env.update_start_generator( UniformListStateGenerator(init_states)) if args.deterministic: with policy.set_std_to_0(): path = rollout(env, policy,
from rllab.algos.nop import NOP from rllab.baselines.zero_baseline import ZeroBaseline from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from rllab.policies.uniform_control_policy import UniformControlPolicy env = normalize(CartpoleEnv()) policy = UniformControlPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. ) baseline = ZeroBaseline(env_spec=env.spec) algo = NOP( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=0.01, ) algo.train()