def to_tf_space(space): if isinstance(space, TheanoBox): return Box(low=space.low, high=space.high) elif isinstance(space, TheanoDiscrete): return Discrete(space.n) elif isinstance(space, TheanoProduct): return Product(list(map(to_tf_space, space.components))) else: return Box(low=space.low, high=space.high)
def to_tf_space(space): if isinstance(space, TheanoBox): return Box(low=space.low, high=space.high) elif isinstance(space, TheanoDiscrete): return Discrete(space.n) elif isinstance(space, TheanoProduct): return Product(list(map(to_tf_space, space.components))) else: print("HACK IN sandbox/rocky/envs/base.py") return Box(low=space.low, high=space.high)
def __init__(self, env, *args, max_timesteps=None): """ Initialize the environment. Args: env (Env): gym environment. Must have discrete observation and action spaces. max_timesteps (int): int indicating the max timesteps the environment will be run for. """ assert(isinstance(env, BanditEnv)) self.wrapped_env = env self.nA = env.action_space.n #actions are just the same actions as those in the environment. self.state_dim = env.n_arms * 2 self.counts = np.zeros(self.state_dim, dtype=np.int32) if max_timesteps is not None: self.max_timesteps = max_timesteps else: max_timesteps = self.max_timesteps = env.horizon self.timesteps = 0 self.Gittins = None self.action_space = Discrete(self.nA) obs_high = np.full(shape=self.counts.shape, fill_value=max_timesteps) self.observation_space = Box(np.zeros_like(self.counts), obs_high) self.dV_drhos = {} self._seed()
def _create_observation_space(self): obs_space = super()._create_observation_space() return Box( np.hstack( (obs_space.low, [-self.BOUNDARY_DIST, -self.BOUNDARY_DIST])), np.hstack( (obs_space.high, [self.BOUNDARY_DIST, self.BOUNDARY_DIST])), )
def gym_to_local(): import gym from sandbox.rocky.tf.spaces.box import Box import envs.base as base gym.envs.mujoco.reacher.ReacherEnv._get_obs = ReacherEnv._get_obs gym.envs.mujoco.reacher.ReacherEnv._step = ReacherEnv._step gym.envs.mujoco.reacher.ReacherEnv.observation_space = property( lambda self: Box(low=ReacherEnv().observation_space.low, high=ReacherEnv().observation_space.high)) gym.envs.mujoco.reacher.ReacherEnv.reset = ReacherEnv.reset gym.envs.mujoco.reacher.ReacherEnv.reset_model = ReacherEnv.reset_model gym.envs.mujoco.reacher.ReacherEnv.n_goals = ReacherEnv.n_goals gym.envs.mujoco.reacher.ReacherEnv.n_states = ReacherEnv.n_states gym.envs.mujoco.reacher.ReacherEnv.cost_np = ReacherEnv.cost_np gym.envs.mujoco.reacher.ReacherEnv.cost_tf = ReacherEnv.cost_tf gym.envs.mujoco.reacher.ReacherEnv.cost_np_vec = ReacherEnv.cost_np_vec base.TfEnv.observation_space = property( lambda self: Box(low=ReacherEnv().observation_space.low, high=ReacherEnv().observation_space.high))
def __init__(self, env_name, record_video=True, video_schedule=None, log_dir=None, record_log=True, force_reset=False, screen_width=84, screen_height=84): if log_dir is None: if logger.get_snapshot_dir() is None: logger.log( "Warning: skipping Gym environment monitoring since snapshot_dir not configured." ) else: log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") Serializable.quick_init(self, locals()) env = gym.envs.make(env_name) if 'Doom' in env_name: from ppaquette_gym_doom.wrappers.action_space import ToDiscrete wrapper = ToDiscrete('minimal') env = wrapper(env) self.env = env self.env_id = env.spec.id monitor_manager.logger.setLevel(logging.WARNING) assert not (not record_log and record_video) if log_dir is None or record_log is False: self.monitoring = False else: if not record_video: video_schedule = NoVideoSchedule() else: if video_schedule is None: video_schedule = CappedCubicVideoSchedule() self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) self.monitoring = True self._observation_space = convert_gym_space(env.observation_space) self._action_space = convert_gym_space(env.action_space) self._horizon = env.spec.timestep_limit self._log_dir = log_dir self._force_reset = force_reset self.screen_width = screen_width self.screen_height = screen_height self._observation_space = Box(low=0, high=1, shape=(screen_width, screen_height, 1))
def to_tf_space(space): if isinstance(space, TheanoBox) or isinstance(space, gymBox): return Box(low=space.low, high=space.high) elif isinstance(space, TheanoDiscrete): return Discrete(space.n) elif isinstance(space, TheanoProduct): return Product(list(map(to_tf_space, space.components))) else: import ipdb ipdb.set_trace() raise NotImplementedError
def __init__( self, horizon=200, l2_action_penalty_weight=1e-2, num_steps=None, include_velocity=False, use_small_maze=False, num_steps_until_reset=5, ): self.init_serialization(locals()) if use_small_maze: self.TARGET_RADIUS = 0.04 self.BOUNDARY_RADIUS = 0.02 self.BOUNDARY_DIST = 0.12 self.BALL_RADIUS = 0.01 super().__init__('small_water_maze.xml') else: self.TARGET_RADIUS = 0.1 self.BOUNDARY_RADIUS = 0.02 self.BOUNDARY_DIST = 0.3 self.BALL_RADIUS = 0.02 super().__init__('water_maze.xml') self.BALL_START_DIST = (self.BOUNDARY_DIST - self.BOUNDARY_RADIUS - 2 * self.BALL_RADIUS) self.MAX_GOAL_DIST = self.BOUNDARY_DIST - self.BOUNDARY_RADIUS self.l2_action_penalty_weight = l2_action_penalty_weight if num_steps is not None: # support backwards compatibility horizon = num_steps self._horizon = horizon self._t = 0 self._on_platform_history = deque(maxlen=5) self.num_steps_until_reset = num_steps_until_reset self.teleport_after_a_while = self.num_steps_until_reset > 0 if self.teleport_after_a_while: for _ in range(self.num_steps_until_reset): self._on_platform_history.append(False) self.include_velocity = include_velocity self.action_space = Box(np.array([-1, -1]), np.array([1, 1])) self.observation_space = self._create_observation_space() self.reset_model()
def main(exp_name=None, fusion=False, latent_dim=3): max_path_length = 100 info_coeff = 0.1 imitation_coeff = 0.01 batch_size = 16 meta_batch_size = 50 max_itrs = 20 pre_epoch = 1000 entropy_weight = 1.0 reward_arch = relu_net if reward_arch == relu_net: layers = 2 d_hidden = 32 reward_arch_args = { 'layers': layers, 'd_hidden': d_hidden, } else: layers, d_hidden = 0, 0 reward_arch_args = None tf.reset_default_graph() env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( 'data/maze_left_data_collect_discrete-15', n=4, latent_dim=latent_dim) # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) pretrain_model = Pretrain(experts, policy, context_encoder, env, latent_dim, batch_size=400, kl_weight=0.1, epoch=pre_epoch) # pretrain_model = None if pretrain_model is None: pre_epoch = 0 irl_model = InfoAIRL(env=env, policy=policy, context_encoder=context_encoder, reward_arch=reward_arch, reward_arch_args=reward_arch_args, expert_trajs=experts, state_only=True, max_path_length=max_path_length, fusion=fusion, max_itrs=max_itrs, meta_batch_size=meta_batch_size, imitation_coeff=imitation_coeff, info_coeff=info_coeff, latent_dim=latent_dim) algo = MetaIRLTRPO( env=env, policy=policy, irl_model=irl_model, randomize_policy=True, pretrain_model=pretrain_model, n_itr=3000, meta_batch_size=meta_batch_size, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, train_irl=True, irl_model_wt=1.0, entropy_weight=entropy_weight, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) if fusion: dirname = 'data_fusion_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % ( imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs, pre_epoch, entropy_weight, layers, d_hidden, exp_name) else: dirname = 'data_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % ( imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs, pre_epoch, entropy_weight, layers, d_hidden, exp_name) with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.train()
def main(exp_name=None, latent_dim=3, params_folder=None): max_path_length = 100 batch_size = 16 meta_batch_size = 1 reward_arch = relu_net if reward_arch == relu_net: layers = 2 d_hidden = 32 reward_arch_args = { 'layers': layers, 'd_hidden': d_hidden, } else: layers, d_hidden = 0, 0 reward_arch_args = None # tf.reset_default_graph() env = TfEnv( CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) barrier_range = [0.2, 0.6] barrier_y = 0.3 # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect', n=4, latent_dim=latent_dim) irl_itr_list = [2800] for irl_itr in irl_itr_list: # params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr) prior_params = load_prior_params(params_file) init_context_encoder_params = load_prior_params( params_file, 'context_params') # params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr-800)) policy_prior_params = load_prior_params(params_file, 'policy_params') # policy_prior_params = None # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) irl_model = InfoAIRL(env=env, expert_trajs=experts, reward_arch=reward_arch, reward_arch_args=reward_arch_args, context_encoder=context_encoder, state_only=True, max_path_length=max_path_length, meta_batch_size=meta_batch_size, latent_dim=latent_dim) savedir = 'data_fusion_discrete/visualize_reward_right-%s' % irl_itr if not os.path.isdir(savedir): os.mkdir(savedir) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) irl_model.context_encoder.set_param_values( init_context_encoder_params) policy.set_param_values(policy_prior_params) irl_model.set_params(prior_params) boundary_low = -0.1 boundary_high = 0.6 expert_obs, expert_acts, expert_contexts = irl_model.extract_paths( irl_model.expert_trajs, keys=('observations', 'actions', 'contexts'), T=max_path_length) expert_trajs = np.concatenate( (expert_obs, expert_acts), axis=-1) # num_experts x T x (state_dim + act_dim) grid_size = 0.005 rescale = 1. / grid_size for itr in range(100): expert_traj_batch, m_batch = irl_model.sample_batch( expert_trajs, expert_contexts, batch_size=1, warm_up=False, warm_up_idx=False) obs_batch = [] num_y = 0 for pos_y in np.arange(boundary_low, boundary_high, grid_size): num_y += 1 num_x = 0 for pos_x in np.arange(boundary_low, boundary_high, grid_size): num_x += 1 obs_batch.append([pos_x, pos_y, 0.]) obs_batch = np.array(obs_batch).reshape( [1, -1, max_path_length, 3]) expert_traj_batch = np.tile( np.reshape(expert_traj_batch, [1, 1, max_path_length, -1]), [1, obs_batch.shape[1], 1, 1]) reward = tf.get_default_session().run( irl_model.reward, feed_dict={ irl_model.expert_traj_var: expert_traj_batch, irl_model.obs_t: obs_batch }) score = reward[:, 0] ax = sns.heatmap(score.reshape([num_x, num_y]), cmap="YlGnBu_r") ax.scatter((m_batch[0][0][0] - boundary_low) * rescale, (m_batch[0][0][1] - boundary_low) * rescale, marker='*', s=150, c='r', edgecolors='k', linewidths=0.5) ax.scatter((0.3 - boundary_low + np.random.uniform(low=-0.05, high=0.05)) * rescale, (0. - boundary_low + np.random.uniform(low=-0.05, high=0.05)) * rescale, marker='o', s=120, c='white', linewidths=0.5, edgecolors='k') ax.plot([(barrier_range[0] - boundary_low) * rescale, (barrier_range[1] - boundary_low) * rescale], [(barrier_y - boundary_low) * rescale, (barrier_y - boundary_low) * rescale], color='k', linewidth=10) ax.invert_yaxis() plt.axis('off') plt.savefig(savedir + '/%s.png' % itr) print('Save Itr', itr) plt.close()
def observation_space(self): # 2 embeddings (query and current page) plus the embeddings of articles # on the beam return Box(low=-5, high=5, shape=(2 + self.beam_size, self.embedding_dim))
def __init__(self, env_name, register_info=None, record_video=True, video_schedule=None, log_dir=None, record_log=True, force_reset=True, screen_width=84, screen_height=84, frame_skip=1, doom_actionspace='Box', conv=True, client_port=10000, transpose_output=False, stack_frames=False, stack_size=4): if log_dir is None: if logger.get_snapshot_dir() is None: logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.") else: log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") Serializable.quick_init(self, locals()) if 'Doom' in env_name: import ex2.envs.doom if 'Minecraft' in env_name: import axe.envs.minecraft if register_info: try: gym.envs.register(**register_info) except gym.error.Error: traceback.print_exc() env = gym.envs.make(env_name) if 'Doom' in env_name: from ex2.envs.doom.wrappers import SetResolution from ex2.envs.doom.wrappers.action_space import ToDiscrete, ToBox if doom_actionspace == 'Box': wrapper1 = ToBox('minimal') else: wrapper1 = ToDiscrete('minimal') #lock = multiprocessing.Lock() #env.configure(lock=lock) wrapper2 = SetResolution('160x120') env = wrapper2(wrapper1(env)) if 'Minecraft' in env_name: env.init(videoResolution=[screen_width, screen_height], allowContinuousMovement=["move", "turn"], continuous_discrete=False, vision=False, client_pool=[('127.0.0.1', client_port)]) self.env = env self.env_id = env.spec.id self.env_name = env_name self.frame_skip = frame_skip self.stack_frames = stack_frames if stack_frames: self.channel_size = stack_size else: self.channel_size = 3 assert not (not record_log and record_video) if log_dir is None or record_log is False: self.monitoring = False else: if not record_video: video_schedule = NoVideoSchedule() else: if video_schedule is None: video_schedule = CappedCubicVideoSchedule() self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) self.monitoring = True self._action_space = convert_gym_space(env.action_space) self._horizon = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') self._log_dir = log_dir self._force_reset = force_reset self.screen_width = screen_width self.screen_height = screen_height self.conv = conv self.transpose_output = transpose_output if conv: if self.transpose_output: self._observation_space = Box(low=0, high=1, shape=(self.channel_size, screen_width, screen_height)) #self._observation_space = Box(low=0, high=1, shape=(3* screen_width* screen_height)) else: self._observation_space = Box(low=0, high=1, shape=(screen_width, screen_height, self.channel_size)) else: self._observation_space = Box(low=0, high=1, shape=(self.channel_size,)) self.last_info = None self.last_obs = []
def observation_space(self): return Box(low=0, high=1, shape=(len(self.vocab),)) #return DiscreteBinaryBag(len(self.vocab))
import samplers.lowlevel.rarl_parallel_sampler as parallel_sampler parallel_sampler.initialize(n_parallel=1) parallel_sampler.set_seed(0) #env = normalize(MultilaneEnv(),1,True,True,0.001,0.001) #env = normalize(MultilaneEnv()) env = TfEnv(JustEgoEnv(port=9427)) obs1_dim = 4 obs2_dim = 4 action1_dim = 2 action2_dim = 2 spec1 = EnvSpec( observation_space = Box(low=-np.ones(4), high=np.ones(4)), action_space = Box(low=-np.ones(2), high=np.ones(2)), ) spec2 = EnvSpec( observation_space = Box(low=-np.ones(4), high=np.ones(4)), action_space = Box(low=-np.ones(2), high=np.ones(2)), ) with tf.Session() as sess: policy1 = GaussianMLPPolicy( env_spec=spec1, name="RARLTFPolicy1", learn_std=True, init_std=0.1, output_nonlinearity=None, hidden_nonlinearity=tf.nn.relu,
def main(exp_name=None, latent_dim=3, params_folder=None): max_path_length = 100 batch_size = 32 meta_batch_size = 50 entropy_weight = 0.1 left = 'right' if_filtered = True # tf.reset_default_graph() if left == 'left': env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) else: env = TfEnv( CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect', n=4, latent_dim=latent_dim) if if_filtered: experts_filtered = [] good_range = [0.1, 0.4] #[0.3, 0.5] for expert in experts: if expert['contexts'][0, 0] >= good_range[0] and expert['contexts'][ 0, 0] <= good_range[1]: experts_filtered.append(expert) assert len(experts_filtered) >= meta_batch_size experts_filtered = experts_filtered[:-(len(experts_filtered) % meta_batch_size)] experts = experts_filtered irl_itr_list = [2800] results = [] for irl_itr in irl_itr_list: params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr) prior_params = load_prior_params(params_file) init_context_encoder_params = load_prior_params( params_file, 'context_params') policy_prior_params = None # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) irl_model = InfoAIRL(env=env, expert_trajs=experts, context_encoder=context_encoder, state_only=True, max_path_length=max_path_length, meta_batch_size=meta_batch_size, latent_dim=latent_dim) algo = MetaIRLTRPO( init_irl_params=prior_params, init_pol_params=policy_prior_params, #policy_prior_params, init_context_encoder_params=init_context_encoder_params, env=env, policy=policy, irl_model=irl_model, n_itr=150, meta_batch_size=meta_batch_size, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, train_irl=True, # True train_context_only=True, train_policy=True, irl_model_wt=1.0, entropy_weight=entropy_weight, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, ) with rllab_logdir( algo=algo, dirname= 'data_finetune/maze_finetune_discrete-entropy-%s-irl_itr-%s-%s-%s-generalize/%s' % (entropy_weight, irl_itr, left, 'filter' if if_filtered else '', exp_name)): with tf.Session(): algo.train() results.append((irl_itr, np.max(algo.pol_ret))) tf.reset_default_graph() print(results)
def reset(self): self.state = np.ones(self.action_space.flat_dim) * self.mu def evolve_state(self): x = self.state dx = self.theta * (self.mu - x) + self.sigma * nr.randn(len(x)) self.state = x + dx return self.state @overrides def get_action(self, t, observation, policy, **kwargs): action, _ = policy.get_action(observation) ou_state = self.evolve_state() return np.clip(action + ou_state, self.action_space.low, self.action_space.high) if __name__ == "__main__": ou = OUStrategy( env_spec=AttrDict(action_space=Box(low=-1, high=1, shape=(1, ))), mu=0, theta=0.15, sigma=0.3) states = [] for i in range(1000): states.append(ou.evolve_state()[0]) import matplotlib.pyplot as plt plt.plot(states) plt.show()
def observation_space(self): return Box(low=0, high=1, shape=(len(self.chars),))
def _create_observation_space(self): num_obs = 4 if self.include_velocity else 2 return Box( np.hstack((-np.inf + np.zeros(num_obs), [0])), np.hstack((np.inf + np.zeros(num_obs), [1])), )