Beispiel #1
0
    def __init__(self,
                 goal_reward=10,
                 actuation_cost_coeff=30,
                 distance_cost_coeff=1,
                 init_sigma=0.1):

        self.dynamics = PointDynamics(dim=2, sigma=0)
        self.init_mu = np.zeros(2, dtype=np.float32)
        self.init_sigma = init_sigma
        self.goal_positions = np.array([[5, 0], [-5, 0], [0, 5], [0, -5]],
                                       dtype=np.float32)
        self.goal_threshold = 1.
        self.goal_reward = goal_reward
        self.action_cost_coeff = actuation_cost_coeff
        self.distance_cost_coeff = distance_cost_coeff
        self.xlim = (-7, 7)
        self.ylim = (-7, 7)
        self.vel_bound = 1.
        self.reset()
        self.observation = None

        self.reward_range = (-float('inf'), float('inf'))
        self.metadata = {'render.modes': []}
        self.spec = None

        self._ax = None
        self._env_lines = []
        self.fixed_plots = None
        self.dynamic_plots = []

        super().__init__()
        Serializable.quick_init(self, locals())
Beispiel #2
0
    def __init__(self,
                 env,
                 policy,
                 backup_policy,
                 mix_policy,
                 pos_eps_policy,
                 neg_eps_policy,
                 baseline,
                 minibatch_size=500,
                 n_sub_itr=10,
                 optimizer=None,
                 optimizer_args=None,
                 delta=0.01,
                 **kwargs):
        Serializable.quick_init(self, locals())
        self.optimizer = optimizer
        if optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict()
            self.optimizer = CGOptimizer(**optimizer_args)

        self.opt_info = None
        self.backup_policy = backup_policy
        self.mix_policy = mix_policy
        self.pos_eps_policy = pos_eps_policy
        self.neg_eps_policy = neg_eps_policy
        self.minibatch_size = minibatch_size
        self.n_sub_itr = n_sub_itr
        self.delta = delta
        super(CATRPO, self).__init__(env=env,
                                     policy=policy,
                                     baseline=baseline,
                                     **kwargs)
Beispiel #3
0
    def __init__(self, env_spec, obs_pl, action, scope_name=None):
        Serializable.quick_init(self, locals())

        self._obs_pl = obs_pl
        self._action = action
        self._scope_name = (tf.get_variable_scope().name
                            if not scope_name else scope_name)
        super(NNPolicy, self).__init__(env_spec)
Beispiel #4
0
    def __init__(self, inputs, name, hidden_layer_sizes):
        Parameterized.__init__(self)
        Serializable.quick_init(self, locals())

        self._name = name
        self._inputs = inputs
        self._layer_sizes = list(hidden_layer_sizes) + [1]

        self._output = self._output_for(self._inputs)
Beispiel #5
0
    def __init__(self, inputs, name, hidden_layer_sizes):
        Parameterized.__init__(self)
        Serializable.quick_init(self, locals())

        self._name = name
        self._inputs = inputs
        self._layer_sizes = list(hidden_layer_sizes) + [1]

        self._output = self._output_for(self._inputs)
Beispiel #6
0
    def __init__(self,
                 radar_range=2,
                 radar_resolution=1,
                 discretized=True,
                 use_maps='all',
                 states_cache=None):
        """
        :param radar_range: how many measurements does 'radar' make to each of 4 sides (and combinations)
        :param radar_resolution: distance between two measurements of agent`s 'radar'
        :param discretized: discretized actions from {<-1,-0.33> , <-0.33,0.33> , <0.33,1>} to [-1, 0, 1]
        :param use_maps: which maps to use, list of indexes or 'all'
        :param states_cache: pre-populated cache to use (observation -> set of states)
        """
        Serializable.quick_init(self, locals())

        self.radar_range = radar_range
        self.radar_resolution = radar_resolution
        self.discretized = discretized
        if states_cache is None:
            self.states_cache = dict()
        else:
            self.states_cache = states_cache
        self.agent_width = 2.4 / np.pi
        self.max_action_distance = 0.2
        self.do_render_init = True
        self.render_prev_pos = np.zeros(2)
        self.do_caching = True

        self.current_map_idx = None
        self.agent_pos = None
        self.agent_ori = None

        # Maps initialization
        if use_maps == 'all':
            raw_maps = self.all_maps
        else:
            # noinspection PyTypeChecker
            raw_maps = [self.all_maps[i] for i in use_maps]
        self.maps = []
        self.bit_maps = []
        for i in range(len(raw_maps)):
            # Normalize char map
            m = np.array([list(row.upper()) for row in raw_maps[i]])
            m[np.logical_or(m == '.', m == ' ')] = 'F'
            m[np.logical_or(m == 'X', m == '#')] = 'W'
            m[m == 'O'] = 'H'
            self.maps.append(m)
            # Make bit map
            bm = np.zeros(m.shape)
            bm[np.logical_or(m == 'W', m == 'H')] = 1
            self.bit_maps.append(bm)
Beispiel #7
0
    def __init__(self,
                 env_name,
                 record_video=False,
                 video_schedule=None,
                 log_dir=None,
                 record_log=False,
                 force_reset=True):
        if log_dir is None:
            if logger.get_snapshot_dir() is None:
                logger.log(
                    "Warning: skipping Gym environment monitoring since snapshot_dir not configured."
                )
            else:
                log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
        Serializable.quick_init(self, locals())

        env = gym.envs.make(env_name)

        # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when
        # the time limit specified for each environment has been passed and
        # therefore the environment is not Markovian (terminal condition depends
        # on time rather than state).
        env = env.env

        self.env = env
        self.env_id = env.spec.id

        assert not (not record_log and record_video)

        if log_dir is None or record_log is False:
            self.monitoring = False
        else:
            if not record_video:
                video_schedule = NoVideoSchedule()
            else:
                if video_schedule is None:
                    video_schedule = CappedCubicVideoSchedule()
            self.env = gym.wrappers.Monitor(self.env,
                                            log_dir,
                                            video_callable=video_schedule,
                                            force=True)
            self.monitoring = True

        self._observation_space = convert_gym_space(env.observation_space)
        logger.log("observation space: {}".format(self._observation_space))
        self._action_space = convert_gym_space(env.action_space)
        logger.log("action space: {}".format(self._action_space))
        self._horizon = env.spec.tags[
            'wrapper_config.TimeLimit.max_episode_steps']
        self._log_dir = log_dir
        self._force_reset = force_reset
Beispiel #8
0
    def __init__(self,
                 env_spec,
                 hidden_layer_sizes=(100, 100),
                 name='value_function'):
        Serializable.quick_init(self, locals())

        self._Do = flat_dim(env_spec.observation_space)
        self._observations_ph = tf.placeholder(
            tf.float32, shape=[None, self._Do], name='observations')

        super(NNVFunction, self).__init__(
            inputs=(self._observations_ph,),
            name=name,
            hidden_layer_sizes=hidden_layer_sizes)
Beispiel #9
0
    def __init__(self, env_spec, q_functions):
        Serializable.quick_init(self, locals())

        self.q_functions = q_functions

        self._Da = flat_dim(env_spec.action_space)
        self._Do = flat_dim(env_spec.observation_space)

        self._observations_ph = tf.placeholder(
            tf.float32, shape=[None, self._Do], name='observations')
        self._actions_ph = tf.placeholder(
            tf.float32, shape=[None, self._Da], name='actions')

        self._output = self.output_for(
            self._observations_ph, self._actions_ph, reuse=True)
Beispiel #10
0
 def __init__(self, env, num_orig_skills, subpath_infos=None):
     """
     Creates a top-level environment for a HRL agent. Original env`s actions are replaced by N discrete actions,
     N being the number of skills.
     :param env: AsaEnv environment to wrap
     :param num_orig_skills: number of pre-trained skill that will prepared be in HRL policy
     :param subpath_infos: 'all' or list of subpath information to keep, defaults to ['env_infos']
     """
     Serializable.quick_init(self, locals())
     super().__init__(env)
     self._num_orig_skills = num_orig_skills
     self.action_space = Discrete(self._num_orig_skills)
     self.hrl_policy = None
     if subpath_infos is None:
         subpath_infos = ['env_infos']
     self.subpath_infos = subpath_infos
Beispiel #11
0
 def __init__(self, env, start_obss, end_obss):
     """
     Creates an environment tailored to train a single (missing) skill. Trajectories are initialized in start_obss
     state and terminated (and reward is generated) upon reaching end_obs state.
     :param env: AsaEnv environment to wrap. Environment is cloned to sustain integrity of original env.
     :param start_obss: Tensor of experienced starting observations (where skill should initiate)
     :param end_obss: Tensor of experienced ending observations (where skill should terminate)
     """
     Serializable.quick_init(self, locals())
     Wrapper.__init__(self, AsaEnv.clone_wrapped(
         env))  # this clones base env along with all wrappers
     if start_obss.shape != end_obss.shape:
         raise ValueError(
             'start_obss ({}) and end_obss ({}) must be of same shape'.
             format(start_obss.shape, end_obss.shape))
     self._end_obss = end_obss.reshape((end_obss.shape[0], -1))
     self._start_obss = start_obss.reshape((start_obss.shape[0], -1))
     self.current_obs_idx = None
Beispiel #12
0
    def __init__(self, goal=(0, -1), arm_distance_coeff=0):
        """
        goal (`list`): List of two elements denoting the x and y coordinates of
            the goal location. Either of the coordinate can also be a string
            'any' to make the reward not to depend on the corresponding
            coordinate.
        arm_distance_coeff ('float'): Coefficient for the arm-to-object distance
            cost.
        """
        super(PusherEnv, self).__init__(file_path=self.FILE_PATH)
        Serializable.quick_init(self, locals())

        self._goal_mask = [coordinate != 'any' for coordinate in goal]
        self._goal = np.array(goal)[self._goal_mask].astype(np.float32)

        self._arm_distance_coeff = arm_distance_coeff
        self._action_cost_coeff = 0.1

        # Make the the complete robot visible when visualizing.
        self.model.stat.extent = 10
Beispiel #13
0
    def __init__(self, goal=(0, -1), arm_distance_coeff=0):
        """
        goal (`list`): List of two elements denoting the x and y coordinates of
            the goal location. Either of the coordinate can also be a string
            'any' to make the reward not to depend on the corresponding
            coordinate.
        arm_distance_coeff ('float'): Coefficient for the arm-to-object distance
            cost.
        """
        super(PusherEnv, self).__init__(file_path=self.FILE_PATH)
        Serializable.quick_init(self, locals())

        self._goal_mask = [coordinate != 'any' for coordinate in goal]
        self._goal = np.array(goal)[self._goal_mask].astype(np.float32)

        self._arm_distance_coeff = arm_distance_coeff
        self._action_cost_coeff = 0.1

        # Make the the complete robot visible when visualizing.
        self.model.stat.extent = 10
Beispiel #14
0
    def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False,
                 force_reset=True):
        if log_dir is None:
            if logger.get_snapshot_dir() is None:
                logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.")
            else:
                log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
        Serializable.quick_init(self, locals())

        env = gym.envs.make(env_name)

        # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when
        # the time limit specified for each environment has been passed and
        # therefore the environment is not Markovian (terminal condition depends
        # on time rather than state).
        env = env.env

        self.env = env
        self.env_id = env.spec.id

        assert not (not record_log and record_video)

        if log_dir is None or record_log is False:
            self.monitoring = False
        else:
            if not record_video:
                video_schedule = NoVideoSchedule()
            else:
                if video_schedule is None:
                    video_schedule = CappedCubicVideoSchedule()
            self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True)
            self.monitoring = True

        self._observation_space = convert_gym_space(env.observation_space)
        logger.log("observation space: {}".format(self._observation_space))
        self._action_space = convert_gym_space(env.action_space)
        logger.log("action space: {}".format(self._action_space))
        self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']
        self._log_dir = log_dir
        self._force_reset = force_reset
    def __init__(self,
                 env_spec,
                 hidden_layer_sizes,
                 squash=True,
                 name='policy'):
        Serializable.quick_init(self, locals())

        self._action_dim = flat_dim(env_spec.action_space)
        self._observation_dim = flat_dim(env_spec.observation_space)
        self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim]
        self._squash = squash
        self._name = name

        self._observation_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observation')

        self._actions = self.actions_for(self._observation_ph)

        super(StochasticNNPolicy, self).__init__(
            env_spec, self._observation_ph, self._actions, self._name)
Beispiel #16
0
 def clone_wrapped(env):
     """
     Clone Serializable AsaEnv wrapped into multiple other environment wrappers.
     This performs Serializable.clone on inner env and all its wrappers, if possible.
     Supported wrappers:
     - TfEnv: no cloning needed, only wrap in new instance
     - NormalizedEnv: clone and wrap
     - other Serializable: clone and wrap, display warning
     - other non-Serializable: wrap in new instance, display warning
     :param env: AsaEnv wrapped in multiple other environment wrappers
     """
     # Unwrap
     stack = []
     while not isinstance(env, AsaEnv):
         stack.append((type(env), env))
         env = env.env
     # Clone inner env
     new_env = Serializable.clone(env)
     # Re-wrap, cloning wrappers on the way
     while stack:
         wrapper_cls, wrapper_env = stack.pop()
         if wrapper_cls is TfEnv:
             new_env = TfEnv(new_env)
         elif wrapper_cls is NormalizedEnv:
             # WARNING: obs_mean and obs_var are not copied to original env after skill training!
             new_env = Serializable.clone(wrapper_env, env=new_env)
         elif isinstance(wrapper_env, Serializable):
             new_env = Serializable.clone(wrapper_env, env=new_env)
             warn_once(
                 'AsaEnv: clone_wrapped performed on unknown Serializable wrapper "{}". '
                 'Wrapper was cloned and applied.'.format(wrapper_cls))
         else:
             new_env = wrapper_cls(env=new_env)
             warn_once(
                 'AsaEnv: clone_wrapped performed on unknown non-Serializable wrapper "{}". '
                 'Wrapper was initiated with default parameters and applied.'
                 .format(wrapper_cls))
     return new_env
    def __init__(self, env_spec, max_replay_buffer_size):
        super(SimpleReplayBuffer, self).__init__()
        Serializable.quick_init(self, locals())

        max_replay_buffer_size = int(max_replay_buffer_size)

        self._env_spec = env_spec
        self._observation_dim = flat_dim(env_spec.observation_space)
        self._action_dim = flat_dim(env_spec.action_space)
        self._max_buffer_size = max_replay_buffer_size
        self._observations = np.zeros((max_replay_buffer_size,
                                       self._observation_dim))
        # It's a bit memory inefficient to save the observations twice,
        # but it makes the code *much* easier since you no longer have to
        # worry about termination conditions.
        self._next_obs = np.zeros((max_replay_buffer_size,
                                   self._observation_dim))
        self._actions = np.zeros((max_replay_buffer_size, self._action_dim))
        self._rewards = np.zeros(max_replay_buffer_size)
        # self._terminals[i] = a terminal was received at time i
        self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8')
        self._top = 0
        self._size = 0
    def __init__(self, env_spec, max_replay_buffer_size):
        super(SimpleReplayBuffer, self).__init__()
        Serializable.quick_init(self, locals())

        max_replay_buffer_size = int(max_replay_buffer_size)

        self._env_spec = env_spec
        self._observation_dim = flat_dim(env_spec.observation_space)
        self._action_dim = flat_dim(env_spec.action_space)
        self._max_buffer_size = max_replay_buffer_size
        self._observations = np.zeros(
            (max_replay_buffer_size, self._observation_dim))
        # It's a bit memory inefficient to save the observations twice,
        # but it makes the code *much* easier since you no longer have to
        # worry about termination conditions.
        self._next_obs = np.zeros(
            (max_replay_buffer_size, self._observation_dim))
        self._actions = np.zeros((max_replay_buffer_size, self._action_dim))
        self._rewards = np.zeros(max_replay_buffer_size)
        # self._terminals[i] = a terminal was received at time i
        self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8')
        self._top = 0
        self._size = 0
    def __init__(self,
                 env_spec,
                 hidden_layer_sizes,
                 squash=True,
                 name='policy'):
        Serializable.quick_init(self, locals())

        self._action_dim = flat_dim(env_spec.action_space)
        self._observation_dim = flat_dim(env_spec.observation_space)
        self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim]
        self._squash = squash
        self._name = name

        self._observation_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observation')

        self._actions = self.actions_for(self._observation_ph)

        super(StochasticNNPolicy,
              self).__init__(env_spec, self._observation_ph, self._actions,
                             self._name)
Beispiel #20
0
    def __init__(self, goal_reward=10, actuation_cost_coeff=30,
                 distance_cost_coeff=1, init_sigma=0.1):

        self.dynamics = PointDynamics(dim=2, sigma=0)
        self.init_mu = np.zeros(2, dtype=np.float32)
        self.init_sigma = init_sigma
        self.goal_positions = np.array(
            [
                [5, 0],
                [-5, 0],
                [0, 5],
                [0, -5]
            ],
            dtype=np.float32
        )
        self.goal_threshold = 1.
        self.goal_reward = goal_reward
        self.action_cost_coeff = actuation_cost_coeff
        self.distance_cost_coeff = distance_cost_coeff
        self.xlim = (-7, 7)
        self.ylim = (-7, 7)
        self.vel_bound = 1.
        self.reset()
        self.observation = None

        self.reward_range = (-float('inf'), float('inf'))
        self.metadata = {'render.modes': []}
        self.spec = None

        self._ax = None
        self._env_lines = []
        self.fixed_plots = None
        self.dynamic_plots = []

        super().__init__()
        Serializable.quick_init(self, locals())
Beispiel #21
0
    def __init__(self,
                 top_policy,
                 skill_policy_prototype,
                 skill_policies,
                 skill_stop_functions=None,
                 skill_max_timesteps=100):
        """
        :param top_policy: policy for top-level agent, to be trained
        :param skill_policies: list of trained skill policies
        :param skill_policy_prototype: an empty policy serving as a prototype for newly created skill policies. New
               policies are generated by calling Serializable.clone() upon this prototype, producing a new instance of
               the policy initialized with same parameters as the prototype.
        :param skill_stop_functions: list of stopping functions (path_dict -> bool) for trained skills
        :param skill_max_timesteps: maximum length of skill execution
        """
        Serializable.quick_init(self, locals())
        self.top_policy = top_policy
        self.skill_policy_prototype = skill_policy_prototype
        self.skill_policies = skill_policies
        self.skill_max_timesteps = skill_max_timesteps
        num_orig_skills = len(skill_policies)

        # pad _skills_end_obss to align indexes with skill_policies
        self._skills_end_obss = [None for _ in range(num_orig_skills)]

        # if _skill_stop_functions is not provided, default stopping function (return False) is assigned to all
        self._skill_stop_functions = skill_stop_functions if skill_stop_functions is not None \
                                     else [lambda path: False for _ in range(num_orig_skills)]
        assert (len(self._skill_stop_functions) == num_orig_skills)

        # Check top-level policy
        if not isinstance(top_policy.action_space, Discrete) \
                or top_policy.action_space.n != self.num_skills:
            raise TypeError(
                'Top level policy must have Discrete(num_skills) action space.'
            )
Beispiel #22
0
    def create_new_skill(self, end_obss):
        """
        Create new untrained skill and add it to skills list, along with its stopping function.
        :return: new skill policy and skill ID (index of the skill)
        :rtype: tuple(garage.policies.base.Policy, int)
        """
        new_skill_id = len(self.skill_policies)
        new_skill_pol = Serializable.clone(
            obj=self.skill_policy_prototype,
            name='{}Skill{}'.format(
                type(self.skill_policy_prototype).__name__, new_skill_id))
        self.skill_policies.append(new_skill_pol)
        self._skills_end_obss.append(np.copy(end_obss))

        unique_end_obss = np.unique(self._skills_end_obss[new_skill_id],
                                    axis=0)
        self._skill_stop_functions.append(
            # lambda path: path['observations'][-1] in self._skills_end_obss[new_skill_id]
            lambda path:
            (path['observations'][-1] == unique_end_obss).all(axis=1).any())
        return new_skill_pol, new_skill_id
Beispiel #23
0
 def __setstate__(self, d):
     Serializable.__setstate__(self, d)
     global load_params
     if load_params:
         self.set_param_values(d['params'])
Beispiel #24
0
 def __getstate__(self):
     d = Serializable.__getstate__(self)
     d['params'] = self.get_param_values()
     return d
Beispiel #25
0
    def __init__(self, env, delay=0.01):
        Serializable.quick_init(self, locals())
        gym.Wrapper.__init__(self, env)

        self._delay = delay
 def __init__(self, *args, **kwargs):
     Serializable.quick_init(self, locals())
     self.reward_range = None
     self.metadata = None
     super().__init__(SequenceReacherEnv(*args, **kwargs))