Ejemplos de Box en Python, ejemplos de rllab.spaces.box.Box en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: cylinder_env.py Proyecto: vickyskarthik/gcg

    def __init__(self, params={}):
        params.setdefault('obs_shape', (64, 36)) # not for size b/c don't want aliasing
        params.setdefault('collision_reward_only', False)
        params.setdefault('steer_limits', [-30., 30.])
        params.setdefault('speed_limits', [2., 2.])

        params.setdefault('use_depth', False)
        params.setdefault('do_back_up', False)
        self._model_path = params.get('model_path',
                                      os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models/cylinder.egg'))

        self._obs_shape = params['obs_shape']

        CarEnv.__init__(
            self,
            params=params)

        self._fixed_speed = (params['speed_limits'][0] == params['speed_limits'][1])
        assert(self._fixed_speed)
        self._steer_limits = params['steer_limits']
        self._speed_limits = params['speed_limits']
        if self._fixed_speed:
            self.action_space = Box(low=np.array([-1., 1.]), high=np.array([1., 1.]))
        else:
            self.action_space = Box(low=np.array([-1., 1.]), high=np.array([-1., 1.]))
        self._unnormalized_action_space = Box(low=np.array([self._steer_limits[0], self._speed_limits[0]]),
                                              high=np.array([self._steer_limits[1], self._speed_limits[1]]))

        self.observation_space = Box(low=0, high=255, shape=tuple(self._get_observation().shape))

        self._collision_reward_only = params['collision_reward_only']

Ejemplo n.º 2

0

Mostrar archivo

 def transformed_observation_space(self, wrapped_observation_space):
     if type(wrapped_observation_space) is Box:
         if self.use_activation:
             return Box(0.0, 1.0, self.out_shape)
         else:
             return Box(0.0, 10.0, self.out_shape)
     else:
         raise NotImplementedError(
             "Currently only support Box observation spaces for InceptionTransformer"
         )

Ejemplo n.º 3

0

Mostrar archivo

    def __init__(self,
                 app_name,
                 time_state=False,
                 idx=0,
                 is_render=False,
                 no_graphics=False,
                 recording=True):
        Serializable.quick_init(self, locals())

        # Unity scene
        self._env = UnityEnvironment(file_name=app_name,
                                     worker_id=idx,
                                     no_graphics=no_graphics)
        self.id = 0

        self.name = app_name
        self.idx = idx
        self.is_render = is_render

        self.time_state = time_state
        self.time_step = 0

        # Check brain configuration
        assert len(self._env.brains) == 1
        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        # Check for number of agents in scene
        initial_info = self._env.reset()[self.brain_name]
        self.use_visual = (brain.number_visual_observations == 1) and False
        self.recording = brain.number_visual_observations == 1 and recording

        # Set observation and action spaces
        if brain.vector_action_space_type == "discrete":
            self._action_space = Discrete(1)
        else:
            high = np.array([np.inf] * (brain.vector_action_space_size))
            self._action_space = Box(-high, high)
        # ----------------------------------
        if self.use_visual and False and no_graphic:
            high = np.array([np.inf] * brain.camera_resolutions[0]["height"] *
                            brain.camera_resolutions[0]["width"] * 3)
            self._observation_space = Box(-high, high)
        else:
            if self.time_state:
                high = np.array([np.inf] *
                                (brain.vector_observation_space_size + 1))
            else:
                high = np.array([np.inf] *
                                (brain.vector_observation_space_size))
            self._observation_space = Box(-high, high)

        # video buffer
        self.frames = []

Ejemplo n.º 4

0

Mostrar archivo

Archivo: multiGymEnv.py Proyecto: VincentYu68/rllab

def convert_gym_space(space, box_additional_dim=0):
    if isinstance(space, gym.spaces.Box):
        if box_additional_dim != 0:
            low = np.concatenate([space.low, [-np.inf] * box_additional_dim])
            high = np.concatenate([space.high, [np.inf] * box_additional_dim])
            return Box(low=low, high=high)
        return Box(low=space.low, high=space.high)
    elif isinstance(space, gym.spaces.Discrete):
        return Discrete(n=space.n)
    elif isinstance(space, gym.spaces.Tuple):
        return Product([convert_gym_space(x) for x in space.spaces])
    else:
        raise NotImplementedError

Ejemplo n.º 5

0

Mostrar archivo

 def env_create(self):
     route = '/v1/envs/'
     data = {'env_id': 'lol'}
     resp = self._post_request(route, data)
     instance_id = resp['instance_id']
     self.instance_id = instance_id
     asi = self.env_action_space_info(instance_id)
     obi = self.env_observation_space_info(instance_id)
     self.action_space = Box(np.array(asi['low']), np.array(asi['high']))
     self.observation_space = Box(np.array(obi['low']),
                                  np.array(obi['high']))
     self.spec = EnvSpec(self.action_space, self.observation_space)
     self.spec.id = 0
     self.spec.timestep_limit = 500
     self.instance_id = instance_id

Ejemplo n.º 6

0

Mostrar archivo

def convert_gym_space(space, n_agents=1):
    if isinstance(space, gym.spaces.Box) or isinstance(space, Box):
        if len(space.shape) > 1:
            assert n_agents == 1, "multi-dimensional inputs for centralized agents not supported"
            return Box(low=np.min(space.low),
                       high=np.max(space.high),
                       shape=space.shape)
        else:
            return Box(low=np.min(space.low),
                       high=np.max(space.high),
                       shape=(space.shape[0] * n_agents, ))
    elif isinstance(space, gym.spaces.Discrete) or isinstance(space, Discrete):
        return Discrete(n=space.n**n_agents)
    else:
        raise NotImplementedError

Ejemplo n.º 7

0

Mostrar archivo

Archivo: concurrent_ppo2.py Proyecto: alexlioralexli/rllab-finetuning

    def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 step_size=0.003,
                 num_latents=6,
                 latents=None,  # some sort of iterable of the actual latent vectors
                 period=10,  # how often I choose a latent
                 truncate_local_is_ratio=None,
                 epsilon=0.1,
                 train_pi_iters=10,
                 use_skill_dependent_baseline=False,
                 mlp_skill_dependent_baseline=False,
                 freeze_manager=False,
                 freeze_skills=False,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(Concurrent_PPO, self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period
        self.freeze_manager = freeze_manager
        self.freeze_skills = freeze_skills
        assert (not freeze_manager) or (not freeze_skills)

        # todo: fix this sampler stuff
        # import pdb; pdb.set_trace()
        self.sampler = HierBatchSampler(self, self.period)
        # self.sampler = BatchSampler(self)
        # i hope this is right
        self.diagonal = DiagonalGaussian(self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period
        # self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            new_obs_space_no_bi = curr_env.observation_space.shape[0] + 1  # 1 for the t_remaining
            skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents,)
            skill_dependent_obs_space = Box(-1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space)
            if self.mlp_skill_dependent_baseline:
                self.skill_dependent_baseline = GaussianMLPBaseline(env_spec=skill_dependent_env_spec)
            else:
                self.skill_dependent_baseline = LinearFeatureBaseline(env_spec=skill_dependent_env_spec)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: gym_env.py Proyecto: andrewliao11/rllab

def convert_gym_space(space):
    if isinstance(space, gym.spaces.Box):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, gym.spaces.Discrete):
        return Discrete(n=space.n)
    else:
        raise NotImplementedError

Ejemplo n.º 9

0

Mostrar archivo

    def _set_action_space(self):
        max_torques = 0.5 * np.array([8, 7, 6, 5, 4, 3, 2])
        self.joint_torque_high = max_torques
        self.joint_torque_low = -1 * max_torques

        if self.action_mode == 'position':
            delta_factor = 0.1
            delta_high = delta_factor * np.ones(3)
            delta_low = delta_factor * -1 * np.ones(3)

            self._action_space = Box(
                delta_low,
                delta_high,
            )
        else:
            self._action_space = Box(self.joint_torque_low,
                                     self.joint_torque_high)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: gym_env.py Proyecto: Obarack/rllab

def convert_gym_space(space):
    if isinstance(space, gym.spaces.Box):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, gym.spaces.Discrete):
        return Discrete(n=space.n)
    elif isinstance(space, gym.spaces.Tuple):
        return Product([convert_gym_space(x) for x in space.spaces])
    # added for robotics enviroments
    elif isinstance(space, gym.spaces.Dict):
        b_low = np.concatenate((space.spaces["desired_goal"].low,space.spaces["achieved_goal"].low,space.spaces["observation"].low))
        b_high = np.concatenate((space.spaces["desired_goal"].high,space.spaces["achieved_goal"].high,space.spaces["observation"].high))
        name = Box(low=b_low, high=b_high)
        print(name)
        return name
    # end addition    
    else:
        raise NotImplementedError

Ejemplo n.º 11

0

Mostrar archivo

Archivo: gym_wrapper.py Proyecto: JasmineDeng/rlmaster

 def __init__(self, env):
   self._env = env
   if isinstance(self.env.action_processor, 
                 base_environment.BaseDiscreteAction):
     self.action_space = spaces.Discrete(self.env.num_actions())
   else:
     assert isinstance(self.env.action_processor,
                       base_environment.BaseContinuousAction)
     self.action_space = Box(
                     low  = env.action_processor.minval(), 
                     high = env.action_processor.maxval(), 
                     shape=(env.action_processor.action_dim()))
   obsNdim = self.env.observation_ndim()   
   obsKeys = list(obsNdim.keys())
   assert len(obsKeys) == 1, 'gym only supports one observation type'
   self._obsKey = obsKeys[0]
   self.observation_space = Box(low=0, high=255, shape=obsNdim[obsKeys[0]])

Ejemplo n.º 12

0

Mostrar archivo

    def __init__(self, env):
        super().__init__(env)
        Serializable.quick_init(self, locals())
        self.keys = ["robot-state", "object-state"]

        # set up observation and action spaces
        flat_ob = self._flatten_obs(self.env.reset(), verbose=True)
        self.obs_dim = flat_ob.size
        high = np.inf * np.ones(self.obs_dim)
        low = -high
        self.observation_space = Box(low=low, high=high)
        low, high = self.env.action_spec
        self.action_space = Box(low=low, high=high)

        self.spec = EnvSpec(
            observation_space=self.observation_space,
            action_space=self.action_space,
        )

Ejemplo n.º 13

0

Mostrar archivo

Archivo: hierarchical_vpg.py Proyecto: alexlioralexli/rllab-finetuning

    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=1e-2,
            num_latents=6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            use_skill_dependent_baseline=False,
            **kwargs):
        Serializable.quick_init(self, locals())
        if optimizer is None:
            default_args = dict(batch_size=None, max_epochs=1)
            if optimizer_args is None:
                optimizer_args = default_args
            else:
                optimizer_args = dict(default_args, **optimizer_args)
            optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                            **optimizer_args)
        self.optimizer = optimizer
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        super(PG_concurrent_approx,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period

        # todo: fix this sampler stuff
        self.sampler = HierBatchSampler(self, self.period)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period

        self.trainable_manager = self.policy.trainable_manager

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            skill_dependent_obs_space_dim = (
                (curr_env.observation_space.shape[0] + 1) * self.num_latents, )
            skill_dependent_obs_space = Box(
                -1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space,
                                                skill_dependent_action_space)
            self.skill_dependent_baseline = LinearFeatureBaseline(
                env_spec=skill_depdendent_env_spec)

Ejemplo n.º 14

0

Mostrar archivo

def convert_gym_space(space):
    if isinstance(space, Box):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, gym.spaces.Discrete):
        return Discrete(n=space.n)
    elif isinstance(space, gym.spaces.Tuple):
        return Product([convert_gym_space(x) for x in space.spaces])
    else:
        raise NotImplementedError

Ejemplo n.º 15

0

Mostrar archivo

    def __init__(
        self,
        update_hz=20,
        action_mode='torque',
        safety_box=True,
        reward='MSE',
        huber_delta=10,
        safety_force_magnitude=8,
        safety_force_temp=15,
        safe_reset_length=150,
        reward_magnitude=1,
    ):
        Serializable.quick_init(self, locals())
        self.init_rospy(update_hz)

        self.safety_box_lows = np.array([-0.03900771, -0.32655392, -1])
        self.safety_box_highs = np.array([0.79529649, 0.3227083, 1])

        self.joint_names = [
            'right_j0', 'right_j1', 'right_j2', 'right_j3', 'right_j4',
            'right_j5', 'right_j6'
        ]
        self.link_names = [
            'right_l2', 'right_l3', 'right_l4', 'right_l5', 'right_l6', '_hand'
        ]

        self.action_mode = action_mode
        self.reward_magnitude = reward_magnitude
        self.safety_box = safety_box
        self.safe_reset_length = safe_reset_length
        self.huber_delta = huber_delta
        self.safety_force_magnitude = safety_force_magnitude
        self.safety_force_temp = safety_force_temp
        self.AnglePDController = AnglePDController()

        max_torques = 0.5 * np.array([8, 7, 6, 5, 4, 3, 2])
        self.joint_torque_high = max_torques
        self.joint_torque_low = -1 * max_torques

        self._action_space = Box(self.joint_torque_low, self.joint_torque_high)

        if reward == 'MSE':
            self.reward_function = self._MSE_reward
        elif reward == 'huber':
            self.reward_function = self._Huber_reward
        else:
            self.reward_function = self._Norm_reward

        self._set_action_space()
        self._set_observation_space()
        self.get_latest_pose_jacobian_dict()
        self.in_reset = True
        self.amplify = np.ones(7) * 2.5
        self.pd_time_steps = 50
        self.jacobian_pseudo_inverse_torques_scale = 10
        self.damping_scale = 5

Ejemplo n.º 16

0

Mostrar archivo

Archivo: gym_env.py Proyecto: ycz0512/rllab-gym-wrapper

    def __init__(self, env_name, dict_keys):
        Serializable.quick_init(self, locals())
        super(FlattenDictWrapper, self).__init__(env_name)
        self.dict_keys = dict_keys

        size = 0
        for key in dict_keys:
            shape = self.env.observation_space.spaces[key].shape
            size += np.prod(shape)
        self._observation_space = Box(-np.inf, np.inf, shape=(size, ))

Ejemplo n.º 17

0

Mostrar archivo

Archivo: period_varying_env.py Proyecto: alexlioralexli/rllab-finetuning

    def __init__(self, wrapped_env):
        Serializable.quick_init(self, locals())
        self._wrapped_env = wrapped_env

        low, high = np.min(
            self._wrapped_env.spec.observation_space.low), np.max(
                self._wrapped_env.spec.observation_space.high)
        # assert len(wrapped_env.spec.observation_space.shape) == 1
        shape = (self._wrapped_env.spec.observation_space.shape[0] + 1, )
        self.obs_space = Box(low, high, shape)

Ejemplo n.º 18

0

Mostrar archivo

def convert_gym_space(space):
    # import IPython; IPython.embed()
    if isinstance(space, gym.spaces.Box):
        return Box(low=0, high=255, shape=(80, 80))
    elif isinstance(space, gym.spaces.Discrete):
        return Discrete(n=space.n)
    elif isinstance(space, gym.spaces.Tuple):
        return Product([convert_gym_space(x) for x in space.spaces])
    else:
        raise NotImplementedError

Ejemplo n.º 19

0

Mostrar archivo

 def __init__(self, dim=2, state_bounds=None, action_bounds=None,
              control_mode='linear', *args, **kwargs):
     """
     :param dim: dimension of the position space. the obs will be 2*dim
     :param state_bounds: list or np.array of 2*dim with the ub of the state space
     :param action_bounds: list or np.array of 2*dim with the ub of the action space
     :param goal_generator: Proceedure to sample the and keep the goals
     :param reward_dist_threshold:
     :param control_mode:
     """
     Serializable.quick_init(self, locals())
     self.dim = dim
     self.control_mode = control_mode
     self.dt = 0.02
     self.pos = np.zeros(dim)
     self.vel = np.zeros(dim)
     self.state_ub = UB * np.ones(self.dim * 2) if state_bounds is None else np.array(state_bounds)
     self.action_ub = UB * np.ones(self.dim) if action_bounds is None else np.array(action_bounds)
     self._observation_space = Box(-1 * self.state_ub, self.state_ub)
     self._action_space = Box(-1 * self.action_ub, self.action_ub)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: one_char_memory.py Proyecto: Asap7772/rail-rl-franka-eval

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._reward_values = list(
            range(1,
                  int(self._max_reward_magnitude) + 1))

        low = np.zeros(self._onehot_size + 1)
        low[-1] = -self._max_reward_magnitude
        high = np.ones(self._onehot_size + 1)
        high[-1] = self._max_reward_magnitude

        self._observation_space = Box(low, high)

Ejemplo n.º 21

0

Mostrar archivo

def convert_gym_space(space):
    if isinstance(space, gym.spaces.Box):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, gym.spaces.Discrete):
        return Discrete(n=space.n)
    elif isinstance(space, gym.spaces.Tuple):
        return Product([convert_gym_space(x) for x in space.spaces])
    elif isinstance(space, list):
        # For multiagent envs
        return list(map(convert_gym_space, space))
        # TODO(cathywu) refactor multiagent envs to use gym.spaces.Tuple
        # (may be needed for pickling?)
    else:
        raise NotImplementedError

Ejemplo n.º 22

0

Mostrar archivo

Archivo: multigoal.py Proyecto: ycz0512/DGSL

    def goal_env_spec(self, dict_keys=('observation', 'desired_goal')):
        """
        convert selected keys of a dict observation space to a Box space
        and return the corresponding env_spec.
        :param dict_keys: (`tuple`) desired keys that you would like to use.
        :return: (`EnvSpec`) converted object
        """
        assert isinstance(self.observation_space, Dict)
        obs_dim = np.sum([self.observation_space.spaces[key].flat_dim for key in dict_keys])
        obs_space = Box(-np.inf, np.inf, shape=(obs_dim,))

        return EnvSpec(
            observation_space=obs_space,
            action_space=self.action_space,
        )

Ejemplo n.º 23

0

Mostrar archivo

def convert_gym_space(space):
    """
    Convert a gym.space to an rllab.space
    :param space: (obj:`gym.Space`) The Space object to convert
    :return: converted rllab.Space object
    """
    if isinstance(space, gym.spaces.Box):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, gym.spaces.Discrete):
        return Discrete(n=space.n)
    elif isinstance(space, gym.spaces.Tuple):
        return Product([convert_gym_space(x) for x in space.spaces])
    elif isinstance(space, gym.spaces.Dict):
        return Dict(space.spaces)
    else:
        raise TypeError

Ejemplo n.º 24

0

Mostrar archivo

Archivo: sawyer_reaching.py Proyecto: JohTobi/sawyer_control

    def _set_observation_space(self):
        lows = np.hstack((
            JOINT_VALUE_LOW['position'],
            JOINT_VALUE_LOW['velocity'],
            JOINT_VALUE_LOW['torque'],
            END_EFFECTOR_VALUE_LOW['position'],
            END_EFFECTOR_VALUE_LOW['angle'],
            END_EFFECTOR_VALUE_LOW['position'],
        ))

        highs = np.hstack((
            JOINT_VALUE_HIGH['position'],
            JOINT_VALUE_HIGH['velocity'],
            JOINT_VALUE_HIGH['torque'],
            END_EFFECTOR_VALUE_HIGH['position'],
            END_EFFECTOR_VALUE_HIGH['angle'],
            END_EFFECTOR_VALUE_HIGH['position'],
        ))

        self._observation_space = Box(lows, highs)

Ejemplo n.º 25

0

Mostrar archivo

Archivo: multigoal.py Proyecto: zxsted/sac

 def action_space(self):
     return Box(low=-self.vel_bound,
                high=self.vel_bound,
                shape=(self.dynamics.a_dim, ))

Ejemplo n.º 26

0

Mostrar archivo

Archivo: multigoal.py Proyecto: zxsted/sac

 def observation_space(self):
     return Box(low=np.array((self.xlim[0], self.ylim[0])),
                high=np.array((self.xlim[1], self.ylim[1])),
                shape=None)

Ejemplo n.º 27

0

Mostrar archivo

Archivo: bayesian_evolution_strategy.py Proyecto: Breakend/BayesianExploration

        return samples

    def get_action(self, t, observation, policy, **kwargs):
        #applying MC Dropout and taking the mean action?
        action, _ = policy.get_action(observation)
        mc_dropout = 10
        all_actions = np.zeros(shape=(mc_dropout, action.shape[0]))

        for d in range(mc_dropout):
            action, _ = policy.get_action(observation)
            all_actions[d, :] = action

        mean_action = np.mean(all_actions, axis=0)

        return mean_action


if __name__ == "__main__":
    ou = MCDropout(
        env_spec=AttrDict(action_space=Box(low=-1, high=1, shape=(1, ))),
        mu=0,
        theta=0.15,
        sigma=0.3)
    states = []
    for i in range(1000):
        states.append(ou.evolve_state()[0])
    import matplotlib.pyplot as plt

    plt.plot(states)
    plt.show()

Ejemplo n.º 28

0

Mostrar archivo

Archivo: multigoal.py Proyecto: ycz0512/DGSL

 def _get_2D_Box_space(self):
     return Box(
         low=np.array((self.xlim[0], self.ylim[0])),
         high=np.array((self.xlim[1], self.ylim[1])),
         shape=None,
     )

Ejemplo n.º 29

0

Mostrar archivo

Archivo: one_char_memory.py Proyecto: Asap7772/rail-rl-franka-eval

    def __init__(
        self,
        n=2,
        num_steps=10,
        reward_for_remembering=1,
        max_reward_magnitude=1,
        softmax_action=False,
        zero_observation=False,
        output_target_number=False,
        output_time=False,
        episode_boundary_flags=False,
    ):
        """
        :param n: Number of different values that could be returned
        :param num_steps: How many steps the policy needs to output before the
        episode ends. AKA the horizon.

        So, the policy will see num_steps+1 total observations
        if you include the observation returned when reset() is called. The
        last observation will be the "terminal state" observation.
        :param reward_for_remembering: The reward bonus for remembering the
        number. This number is added to the usual reward if the correct
        number has the maximum probability.
        :param max_reward_magnitude: Clip the reward magnitude to this value.
        :param softmax_action: If true, put the action through a softmax.
        :param zero_observation: If true, all observations after the first will
        be just zeros (NOT the zero one-hot).
        :param episode_boundary_flags: If true, add a boolean flag to the
        observation for whether or not the episode is starting or terminating.
        """
        assert max_reward_magnitude >= reward_for_remembering
        self.num_steps = num_steps
        self.n = n
        self._onehot_size = n + 1
        """
        Time step for the NEXT observation to be returned.

        env = OneCharMemory()  # t == 0 after this line
        obs = env.reset()      # t == 1
        _ = env.step()         # t == 2
        _ = env.step()         # t == 3
        ...
        done = env.step()[2]   # t == num_steps and done == False
        done = env.step()[2]   # t == num_steps+1 and done == True
        """
        self._t = 0
        self._reward_for_remembering = reward_for_remembering
        self._max_reward_magnitude = max_reward_magnitude
        self._softmax_action = softmax_action
        self._zero_observation = zero_observation
        self._output_target_number = output_target_number
        self._output_time = output_time
        self._episode_boundary_flags = episode_boundary_flags

        self._action_space = Box(np.zeros(self._onehot_size),
                                 np.ones(self._onehot_size))
        obs_low = np.zeros(self._onehot_size)
        obs_high = np.zeros(self._onehot_size)
        if self._output_target_number:
            obs_low = np.hstack((obs_low, [0]))
            obs_high = np.hstack((obs_high, [self.n]))
        if self._output_time:
            obs_low = np.hstack((obs_low, [0] * (self.num_steps + 1)))
            obs_high = np.hstack((obs_high, [1] * (self.num_steps + 1)))
        if self._episode_boundary_flags:
            obs_low = np.hstack((obs_low, [0] * 2))
            obs_high = np.hstack((obs_high, [1] * 2))
        self._observation_space = Box(obs_low, obs_high)

        self._target_number = None

        # For rendering
        self._last_reward = None
        self._last_action = None
        self._last_t = None

Ejemplo n.º 30

0

Mostrar archivo

 def action_space(self):
     return Box(low=-5.0, high=5.0, shape=(1,))