def space_step(self, action_e): action = action_e[(0, 0)] # single body if self.done: # space envs run continually without a central reset signal return self.space_reset() if not self.is_discrete: action = np.array([action]) state, reward, done, _info = self.u_env.step(action) reward = guard_reward(reward) reward *= self.reward_scale if util.to_render(): self.u_env.render() self.done = done = done or self.clock.get('t') > self.max_timestep reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for ab, body in util.ndenumerate_nonan(self.body_e): reward_e[ab] = reward state_e[ab] = state done_e[ab] = done logger.debug( f'Env {self.e} step reward_e: {reward_e}, state_e: {state_e}, done_e: {done_e}' ) if isinstance(self.u_env.observation_space, gym.spaces.discrete.Discrete): state = util.to_one_hot(state, self.u_env.observation_space.n) return reward_e, state_e, done_e
def __init__(self, name): worker_id = int(f'{os.getpid()}{int(ps.unique_id())}'[-4:]) super().__init__(get_env_path(name), worker_id, no_graphics=not util.to_render(), multiagent=True) self.num_envs = self.number_agents
def reset(self): _reward = np.nan state = self.u_env.reset() self.done = done = False if util.to_render(): self.u_env.render() logger.debug(f'Env {self.e} reset reward: {_reward}, state: {state}, done: {done}') return _reward, state, done
def space_reset(self): _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for ab, body in util.ndenumerate_nonan(self.body_e): state = self.u_env.reset() state_e[ab] = state done_e[ab] = self.done = False if util.to_render(): self.u_env.render() logger.debug(f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}') return _reward_e, state_e, done_e
def step(self, action): if not self.is_discrete: # guard for continuous action = np.array([action]) state, reward, done, _info = self.u_env.step(action) if util.to_render(): self.u_env.render() self.done = done = done or self.clock.get('t') > self.max_timestep logger.debug( f'Env {self.e} step reward: {reward}, state: {state}, done: {done}' ) return reward, state, done
def step(self, action): if not self.is_discrete: # guard for continuous action = np.array([action]) state, reward, done, _info = self.u_env.step(action) reward = guard_reward(reward) reward *= self.reward_scale if util.to_render(): self.u_env.render() if self.max_t is not None: done = done or self.clock.t > self.max_t self.done = done logger.debug(f'Env {self.e} step reward: {reward}, state: {state}, done: {done}') return reward, state, done
def reset(self): _reward = np.nan state = self.u_env.reset() self.done = done = False if util.to_render(): self.u_env.render() logger.debug( f'Env {self.e} reset reward: {_reward}, state: {state}, done: {done}' ) if isinstance(self.u_env.observation_space, gym.spaces.discrete.Discrete): state = util.to_one_hot(state, self.u_env.observation_space.n) return _reward, state, done
def space_reset(self): _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s( ENV_DATA_NAMES, e=self.e) for ab, body in util.ndenumerate_nonan(self.body_e): state = self.u_env.reset() state_e[ab] = state done_e[ab] = self.done = False if util.to_render(): self.u_env.render() logger.debug( f'Env {self.e} reset reward_e: {_reward_e}, state_e: {state_e}, done_e: {done_e}' ) if isinstance(self.u_env.observation_space, gym.spaces.discrete.Discrete): state = util.to_one_hot(state, self.u_env.observation_space.n) return _reward_e, state_e, done_e
def step(self, action): if not self.is_discrete: # guard for continuous action = np.array([action]) state, reward, done, _info = self.u_env.step(action) reward = guard_reward(reward) reward *= self.reward_scale if util.to_render(): self.u_env.render() self.done = done = done or self.clock.get('t') > self.max_timestep logger.debug( f'Env {self.e} step reward: {reward}, state: {state}, done: {done}' ) if isinstance(self.u_env.observation_space, gym.spaces.discrete.Discrete): state = util.to_one_hot(state, self.u_env.observation_space.n) return reward, state, done
def __init__(self, spec): self.done = False self.env_spec = spec['env'][0] # idx 0 for single-env # set default util.set_attr( self, dict( log_frequency=None, # default to log at epi done frame_op=None, frame_op_len=None, normalize_state=False, reward_scale=None, num_envs=None, )) util.set_attr(self, spec['meta'], [ 'log_frequency', 'eval_frequency', ]) util.set_attr(self, self.env_spec, [ 'name', 'frame_op', 'frame_op_len', 'normalize_state', 'reward_scale', 'num_envs', 'max_t', 'max_frame', ]) seq_len = ps.get(spec, 'agent.0.net.seq_len') if seq_len is not None: # infer if using RNN self.frame_op = 'stack' self.frame_op_len = seq_len if util.in_eval_lab_modes(): # use singleton for eval self.num_envs = 1 self.log_frequency = None if spec['meta'][ 'distributed'] != False: # divide max_frame for distributed self.max_frame = int(self.max_frame / spec['meta']['max_session']) self.is_venv = (self.num_envs is not None and self.num_envs > 1) if self.is_venv: assert self.log_frequency is not None, f'Specify log_frequency when using venv' self.clock_speed = 1 * ( self.num_envs or 1 ) # tick with a multiple of num_envs to properly count frames self.clock = Clock(self.max_frame, self.clock_speed) self.to_render = util.to_render()
def __init__(self, spec): self.env_spec = spec['env'][0] # idx 0 for single-env # set default util.set_attr( self, dict( eval_frequency=10000, log_frequency=10000, frame_op=None, frame_op_len=None, image_downsize=(84, 84), normalize_state=False, reward_scale=None, num_envs=1, )) util.set_attr(self, spec['meta'], [ 'eval_frequency', 'log_frequency', ]) util.set_attr(self, self.env_spec, [ 'name', 'frame_op', 'frame_op_len', 'image_downsize', 'normalize_state', 'reward_scale', 'num_envs', 'max_t', 'max_frame', ]) # override if env is for eval if util.in_eval_lab_modes(): self.num_envs = ps.get(spec, 'meta.rigorous_eval') self.to_render = util.to_render() self._infer_frame_attr(spec) self._infer_venv_attr() self._set_clock() self.done = False self.total_reward = np.nan