def step(self, action): if self._symmetric_action_space: new_action = torch.as_tensor( action.round().clip(0, self._max_stock) + self._max_stock / 2, dtype=torch.int32) else: new_action = torch.as_tensor(action, dtype=torch.int32).clamp( 0, self._max_stock) if self.day_position % self._substep_count == 0: order_cost = self._make_fast_order(new_action) (sales, availability) = \ self._generateDemand(self.real.clamp_(0.0, 1.)) waste = self._waste() # Update waste and store result self._reduceShelfLives() self._step_counter += 1 self._updateEnv() else: self.day_position += 1 order_cost = self._make_order(new_action) (sales, availability) = \ self._generateDemand(self.real.clamp_(0.0, 1.)) waste = 0 # By default, no waste before the end of day self._updateObs() sales.sub_(order_cost) utility = self.utility_function.reward(sales, waste, availability) done = self._step_counter == self.horizon info = EnvInfo(sales=sales, availability=availability, waste=waste, reward=utility, traj_done=done) return EnvStep(self.get_obs(), utility, done, info)
def step(self, action): """ 在environment中向前走一步。 这个函数在Collector类(例如)的collect_batch()函数中会被调用。 注意:policy network的前向传播过程不是在这里发生的,而是在agent类(例如DqnAgent)的step()函数里发生(由Collector类的 collect_batch()函数调用)。environment里的step(),输入的action已经是policy network推断出来的action了,在这里做的工作主要是: 计算该action带来的reward,判断trajectory是否结束,记录一些统计信息等。 :param action: 一个标量,其值在 self._action_set 的index范围内。TODO: 确认是否正确? :return: 一个 EnvStep 对象,包含observation等数据。 """ a = self._action_set[action] # 从action set(动作集)中取出一个具体的action game_score = np.array( 0., dtype="float32" ) # 游戏分数,其实就是一个标量值。这个函数里算出来的只是当前step的score而不是整个游戏过程的score # 可以设置每一个step走游戏的几帧,这里就连续地执行N-1(假设N为帧数)次action for _ in range(self._frame_skip - 1): game_score += self.ale.act( a) # 执行一个action,得到一个score,累加到原来已经得到的分数上,这里累加也只是累加本step内的分数 self._get_screen(1) game_score += self.ale.act(a) # 上面skip的frame,还差一帧,这里补上执行一次action lost_life = self._check_life( ) # Advances from lost_life state. 看看游戏角色是不是挂了 if lost_life and self._episodic_lives: self._reset_obs() # Internal reset. self._update_obs() # 奖励值。当设置了_clip_reward的时候使用-1,0,1作为reward,否则就使用真实的游戏分数作为reward reward = np.sign(game_score) if self._clip_reward else game_score game_over = self.ale.game_over( ) or self._step_counter >= self.horizon # 判断游戏是不是结束了,当horizon达到阈值时也结束 done = game_over or (self._episodic_lives and lost_life) # bool类型 info = EnvInfo(game_score=game_score, traj_done=game_over) # 当前environment的一些信息,比如游戏分数等 self._step_counter += 1 # 用于统计走了多少个step的计数器 return EnvStep(self.get_obs(), reward, done, info)
def step(self, action): time_step = self._env.step(action) reward = time_step.reward terminal = time_step.last() info = time_step.info info.update({ key: value for key, value in time_step.observation.items() if key not in self._observation_keys }) observation = self._filter_observation(time_step.observation) self._step_count += 1 info['traj_done'] = self._step_count >= self._max_path_length global EnvInfo if EnvInfo is None: EnvInfo = namedtuple("EnvInfo", list(info.keys())) info = EnvInfo( **{k: v for k, v in info.items() if k in EnvInfo._fields}) global Observation if Observation is None: Observation = namedarraytuple("Observation", list(observation.keys())) observation = Observation( **{ k: v.copy() for k, v in observation.items() if k in self._observation_keys }) return EnvStep(observation, reward, terminal, info)
def step(self, action): ''' Passes action to env and returns next state, reward, and terminal Args: action(int): Int represnting action in action space Returns: (EnvStep:named_tuple_array) ''' reward = 0 if self.is_action_continuous: action *= np.array(self._env.action_space.get_high()) for _ in range(self._steps_per_action): sensor_dict, temp_reward, terminal, _ = self._env.step(action) reward += temp_reward if self.rollout_count % self.gif_freq == 0 and self.has_img: self.gif_images.append(self.get_img(sensor_dict)) state_rep = self._get_state_rep(sensor_dict) self.curr_step += 1 if self.curr_step >= self._max_steps: terminal = True return EnvStep(state_rep, np.array(reward), terminal, None)
def step(self, action): o, r, d, info = self.env.step(action) self.time_elapsed += 1 if self.time_limit is not None: d = self.time_elapsed >= self.time_limit or d return EnvStep(np.array(self.state), r, d, EnvInfo(**info, state=self.state))
def step(self, action): """ Returns: obs reward done log """ print(type(action)) # assert action in [0, 1], action # if action[0] == 0 and self.cur_pos > 0: # self.cur_pos -= 1 # elif action[0] == 1: # self.cur_pos += 1 if action == 0 and self.cur_pos > 0: self.cur_pos -= 1 elif action == 1: self.cur_pos += 1 done = self.cur_pos >= self.end_pos # info = EnvInfo(game_score=game_score, traj_done=game_over) info = None reward = 1 if done else 0 self._step_counter += 1 return EnvStep(self.get_obs(), reward, done, info)
def step(self, action): timestep = self.env.step(action) self._last_observation = timestep.observation reward = timestep.reward or 0. if timestep.last(): self.game_over = True return EnvStep(timestep.observation, reward, timestep.last(), EnvInfo())
def step(self, action): time_step = self._env.step(action) _ = dict(time_step.observation) obs = self.render() reward = time_step.reward or 0 done = time_step.last() info = EnvInfo(np.array(time_step.discount, np.float32), None, done) return EnvStep(obs, reward, done, info)
def step(self, action): if self.player_turn: self.player_turn = False a = self.player_action_space.revert(action) if a.size <= 1: a = a.item() o, r, d, info = self.env.step(a) self.last_obs = o self.last_action = a obs = self.observer_observation_space.convert(o) if self.time_limit: if "TimeLimit.truncated" in info: info["timeout"] = info.pop("TimeLimit.truncated") else: info["timeout"] = False self.last_info = info #(info["timeout"]) # info = (False) if isinstance(r, float): r = np.dtype("float32").type(r) # Scalar float32. self.last_reward = r self.curr_episode_length += 1 if self.curr_episode_length >= self.max_episode_length: d = True self.last_done = d return EnvStep(obs, r, d, info) else: r_action = self.observer_action_space.revert(action) r_action = self.obs_action_translator(r_action, self.window_size, self.obs_size) self.player_turn = True self.last_obs_act = r_action masked_obs = np.multiply(r_action, self.last_obs) info = self.last_info r = self.last_reward d = self.last_done if self.add_channel: masked_obs = np.concatenate([r_action, masked_obs], axis=0) else: masked_obs[r_action == 0] = -1 obs = self.player_observation_space.convert(masked_obs) return EnvStep(obs, r, d, info)
def step(self, action): obs, reward, done, info = super().step(action) # fix the labels later in GoalWrapper updated_info = EnvInfo(game_score=info.game_score, traj_done=info.traj_done, labels=self.static_info, goal_labels=self.static_info) return EnvStep(obs, reward, done, updated_info)
def step(self, actions): """ Action is either a single value (discrete, one-hot), or a tuple with an action for each of the discrete action subspaces. """ action = actions.item() obs, rew, done, info = self.env.step(action) # print('Obs shape as returned by the env:', obs.shape) fake_info = tuple() # do not need for testing return EnvStep(obs, rew, done, fake_info)
def step(self, action): action = {self._node_id: action} obs, reward, done, info = self._env.step(action) self.obs = self.transform_obs(obs) done = done[self._node_id] reward = reward[self._node_id] info = info[self._node_id] info = EnvInfo(None, 0, done) return EnvStep(self.obs, reward, done, info)
def step(self, action): reward = self._lab.step(self._action_set[action]) finished = not self._lab.is_running() if not finished: self._update_obs() self._total_reward += reward self._step_counter += 1 return EnvStep(self.get_obs(), reward, finished, EnvInfo( total_reward=self._total_reward, traj_done=finished, ))
def step(self, action): a = self.action_space.revert(action) o, r, d, info = self.env.step(a) obs = self.observation_space.convert(o) if self._time_limit: if "TimeLimit.truncated" in info: info["timeout"] = info.pop("TimeLimit.truncated") else: info["timeout"] = False info = info_to_nt(info) return EnvStep(obs, r, d, info)
def step(self, action): time_step = self._env.step(action) obs = dict(time_step.observation) state_obs = np.concatenate([value for key, value in obs.items()]) img_obs = self.render() reward = time_step.reward or 0 done = time_step.last() info = EnvInfo(np.array(time_step.discount, np.float32), None, done) obs = StateObs(img_obs, state_obs) if self.use_state else img_obs return EnvStep(obs, reward, done, info)
def step(self, action): self.iter += 1 velocity = np.linalg.norm(action - self.state) self.state = action.copy() dist = np.linalg.norm(self.state - self.goal) rewards = dict() rewards['goal'] = 0.9 * np.exp(-0.5 * 10 * dist) rewards['vel'] = 0.1 * np.exp(-0.5 * 10 * velocity) rewards['col'] = -1 * np.exp(-0.5 * self.colision_dist(self.state)) * self.in_collision(self.state) # print(rewards) return EnvStep(self.get_obs(), sum(rewards.values()) / self.horizon, self.iter == self.horizon, EnvInfo())
def step(self, action): assert self._step is not None, 'Must reset environment.' obs, reward, done, info = self.env.step(action) self._step += 1 if self._step >= self._duration: # done = True # if 'discount' not in info: # info['discount'] = np.array(1.0).astype(np.float32) if isinstance(info, EnvInfo): # The last attribute in EnvInfo indicates termination of the trajectory # we do not set done = True because it should only be controlled by the environment info = EnvInfo(info.discount, info.game_score, True) self._step = None return EnvStep(obs, reward, done, info)
def step(self, action): """Reverts the action from rlpyt format to gym format (i.e. if composite-to- dictionary spaces), steps the gym environment, converts the observation from gym to rlpyt format (i.e. if dict-to-composite), and converts the env_info from dictionary into namedtuple.""" a = self.action_space.revert(action) o, r, d, info = self.env.step(a) obs = self.observation_space.convert(o) if self._time_limit: if "TimeLimit.truncated" in info: info["timeout"] = info.pop("TimeLimit.truncated") else: info["timeout"] = False info = info_to_nt(info, self._info_schemas) return EnvStep(obs, r, d, info)
def step(self, action, sym_features=None): """ :param sym_features: if given, this is a safety-wrapped environment """ constraint_used = False if self._actions is not None: action = self._actions[action] else: # continuous actions from agents are in [-1, 1]; convert back here action = (action + 1) / 2 * self._action_range + self._action_lb if sym_features is not None or self.oracle_safety: current_state = self._env.current_oracle_state() if sym_features is not None: sym_features = sym_features.squeeze() nan_idx = np.isnan(sym_features) sym_features[nan_idx] = current_state[nan_idx] else: # oracle safety sym_features = current_state if not self._env.constraint_func(action, sym_features): constraint_used = True action = self.constrained_sample(sym_features) if action is None: if self._fallback_action is None: raise ValueError( "No safe action found! Consider adding fallback.") action = self._fallback_action if self.log_unsafe_transitions: unsafe_info = { "oracle": self._env.current_oracle_state(), "sym_feats": sym_features.copy(), "img": self._env.render(), "action": action.copy(), "constraint_used": constraint_used, } obs, reward, done, info = self._env.step(action) info = SafetyEnvInfo(info["unsafe"], 0, constraint_used) if (self.log_unsafe_transitions and info.action_unsafe and sym_features is not None): debug_dir = Path.home() / "debug" debug_dir.mkdir(exist_ok=True) i = str(np.random.randint(1000)) unsafe_info["oracle_next"] = self._env.current_oracle_state() (debug_dir / f"{i}.pkl").write_bytes(pickle.dumps(unsafe_info)) return EnvStep(obs, reward, done, info)
def step(self, action): a = self._action_set[action] game_score = np.array(0., dtype="float32") for _ in range(self._frame_skip - 1): game_score += self.ale.act(a) self._get_screen(1) game_score += self.ale.act(a) lost_life = self._check_life() # Advances from lost_life state. if lost_life and self._episodic_lives: self._reset_obs() # Internal reset. self._update_obs() reward = np.sign(game_score) if self._clip_reward else game_score game_over = self.ale.game_over() or self._step_counter >= self.horizon done = game_over or (self._episodic_lives and lost_life) info = EnvInfo(game_score=game_score, traj_done=game_over) self._step_counter += 1 return EnvStep(self.get_obs(), reward, done, info)
def step(self, action): """Reverts the action from rlpyt format to gym format (i.e. if composite-to- dictionary spaces), steps the gym environment, converts the observation from gym to rlpyt format (i.e. if dict-to-composite), and converts the env_info from dictionary into namedtuple.""" a = self.action_space.revert(action) o, r, d, info = self.env.step(a) obs = self.observation_space.convert(o.transpose((2, 0, 1))) if self._time_limit: if "TimeLimit.truncated" in info: info["timeout"] = info.pop("TimeLimit.truncated") else: info["timeout"] = False info = info_to_nt(info) if isinstance(r, float): r = np.dtype("float32").type(r) # Scalar float32. return EnvStep(obs, r, d, info)
def step(self, action): """ Take step with action, then observe result """ self._time += 1 # Update time step timeout = False self._action[:] = action if self._time_limit and self._time >= self._time_limit: self._action[:] = -1 # Can force a reset by taking action of -1 self._time = 0 # Reset time timeout = True self.env.act(self._action) r, self._o, d = self.env.observe() r, d = r.squeeze(), d.squeeze() if d: self._reset_obs() # If done, reset the stacked frames self._update_obs() # Add newest observation to stacked frames o = self._get_obs() # Get stacked observation in correct order return EnvStep(o, r, d, EnvInfo(timeout=timeout))
def step(self, action): assert self._norm_action_space.contains(action) action = self._convert_action(action) assert self._true_action_space.contains(action) reward = 0 extra = {'internal_state': self._env.physics.get_state().copy()} for _ in range(self._frame_skip): time_step = self._env.step(action) reward += time_step.reward or 0 done = time_step.last() if done: break obs = self._get_obs(time_step) extra['discount'] = time_step.discount extra['traj_done'] = done extra['game_score'] = reward info = self.info_class(**extra) return EnvStep(obs, reward, done, info)
def step(self, action): if action == 0 and self.y < self.h - 1: if self.grid[self.x, self.y + 1] != 2: self.y += 1 elif action == 1 and self.x < self.w - 1: if self.grid[self.x + 1, self.y] != 2: self.x += 1 elif action == 2 and self.y > 0: if self.grid[self.x, self.y - 1] != 2: self.y -= 1 elif action == 3 and self.x > 0: if self.grid[self.x - 1, self.y] != 2: self.x -= 1 else: # stand still pass info = EnvInfo(traj_done=False, labels=None, goal_labels=None) return EnvStep(self._get_grid(), 0, False, info)
def step(self, action): total_reward = 0.0 for step in range(self._action_repeat): _, reward, done, info = self._env.step(action) total_reward += reward if self._life_done: lives = self._env.ale.lives() done = done or lives < self._lives self._lives = lives if done: break elif step >= self._action_repeat - 2: index = step - (self._action_repeat - 2) if self._grayscale: self._env.ale.getScreenGrayscale(self._buffers[index]) else: self._env.ale.getScreenRGB2(self._buffers[index]) obs = self._get_obs() env_info = EnvInfo(None, total_reward, done, None) return EnvStep(obs, total_reward, done, env_info)
def step(self, action): a = self._action_set[action] game_score = np.array(0., dtype="float32") for _ in range(self._frame_skip - 1): game_score += self.ale.act(a) self._get_screen(1) game_score += self.ale.act(a) lost_life = self._check_life() # Advances from lost_life state. if lost_life and self._episodic_lives: self._reset_obs() # Internal reset. self._update_obs() reward = np.sign(game_score) if self._clip_reward else game_score game_over = self.ale.game_over() or self._step_counter >= self.horizon done = game_over or (self._episodic_lives and lost_life) # Include reporting of current room ID in Montezuma Revenge (stored at RAM address 3) info = MontezumaEnvInfo(game_score=game_score, traj_done=game_over, room_id=self.ale.getRAM()[3]) self._step_counter += 1 return EnvStep(self.get_obs(), reward, done, info)
def step(self, action): cont_action = np.array(self.action_map[action]).astype(float) done = False for _ in range(self.frame_skip - 1): env_step = self.env.step(cont_action, blind=True) if env_step.last(): done = True break if not done: env_step = self.env.step(cont_action) done = env_step.last() self._update_obs(env_step.observation['pixels']) self.last_obs = env_step.observation if self.static_labels is None: self.static_labels = self.labels() info = EnvInfo(traj_done=done, labels=self.static_labels, goal_labels=self.static_labels) return EnvStep(self.get_obs(), 0, done, info)
def step(self, action): step_results = None if action == MiniGridEnv.Actions.forward: # Go forward action; no slippage for other actions if np.random.uniform() < self.slipperiness: action = random.choice(["left", "right"]) # By default, the agent can only move in a direction if it's facing that way. # We can model slippage by turning a direction, moving that way, then turning back # and only counting it as a single action. self._env.step_count -= 2 if action == "left": self._env.step(MiniGridEnv.Actions.left) self._env.step(MiniGridEnv.Actions.forward) step_results = self._env.step(MiniGridEnv.Actions.right) else: self._env.step(MiniGridEnv.Actions.right) self._env.step(MiniGridEnv.Actions.forward) step_results = self._env.step(MiniGridEnv.Actions.left) if step_results is None: step_results = self._env.step(action) obs, reward, done, info = step_results obs = StateObs(obs['image'], np.concatenate([obs['mission'], obs['direction']])) info = EnvInfo(None, None, done) return EnvStep(obs, reward, done, info)
def step(self, action: np.ndarray): """ Args: action: [int, int] Return: obs: target_im (H, W, C), cur_im (H, W, C), field_info (x0, y0) """ # print(self.action_map[action[0]], self.action_map[action[1]]) # idx = action[0] # print(action) i_item = 0 # action = action.reshape((N_ACTIONS / 2, 2)) dmove = np.array([HOR_MOVE[action], VER_MOVE[action]], dtype=np.float32) xy0 = self.cur_coord[i_item, :2] + dmove self.cur_coord[i_item] = np.concatenate((xy0, xy0 + self.obj_wh), axis=0) self.cur_im = self._render(self.cur_coord) reward = self._reward(self.cur_coord, self.target_coord) done = self.cur_step >= MAX_STEP info = EnvInfo() self.cur_step += 1 # return self._obs(), reward, done, {} return EnvStep(self._obs(), reward, done, info)
def step(self, action): if self.player_turn: self.player_turn = False a = self.player_action_space.revert(action) if a.size <= 1: a = a.item() o, r, d, info = self.env.step(a) self.last_obs = o self.last_action = a if self.serial: obs = np.concatenate( [np.zeros(self.last_obs_act.shape), self.last_masked_obs]) else: obs = np.concatenate([self.last_obs_act, self.last_masked_obs]) if self.inc_player_last_act: obs = np.append(obs, a) obs = self.observer_observation_space.convert(obs) if self.time_limit: if "TimeLimit.truncated" in info: info["timeout"] = info.pop("TimeLimit.truncated") else: info["timeout"] = False self.last_info = (info["timeout"]) info = (False) if isinstance(r, float): r = np.dtype("float32").type(r) # Scalar float32. self.last_reward = r # if (not d) and (self.observer_reward_shaping is not None): # r = self.observer_reward_shaping(r,self.last_obs_act) self.curr_episode_length += 1 if self.curr_episode_length >= self.max_episode_length: d = True self.last_done = d return EnvStep(obs, r, d, info) else: if not np.array_equal(action, action.astype(bool)): action = np.random.binomial(1, action) r_action = self.observer_action_space.revert(action) if self.serial: if self.fully_obs: r_action = 1 elif self.rand_obs: r_action = random.randint(0, 1) self.ser_cum_act[self.ser_counter] = r_action self.ser_counter += 1 if self.ser_counter == self.obs_size: self.player_turn = True self.ser_counter = 0 masked_obs = np.multiply( np.reshape(self.ser_cum_act, self.last_obs.shape), self.last_obs) self.last_masked_obs = masked_obs self.last_obs_act = self.ser_cum_act.copy() self.ser_cum_act = np.zeros( self.env.env.observation_space.shape) r = self.last_reward # if self.player_reward_shaping is not None: # r = self.player_reward_shaping(r, self.last_obs_act) d = self.last_done info = self.last_info obs = np.concatenate([ np.reshape(self.last_obs_act, masked_obs.shape), masked_obs ]) obs = self.player_observation_space.convert(obs) else: r = 0 info = (False) obs = np.concatenate([ np.reshape(self.ser_cum_act, self.last_masked_obs.shape), self.last_masked_obs ]) if self.inc_player_last_act: obs = np.append(obs, self.last_action) obs = self.observer_observation_space.convert(obs) d = False else: if not self.cont_act: r_action = self.obs_action_translator( r_action, self.power_vec, self.obs_size) if self.fully_obs: r_action = np.ones(r_action.shape) elif self.rand_obs: r_action = np.random.randint(0, 2, r_action.shape) self.player_turn = True self.last_obs_act = r_action masked_obs = np.multiply( np.reshape(r_action, self.last_obs.shape), self.last_obs) self.last_masked_obs = masked_obs info = self.last_info r = self.last_reward # if self.player_reward_shaping is not None: # r = self.player_reward_shaping(r, r_action) d = self.last_done obs = np.concatenate( [np.reshape(r_action, masked_obs.shape), masked_obs]) obs = self.player_observation_space.convert(obs) return EnvStep(obs, r, d, info)