def step(self, action): if self._reset_next_step: return self.reset() observation, reward, done, info = self._environment.step(action) self._goals_achieved.append(info['goal_achieved']) success = self._environment.evaluate_success([{ 'env_infos': { 'goal_achieved': self._goals_achieved } }]) info['success'] = bool(success) if self._end_on_success: done = done or success for k in info: assert k in self._info_defaults.keys() | {'TimeLimit.truncated'} observation = self._wrap_observation(observation, info) self._reset_next_step = done if done: truncated = info.get('TimeLimit.truncated', False) if truncated: return dm_env.truncation(reward, observation) return dm_env.termination(reward, observation) return dm_env.transition(reward, observation)
def _step(self, action: int) -> dm_env.TimeStep: self._timestep += 1 ## update agent agent = self.locate("P") reward = 0.0 vector = Actions(action).vector() location = ( max(0, min(agent[0] + vector[0], self.shape[0])), max(0, min(agent[1] + vector[1], self.shape[1])), ) # hit a wall, go back (diagonal moves are never done partially) if self.art[location] == "#": location = agent # stepped on object, compute reward if self.art[location] in [obj.symbol for obj in self.objects]: obj = [x for x in self.objects if x.symbol == self.art[location]] reward = obj[0].reward if len(obj) > 0 else 0.0 # set new agent position self.art[agent] = " " self.art[location] = "P" ## update environment, let it be ❤ for obj in self.objects: missing = obj.n - len(self.locate(obj.symbol)) for _ in range(missing): # termination probability if self._rng.random() < obj.eps_term: return dm_env.termination(reward, self._get_observation()) # respawning probability if self._rng.random() < obj.eps_respawn: self.spawn(obj.symbol) return dm_env.transition(reward, self._get_observation())
def step(self, action): if self._reset_next_step: return self.reset() self._state = cartpole.step_cartpole( action=action, timescale=self._timescale, state=self._state, config=self._cartpole_config, ) # Rewards only when the pole is central and balanced is_upright = (np.cos(self._state.theta) > self._height_threshold and np.abs(self._state.theta_dot) < self._theta_dot_threshold and np.abs(self._state.x) < self._x_reward_threshold) reward = -1. * np.abs(action - 1) * self._move_cost self._steps_elapsed += 1 if is_upright: reward += 1. self._raw_return += reward self._episode_return += reward self._best_episode = max(self._episode_return, self._best_episode) #is_end_of_episode = (self._state.time_elapsed > self._max_time is_end_of_episode = (self._steps_elapsed > self._max_steps or np.abs(self._state.x) > self._x_threshold) if is_end_of_episode: self._reset_next_step = True return dm_env.termination(reward=reward, observation=self.observation) else: # continuing transition. return dm_env.transition(reward=reward, observation=self.observation)
def _step(self, action): reward = 0. action_right = action == self._action_mapping[self._row, self._column] # Reward calculation if self._column == self._size - 1 and action_right: reward += 1. self._denoised_return += 1. if not self._deterministic: # Noisy rewards on the 'end' of chain. if self._row == self._size - 1 and self._column in [ 0, self._size - 1 ]: reward += self._rng.randn() # Transition dynamics if action_right: if self._rng.rand() > 1 / self._size or self._deterministic: self._column = np.clip(self._column + 1, 0, self._size - 1) reward -= self._unscaled_move_cost / self._size else: if self._row == self._column: # You were on the right path and went wrong self._bad_episode = True self._column = np.clip(self._column - 1, 0, self._size - 1) self._row += 1 observation = self._get_observation() if self._row == self._size: if self._bad_episode: self._total_bad_episodes += 1 return dm_env.termination(reward=reward, observation=observation) else: return dm_env.transition(reward=reward, observation=observation)
def _step(self, action: int) -> dm_env.TimeStep: """+1/-1 for correct/incorrect guesses. This also terminates the episode.""" correct = action == self._correct_label reward = 1. if correct else -1. self._total_regret += self._optimal_return - reward observation = np.zeros(shape=self._image_shape, dtype=np.float32) return dm_env.termination(reward=reward, observation=observation)
def step(self, action): if self._reset_next_step: return self.reset() self._state = step_cartpole( action=action, timescale=self._timescale, state=self._state, config=self._cartpole_config, ) # Rewards only when the pole is central and balanced is_reward = (np.cos(self._state.theta) > self._height_threshold and np.abs(self._state.x) < self._x_threshold) reward = 1. if is_reward else 0. self._raw_return += reward self._episode_return += reward self._best_episode = max(self._episode_return, self._best_episode) if self._state.time_elapsed > self._max_time or not is_reward: self._reset_next_step = True return dm_env.termination(reward=reward, observation=self.observation) else: # continuing transition. return dm_env.transition(reward=reward, observation=self.observation)
def step(self, action): """Updates the environment according to the action.""" if self._reset_next_step: return self.reset() # Insert token if column isn't full if column is full if self._col_heights[action] < N_HEIGHT: target_cell = action * N_HEIGHT + self._col_heights[action] target_player = 0 if self._player_one_turn else 1 self._board[target_player] |= 1 << target_cell self._col_heights[action] += 1 else: print("Illegal move!") self._player_one_turn = not self._player_one_turn # Check for termination. if self.is_terminal(): reward = 1.0 if self._winner == 0 else -1.0 if self._winner == 1 else 0.0 self._reset_next_step = True return dm_env.termination(reward=reward, observation=self._observation()) else: return dm_env.transition(reward=0.0, observation=self._observation())
def step(self, action): """Step the environment with an action.""" if self._reset_next_step: return self.reset() # Apply the game_rules for rule in self.game_rules: rule.step(self._state, self._meta_state) # Apply the action self.action_space.step(self._state, action) # Step the physics self.physics.step(self._state) # Compute reward self.step_count += 1 reward, should_reset = self.task.reward(self._state, self._meta_state, self.step_count) # Take observation observation = self.observation() # Return transition if should_reset: self._reset_next_step = True return dm_env.termination(reward=reward, observation=observation) else: return dm_env.transition(reward=reward, observation=observation)
def step(self, action: int) -> dm_env.TimeStep: dm_env_step = self.dm_env.step(action) #hack set reward as 0 if dm_env_step.reward returns None which happens in case of restart() self._raw_return += 0. if dm_env_step.reward is None else dm_env_step.reward self._episode_return += 0. if dm_env_step.reward is None else dm_env_step.reward if self.gym_env.total_transitions_episode > self.max_episode_len: self._best_episode = max(self._episode_return, self._best_episode) dm_env_step = dm_env.truncation(dm_env_step.reward, dm_env_step.observation) ohe_obs = np.zeros( shape=(self.gym_env.observation_space.n, ), dtype=np.float32 ) #hack #TODO bsuite/baselines/tf/dqn agent doesn't allow discrete states ohe_obs[dm_env_step.observation] = 1 # dm_env_step.observation = ohe_obs # return corresponding TimeStep object based on step_type if dm_env_step.step_type == StepType.FIRST: return dm_env.restart(ohe_obs) elif dm_env_step.step_type == StepType.LAST: return dm_env.termination(dm_env_step.reward, ohe_obs) else: return dm_env.transition(dm_env_step.reward, ohe_obs)
def make_timestep_from_step_type_string(step_type_str, observation): if step_type_str == 'f': return dm_env.restart(observation=observation) elif step_type_str == 'm': return dm_env.transition(reward=0, observation=observation) elif step_type_str == 'l': return dm_env.termination(reward=0, observation=observation) else: raise ValueError('Unknown step type string %s.' % step_type_str)
def _step(self, action: int) -> dm_env.TimeStep: self._timestep += 1 ## update agent reward = 0.0 vector = Actions(action).vector() location = ( max(0, min(self._agent_location[0] + vector[0], self.shape[0])), max(0, min(self._agent_location[1] + vector[1], self.shape[1])), ) # hit a wall, go back (diagonal moves are never done partially) if self.art[location] == "#": location = self._agent_location # stepped on object, compute reward if self.art[location] in [obj.symbol for obj in self.objects]: obj = [x for x in self.objects if x.symbol == self.art[location]] if len(obj) > 0: reward = obj[0].reward # termination probability if self._rng.random() < obj[0].eps_term: return dm_env.termination(reward, self._get_observation()) # set new agent position self.art[self._agent_location] = " " self.art[location] = "P" self._agent_location = location ## update environment, let it be ❤ for obj in self.objects: for i, location in enumerate(self._object_locations[obj.symbol]): # if location if self.art[location] != obj.symbol: # respawning probability if self._rng.random() < obj.eps_respawn: self._object_locations[obj.symbol][i] = self.spawn( obj.symbol, location) # 增加到最大步数时结束 if self._timestep == self.max_steps: return dm_env.termination(reward, self._get_observation()) return dm_env.transition(reward, self._get_observation())
def _step(self, action: np.ndarray) -> dm_env.TimeStep: """Does one step within TCV.""" voltages = self._noise.add_action_noise(action) voltage_simulator = self._simulator_voltages_from_voltages(voltages) try: state = self._simulator.step(voltage_simulator) except (fge_state.InvalidSolutionError, fge_state.StopSignalException): return dm_env.termination(self._reward.terminal_reward(), self._last_observation) references = self._reference_generator.step() self._last_observation = self._extract_observation( state, references, action) term = self._termination.terminate(state) if term: return dm_env.termination(self._reward.terminal_reward(), self._last_observation) reward, _ = self._reward.reward(voltages, state, references) self._step_counter += 1 if self._step_counter >= self._max_episode_length: return dm_env.truncation(reward, self._last_observation) return dm_env.transition(reward, self._last_observation)
def step(self, action: int) -> dm_env.TimeStep: if self._reset_next_step: return self.reset() # Convert the gym step result to a dm_env TimeStep. obs, reward, done, _ = self.gym_env.step(action) if done: self._reset_next_step = True return dm_env.termination(reward, obs) else: return dm_env.transition(reward, obs)
def step(self, action: np.ndarray) -> dm_env.TimeStep: # Reset if previous timestep was LAST. if self._reset_next_step: return self.reset() # Take an environment step. observation, reward, done = self._environment.step(action) self._reset_next_step = done if done: return dm_env.termination(reward=reward, observation=observation) # After this, it's always LAST return dm_env.transition(reward=reward, observation=observation)
def step(self, action: types.NestedArray) -> dm_env.TimeStep: if self._reset_next_step: return self.reset() observation, reward, done, info = self._environment.step(action) self._reset_next_step = all(done.values()) if isinstance(done, dict) else done == True if self._reset_next_step: truncated = info.get('TimeLimit.truncated', False) if truncated: return dm_env.truncation(reward, observation) return dm_env.termination(reward, observation) return dm_env.transition(reward, observation)
def step(self, action: List[np.ndarray]) -> dm_env.TimeStep: """Steps the environment.""" if self._reset_next_step: return self.reset() observation, reward, done, _ = self._environment.step(action[0].item()) self._reset_next_step = done observation = self._wrap_observation(observation) if done: return dm_env.termination(reward, observation) return dm_env.transition(reward, observation)
def step(self, action): """Performs an environment step.""" # If the environment has just been created or finished an episode # we should reset it (ignoring the action). if self._prev_step_type in {None, environment.StepType.LAST}: return self.reset() for k in action.keys(): self._action_spec[k].validate(action[k]) locations, flag, pressure, log_size, red, green, blue = ( self._process_action(action)) loc_control, loc_end = locations # Perform action. self._surface.BeginAtomic() if flag == 1: # The agent produces a visible stroke. self._action_mask = self._action_masks["paint"] y_c, x_c = loc_control y_e, x_e = loc_end self._bezier_to(y_c, x_c, y_e, x_e, pressure, log_size, red, green, blue) # Update episode statistics. self.stats["total_strokes"] += 1 if not self._prev_brush_params["is_painting"]: self.stats["total_disjoint"] += 1 elif flag == 0: # The agent moves to a new location. self._action_mask = self._action_masks["move"] y_e, x_e = loc_end self._move_to(y_e, x_e) else: raise ValueError("Invalid flag value") self._surface.EndAtomic() # Handle termination of the episode. reward = 0 self._episode_step += 1 if self._episode_step == self._episode_length: time_step = environment.termination(reward=reward, observation=self.observation()) else: time_step = environment.transition(reward=reward, observation=self.observation(), discount=self._discount) self._prev_step_type = time_step.step_type return time_step
def _step(self, action: int) -> dm_env.TimeStep: if self._timestep == 0: self._context = action self._timestep += 1 if self._timestep == self._reward_timestep[self._context]: reward = self._rewards[self._context] else: reward = 0. observation = self._get_observation() if self._timestep == self._episode_len: return dm_env.termination(reward=reward, observation=observation) return dm_env.transition(reward=reward, observation=observation)
def reset(self) -> dm_env.TimeStep: """ Reset the environment and start a new episode. """ observation = self._sim.reset(self._sim_input, False) # Necessary when willingness is set to 0 # and Rainbow agents are disabled if self.sim_finished(): self._start_of_episode = True return dm_env.termination(None, observation) self._start_of_episode = False return dm_env.restart(observation)
def step(self, action: np.ndarray): """Updates the environment according to the action.""" if self._reset_next_step: return self.reset() self.defended = np.logical_or(self.defended, action) self.burn_vertices() if self._reset_next_step: return dm_env.termination(reward=0.0, observation=self._observation()) return dm_env.transition(reward=-1.0, observation=self._observation())
def step(self, action: types.NestedArray) -> dm_env.TimeStep: """Steps the environment.""" if self._reset_next_step: return self.reset() observation, reward, done, info = self._environment.step(action) self._reset_next_step = done if done: truncated = info.get('TimeLimit.truncated', False) if truncated: return dm_env.truncation(reward, observation) return dm_env.termination(reward, observation) return dm_env.transition(reward, observation)
def step(self, action): """Step the environment with an action.""" if self._reset_next_step: return self.reset() self._read_action(self._action_spec, action) self._env.act_discrete(self._act_discrete) self._env.act_continuous(self._act_continuous) self._env.act_text(self._act_text) self._status, reward = self._env.advance() if self._status != dmlab2d.RUNNING: self._reset_next_step = True return dm_env.termination(reward=reward, observation=self.observation()) else: return dm_env.transition(reward=reward, observation=self.observation())
def step(self, action): if self._reset_next_step: return self.reset() observation, reward, terminal, _ = self._env.step(action.item()) observation = observation.squeeze(-1) discount = 1 - float(terminal) self._episode_steps += 1 if terminal: self._reset_next_episode = True return dm_env.termination(reward, observation) elif self._episode_steps == self._max_episode_steps: self._reset_next_episode = True return dm_env.truncation(reward, observation, discount) else: return dm_env.transition(reward, observation, discount)
def step(self, action): if self._reset_next_step: return self.reset() observation, reward, done, info = self._environment.step(action) for k in info: assert k in self._info_defaults.keys() | {'TimeLimit.truncated'} observation = self._wrap_observation(observation, info) self._reset_next_step = done if done: truncated = info.get('TimeLimit.truncated', False) if truncated: return dm_env.truncation(reward, observation) return dm_env.termination(reward, observation) return dm_env.transition(reward, observation)
def make_trajectory(observations): """Make a simple trajectory from a sequence of observations. Arguments: observations: a sequence of observations. Returns: a tuple (first, steps) where first contains the initial dm_env.TimeStep object and steps contains a list of (action, step) tuples. The length of steps is given by episode_length. """ first = dm_env.restart(observations[0]) middle = [(0, dm_env.transition(reward=0.0, observation=observation)) for observation in observations[1:-1]] last = (0, dm_env.termination(reward=0.0, observation=observations[-1])) return first, middle + [last]
def _step(self, action): observation = self._get_observation() self._timestep += 1 # on all but the last step provide a reward of 0. if self._timestep - 1 < self._memory_length: return dm_env.transition(reward=0., observation=observation) elif self._timestep - 1 == self._memory_length: if action == self._context[self._query]: reward = 1. self._total_perfect += 1 else: reward = -1. self._total_regret += 2. return dm_env.termination(reward=reward, observation=observation)
def step(self, action: int) -> dm_env.TimeStep: if self._reset_next_step: return self.reset() # Convert the gym step result to a dm_env TimeStep. observation, reward, done, info = self.gym_env.step(action) self._reset_next_step = done if done: is_truncated = info.get('TimeLimit.truncated', False) if is_truncated: return dm_env.truncation(reward, observation) else: return dm_env.termination(reward, observation) else: return dm_env.transition(reward, observation)
def _step(self, action: int) -> dm_env.TimeStep: observation = self._get_observation() self._timestep += 1 if self._timestep - 1 < self._memory_length: # On all but the last step provide a reward of 0. return dm_env.transition(reward=0., observation=observation) if self._timestep - 1 > self._memory_length: raise RuntimeError('Invalid state.') # We shouldn't get here. if action == self._context[self._query]: reward = 1. self._total_perfect += 1 else: reward = -1. self._total_regret += 2. return dm_env.termination(reward=reward, observation=observation)
def step(self): """Step the environment, returning an observation.""" if self._reset_next_step: return self.reset() self._step_count += 1 for _ in range(self._physics_steps_per_env_step): self.physics_step() observation = self.observation() if self.should_terminate(): self._reset_next_step = True return dm_env.termination(reward=0, observation=observation) else: return dm_env.transition(reward=0, observation=observation)
def step(self, action): if self._needs_reset: return self.reset() lab_action = np.empty(self._action_count, dtype=np.dtype("int32")) for name, value in six.iteritems(action): lab_action[self._action_map[name]] = value reward = self._lab.step(lab_action) if self._lab.is_running(): return dm_env.transition(reward=reward, observation=self._observation()) else: self._needs_reset = True return dm_env.termination(reward=reward, observation=self._observation())