def _step(self, action): if action < self._action_spec.minimum or action > self._action_spec.maximum: raise ValueError('Action should be in [{0}, {1}], but saw: {2}'.format( self._action_spec.minimum, self._action_spec.maximum, action)) if self._state >= self._final_state: # Start a new episode. Ignore action self._state = 0 return ts.restart(self._state) self._state += action if self._state < self._final_state: return ts.transition(self._state, 1.) else: return ts.termination(self._state, 1.)
def _step(self, action): # Automatically reset the environments on step if they need to be reset. if self._auto_reset and self._done: return self.reset() # TODO(oars): Figure out how tuple or dict actions will be generated by the # agents and if we can pass them through directly to gym. observation, reward, self._done, self._info = self._gym_env.step(action) if self._match_obs_space_dtype: observation = self._to_obs_space_dtype(observation) if self._done: return ts.termination(observation, reward) else: return ts.transition(observation, reward, self._discount)
def test_resets_after_limit(self): max_steps = 5 base_env = mock.MagicMock() wrapped_env = atari_wrappers.AtariTimeLimit(base_env, max_steps) base_env.gym.game_over = False base_env.reset.return_value = ts.restart(1) base_env.step.return_value = ts.transition(2, 0) action = 1 for _ in range(max_steps + 1): wrapped_env.step(action) self.assertTrue(wrapped_env.game_over) self.assertEqual(1, base_env.reset.call_count) wrapped_env.step(action) self.assertFalse(wrapped_env.game_over) self.assertEqual(2, base_env.reset.call_count)
def _step(self, action): if self._episode_ended: return self.reset() # don't forget to `return` if action == self.ACT_HIT: self._player_cards.hit() if self._player_cards.is_bust(): return self._terminate(LOSS_SCORE) return time_step.transition(self._state(), reward=0, discount=1) # Afteward action == self.ACT_STICK dealer_score = self._dealer_cards.dealer_hit() player_score = self._player_cards.sum() if self._dealer_cards.is_bust() or dealer_score < player_score: reward = WIN_SCORE else: reward = LOSS_SCORE return self._terminate(reward)
def test_game_over_after_limit(self): max_steps = 5 base_env = mock.MagicMock() wrapped_env = atari_wrappers.AtariTimeLimit(base_env, max_steps) base_env.gym.game_over = False base_env.reset.return_value = ts.restart(1) base_env.step.return_value = ts.transition(2, 0) action = 1 self.assertFalse(wrapped_env.game_over) for _ in range(max_steps): time_step = wrapped_env.step(action) self.assertFalse(time_step.is_last()) self.assertFalse(wrapped_env.game_over) time_step = wrapped_env.step(action) self.assertTrue(time_step.is_last()) self.assertTrue(wrapped_env.game_over)
def _generate_replay_buffer(self, rb_cls): stack_count = 4 shape = (15, 15, stack_count) single_shape = (15, 15, 1) observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs') time_step_spec = ts.time_step_spec(observation_spec) action_spec = policy_step.PolicyStep( array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')) self._trajectory_spec = trajectory.from_transition( time_step_spec, action_spec, time_step_spec) self._capacity = 32 self._replay_buffer = rb_cls(data_spec=self._trajectory_spec, capacity=self._capacity) # Generate N frames: the value of pixels is the frame index. # The observations will be generated by stacking K frames out of those N, # generating some redundancies between the observations. single_frames = [] frame_count = 100 for k in range(frame_count): single_frames.append(np.full(single_shape, k, dtype=np.int32)) # Add stack of frames to the replay buffer. time_steps = [] for k in range(len(single_frames) - stack_count + 1): observation = np.concatenate(single_frames[k:k + stack_count], axis=-1) time_steps.append(ts.transition(observation, reward=0.0)) self._transition_count = len(time_steps) - 1 dummy_action = policy_step.PolicyStep(np.int32(0)) for k in range(self._transition_count): self._replay_buffer.add_batch( nest_utils.batch_nested_array( trajectory.from_transition(time_steps[k], dummy_action, time_steps[k + 1])))
def _step(self, action): if self._episode_ended: return self.reset() self.move(action) if self.game_over(): self._episode_ended = True if self._episode_ended: if self.game_over(): reward = 100 else: reward = 0 return ts.termination(np.array(self._state, dtype=np.int32), reward) else: return ts.transition(np.array(self._state, dtype=np.int32), reward=0, discount=0.9)
def testCriticLoss(self): agent = sac_agent.SacAgent( self._time_step_spec, self._action_spec, critic_network=DummyCriticNet(), actor_network=None, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, squash_actions=False, actor_policy_ctor=DummyActorPolicy) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations) actions = tf.constant([[5], [6]], dtype=tf.float32) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = [tf.constant([[5, 6], [7, 8]], dtype=tf.float32)] next_time_steps = ts.transition(next_observations, rewards, discounts) td_targets = [7.3, 19.1] pred_td_targets = [7., 10.] self.evaluate(tf.compat.v1.global_variables_initializer()) # Expected critic loss has factor of 2, for the two TD3 critics. expected_loss = self.evaluate(2 * tf.compat.v1.losses.mean_squared_error( tf.constant(td_targets), tf.constant(pred_td_targets))) loss = agent.critic_loss( time_steps, actions, next_time_steps, td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def _step(self, action): if self._do_record: self._write_log_entry(action) if self._episode_ended: # The last action ended the episode. Ignore the current action and start # a new episode. return self.reset() iscore = self._game.get_score() # Input agent action if action == self._UP: self._game.move_up() elif action == self._DOWN: self._game.move_down() elif action == self._LEFT: self._game.move_left() elif action == self._RIGHT: self._game.move_right() else: raise ValueError('`action` should be between 0 and 3 (inclusive).') # Get state after the agent action is taken state_buffer = self._state self._state = self._game.get_flat_board() if self._game.is_game_over() or np.array_equal(state_buffer, self._state): self._episode_ended = True reward = self._game.get_score() - iscore # Set rewards if self._episode_ended: # return with a reward of 0 return ts.termination(self._state, 0.0) else: return ts.transition(self._state, reward=reward, discount=1.0)
def _step(self, action): if self._episode_ended: return self.reset() if action == self.ACTION_END_GAME: self._episode_ended = True elif action == self.ACTION_GET_NEW_CARD: new_card = np.random.randint(1, 11) self._state += new_card print("New card: {}, Sum: {}".format(new_card, self._state)) else: raise ValueError("`action` should be {} or {}".format( self.ACTION_GET_NEW_CARD, self.ACTION_END_GAME)) if self._episode_ended or self._state >= self.LIMIT_STATE: reward = self._state if self._state <= self.LIMIT_STATE else -99 print("End of game, rewarded", reward) return time_step.termination( np.array([self._state], dtype=np.int32), reward) return time_step.transition(np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
def testLoss(self, agent_class): q_net = test_utils.DummyNet(self._observation_spec, self._action_spec) agent = agent_class(self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = [tf.constant([[0], [1]], dtype=tf.int32)] rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) expected_loss = 26.0 loss_info = agent._loss(time_steps, actions, next_time_steps) total_loss = loss_info.loss self.evaluate(tf.initialize_all_variables()) self.assertAllClose(self.evaluate(total_loss), expected_loss)
def step(unused_time_step): if rng.rand() < 0.10: return ts.termination(sample_fn(), 0.0) else: return ts.transition(sample_fn(), 1.0)
def testTransitionIsMid(self): observation = -1 reward = 2.0 time_step = ts.transition(observation, reward) self.assertTrue(time_step.is_mid())
def testTransitionIsMid(self): observation = tf.constant(-1) reward = tf.constant(2.0) time_step = ts.transition(observation, reward) is_mid = time_step.is_mid() self.assertEqual(True, self.evaluate(is_mid))
def ts_transition(observation): return ts.transition(observation=observation, reward=np.array(1, dtype=np.float32))
def _step(self, action): # The last action ended the episode. # Ignore the current action and start a new episode if self._episode_ended: return self.reset() action_value = action - MAX_AMOUNT/2 step = self._step_num amount = self._amount row = self._df.iloc[[step]] buy = row['buy'].item() sale = row['sale'].item() reward = 0 if action_value > 0: # buying currency reward = - sale * action_value elif action_value < 0 and amount >= np.abs(action_value): # selling currency reward = buy * np.abs(action_value) new_amount = amount + action_value # take action self._step_num += 1 if self._step_num == self.env_size: self._episode_ended = True if 0 <= new_amount <= MAX_AMOUNT: amount = new_amount else: raise RuntimeError( 'Wrong action is produced by policy: {}, {}: a{}, s{}'.format( action, action_value, amount, step)) reward = WRONG_ACTION_REWARD if VERBOSE: print( '#{step} ({amount}->{new_amount}): ' '{buy}/{sale}; {action}; {reward}'.format( step=step, amount=self._amount, new_amount=amount, buy=buy, sale=sale, action=action_value, reward=reward, )) # Update amount after action taken and return the observation self._amount = amount observation = self._get_observation(step) if self._episode_ended: return ts.termination( observation=observation, reward=reward, ) return ts.transition( observation=observation, reward=reward, discount=1.0, )