Example #1
0
  def _step(self, action):
    if action < self._action_spec.minimum or action > self._action_spec.maximum:
      raise ValueError('Action should be in [{0}, {1}], but saw: {2}'.format(
          self._action_spec.minimum, self._action_spec.maximum,
          action))

    if self._state >= self._final_state:
      # Start a new episode. Ignore action
      self._state = 0
      return ts.restart(self._state)

    self._state += action
    if self._state < self._final_state:
      return ts.transition(self._state, 1.)
    else:
      return ts.termination(self._state, 1.)
Example #2
0
  def _step(self, action):
    # Automatically reset the environments on step if they need to be reset.
    if self._auto_reset and self._done:
      return self.reset()

    # TODO(oars): Figure out how tuple or dict actions will be generated by the
    # agents and if we can pass them through directly to gym.

    observation, reward, self._done, self._info = self._gym_env.step(action)

    if self._match_obs_space_dtype:
      observation = self._to_obs_space_dtype(observation)

    if self._done:
      return ts.termination(observation, reward)
    else:
      return ts.transition(observation, reward, self._discount)
Example #3
0
    def test_resets_after_limit(self):
        max_steps = 5
        base_env = mock.MagicMock()
        wrapped_env = atari_wrappers.AtariTimeLimit(base_env, max_steps)

        base_env.gym.game_over = False
        base_env.reset.return_value = ts.restart(1)
        base_env.step.return_value = ts.transition(2, 0)
        action = 1

        for _ in range(max_steps + 1):
            wrapped_env.step(action)

        self.assertTrue(wrapped_env.game_over)
        self.assertEqual(1, base_env.reset.call_count)

        wrapped_env.step(action)
        self.assertFalse(wrapped_env.game_over)
        self.assertEqual(2, base_env.reset.call_count)
Example #4
0
    def _step(self, action):
        if self._episode_ended:
            return self.reset()  # don't forget to `return`

        if action == self.ACT_HIT:
            self._player_cards.hit()
            if self._player_cards.is_bust():
                return self._terminate(LOSS_SCORE)

            return time_step.transition(self._state(), reward=0, discount=1)

        # Afteward action == self.ACT_STICK
        dealer_score = self._dealer_cards.dealer_hit()
        player_score = self._player_cards.sum()
        if self._dealer_cards.is_bust() or dealer_score < player_score:
            reward = WIN_SCORE
        else:
            reward = LOSS_SCORE
        return self._terminate(reward)
Example #5
0
    def test_game_over_after_limit(self):
        max_steps = 5
        base_env = mock.MagicMock()
        wrapped_env = atari_wrappers.AtariTimeLimit(base_env, max_steps)

        base_env.gym.game_over = False
        base_env.reset.return_value = ts.restart(1)
        base_env.step.return_value = ts.transition(2, 0)
        action = 1

        self.assertFalse(wrapped_env.game_over)

        for _ in range(max_steps):
            time_step = wrapped_env.step(action)
            self.assertFalse(time_step.is_last())
            self.assertFalse(wrapped_env.game_over)

        time_step = wrapped_env.step(action)
        self.assertTrue(time_step.is_last())
        self.assertTrue(wrapped_env.game_over)
    def _generate_replay_buffer(self, rb_cls):
        stack_count = 4
        shape = (15, 15, stack_count)
        single_shape = (15, 15, 1)
        observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs')
        time_step_spec = ts.time_step_spec(observation_spec)
        action_spec = policy_step.PolicyStep(
            array_spec.BoundedArraySpec(shape=(),
                                        dtype=np.int32,
                                        minimum=0,
                                        maximum=1,
                                        name='action'))
        self._trajectory_spec = trajectory.from_transition(
            time_step_spec, action_spec, time_step_spec)

        self._capacity = 32
        self._replay_buffer = rb_cls(data_spec=self._trajectory_spec,
                                     capacity=self._capacity)

        # Generate N frames: the value of pixels is the frame index.
        # The observations will be generated by stacking K frames out of those N,
        # generating some redundancies between the observations.
        single_frames = []
        frame_count = 100
        for k in range(frame_count):
            single_frames.append(np.full(single_shape, k, dtype=np.int32))

        # Add stack of frames to the replay buffer.
        time_steps = []
        for k in range(len(single_frames) - stack_count + 1):
            observation = np.concatenate(single_frames[k:k + stack_count],
                                         axis=-1)
            time_steps.append(ts.transition(observation, reward=0.0))

        self._transition_count = len(time_steps) - 1
        dummy_action = policy_step.PolicyStep(np.int32(0))
        for k in range(self._transition_count):
            self._replay_buffer.add_batch(
                nest_utils.batch_nested_array(
                    trajectory.from_transition(time_steps[k], dummy_action,
                                               time_steps[k + 1])))
Example #7
0
    def _step(self, action):

        if self._episode_ended:
            return self.reset()

        self.move(action)

        if self.game_over():
            self._episode_ended = True

        if self._episode_ended:
            if self.game_over():
                reward = 100
            else:
                reward = 0
            return ts.termination(np.array(self._state, dtype=np.int32),
                                  reward)
        else:
            return ts.transition(np.array(self._state, dtype=np.int32),
                                 reward=0,
                                 discount=0.9)
Example #8
0
  def testCriticLoss(self):
    agent = sac_agent.SacAgent(
        self._time_step_spec,
        self._action_spec,
        critic_network=DummyCriticNet(),
        actor_network=None,
        actor_optimizer=None,
        critic_optimizer=None,
        alpha_optimizer=None,
        squash_actions=False,
        actor_policy_ctor=DummyActorPolicy)

    observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
    time_steps = ts.restart(observations)
    actions = tf.constant([[5], [6]], dtype=tf.float32)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
    next_observations = [tf.constant([[5, 6], [7, 8]], dtype=tf.float32)]
    next_time_steps = ts.transition(next_observations, rewards, discounts)

    td_targets = [7.3, 19.1]
    pred_td_targets = [7., 10.]

    self.evaluate(tf.compat.v1.global_variables_initializer())

    # Expected critic loss has factor of 2, for the two TD3 critics.
    expected_loss = self.evaluate(2 * tf.compat.v1.losses.mean_squared_error(
        tf.constant(td_targets), tf.constant(pred_td_targets)))

    loss = agent.critic_loss(
        time_steps,
        actions,
        next_time_steps,
        td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    loss_ = self.evaluate(loss)
    self.assertAllClose(loss_, expected_loss)
Example #9
0
    def _step(self, action):

        if self._do_record:
            self._write_log_entry(action)

        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()

        iscore = self._game.get_score()

        # Input agent action
        if action == self._UP:
            self._game.move_up()
        elif action == self._DOWN:
            self._game.move_down()
        elif action == self._LEFT:
            self._game.move_left()
        elif action == self._RIGHT:
            self._game.move_right()
        else:
            raise ValueError('`action` should be between 0 and 3 (inclusive).')

        # Get state after the agent action is taken
        state_buffer = self._state
        self._state = self._game.get_flat_board()
        if self._game.is_game_over() or np.array_equal(state_buffer,
                                                       self._state):
            self._episode_ended = True
        reward = self._game.get_score() - iscore

        # Set rewards
        if self._episode_ended:
            # return with a reward of 0
            return ts.termination(self._state, 0.0)
        else:
            return ts.transition(self._state, reward=reward, discount=1.0)
    def _step(self, action):
        if self._episode_ended:
            return self.reset()

        if action == self.ACTION_END_GAME:
            self._episode_ended = True
        elif action == self.ACTION_GET_NEW_CARD:
            new_card = np.random.randint(1, 11)
            self._state += new_card
            print("New card: {}, Sum: {}".format(new_card, self._state))
        else:
            raise ValueError("`action` should be {} or {}".format(
                self.ACTION_GET_NEW_CARD, self.ACTION_END_GAME))

        if self._episode_ended or self._state >= self.LIMIT_STATE:
            reward = self._state if self._state <= self.LIMIT_STATE else -99
            print("End of game, rewarded", reward)
            return time_step.termination(
                np.array([self._state], dtype=np.int32), reward)

        return time_step.transition(np.array([self._state], dtype=np.int32),
                                    reward=0.0,
                                    discount=1.0)
Example #11
0
    def testLoss(self, agent_class):
        q_net = test_utils.DummyNet(self._observation_spec, self._action_spec)
        agent = agent_class(self._time_step_spec,
                            self._action_spec,
                            q_network=q_net,
                            optimizer=None)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)

        actions = [tf.constant([[0], [1]], dtype=tf.int32)]

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
        next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        expected_loss = 26.0
        loss_info = agent._loss(time_steps, actions, next_time_steps)
        total_loss = loss_info.loss

        self.evaluate(tf.initialize_all_variables())
        self.assertAllClose(self.evaluate(total_loss), expected_loss)
Example #12
0
 def step(unused_time_step):
     if rng.rand() < 0.10:
         return ts.termination(sample_fn(), 0.0)
     else:
         return ts.transition(sample_fn(), 1.0)
Example #13
0
 def testTransitionIsMid(self):
     observation = -1
     reward = 2.0
     time_step = ts.transition(observation, reward)
     self.assertTrue(time_step.is_mid())
Example #14
0
 def testTransitionIsMid(self):
     observation = tf.constant(-1)
     reward = tf.constant(2.0)
     time_step = ts.transition(observation, reward)
     is_mid = time_step.is_mid()
     self.assertEqual(True, self.evaluate(is_mid))
Example #15
0
def ts_transition(observation):
    return ts.transition(observation=observation,
                         reward=np.array(1, dtype=np.float32))
Example #16
0
    def _step(self, action):
        # The last action ended the episode.
        # Ignore the current action and start a new episode
        if self._episode_ended:
            return self.reset()

        action_value = action - MAX_AMOUNT/2 
        step = self._step_num
        amount = self._amount

        row = self._df.iloc[[step]]
        buy = row['buy'].item()
        sale = row['sale'].item()

        reward = 0
        if action_value > 0:  # buying currency
            reward = - sale * action_value
        elif action_value < 0 and amount >= np.abs(action_value):  
            # selling currency
            reward = buy * np.abs(action_value)

        new_amount = amount + action_value
        # take action
        self._step_num += 1

        if self._step_num == self.env_size:
            self._episode_ended = True

        if 0 <= new_amount <= MAX_AMOUNT:
            amount = new_amount
        else:
            raise RuntimeError(
                'Wrong action is produced by policy: {}, {}: a{}, s{}'.format(
                    action, action_value, amount, step))
            reward = WRONG_ACTION_REWARD

        if VERBOSE:
            print(
                '#{step} ({amount}->{new_amount}): '
                '{buy}/{sale}; {action}; {reward}'.format(
                    step=step,
                    amount=self._amount,
                    new_amount=amount,
                    buy=buy,
                    sale=sale,
                    action=action_value,
                    reward=reward,
            ))

        # Update amount after action taken and return the observation
        self._amount = amount
        observation = self._get_observation(step)

        if self._episode_ended:
            return ts.termination(
                observation=observation,
                reward=reward,
            )

        return ts.transition(
            observation=observation,
            reward=reward,
            discount=1.0,
        )