def _step(self, action):
        if self._done:
            return self.reset()

        if self._action_spec:
            tf.nest.assert_same_structure(self._action_spec, action)

        self._num_steps += 1

        observation = self._get_observation()
        if self._num_steps < self._min_duration:
            self._done = False
        elif self._max_duration and self._num_steps >= self._max_duration:
            self._done = True
        else:
            self._done = self._rng.uniform() < self._episode_end_probability

        if self._done:
            reward = self._reward_fn(ts.StepType.LAST, action, observation)
            self._check_reward_shape(reward)
            time_step = ts.termination(observation, reward)
            self._num_steps = 0
        else:
            reward = self._reward_fn(ts.StepType.MID, action, observation)
            self._check_reward_shape(reward)
            time_step = ts.transition(observation, reward, self._discount)

        return time_step
Example #2
0
 def _terminate(self, reward):
     plog(
         "Player: {} -> {}. Dealer: {} -> {}. Reward: {}.",
         self._player_cards, self._player_cards.sum(),
         self._dealer_cards, self._dealer_cards.sum(),
         reward)
     self._episode_ended = True
     return time_step.termination(self._state(), reward)
Example #3
0
    def testTermination(self):
        observation = -1
        reward = 2.0
        time_step = ts.termination(observation, reward)

        self.assertEqual(ts.StepType.LAST, time_step.step_type)
        self.assertEqual(-1, time_step.observation)
        self.assertEqual(2.0, time_step.reward)
        self.assertEqual(0.0, time_step.discount)
Example #4
0
 def testTermination(self):
     observation = tf.constant(-1)
     reward = tf.constant(2.0)
     time_step = ts.termination(observation, reward)
     time_step_ = self.evaluate(time_step)
     self.assertEqual(ts.StepType.LAST, time_step_.step_type)
     self.assertEqual(-1, time_step_.observation)
     self.assertEqual(2.0, time_step_.reward)
     self.assertEqual(0.0, time_step_.discount)
Example #5
0
    def testTerminationBatched(self):
        observation = np.array([[-1], [-1]])
        reward = np.array([2., 2.])
        time_step = ts.termination(observation, reward)

        self.assertItemsEqual([ts.StepType.LAST] * 2, time_step.step_type)
        self.assertItemsEqual(observation, time_step.observation)
        self.assertItemsEqual(reward, time_step.reward)
        self.assertItemsEqual([0., 0.], time_step.discount)
    def _step(self, action):
        self._state = (self._state + 1) % 3
        self.steps += 1
        self.actions_taken.append(action)

        observation = [self._state]
        if self._state == 0:
            return ts.restart(observation)
        elif self._state == 2:
            self.episodes += 1
            return ts.termination(observation, reward=1.0)
        return ts.transition(observation, reward=0.0)
Example #7
0
  def _step(self, action):
    if action < self._action_spec.minimum or action > self._action_spec.maximum:
      raise ValueError('Action should be in [{0}, {1}], but saw: {2}'.format(
          self._action_spec.minimum, self._action_spec.maximum,
          action))

    if self._state >= self._final_state:
      # Start a new episode. Ignore action
      self._state = 0
      return ts.restart(self._state)

    self._state += action
    if self._state < self._final_state:
      return ts.transition(self._state, 1.)
    else:
      return ts.termination(self._state, 1.)
Example #8
0
  def _step(self, action):
    # Automatically reset the environments on step if they need to be reset.
    if self._auto_reset and self._done:
      return self.reset()

    # TODO(oars): Figure out how tuple or dict actions will be generated by the
    # agents and if we can pass them through directly to gym.

    observation, reward, self._done, self._info = self._gym_env.step(action)

    if self._match_obs_space_dtype:
      observation = self._to_obs_space_dtype(observation)

    if self._done:
      return ts.termination(observation, reward)
    else:
      return ts.transition(observation, reward, self._discount)
Example #9
0
    def _step(self, action):

        if self._episode_ended:
            return self.reset()

        self.move(action)

        if self.game_over():
            self._episode_ended = True

        if self._episode_ended:
            if self.game_over():
                reward = 100
            else:
                reward = 0
            return ts.termination(np.array(self._state, dtype=np.int32),
                                  reward)
        else:
            return ts.transition(np.array(self._state, dtype=np.int32),
                                 reward=0,
                                 discount=0.9)
Example #10
0
    def _step(self, action):

        if self._do_record:
            self._write_log_entry(action)

        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()

        iscore = self._game.get_score()

        # Input agent action
        if action == self._UP:
            self._game.move_up()
        elif action == self._DOWN:
            self._game.move_down()
        elif action == self._LEFT:
            self._game.move_left()
        elif action == self._RIGHT:
            self._game.move_right()
        else:
            raise ValueError('`action` should be between 0 and 3 (inclusive).')

        # Get state after the agent action is taken
        state_buffer = self._state
        self._state = self._game.get_flat_board()
        if self._game.is_game_over() or np.array_equal(state_buffer,
                                                       self._state):
            self._episode_ended = True
        reward = self._game.get_score() - iscore

        # Set rewards
        if self._episode_ended:
            # return with a reward of 0
            return ts.termination(self._state, 0.0)
        else:
            return ts.transition(self._state, reward=reward, discount=1.0)
    def _step(self, action):
        if self._episode_ended:
            return self.reset()

        if action == self.ACTION_END_GAME:
            self._episode_ended = True
        elif action == self.ACTION_GET_NEW_CARD:
            new_card = np.random.randint(1, 11)
            self._state += new_card
            print("New card: {}, Sum: {}".format(new_card, self._state))
        else:
            raise ValueError("`action` should be {} or {}".format(
                self.ACTION_GET_NEW_CARD, self.ACTION_END_GAME))

        if self._episode_ended or self._state >= self.LIMIT_STATE:
            reward = self._state if self._state <= self.LIMIT_STATE else -99
            print("End of game, rewarded", reward)
            return time_step.termination(
                np.array([self._state], dtype=np.int32), reward)

        return time_step.transition(np.array([self._state], dtype=np.int32),
                                    reward=0.0,
                                    discount=1.0)
Example #12
0
 def step(unused_time_step):
     if rng.rand() < 0.10:
         return ts.termination(sample_fn(), 0.0)
     else:
         return ts.transition(sample_fn(), 1.0)
Example #13
0
 def testTerminationIsLast(self):
     observation = -1
     reward = 2.0
     time_step = ts.termination(observation, reward)
     self.assertTrue(time_step.is_last())
Example #14
0
 def testTerminationIsLast(self):
     observation = tf.constant(-1)
     reward = tf.constant(2.0)
     time_step = ts.termination(observation, reward)
     is_last = time_step.is_last()
     self.assertEqual(True, self.evaluate(is_last))
Example #15
0
def ts_termination(observation):
    return ts.termination(observation=observation,
                          reward=np.array(1, dtype=np.float32))
Example #16
0
    def _step(self, action):
        # The last action ended the episode.
        # Ignore the current action and start a new episode
        if self._episode_ended:
            return self.reset()

        action_value = action - MAX_AMOUNT/2 
        step = self._step_num
        amount = self._amount

        row = self._df.iloc[[step]]
        buy = row['buy'].item()
        sale = row['sale'].item()

        reward = 0
        if action_value > 0:  # buying currency
            reward = - sale * action_value
        elif action_value < 0 and amount >= np.abs(action_value):  
            # selling currency
            reward = buy * np.abs(action_value)

        new_amount = amount + action_value
        # take action
        self._step_num += 1

        if self._step_num == self.env_size:
            self._episode_ended = True

        if 0 <= new_amount <= MAX_AMOUNT:
            amount = new_amount
        else:
            raise RuntimeError(
                'Wrong action is produced by policy: {}, {}: a{}, s{}'.format(
                    action, action_value, amount, step))
            reward = WRONG_ACTION_REWARD

        if VERBOSE:
            print(
                '#{step} ({amount}->{new_amount}): '
                '{buy}/{sale}; {action}; {reward}'.format(
                    step=step,
                    amount=self._amount,
                    new_amount=amount,
                    buy=buy,
                    sale=sale,
                    action=action_value,
                    reward=reward,
            ))

        # Update amount after action taken and return the observation
        self._amount = amount
        observation = self._get_observation(step)

        if self._episode_ended:
            return ts.termination(
                observation=observation,
                reward=reward,
            )

        return ts.transition(
            observation=observation,
            reward=reward,
            discount=1.0,
        )