Beispiel #1
0
def check_unbatched_time_step_spec(time_step, time_step_spec, batch_size):
  """Checks if time step conforms array spec, even if batched."""
  if batch_size is None:
    return array_spec.check_arrays_nest(time_step, time_step_spec)

  return array_spec.check_arrays_nest(
      time_step, array_spec.add_outer_dims_nest(time_step_spec, (batch_size,)))
def check_unbatched_time_step_spec(time_step, time_step_spec, batch_size):
    """Checks if time step conforms array spec, even if batched."""
    if batch_size is None:
        return array_spec.check_arrays_nest(time_step, time_step_spec)

    if not all([spec.shape[0] == batch_size for spec in time_step]):
        return False

    unbatched_time_step = ts.TimeStep(step_type=time_step.step_type[0],
                                      reward=time_step.reward[0],
                                      discount=time_step.discount[0],
                                      observation=time_step.observation[0])
    return array_spec.check_arrays_nest(unbatched_time_step, time_step_spec)
    def _step(self, action_input):
        if self._reset_next_step:
            return self.reset()

        
        action = action_input.copy()
        
        self._curr_step += 1
        new_state = deepcopy(self._state)
        noise = np.dot(self._ValFct.alphaAtZ(self._wealth), np.random.normal(scale=1./252,size=self._dRiskyAsset))/(self._wealth/self._scale)
        new_state += noise
        new_state += action
        
        if  not array_spec.check_arrays_nest(new_state, self._observation_spec):
            reward_step = -2
        else:
            self._state = new_state
            reward_step = self._reward_fn(action*self._wealth/self._scale)
            
            self._avg_reward += self._learningRate_AR*(reward_step-self._avg_reward)
            reward_step -= self._avg_reward
            reward_step = np.clip(self._reward_scale*reward_step, -1., 1.)
        
        if self._curr_step >= self._Nsteps:
            self._reset_next_step = True
            print('EPISODE AVERAGE REWARD: ', self._avg_reward)
            return ts.termination(observation=self._state, reward=reward_step)

        return ts.transition(observation=self._state, reward=reward_step, discount=1.)
Beispiel #4
0
def validate_py_environment(
    environment: py_environment.PyEnvironment,
    episodes: int = 5,
    observation_and_action_constraint_splitter: Optional[
        types.Splitter] = None):
    """Validates the environment follows the defined specs."""
    time_step_spec = environment.time_step_spec()
    action_spec = environment.action_spec()

    random_policy = random_py_policy.RandomPyPolicy(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter))

    if environment.batch_size is not None:
        batched_time_step_spec = array_spec.add_outer_dims_nest(
            time_step_spec, outer_dims=(environment.batch_size, ))
    else:
        batched_time_step_spec = time_step_spec

    episode_count = 0
    time_step = environment.reset()

    while episode_count < episodes:
        if not array_spec.check_arrays_nest(time_step, batched_time_step_spec):
            raise ValueError('Given `time_step`: %r does not match expected '
                             '`time_step_spec`: %r' %
                             (time_step, batched_time_step_spec))

        action = random_policy.action(time_step).action
        time_step = environment.step(action)

        episode_count += np.sum(time_step.is_last())
    def _action(self, time_step, policy_state):
        del time_step  # Unused.
        if policy_state is None:
            policy_state = [0, 0]

        action_index, num_repeats = policy_state  #  pylint: disable=unpacking-non-sequence

        def _check_episode_length():
            if action_index >= len(self._action_script):
                raise ValueError(
                    "Episode is longer than the provided scripted policy. Consider "
                    "setting a TimeLimit wrapper that stops episodes within the length"
                    " of your scripted policy.")

        _check_episode_length()
        n, current_action = self._action_script[action_index]

        # If the policy has been executed n times get the next scripted action.
        # Allow users to disable entries in the scripted policy by setting n <= 0.
        while num_repeats >= n:
            action_index += 1
            num_repeats = 0
            _check_episode_length()
            n, current_action = self._action_script[action_index]

        num_repeats += 1

        # To make it easier for the user we allow the actions in the script to be
        # lists instead of numpy arrays. Checking the arrays_nest requires us to
        # have the leaves be objects and not lists so we lift them into numpy
        # arrays.
        def actions_as_array(action_spec, action):
            return np.asarray(action, dtype=action_spec.dtype)

        current_action = nest.map_structure_up_to(self._action_spec,
                                                  actions_as_array,
                                                  self._action_spec,
                                                  current_action)

        if not array_spec.check_arrays_nest(current_action, self._action_spec):
            raise ValueError(
                "Action at index {} does not match the environment's action_spec. "
                "Got: {}. Expected {}.".format(action_index, current_action,
                                               self._action_spec))

        logging.info("Policy_state: %r", policy_state)
        return policy_step.PolicyStep(current_action,
                                      [action_index, num_repeats])
Beispiel #6
0
def validate_py_environment(environment, episodes=5):
    """Validates the environment follows the defined specs."""
    time_step_spec = environment.time_step_spec()
    action_spec = environment.action_spec()

    random_policy = random_py_policy.RandomPyPolicy(
        time_step_spec=time_step_spec, action_spec=action_spec)

    episode_count = 0
    time_step = environment.reset()

    while episode_count < episodes:
        if not array_spec.check_arrays_nest(time_step, time_step_spec):
            raise ValueError(
                'Given `time_step`: %r does not match expected `time_step_spec`: %r'
                % (time_step, random_policy.time_step_spec()))

        action = random_policy.action(time_step).action
        time_step = environment.step(action)

        if time_step.is_last():
            episode_count += 1
 def validate_data_observer(traj):
   if not array_spec.check_arrays_nest(traj, self._array_data_spec):
     raise ValueError('Trajectory incompatible with array_data_spec')
 def testNoMatch(self, arrays, spec):
     self.assertFalse(array_spec.check_arrays_nest(arrays, spec))
 def testMatch(self, dtype):
     spec = example_nested_spec(dtype)
     sample = array_spec.sample_spec_nest(spec, np.random.RandomState())
     self.assertTrue(array_spec.check_arrays_nest(sample, spec))