def check_unbatched_time_step_spec(time_step, time_step_spec, batch_size): """Checks if time step conforms array spec, even if batched.""" if batch_size is None: return array_spec.check_arrays_nest(time_step, time_step_spec) return array_spec.check_arrays_nest( time_step, array_spec.add_outer_dims_nest(time_step_spec, (batch_size,)))
def check_unbatched_time_step_spec(time_step, time_step_spec, batch_size): """Checks if time step conforms array spec, even if batched.""" if batch_size is None: return array_spec.check_arrays_nest(time_step, time_step_spec) if not all([spec.shape[0] == batch_size for spec in time_step]): return False unbatched_time_step = ts.TimeStep(step_type=time_step.step_type[0], reward=time_step.reward[0], discount=time_step.discount[0], observation=time_step.observation[0]) return array_spec.check_arrays_nest(unbatched_time_step, time_step_spec)
def _step(self, action_input): if self._reset_next_step: return self.reset() action = action_input.copy() self._curr_step += 1 new_state = deepcopy(self._state) noise = np.dot(self._ValFct.alphaAtZ(self._wealth), np.random.normal(scale=1./252,size=self._dRiskyAsset))/(self._wealth/self._scale) new_state += noise new_state += action if not array_spec.check_arrays_nest(new_state, self._observation_spec): reward_step = -2 else: self._state = new_state reward_step = self._reward_fn(action*self._wealth/self._scale) self._avg_reward += self._learningRate_AR*(reward_step-self._avg_reward) reward_step -= self._avg_reward reward_step = np.clip(self._reward_scale*reward_step, -1., 1.) if self._curr_step >= self._Nsteps: self._reset_next_step = True print('EPISODE AVERAGE REWARD: ', self._avg_reward) return ts.termination(observation=self._state, reward=reward_step) return ts.transition(observation=self._state, reward=reward_step, discount=1.)
def validate_py_environment( environment: py_environment.PyEnvironment, episodes: int = 5, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None): """Validates the environment follows the defined specs.""" time_step_spec = environment.time_step_spec() action_spec = environment.action_spec() random_policy = random_py_policy.RandomPyPolicy( time_step_spec=time_step_spec, action_spec=action_spec, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter)) if environment.batch_size is not None: batched_time_step_spec = array_spec.add_outer_dims_nest( time_step_spec, outer_dims=(environment.batch_size, )) else: batched_time_step_spec = time_step_spec episode_count = 0 time_step = environment.reset() while episode_count < episodes: if not array_spec.check_arrays_nest(time_step, batched_time_step_spec): raise ValueError('Given `time_step`: %r does not match expected ' '`time_step_spec`: %r' % (time_step, batched_time_step_spec)) action = random_policy.action(time_step).action time_step = environment.step(action) episode_count += np.sum(time_step.is_last())
def _action(self, time_step, policy_state): del time_step # Unused. if policy_state is None: policy_state = [0, 0] action_index, num_repeats = policy_state # pylint: disable=unpacking-non-sequence def _check_episode_length(): if action_index >= len(self._action_script): raise ValueError( "Episode is longer than the provided scripted policy. Consider " "setting a TimeLimit wrapper that stops episodes within the length" " of your scripted policy.") _check_episode_length() n, current_action = self._action_script[action_index] # If the policy has been executed n times get the next scripted action. # Allow users to disable entries in the scripted policy by setting n <= 0. while num_repeats >= n: action_index += 1 num_repeats = 0 _check_episode_length() n, current_action = self._action_script[action_index] num_repeats += 1 # To make it easier for the user we allow the actions in the script to be # lists instead of numpy arrays. Checking the arrays_nest requires us to # have the leaves be objects and not lists so we lift them into numpy # arrays. def actions_as_array(action_spec, action): return np.asarray(action, dtype=action_spec.dtype) current_action = nest.map_structure_up_to(self._action_spec, actions_as_array, self._action_spec, current_action) if not array_spec.check_arrays_nest(current_action, self._action_spec): raise ValueError( "Action at index {} does not match the environment's action_spec. " "Got: {}. Expected {}.".format(action_index, current_action, self._action_spec)) logging.info("Policy_state: %r", policy_state) return policy_step.PolicyStep(current_action, [action_index, num_repeats])
def validate_py_environment(environment, episodes=5): """Validates the environment follows the defined specs.""" time_step_spec = environment.time_step_spec() action_spec = environment.action_spec() random_policy = random_py_policy.RandomPyPolicy( time_step_spec=time_step_spec, action_spec=action_spec) episode_count = 0 time_step = environment.reset() while episode_count < episodes: if not array_spec.check_arrays_nest(time_step, time_step_spec): raise ValueError( 'Given `time_step`: %r does not match expected `time_step_spec`: %r' % (time_step, random_policy.time_step_spec())) action = random_policy.action(time_step).action time_step = environment.step(action) if time_step.is_last(): episode_count += 1
def validate_data_observer(traj): if not array_spec.check_arrays_nest(traj, self._array_data_spec): raise ValueError('Trajectory incompatible with array_data_spec')
def testNoMatch(self, arrays, spec): self.assertFalse(array_spec.check_arrays_nest(arrays, spec))
def testMatch(self, dtype): spec = example_nested_spec(dtype) sample = array_spec.sample_spec_nest(spec, np.random.RandomState()) self.assertTrue(array_spec.check_arrays_nest(sample, spec))