def action(self, *args, **kwargs): """Compute an action for a single input, (e.g. observation).""" args_, kwargs_ = tree.map_structure(lambda x: x[None, ...], (args, kwargs)) actions = self.actions(*args_, **kwargs_) action = tree.map_structure(lambda x: x[0], actions) return action
def test_values(self): _ = self.env.reset() action1_np = self.env.action_space.sample() observation1_np = self.env.step(action1_np)[0] action2_np = self.env.action_space.sample() observation2_np = self.env.step(action2_np)[0] observations_np = type(observation1_np)(( (key, np.stack(( observation1_np[key], observation2_np[key] ), axis=0).astype(np.float32)) for key in observation1_np.keys() )) actions_np = np.stack(( action1_np, action2_np ), axis=0).astype(np.float32) observations_tf = tree.map_structure( lambda x: tf.constant(x, dtype=x.dtype), observations_np) actions_tf = tree.map_structure( lambda x: tf.constant(x, dtype=x.dtype), actions_np) for observations, actions in ( (observations_np, actions_np), (observations_tf, actions_tf)): values = self.value_function.values(observations, actions) tf.debugging.assert_shapes(((values, (2, 1)),))
def prob(self, *args, **kwargs): """Compute the probability for a single action.""" args_, kwargs_ = tree.map_structure(lambda x: x[None, ...], (args, kwargs)) probs = self.probs(*args_, **kwargs_) prob = tree.map_structure(lambda x: x[0], probs) return prob
def value(self, *args, **kwargs): """Compute a value for a single input, (e.g. observation).""" args_, kwargs_ = tree.map_structure( lambda x: x[None, ...], (args, kwargs)) values = self.values(*args_, **kwargs_) value = tree.map_structure(lambda x: x[0], values) return value
def test_field_initialization(self): def verify_field(field_attrs, field_values): self.assertEqual(field_values.shape, (self.pool._max_size, *field_attrs.shape)) self.assertEqual(field_values.dtype.name, field_attrs.dtype) np.testing.assert_array_equal(field_values, 0.0) tree.map_structure(verify_field, self.pool.fields, self.pool.data)
def load_experience(self, experience_path): with gzip.open(experience_path, 'rb') as f: latest_samples = pickle.load(f) num_samples = tree.flatten(latest_samples)[0].shape[0] def assert_shape(data): assert data.shape[0] == num_samples, data.shape tree.map_structure(assert_shape, latest_samples) self.add_samples(latest_samples) self._samples_since_save = 0
def test_actions_and_log_probs(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = type(observation1_np)( ((key, np.stack((observation1_np[key], observation2_np[key]), axis=0).astype(np.float32)) for key in observation1_np.keys())) observations_tf = tree.map_structure( lambda x: tf.constant(x, dtype=x.dtype), observations_np) for observations in (observations_np, observations_tf): actions = self.policy.actions(observations) log_pis = self.policy.log_probs(observations, actions) self.assertAllEqual( log_pis, tfp.distributions.Independent( tfp.distributions.Uniform( low=self.env.action_space.low, high=self.env.action_space.high, ), reinterpreted_batch_ndims=1, ).log_prob(actions)[..., None]) self.assertEqual(actions.shape, (2, *self.env.action_shape))
def feedforward_Q_function(input_shapes, *args, preprocessors=None, observation_keys=None, name='feedforward_Q', **kwargs): inputs = create_inputs(input_shapes) if preprocessors is None: preprocessors = tree.map_structure(lambda _: None, inputs) preprocessors = tree.map_structure_up_to(inputs, preprocessors_lib.deserialize, preprocessors) preprocessed_inputs = apply_preprocessors(preprocessors, inputs) # NOTE(hartikainen): `feedforward_model` would do the `cast_and_concat` # step for us, but tf2.2 broke the sequential multi-input handling: See: # https://github.com/tensorflow/tensorflow/issues/37061. out = tf.keras.layers.Lambda(cast_and_concat)(preprocessed_inputs) Q_model_body = feedforward_model(*args, output_shape=[1], name=name, **kwargs) Q_model = tf.keras.Model(inputs, Q_model_body(out), name=name) Q_function = StateActionValueFunction(model=Q_model, observation_keys=observation_keys, name=name) return Q_function
def test_sequence_batch_by_indices(self): sequence_length = 2 with self.assertRaises(ValueError): self.pool.sequence_batch_by_indices( np.array([-1, 2, 4]), sequence_length=sequence_length) path_lengths = [10, 4, 50, 36] assert sum(path_lengths) == self.pool._max_size, path_lengths samples = { 'field1': np.arange(self.pool._max_size)[:, None], 'field2': -np.arange(self.pool._max_size)[:, None] * 2, } for path_end, path_length in zip(np.cumsum(path_lengths), path_lengths): self.pool.add_path( tree.map_structure( lambda s: s[path_end - path_length:path_end], samples)) batch_indices = np.flip(np.arange(self.pool._max_size)) full_pool_batch = self.pool.sequence_batch_by_indices( batch_indices, sequence_length=sequence_length) for key, values in full_pool_batch.items(): if key == 'mask': self.assertEqual(values.shape, (self.pool._max_size, sequence_length)) else: self.assertEqual(values.shape, (self.pool._max_size, sequence_length, 1)) self.assertIn('mask', full_pool_batch) self.assertTrue( all(index_field in full_pool_batch.keys() for index_field in INDEX_FIELDS)) episode_start_indices = np.flatnonzero( self.pool.data['episode_index_forwards'] == 0) episode_start_indices_batch = np.flatnonzero( np.isin(batch_indices, episode_start_indices)) mask = full_pool_batch['mask'] for key, values in full_pool_batch.items(): if key in ('mask', *INDEX_FIELDS): continue expected = np.stack(( np.roll(np.flip(samples[key]), -1), np.flip(samples[key]), ), axis=1) expected[episode_start_indices_batch, 0, :] = 0 np.testing.assert_array_equal(expected, ~mask[..., None] * values) self.assertEqual(values.shape, (self.pool._max_size, sequence_length, 1)) # Make sure that the values at the start of the episode are zero. np.testing.assert_equal( ~mask[episode_start_indices_batch, :-1, None] * values[episode_start_indices_batch][:, :-1, :], 0)
def preprocess(x): """Cast to float, normalize, and concatenate images along last axis.""" x = tree.map_structure( lambda image: tf.image.convert_image_dtype(image, tf.float32), x) x = tree.flatten(x) x = tf.concat(x, axis=-1) x = (tf.image.convert_image_dtype(x, tf.float32) - 0.5) * 2.0 return x
def observation_shape(self): if not isinstance(self.observation_space, spaces.Dict): raise NotImplementedError(type(self.observation_space)) observation_shape = tree.map_structure( lambda space: tf.TensorShape(space.shape), self.observation_space.spaces) return observation_shape
def apply_preprocessors(preprocessors, inputs): tree.assert_same_structure(inputs, preprocessors) preprocessed_inputs = tree.map_structure( lambda preprocessor, input_: (preprocessor(input_) if preprocessor is not None else input_), preprocessors, inputs, ) return preprocessed_inputs
def __init__(self, max_size, fields): super(FlexibleReplayPool, self).__init__() max_size = int(max_size) self._max_size = max_size self.fields = {**fields, **INDEX_FIELDS} self.data = tree.map_structure(self._initialize_field, self.fields) self._pointer = 0 self._size = 0 self._samples_since_save = 0
def _preprocess_inputs(self, inputs): if self.preprocessors is None: preprocessors = tree.map_structure(lambda x: None, inputs) else: preprocessors = self.preprocessors preprocessed_inputs = apply_preprocessors(preprocessors, inputs) preprocessed_inputs = tf.keras.layers.Lambda(cast_and_concat)( preprocessed_inputs) return preprocessed_inputs
def test_save_latest_experience_with_overflown_pool(self): self.assertEqual(self.pool._samples_since_save, 0) num_samples = self.pool._max_size + 10 samples = { 'field1': np.arange(num_samples)[:, None], 'field2': -np.arange(num_samples)[:, None] * 2, } self.pool.add_samples(samples) self.assertEqual(self.pool.size, self.pool._max_size) self.assertEqual(self.pool._samples_since_save, num_samples) self.pool.save_latest_experience('./tmp/pool_1.pkl') pool = create_pool(self.pool._max_size) self.assertEqual(pool.size, 0) import gzip with gzip.open('./tmp/pool_1.pkl', 'rb') as f: latest_samples = pickle.load(f) def assert_same_shape(field, data): expected_shape = (self.pool._max_size, *field.shape) self.assertEqual(data.shape, expected_shape) tree.map_structure(assert_same_shape, self.pool.fields, latest_samples) pool.load_experience('./tmp/pool_1.pkl') self.assertEqual(pool.size, self.pool._max_size) assert all(index_field in pool.fields.keys() for index_field in INDEX_FIELDS) def assert_field_data_shape(field_data, field_samples): np.testing.assert_array_equal(field_data, field_samples[-self.pool._max_size:]) tree.map_structure(assert_field_data_shape, pool.data, samples)
def create_sequence_inputs(shapes, dtypes=None): """Creates `tf.keras.layers.Input`s usable for sequential models like RNN. Args: See `create_inputs`. Returns: inputs: nested structure, of same shape as input_shapes, containing `tf.keras.layers.Input`s, each with shape (None, ...). """ shapes = tree.map_structure(lambda x: tf.TensorShape([None]) + x, shapes) sequence_inputs = create_inputs(shapes, dtypes) return sequence_inputs
def sample(self): if self._is_first_step: self.reset() action = self.policy.action(self._policy_input).numpy() next_observation, reward, terminal, info = self.environment.step( action) self._path_length += 1 self._path_return += reward self._total_samples += 1 processed_sample = self._process_sample( observation=self._current_observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, info=info, ) self._current_path.append(processed_sample) if terminal or self._path_length >= self._max_path_length: last_path = tree.map_structure( lambda *x: np.stack(x, axis=0), *self._current_path) self.pool.add_path({ key: value for key, value in last_path.items() if key != 'infos' }) self._last_n_paths.appendleft(last_path) self._max_path_return = max(self._max_path_return, self._path_return) self._last_path_return = self._path_return self._n_episodes += 1 self.pool.terminate_episode() self._is_first_step = True # Reset is done in the beginning of next episode, see above. else: self._current_observation = next_observation self._is_first_step = False return next_observation, reward, terminal, info
def batch_by_indices(self, indices, field_name_filter=None, validate_index=True): if validate_index and np.any(self.size <= indices % self._max_size): raise ValueError( "Tried to retrieve batch with indices greater than current" " size") if field_name_filter is not None: raise NotImplementedError("TODO(hartikainen)") batch = tree.map_structure( lambda field: field[indices % self._max_size], self.data) return batch
def test_serialize_deserialize_empty(self): self.assertEqual(self.pool._size, 0) tree.map_structure( lambda field_data: np.testing.assert_array_equal(field_data, 0.0), self.pool.data) serialized = pickle.dumps(self.pool) deserialized = pickle.loads(serialized) for key in deserialized.__dict__: if key == 'data': for field_name in self.pool.__dict__[key]: np.testing.assert_array_equal( self.pool.__dict__[key][field_name], deserialized.__dict__[key][field_name]) else: np.testing.assert_array_equal(self.pool.__dict__[key], deserialized.__dict__[key]) self.assertNotEqual(id(self.pool), id(deserialized)) self.assertEqual(deserialized._size, 0) for field_name, field_attrs in self.pool.fields.items(): np.testing.assert_array_equal(self.pool.fields[field_name], deserialized.fields[field_name])
def REPLACE_FULL_OBSERVATION(original_batch, resampled_batch, where_resampled, environment): def replace_original_with_resampled(original_goals, resampled_goals): np.testing.assert_equal(original_goals[where_resampled].shape, resampled_goals.shape) new_goals = original_goals.copy() new_goals[where_resampled] = resampled_goals.copy() return new_goals new_batch = original_batch.copy() new_goals = tree.map_structure(replace_original_with_resampled, original_batch['goals'], resampled_batch['goals']) new_batch['goals'] = new_goals return new_batch
def create_inputs(shapes, dtypes=None): """Creates `tf.keras.layers.Input`s based on input shapes. Args: input_shapes: (possibly nested) list/array/dict structure of inputs shapes. Returns: inputs: nested structure, of same shape as input_shapes, containing `tf.keras.layers.Input`s. TODO(hartikainen): Need to figure out a better way for handling the dtypes. """ if dtypes is None: dtypes = tree.map_structure(lambda _: None, shapes) inputs = tree.map_structure_with_path(create_input, shapes, dtypes) return inputs
def test_actions_and_log_probs(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = type(observation1_np)( ((key, np.stack((observation1_np[key], observation2_np[key]), axis=0).astype(np.float32)) for key in observation1_np.keys())) observations_tf = tree.map_structure( lambda x: tf.constant(x, dtype=x.dtype), observations_np) for observations in (observations_np, observations_tf): actions = self.policy.actions(observations) log_pis = self.policy.log_probs(observations, actions) self.assertEqual(actions.shape, (2, *self.env.action_space.shape)) self.assertEqual(log_pis.shape, (2, 1))
def _do_training_repeats(self, timestep): """Repeat training _n_train_repeat times every _train_every_n_steps""" if timestep % self._train_every_n_steps > 0: return trained_enough = ( self._train_steps_this_epoch > self._max_train_repeat_per_timestep * self._timestep) if trained_enough: return diagnostics = [ self._do_training(iteration=timestep, batch=self._training_batch()) for i in range(self._n_train_repeat) ] diagnostics = tree.map_structure( lambda *d: tf.reduce_mean(d).numpy(), *diagnostics) self._num_train_steps += self._n_train_repeat self._train_steps_this_epoch += self._n_train_repeat return diagnostics
def __init__(self, input_shapes, output_shape, observation_keys=None, preprocessors=None, name='policy'): self._input_shapes = input_shapes self._output_shape = output_shape self._observation_keys = observation_keys self._inputs = create_inputs(input_shapes) if preprocessors is None: preprocessors = tree.map_structure(lambda x: None, input_shapes) preprocessors = tree.map_structure_up_to(input_shapes, preprocessors_lib.deserialize, preprocessors) self._preprocessors = preprocessors self._name = name
def get_diagnostics_np(self, *args, **kwargs): diagnostics = self.get_diagnostics(*args, **kwargs) diagnostics_np = tree.map_structure(lambda x: x.numpy(), diagnostics) return diagnostics_np
def test_sequence_overlaps_two_episodes(self): sequence_length = self.pool._max_size path_lengths = [ self.pool._max_size // 2 - 5, (self.pool._max_size // 2) - 3, 8, ] samples = { 'field1': np.arange(self.pool._max_size)[:, None], 'field2': -np.arange(self.pool._max_size)[::-1, None] * 2, } for path_end, path_length in zip(np.cumsum(path_lengths), path_lengths): self.pool.add_path( tree.map_structure( lambda s: s[path_end - path_length:path_end], samples)) batch_indices = np.array([0, *np.cumsum(path_lengths), -2, -1]) batch = self.pool.sequence_batch_by_indices( batch_indices, sequence_length=sequence_length) self.assertIn('mask', batch) self.assertTrue( all(index_field in batch.keys() for index_field in INDEX_FIELDS)) for key, values in batch.items(): if key == 'mask': self.assertEqual(values.shape, (batch_indices.size, sequence_length)) else: self.assertEqual(values.shape, (batch_indices.size, sequence_length, 1)) for i, (episode_start_index, episode_length) in enumerate( zip((0, *np.cumsum(path_lengths)[:-1]), path_lengths)): np.testing.assert_equal( ~batch['mask'][i][:-1] * batch['field1'][i][:-1], 0) np.testing.assert_equal( ~batch['mask'][i][-1] * batch['field1'][i][-1], samples['field1'][episode_start_index]) np.testing.assert_equal( ~batch['mask'][i][:-1] * batch['field2'][i][:-1], 0) np.testing.assert_equal( ~batch['mask'][i][-1] * batch['field2'][i][-1], samples['field2'][episode_start_index]) np.testing.assert_equal( ~batch['mask'][i, :, None] * batch['episode_index_forwards'][i], 0) np.testing.assert_equal( ~batch['mask'][i, :, None][:-1] * batch['episode_index_backwards'][i][:-1], 0) np.testing.assert_equal( ~batch['mask'][i, :, None][-1] * batch['episode_index_backwards'][i][-1], episode_length - 1) np.testing.assert_equal( ~batch['mask'][-2, :-path_lengths[-1] + 1, None] * batch['field1'][-2][:-path_lengths[-1] + 1], 0) np.testing.assert_equal( ~batch['mask'][-2, -path_lengths[-1] + 1:, None] * batch['field1'][-2][-path_lengths[-1] + 1:], samples['field1'][-path_lengths[-1]:-1]) np.testing.assert_equal( ~batch['mask'][-2, :-path_lengths[-1] + 1, None] * batch['field2'][-2][:-path_lengths[-1] + 1], 0) np.testing.assert_equal( ~batch['mask'][-2, -path_lengths[-1] + 1:, None] * batch['field2'][-2][-path_lengths[-1] + 1:], samples['field2'][-path_lengths[-1]:-1]) np.testing.assert_equal( ~batch['mask'][-1, :-path_lengths[-1], None] * batch['field1'][-1][:-path_lengths[-1]], 0) np.testing.assert_equal( ~batch['mask'][-1, -path_lengths[-1]:, None] * batch['field1'][-1][-path_lengths[-1]:], samples['field1'][-path_lengths[-1]:]) np.testing.assert_equal( ~batch['mask'][-1, :-path_lengths[-1], None] * batch['field2'][-1][:-path_lengths[-1]], 0) np.testing.assert_equal( ~batch['mask'][-1, -path_lengths[-1]:, None] * batch['field2'][-1][-path_lengths[-1]:], samples['field2'][-path_lengths[-1]:])
def add_sample(self, sample): samples = tree.map_structure(lambda x: x[..., np.newaxis], sample) self.add_samples(samples)
def cast_and_concat(x): x = tree.map_structure(lambda element: tf.cast(element, tf.float32), x) x = tree.flatten(x) x = tf.concat(x, axis=-1) return x
def _train(self): """Return a generator that performs RL training. Args: rail (`SoftlearningEnv`): Environment used for training. policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ training_environment = self._training_environment evaluation_environment = self._evaluation_environment policy = self._policy gt.reset_root() gt.rename_root('RLAlgorithm') gt.set_def_unique(False) self._training_before_hook() for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)): self._epoch_before_hook() gt.stamp('epoch_before_hook') update_diagnostics = [] start_samples = self.sampler._total_samples for i in count(): samples_now = self.sampler._total_samples self._timestep = samples_now - start_samples if (samples_now >= start_samples + self._epoch_length and self.ready_to_train): break self._timestep_before_hook() gt.stamp('timestep_before_hook') self._do_sampling(timestep=self._total_timestep) gt.stamp('sample') if self.ready_to_train: update_diagnostics.append(self._do_training_repeats( timestep=self._total_timestep)) gt.stamp('train') self._timestep_after_hook() gt.stamp('timestep_after_hook') update_diagnostics = tree.map_structure( lambda *d: np.mean(d), *update_diagnostics) training_paths = self.sampler.get_last_n_paths( math.ceil(self._epoch_length / self.sampler._max_path_length)) gt.stamp('training_paths') evaluation_paths = self._evaluation_paths( policy, evaluation_environment) gt.stamp('evaluation_paths') training_metrics = self._evaluate_rollouts( training_paths, training_environment, self._total_timestep, evaluation_type='train') gt.stamp('training_metrics') if evaluation_paths: evaluation_metrics = self._evaluate_rollouts( evaluation_paths, evaluation_environment, self._total_timestep, evaluation_type='evaluation') gt.stamp('evaluation_metrics') else: evaluation_metrics = {} self._epoch_after_hook(training_paths) gt.stamp('epoch_after_hook') sampler_diagnostics = self.sampler.get_diagnostics() diagnostics = self.get_diagnostics( iteration=self._total_timestep, batch=self._evaluation_batch(), training_paths=training_paths, evaluation_paths=evaluation_paths) time_diagnostics = { key: times[-1] for key, times in gt.get_times().stamps.itrs.items() } # TODO(hartikainen/tf2): Fix the naming of training/update # diagnostics/metric diagnostics.update(( ('evaluation', evaluation_metrics), ('training', training_metrics), ('update', update_diagnostics), ('times', time_diagnostics), ('sampler', sampler_diagnostics), ('epoch', self._epoch), ('timestep', self._timestep), ('total_timestep', self._total_timestep), ('num_train_steps', self._num_train_steps), )) if self._eval_render_kwargs and hasattr( evaluation_environment, 'render_rollouts'): # TODO(hartikainen): Make this consistent such that there's no # need for the hasattr check. training_environment.render_rollouts(evaluation_paths) yield diagnostics self.sampler.terminate() self._training_after_hook() yield {'done': True, **diagnostics}