def testBatchSizeProvided(self, metric_class, expected_result): metric = metric_class(batch_size=2) metric( nest_utils.stack_nested_arrays([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.first((), (), (), 1., 1.), trajectory.first((), (), (), 1., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.mid((), (), (), 2., 1.), trajectory.last((), (), (), 3., 0.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.last((), (), (), 3., 0.), trajectory.boundary((), (), (), 0., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.boundary((), (), (), 0., 1.), trajectory.first((), (), (), 1., 1.) ])) self.assertEqual(metric.result(), expected_result)
def _create_misaligned_trajectories(self): def _concat_nested_tensors(nest1, nest2): return tf.nest.map_structure(lambda t1, t2: tf.concat([t1, t2], axis=0), nest1, nest2) # Order of args for trajectory methods: # observation, action, policy_info, reward, discount ts1 = _concat_nested_tensors( trajectory.first((), tf.constant([2]), (), tf.constant([1.], dtype=tf.float32), [1.]), trajectory.boundary((), tf.constant([1]), (), tf.constant([0.], dtype=tf.float32), [1.])) ts2 = _concat_nested_tensors( trajectory.last((), tf.constant([1]), (), tf.constant([3.], dtype=tf.float32), [1.]), trajectory.first((), tf.constant([1]), (), tf.constant([2.], dtype=tf.float32), [1.])) ts3 = _concat_nested_tensors( trajectory.boundary((), tf.constant([2]), (), tf.constant([0.], dtype=tf.float32), [1.]), trajectory.last((), tf.constant([1]), (), tf.constant([4.], dtype=tf.float32), [1.])) return [ts1, ts2, ts3]
def setUp(self): super(BatchedPyMetricTest, self).setUp() # Order of args for trajectory methods: # observation, action, policy_info, reward, discount ts0 = nest_utils.stack_nested_tensors([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ]) ts1 = nest_utils.stack_nested_tensors([ trajectory.first((), (), (), 1., 1.), trajectory.first((), (), (), 2., 1.) ]) ts2 = nest_utils.stack_nested_tensors([ trajectory.last((), (), (), 3., 1.), trajectory.last((), (), (), 4., 1.) ]) ts3 = nest_utils.stack_nested_tensors([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ]) ts4 = nest_utils.stack_nested_tensors([ trajectory.first((), (), (), 5., 1.), trajectory.first((), (), (), 6., 1.) ]) ts5 = nest_utils.stack_nested_tensors([ trajectory.last((), (), (), 7., 1.), trajectory.last((), (), (), 8., 1.) ]) self._ts = [ts0, ts1, ts2, ts3, ts4, ts5]
def _create_trajectories(self): # Order of args for trajectory methods: # observation, action, policy_info, reward, discount ts0 = nest_utils.stack_nested_tensors([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ]) ts1 = nest_utils.stack_nested_tensors([ trajectory.first((), (), (), 1., 1.), trajectory.first((), (), (), 2., 1.) ]) ts2 = nest_utils.stack_nested_tensors([ trajectory.last((), (), (), 3., 1.), trajectory.last((), (), (), 4., 1.) ]) ts3 = nest_utils.stack_nested_tensors([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ]) ts4 = nest_utils.stack_nested_tensors([ trajectory.first((), (), (), 5., 1.), trajectory.first((), (), (), 6., 1.) ]) ts5 = nest_utils.stack_nested_tensors([ trajectory.last((), (), (), 7., 1.), trajectory.last((), (), (), 8., 1.) ]) return [ts0, ts1, ts2, ts3, ts4, ts5]
def setUp(self): super(PyDriverTest, self).setUp() f0 = np.array(0., dtype=np.float32) f1 = np.array(1., dtype=np.float32) # Order of args for trajectory methods: # (observation, action, policy_info, reward, discount) self._trajectories = [ trajectory.first(0, 1, 2, f1, f1), trajectory.last(1, 2, 4, f1, f0), trajectory.boundary(3, 1, 2, f0, f1), trajectory.first(0, 1, 2, f1, f1), trajectory.last(1, 2, 4, f1, f0), trajectory.boundary(3, 1, 2, f0, f1), trajectory.first(0, 1, 2, f1, f1), ]
def testZeroEpisodes(self, metric_class, expected_result): metric = metric_class() # Order of args for trajectory methods: # observation, action, policy_info, reward, discount metric(trajectory.boundary((), (), (), 0., 1.)) metric(trajectory.first((), (), (), 1., 1.)) self.assertEqual(expected_result, metric.result())
def testAverageTwoEpisode(self, metric_class, expected_result): metric = metric_class() metric(trajectory.boundary((), (), (), 0., 1.)) metric(trajectory.first((), (), (), 1., 1.)) metric(trajectory.mid((), (), (), 2., 1.)) metric(trajectory.last((), (), (), 3., 0.)) metric(trajectory.boundary((), (), (), 0., 1.)) # TODO(kbanoop): Add optional next_step_type arg to trajectory.first. Or # implement trajectory.first_last(). metric( trajectory.Trajectory(ts.StepType.FIRST, (), (), (), ts.StepType.LAST, -6., 1.)) self.assertEqual(expected_result, metric.result())
def testAverageOneEpisode(self, metric_class, expected_result): metric = metric_class() metric(trajectory.boundary((), (), (), 0., 1.)) metric(trajectory.mid((), (), (), 1., 1.)) metric(trajectory.mid((), (), (), 2., 1.)) metric(trajectory.last((), (), (), 3., 0.)) self.assertEqual(expected_result, metric.result())
def _get_padding_step( self, example_trajectory: trajectory_lib.Trajectory ) -> trajectory_lib.Trajectory: """Get the padding step to append to the cache.""" zero_step = trajectory_lib.boundary( tf.nest.map_structure(tf.zeros_like, example_trajectory.observation), tf.nest.map_structure(tf.zeros_like, example_trajectory.action), tf.nest.map_structure(tf.zeros_like, example_trajectory.policy_info), tf.nest.map_structure(tf.zeros_like, example_trajectory.reward), tf.nest.map_structure(tf.zeros_like, example_trajectory.discount), ) return zero_step
def testSaveRestore(self): metrics = [ py_metrics.AverageReturnMetric(), py_metrics.AverageEpisodeLengthMetric(), py_metrics.EnvironmentSteps(), py_metrics.NumberOfEpisodes() ] for metric in metrics: metric(trajectory.boundary((), (), (), 0., 1.)) metric(trajectory.mid((), (), (), 1., 1.)) metric(trajectory.mid((), (), (), 2., 1.)) metric(trajectory.last((), (), (), 3., 0.)) checkpoint = tf.train.Checkpoint(**{m.name: m for m in metrics}) prefix = self.get_temp_dir() + '/ckpt' save_path = checkpoint.save(prefix) for metric in metrics: metric.reset() self.assertEqual(0, metric.result()) checkpoint.restore(save_path).assert_consumed() for metric in metrics: self.assertGreater(metric.result(), 0)
#TRAINING LOOP for i in tqdm(range(5000), "Training"): #QUERY TABLE FOR PARTIAL ROWS regex_filter = '^cartpole_trajectory_{}$'.format(i) row_filter = row_filters.RowKeyRegexFilter(regex_filter) filtered_rows = table.read_rows(filter_=row_filter) for row in filtered_rows: bytes_traj = row.cells['trajectory']['traj'.encode()][0].value bytes_info = row.cells['trajectory']['info'.encode()][0].value traj, info = Trajectory(), Info() traj.ParseFromString(bytes_traj) info.ParseFromString(bytes_info) traj_shape = np.append(np.array(info.num_steps), np.array(info.vector_obs_spec)) observations = np.array(traj.vector_obs).reshape(traj_shape) traj_obs = np.rollaxis(np.array([observations, np.roll(observations, 1)]), 0 , 2) traj_actions = np.rollaxis(np.array([traj.actions, np.roll(traj.actions, 1)]), 0 , 2) traj_rewards = np.rollaxis(np.array([traj.rewards, np.roll(traj.rewards, 1)]), 0 , 2) traj_discounts = np.ones((info.num_steps,2)) traj_obs = tf.constant(traj_obs, dtype=tf.float32) traj_actions = tf.constant(traj_actions, dtype=tf.int32) policy_info = () traj_rewards = tf.constant(traj_rewards, dtype=tf.float32) traj_discounts = tf.constant(traj_discounts, dtype=tf.float32) traj = trajectory.boundary(traj_obs, traj_actions, policy_info, traj_rewards, traj_discounts) train_loss = tf_agent.train(traj) tf_agent._q_network.save_weights("cartpole_model.h5")