def test_action_generator(): # Continuous action_len = 30 specs = BehaviorSpec( observation_shapes=[(5, )], action_type=ActionType.CONTINUOUS, action_shape=action_len, ) zero_action = specs.create_empty_action(4) assert np.array_equal(zero_action, np.zeros((4, action_len), dtype=np.float32)) random_action = specs.create_random_action(4) assert random_action.dtype == np.float32 assert random_action.shape == (4, action_len) assert np.min(random_action) >= -1 assert np.max(random_action) <= 1 # Discrete action_shape = (10, 20, 30) specs = BehaviorSpec( observation_shapes=[(5, )], action_type=ActionType.DISCRETE, action_shape=action_shape, ) zero_action = specs.create_empty_action(4) assert np.array_equal(zero_action, np.zeros((4, len(action_shape)), dtype=np.int32)) random_action = specs.create_random_action(4) assert random_action.dtype == np.int32 assert random_action.shape == (4, len(action_shape)) assert np.min(random_action) >= 0 for index, branch_size in enumerate(action_shape): assert np.max(random_action[:, index]) < branch_size
def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_observations = [ np.random.normal(size=shape) for shape in behavior_spec.observation_shapes ] next_observations = [ np.random.normal(size=shape) for shape in behavior_spec.observation_shapes ] action = behavior_spec.create_random_action(1)[0, :] for _ in range(number): curr_split_obs = SplitObservations.from_observations(curr_observations) next_split_obs = SplitObservations.from_observations(next_observations) for i, _ in enumerate(curr_split_obs.visual_observations): buffer["visual_obs%d" % i].append( curr_split_obs.visual_observations[i]) buffer["next_visual_obs%d" % i].append( next_split_obs.visual_observations[i]) buffer["vector_obs"].append(curr_split_obs.vector_observations) buffer["next_vector_in"].append(next_split_obs.vector_observations) buffer["actions"].append(action) buffer["done"].append(np.zeros(1, dtype=np.float32)) buffer["reward"].append(np.ones(1, dtype=np.float32) * reward) buffer["masks"].append(np.ones(1, dtype=np.float32)) return buffer