def test_handles_done_in_substep_with_recorded_episode_stats(): """Recording episode stats and handling environments that return done during a (non-last) sub-step.""" env = build_dummy_structured_env() env = _FiveSubstepsLimitWrapper.wrap(env) policy = RandomPolicy(env.action_spaces_dict) # -- Normal operation (should reset the env automatically and continue rollout) -- rollout_generator = RolloutGenerator(env=env, record_episode_stats=True) trajectory = rollout_generator.rollout(policy, n_steps=10) assert len(trajectory) == 10 # The done step records should have data for the first sub-step only dones = 0 for step_record in trajectory.step_records: if step_record.is_done(): assert [0] == list(step_record.observations_dict.keys()) dones += 1 assert step_record.episode_stats is not None else: assert [0, 1] == list(step_record.observations_dict.keys()) assert step_record.episode_stats is None assert dones == 3 # Each episode is done after 5 sub-steps, i.e. 3 structured steps get recorded => 3 episodes fit # -- Terminate on done -- rollout_generator = RolloutGenerator(env=env, terminate_on_done=True) trajectory = rollout_generator.rollout(policy, n_steps=10) assert len(trajectory) == 3 assert trajectory.is_done() assert [0] == list(trajectory.step_records[-1].observations_dict.keys())
def test_records_next_observations(): """Recording next observations.""" env = build_dummy_structured_env() rollout_generator = RolloutGenerator(env=env, record_next_observations=True) policy = RandomPolicy(env.action_spaces_dict) trajectory = rollout_generator.rollout(policy, n_steps=10) assert len(trajectory) == 10 sub_step_keys = env.action_spaces_dict.keys() last_next_obs = None for record in trajectory.step_records: assert sub_step_keys == record.observations_dict.keys() assert sub_step_keys == record.next_observations_dict.keys() assert record.batch_shape is None for step_key in sub_step_keys: curr_obs = record.observations_dict[step_key] # Next obs from the previous sub-step should be equal to the current observation if last_next_obs: assert list(curr_obs.keys()) == list(last_next_obs.keys()) for obs_key in curr_obs.keys(): assert np.all(curr_obs[obs_key] == last_next_obs[obs_key]) last_next_obs = record.next_observations_dict[step_key]
def run_observation_normalization_pipeline( normalization_config) -> ObservationNormalizationWrapper: """ observation normalization test """ # wrap env with observation normalization env = GymMazeEnv("CartPole-v0") env = ObservationNormalizationWrapper( env, default_strategy=normalization_config["default_strategy"], default_strategy_config=normalization_config[ "default_strategy_config"], default_statistics=normalization_config["default_statistics"], statistics_dump=normalization_config["statistics_dump"], exclude=normalization_config["exclude"], sampling_policy=RandomPolicy(env.action_spaces_dict), manual_config=normalization_config["manual_config"]) # estimate normalization statistics statistics = obtain_normalization_statistics(env, n_samples=1000) # check statistics for sub_step_key in env.observation_spaces_dict: for obs_key in env.observation_spaces_dict[sub_step_key].spaces: assert obs_key in statistics for stats_key in statistics[obs_key]: stats = statistics[obs_key][stats_key] assert isinstance(stats, np.ndarray) # test normalization random_env_steps(env, steps=100) return env
def test_random_sampling_seeding(): """Test the seeding with a random env version and random sampling (fully stochastic)""" env = GymMazeEnv(env="CartPole-v0") policy = RandomPolicy(env.action_spaces_dict) perform_seeding_test(env, policy, is_deterministic_env=False, is_deterministic_agent=False)
def test_terminates_on_done(): """Resetting the env or terminating rollout early when the env is done.""" env = build_dummy_maze_env() env = TimeLimitWrapper.wrap(env, max_episode_steps=5) policy = RandomPolicy(env.action_spaces_dict) # Normal operation (should reset the env automatically and continue rollout) rollout_generator = RolloutGenerator(env=env) trajectory = rollout_generator.rollout(policy, n_steps=10) assert len(trajectory) == 10 # Terminate on done rollout_generator = RolloutGenerator(env=env, terminate_on_done=True) trajectory = rollout_generator.rollout(policy, n_steps=10) assert len(trajectory) == 5
def test_observation_normalization_manual_default_stats(): """ observation normalization test """ # init environment env = GymMazeEnv("CartPole-v0") # normalization config normalization_config = { "default_strategy": "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy", "default_strategy_config": { "clip_range": (0, 1), "axis": 0 }, "default_statistics": { "mean": [0, 0, 0, 0], "std": [1, 1, 1, 1] }, "statistics_dump": "statistics.pkl", "sampling_policy": RandomPolicy(env.action_spaces_dict), "exclude": None, "manual_config": None, } # wrap env with observation normalization env = ObservationNormalizationWrapper( env, default_strategy=normalization_config["default_strategy"], default_strategy_config=normalization_config[ "default_strategy_config"], default_statistics=normalization_config["default_statistics"], statistics_dump=normalization_config["statistics_dump"], sampling_policy=normalization_config['sampling_policy'], exclude=normalization_config["exclude"], manual_config=normalization_config["manual_config"]) # check if action space clipping was applied assert np.alltrue(env.observation_space["observation"].high <= 1.0) assert np.alltrue(env.observation_space["observation"].low >= 0.0) # check if stats have been set properly statistics = env.get_statistics() assert np.all(statistics["observation"]["mean"] == np.zeros(shape=4)) assert np.all(statistics["observation"]["std"] == np.ones(shape=4)) # test sampling obs = random_env_steps(env, steps=100) assert np.min(obs) >= 0 and np.max(obs) <= 1
def test_works_with_gym_maze_envs(): env = GymMazeEnv("CartPole-v0") policy = RandomPolicy(action_spaces_dict=env.action_spaces_dict) agent_deployment = AgentDeployment( policy=policy, env=env ) external_env = gym.make("CartPole-v0") maze_state = external_env.reset() reward, done, info = 0, False, {} for i in range(10): maze_action = agent_deployment.act(maze_state, reward, done, info) maze_state, reward, done, info = external_env.step(maze_action) agent_deployment.close(maze_state, reward, done, info)
def test_standard_rollout(): """Rollout with a single structured env.""" env = build_dummy_structured_env() rollout_generator = RolloutGenerator(env=env) policy = RandomPolicy(env.action_spaces_dict) trajectory = rollout_generator.rollout(policy, n_steps=10) assert len(trajectory) == 10 sub_step_keys = env.action_spaces_dict.keys() for record in trajectory.step_records: assert sub_step_keys == record.actions_dict.keys() assert sub_step_keys == record.observations_dict.keys() assert sub_step_keys == record.rewards_dict.keys() assert record.batch_shape is None for step_key in sub_step_keys: assert record.observations_dict[ step_key] in env.observation_spaces_dict[step_key] assert record.actions_dict[step_key] in env.action_spaces_dict[ step_key]
def test_custom_model_composer_with_shared_embedding(): env = build_dummy_structured_env() policies = { "_target_": "maze.perception.models.policies.ProbabilisticPolicyComposer", "networks": [{ "_target_": "maze.perception.models.built_in.flatten_concat_shared_embedding.FlattenConcatSharedEmbeddingPolicyNet", "non_lin": "torch.nn.SELU", "hidden_units": [16], "head_units": [16] }, { "_target_": "maze.perception.models.built_in.flatten_concat_shared_embedding.FlattenConcatSharedEmbeddingPolicyNet", "non_lin": "torch.nn.SELU", "hidden_units": [16], "head_units": [16] }], "substeps_with_separate_agent_nets": [] } step_critic = { "_target_": "maze.perception.models.critics.StepStateCriticComposer", "networks": [{ "_target_": "maze.perception.models.built_in.flatten_concat_shared_embedding.FlattenConcatSharedEmbeddingStateValueNet", "non_lin": "torch.nn.SELU", "head_units": [16] }, { "_target_": "maze.perception.models.built_in.flatten_concat_shared_embedding.FlattenConcatSharedEmbeddingStateValueNet", "non_lin": "torch.nn.SELU", "head_units": [16] }] } # check if model config is fine CustomModelComposer.check_model_config({"critic": step_critic}) composer = CustomModelComposer( action_spaces_dict=env.action_spaces_dict, observation_spaces_dict=env.observation_spaces_dict, agent_counts_dict=env.agent_counts_dict, distribution_mapper_config=[], policy=policies, critic=step_critic) assert isinstance(composer.distribution_mapper, DistributionMapper) assert isinstance(composer.critic, TorchStepStateCritic) assert isinstance(composer.critic.networks, dict) # test saving models composer.save_models() try: import pygraphviz for model_file in [ "critic_0.pdf", "critic_1.pdf", "policy_0.pdf", "policy_1.pdf" ]: file_path = os.path.join(os.getcwd(), model_file) assert os.path.exists(file_path) os.remove(file_path) except ImportError: pass # no output generated as pygraphviz is not installed. rollout_generator = RolloutGenerator(env=env, record_next_observations=False) policy = RandomPolicy(env.action_spaces_dict) trajectory = rollout_generator.rollout( policy, n_steps=10).stack().to_torch(device='cpu') policy_output = composer.policy.compute_policy_output(trajectory) critic_input = StateCriticInput.build(policy_output, trajectory) _ = composer.critic.predict_values(critic_input)
def test_redistributes_actor_reward_if_available(): env = build_dummy_maze_env_with_structured_core_env() rollout_generator = RolloutGenerator(env=env) policy = RandomPolicy(env.action_spaces_dict) trajectory = rollout_generator.rollout(policy, n_steps=1) assert np.all(trajectory.step_records[0].rewards == [1, 1])
def build_single_step_with_critic_type( critics_composer_type: type(BaseStateCriticComposer), critics_type: type(TorchStateCritic), shared_embedding_keys: Optional[Union[List[str], Dict[StepKeyType, List[str]]]]): """ helper function """ # init environment env = GymMazeEnv('CartPole-v0') observation_space = env.observation_space action_space = env.action_space # map observations to a modality obs_modalities = { obs_key: "feature" for obs_key in observation_space.spaces.keys() } # define how to process a modality modality_config = dict() modality_config["feature"] = { "block_type": "maze.perception.blocks.DenseBlock", "block_params": { "hidden_units": [32, 32], "non_lin": "torch.nn.ReLU" } } modality_config["hidden"] = { "block_type": "maze.perception.blocks.DenseBlock", "block_params": { "hidden_units": [64], "non_lin": "torch.nn.ReLU" } } modality_config["recurrence"] = {} model_builder = { '_target_': 'maze.perception.builders.concat.ConcatModelBuilder', 'modality_config': modality_config, 'observation_modality_mapping': obs_modalities, 'shared_embedding_keys': shared_embedding_keys } # initialize default model builder default_builder = TemplateModelComposer( action_spaces_dict={0: action_space}, observation_spaces_dict={0: observation_space}, agent_counts_dict={0: 1}, distribution_mapper_config={}, model_builder=model_builder, policy={'_target_': policy_composer_type}, critic={'_target_': critics_composer_type}) # create model pdf default_builder.save_models() assert isinstance(default_builder.distribution_mapper, DistributionMapper) assert isinstance(default_builder.policy.networks[0], nn.Module) assert isinstance(default_builder.critic.networks[0], nn.Module) assert isinstance(default_builder.critic, critics_type) # test default policy gradient actor policy_net = default_builder.policy.networks[0] assert isinstance(policy_net, InferenceBlock) assert "action" in policy_net.out_keys assert policy_net.out_shapes()[0] == (2, ) # test standalone critic value_net = default_builder.critic.networks[0] assert isinstance(value_net, InferenceBlock) assert "value" in value_net.out_keys assert value_net.out_shapes()[0] == (1, ) if shared_embedding_keys is not None: if isinstance(shared_embedding_keys, list): assert all([ shared_key in policy_net.out_keys for shared_key in shared_embedding_keys ]) assert all([ shared_key in value_net.in_keys for shared_key in shared_embedding_keys ]) else: assert all([ shared_key in policy_net.out_keys for shared_keylist in shared_embedding_keys.values() for shared_key in shared_keylist ]) assert all([ shared_key in value_net.in_keys for shared_keylist in shared_embedding_keys.values() for shared_key in shared_keylist ]) else: assert value_net.in_keys == policy_net.in_keys rollout_generator = RolloutGenerator(env=env, record_next_observations=False) policy = RandomPolicy(env.action_spaces_dict) trajectory = rollout_generator.rollout( policy, n_steps=10).stack().to_torch(device='cpu') policy_output = default_builder.policy.compute_policy_output(trajectory) critic_input = StateCriticInput.build(policy_output, trajectory) _ = default_builder.critic.predict_values(critic_input)
def test_observation_statistics_logging(): """ observation normalization logging test """ # normalization config normalization_config = { "default_strategy": "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy", "default_strategy_config": { "clip_range": (None, None), "axis": 0 }, "default_statistics": None, "statistics_dump": "statistics.pkl", "exclude": None, "manual_config": { "observation": { "strategy": "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy", "strategy_config": { "clip_range": (0, 1) }, "statistics": { "mean": [0, 0, 0, 0], "std": [1, 1, 1, 1] } } } } writer = LogStatsWriterTensorboard(log_dir='test_log', tensorboard_render_figure=True) register_log_stats_writer(writer) # attach a console writer as well for immediate console feedback register_log_stats_writer(LogStatsWriterConsole()) # init environment env = GymMazeEnv("CartPole-v0") # wrap env with observation normalization env = ObservationNormalizationWrapper( env, default_strategy=normalization_config["default_strategy"], default_strategy_config=normalization_config[ "default_strategy_config"], default_statistics=normalization_config["default_statistics"], statistics_dump=normalization_config["statistics_dump"], sampling_policy=RandomPolicy(env.action_spaces_dict), exclude=normalization_config["exclude"], manual_config=normalization_config["manual_config"]) env = LogStatsWrapper.wrap(env, logging_prefix="train") n_episodes = 10 n_steps_per_episode = 100 for episode in range(n_episodes): _ = env.reset() for step in range(n_steps_per_episode): # take random action action = env.action_space.sample() # take step in env and trigger log stats writing _, _, done, _ = env.step(action) if done: break increment_log_step()
def test_observation_normalization_pipeline(): """ observation normalization test """ # wrap env with observation normalization env = GymMazeEnv("CartPole-v0") # normalization config normalization_config = { "default_strategy": "maze.normalization_strategies.RangeZeroOneObservationNormalizationStrategy", "default_strategy_config": { "clip_range": (None, None), "axis": 0 }, "default_statistics": None, "sampling_policy": RandomPolicy(env.action_spaces_dict), "statistics_dump": "statistics.pkl", "exclude": None, "manual_config": None } env = ObservationNormalizationWrapper( env, default_strategy=normalization_config["default_strategy"], default_strategy_config=normalization_config[ "default_strategy_config"], default_statistics=normalization_config["default_statistics"], statistics_dump=normalization_config["statistics_dump"], sampling_policy=normalization_config['sampling_policy'], exclude=normalization_config["exclude"], manual_config=normalization_config["manual_config"]) # check statistics statistics = env.get_statistics() assert statistics["observation"] is None, statistics # check that assertion is thrown with pytest.raises(AssertionError): random_env_steps(env, steps=1) # estimate normalization statistics statistics = obtain_normalization_statistics(env, n_samples=1000) # check statistics for sub_step_key in env.observation_spaces_dict: for obs_key in env.observation_spaces_dict[sub_step_key].spaces: assert obs_key in statistics for stats_key in statistics[obs_key]: stats = statistics[obs_key][stats_key] assert isinstance(stats, np.ndarray) # test normalization random_env_steps(env, steps=100) # test file dump and loading statistics_copy = copy.deepcopy(env.get_statistics()) assert os.path.exists("statistics.pkl") # wrap env with observation normalization env = GymMazeEnv("CartPole-v0") env = ObservationNormalizationWrapper( env, default_strategy=normalization_config["default_strategy"], default_strategy_config=normalization_config[ "default_strategy_config"], default_statistics=normalization_config["default_statistics"], statistics_dump=normalization_config["statistics_dump"], sampling_policy=normalization_config['sampling_policy'], exclude=normalization_config["exclude"], manual_config=normalization_config["manual_config"]) # check if stats loading worked properly statistics = env.get_statistics() for _ in env.observation_spaces_dict: for obs_key in statistics: for stats_key in statistics[obs_key]: assert np.all(statistics[obs_key][stats_key] == statistics_copy[obs_key][stats_key]) # check if stepping works random_env_steps(env, steps=100)
def test_default_action_space_sampling(): env = build_dummy_maze_env() policy = RandomPolicy(env.action_spaces_dict) action = policy.compute_action(observation=env.observation_space.sample(), maze_state=None) assert action in env.action_space
# OBSERVATION NORMALIZATION # ------------------------- # we wrap the environment with the ObservationNormalizationWrapper # (you can find details on this in the section on observation normalization) env = ObservationNormalizationWrapper( env=env, default_strategy= "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy", default_strategy_config={ "clip_range": (None, None), "axis": 0 }, default_statistics=None, statistics_dump="statistics.pkl", sampling_policy=RandomPolicy(env.action_spaces_dict), exclude=None, manual_config=None) # next we estimate the normalization statistics by # (1) collecting observations by randomly sampling 1000 transitions from the environment # (2) computing the statistics according to the define normalization strategy normalization_statistics = obtain_normalization_statistics(env, n_samples=1000) env.set_normalization_statistics(normalization_statistics) # after this step all observations returned by the environment will be normalized # stable-baselines does not support dict spaces so we have to remove them env = NoDictSpacesWrapper(env) # TRAINING AND ROLLOUT (remains unchanged)
import gym from maze.core.agent.random_policy import RandomPolicy from maze.core.agent_deployment.agent_deployment import AgentDeployment from maze.core.wrappers.maze_gym_env_wrapper import GymMazeEnv env = GymMazeEnv("CartPole-v0") policy = RandomPolicy(action_spaces_dict=env.action_spaces_dict) agent_deployment = AgentDeployment(policy=policy, env=env) # Simulate an external production environment that does not use Maze external_env = gym.make("CartPole-v0") maze_state = external_env.reset() reward, done, info = 0, False, {} for i in range(10): # Query the agent deployment for maze action, then step the environment with it maze_action = agent_deployment.act(maze_state, reward, done, info) maze_state, reward, done, info = external_env.step(maze_action) agent_deployment.close(maze_state, reward, done, info)
def build_structured_with_critic_type( env, critics_composer_type: type(BaseStateCriticComposer), critics_type: type(TorchStateCritic), shared_embedding_keys: Optional[Union[List[str], Dict[StepKeyType, List[str]]]]): """ helper function """ # map observations to a modality obs_modalities = { "observation_0": "image", "observation_1": "feature", DeltaStateCriticComposer.prev_value_key: 'feature' } # define how to process a modality modality_config = dict() modality_config["feature"] = { "block_type": "maze.perception.blocks.DenseBlock", "block_params": { "hidden_units": [32, 32], "non_lin": "torch.nn.ReLU" } } modality_config['image'] = { 'block_type': 'maze.perception.blocks.StridedConvolutionDenseBlock', 'block_params': { 'hidden_channels': [8, 16, 32], 'hidden_kernels': [8, 4, 4], 'convolution_dimension': 2, 'hidden_strides': [4, 2, 2], 'hidden_dilations': None, 'hidden_padding': [1, 1, 1], 'padding_mode': None, 'hidden_units': [], 'non_lin': 'torch.nn.SELU' } } modality_config["hidden"] = { "block_type": "maze.perception.blocks.DenseBlock", "block_params": { "hidden_units": [64], "non_lin": "torch.nn.ReLU" } } modality_config["recurrence"] = {} model_builder = { '_target_': 'maze.perception.builders.concat.ConcatModelBuilder', 'modality_config': modality_config, 'observation_modality_mapping': obs_modalities, 'shared_embedding_keys': shared_embedding_keys } # initialize default model builder default_builder = TemplateModelComposer( action_spaces_dict=env.action_spaces_dict, observation_spaces_dict=env.observation_spaces_dict, agent_counts_dict=env.agent_counts_dict, distribution_mapper_config={}, model_builder=model_builder, policy={'_target_': policy_composer_type}, critic={'_target_': critics_composer_type}) # create model pdf default_builder.save_models() assert isinstance(default_builder.distribution_mapper, DistributionMapper) for pp in default_builder.policy.networks.values(): assert isinstance(pp, nn.Module) for cc in default_builder.critic.networks.values(): assert isinstance(cc, nn.Module) assert isinstance(default_builder.critic, critics_type) rollout_generator = RolloutGenerator(env=env, record_next_observations=False) policy = RandomPolicy(env.action_spaces_dict) trajectory = rollout_generator.rollout( policy, n_steps=10).stack().to_torch(device='cpu') policy_output = default_builder.policy.compute_policy_output(trajectory) critic_input = StateCriticInput.build(policy_output, trajectory) _ = default_builder.critic.predict_values(critic_input)