def main(n_epochs) -> None: """Trains the cart pole environment with the ES implementation. """ env = GymMazeEnv(env="CartPole-v0") distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={}) obs_shapes = observation_spaces_to_in_shapes(env.observation_spaces_dict) action_shapes = { step_key: { action_head: distribution_mapper.required_logits_shape(action_head) for action_head in env.action_spaces_dict[step_key].spaces.keys() } for step_key in env.action_spaces_dict.keys() } # initialize policies policies = [ PolicyNet(obs_shapes=obs_shapes[0], action_logits_shapes=action_shapes[0], non_lin=nn.SELU) ] # initialize optimizer policy = TorchPolicy(networks=list_to_dict(policies), distribution_mapper=distribution_mapper, device="cpu") shared_noise = SharedNoiseTable(count=1_000_000) algorithm_config = ESAlgorithmConfig(n_rollouts_per_update=100, n_timesteps_per_update=0, max_steps=0, optimizer=Adam(step_size=0.01), l2_penalty=0.005, noise_stddev=0.02, n_epochs=n_epochs, policy_wrapper=None) trainer = ESTrainer(algorithm_config=algorithm_config, torch_policy=policy, shared_noise=shared_noise, normalization_stats=None) setup_logging(job_config=None) maze_rng = np.random.RandomState(None) # run with pseudo-distribution, without worker processes trainer.train(ESDummyDistributedRollouts( env=env, n_eval_rollouts=10, shared_noise=shared_noise, agent_instance_seed=MazeSeeding.generate_seed_from_random_state( maze_rng)), model_selection=None)
def main(n_epochs: int, rnn_steps: int) -> None: """Trains the cart pole environment with the multi-step a2c implementation. """ env_name = "CartPole-v0" # initialize distributed env envs = SequentialVectorEnv([ lambda: to_rnn_dict_space_environment(env=env_name, rnn_steps=rnn_steps) for _ in range(4) ], logging_prefix="train") # initialize the env and enable statistics collection eval_env = SequentialVectorEnv([ lambda: to_rnn_dict_space_environment(env=env_name, rnn_steps=rnn_steps) for _ in range(4) ], logging_prefix="eval") # map observations to a modality obs_modalities_mappings = {"observation": "feature"} # define how to process a modality modality_config = dict() modality_config["feature"] = { "block_type": "maze.perception.blocks.DenseBlock", "block_params": { "hidden_units": [32, 32], "non_lin": "torch.nn.Tanh" } } modality_config["hidden"] = { "block_type": "maze.perception.blocks.DenseBlock", "block_params": { "hidden_units": [64], "non_lin": "torch.nn.Tanh" } } modality_config["recurrence"] = {} if rnn_steps > 0: modality_config["recurrence"] = { "block_type": "maze.perception.blocks.LSTMLastStepBlock", "block_params": { "hidden_size": 8, "num_layers": 1, "bidirectional": False, "non_lin": "torch.nn.Tanh" } } template_builder = TemplateModelComposer( action_spaces_dict=envs.action_spaces_dict, observation_spaces_dict=envs.observation_spaces_dict, agent_counts_dict=envs.agent_counts_dict, distribution_mapper_config={}, model_builder=ConcatModelBuilder(modality_config, obs_modalities_mappings, None), policy={ '_target_': 'maze.perception.models.policies.ProbabilisticPolicyComposer' }, critic={ '_target_': 'maze.perception.models.critics.StateCriticComposer' }) algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs, epoch_length=10, patience=10, critic_burn_in_epochs=0, n_rollout_steps=20, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.0, max_grad_norm=0.0, device="cpu", rollout_evaluator=RolloutEvaluator( eval_env=eval_env, n_episodes=1, model_selection=None, deterministic=True)) model = TorchActorCritic(policy=TorchPolicy( networks=template_builder.policy.networks, distribution_mapper=template_builder.distribution_mapper, device=algorithm_config.device), critic=template_builder.critic, device=algorithm_config.device) a2c = A2C(rollout_generator=RolloutGenerator(envs), evaluator=algorithm_config.rollout_evaluator, algorithm_config=algorithm_config, model=model, model_selection=None) setup_logging(job_config=None) # train agent a2c.train() # final evaluation run print("Final Evaluation Run:") a2c.evaluate()
def main(n_epochs: int) -> None: """Trains the cart pole environment with the multi-step a2c implementation. """ # initialize distributed env envs = SequentialVectorEnv( [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)], logging_prefix="train") # initialize the env and enable statistics collection eval_env = SequentialVectorEnv( [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)], logging_prefix="eval") # init distribution mapper env = GymMazeEnv(env="CartPole-v0") # init default distribution mapper distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={}) # initialize policies policies = { 0: PolicyNet({'observation': (4, )}, {'action': (2, )}, non_lin=nn.Tanh) } # initialize critic critics = {0: ValueNet({'observation': (4, )})} # initialize optimizer algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs, epoch_length=10, patience=10, critic_burn_in_epochs=0, n_rollout_steps=20, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.0, max_grad_norm=0.0, device="cpu", rollout_evaluator=RolloutEvaluator( eval_env=eval_env, n_episodes=1, model_selection=None, deterministic=True)) # initialize actor critic model model = TorchActorCritic(policy=TorchPolicy( networks=policies, distribution_mapper=distribution_mapper, device=algorithm_config.device), critic=TorchSharedStateCritic( networks=critics, obs_spaces_dict=env.observation_spaces_dict, device=algorithm_config.device, stack_observations=False), device=algorithm_config.device) a2c = A2C(rollout_generator=RolloutGenerator(envs), evaluator=algorithm_config.rollout_evaluator, algorithm_config=algorithm_config, model=model, model_selection=None) setup_logging(job_config=None) # train agent a2c.train() # final evaluation run print("Final Evaluation Run:") a2c.evaluate()
def setup(self, cfg: DictConfig) -> None: """ Sets up prerequisites to training. Includes wrapping the environment for observation normalization, instantiating the model composer etc. :param cfg: DictConfig defining components to initialize. """ self._cfg = cfg # Generate a random state used for sampling random seeds for the envs and agents self.maze_seeding = MazeSeeding(cfg.seeding.env_base_seed, cfg.seeding.agent_base_seed, cfg.seeding.cudnn_determinism_flag) with SwitchWorkingDirectoryToInput(cfg.input_dir): assert isinstance(cfg.env, DictConfig) or isinstance( cfg.env, Callable) wrapper_cfg = omegaconf.OmegaConf.to_object( cfg["wrappers"]) if "wrappers" in cfg else {} # if the observation normalization is already available, read it from the input directory if isinstance(cfg.env, DictConfig): self.env_factory = EnvFactory( omegaconf.OmegaConf.to_object(cfg["env"]), wrapper_cfg) elif isinstance(cfg.env, Callable): env_fn = omegaconf.OmegaConf.to_container(cfg)["env"] self.env_factory = lambda: WrapperFactory.wrap_from_config( env_fn(), wrapper_cfg) normalization_env = self.env_factory() normalization_env.seed( self.maze_seeding.generate_env_instance_seed()) # Observation normalization self._normalization_statistics = obtain_normalization_statistics( normalization_env, n_samples=self.normalization_samples) if self._normalization_statistics: self.env_factory = make_normalized_env_factory( self.env_factory, self._normalization_statistics) # dump statistics to current working directory assert isinstance(normalization_env, ObservationNormalizationWrapper) normalization_env.dump_statistics() # Generate an agent seed and set the seed globally for the model initialization set_seeds_globally(self.maze_seeding.agent_global_seed, self.maze_seeding.cudnn_determinism_flag, info_txt=f'training runner (Pid:{os.getpid()})') # init model composer composer_type = Factory(base_type=BaseModelComposer).type_from_name( cfg.model['_target_']) composer_type.check_model_config(cfg.model) # todo Factory.instantiate returns specified dicts as DictConfig, i.e. many specified types are wrong. How do we # go about this? DictConfig behaves similarly to Dict for all intents and purposes, but typing is still off/ # misleading. This is independent from our Python training API and can apparently not be changed, i.e. kwargs # seems to be always converted to DictConfig/ListConfig. self._model_composer = Factory( base_type=BaseModelComposer).instantiate( cfg.model, action_spaces_dict=normalization_env.action_spaces_dict, observation_spaces_dict=normalization_env. observation_spaces_dict, agent_counts_dict=normalization_env.agent_counts_dict) SpacesConfig(self._model_composer.action_spaces_dict, self._model_composer.observation_spaces_dict, self._model_composer.agent_counts_dict).save( self.spaces_config_dump_file) # Should be done after the normalization runs, otherwise stats from those will get logged as well. setup_logging(job_config=cfg) # close normalization env normalization_env.close()
def train(n_epochs): # Instantiate one environment. This will be used for convenient access to observation # and action spaces. env = cartpole_env_factory() observation_space = env.observation_space action_space = env.action_space # Policy Setup # ------------ # Policy Network # ^^^^^^^^^^^^^^ # Instantiate policy with the correct shapes of observation and action spaces. policy_net = CartpolePolicyNet( obs_shapes={'observation': observation_space.spaces['observation'].shape}, action_logit_shapes={'action': (action_space.spaces['action'].n,)}) maze_wrapped_policy_net = TorchModelBlock( in_keys='observation', out_keys='action', in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2], out_num_dims=2, net=policy_net) policy_networks = {0: maze_wrapped_policy_net} # Policy Distribution # ^^^^^^^^^^^^^^^^^^^ distribution_mapper = DistributionMapper( action_space=action_space, distribution_mapper_config={}) # Optionally, you can specify a different distribution with the distribution_mapper_config argument. Using a # Categorical distribution for a discrete action space would be done via distribution_mapper = DistributionMapper( action_space=action_space, distribution_mapper_config=[{ "action_space": gym.spaces.Discrete, "distribution": "maze.distributions.categorical.CategoricalProbabilityDistribution"}]) # Instantiating the Policy # ^^^^^^^^^^^^^^^^^^^^^^^^ torch_policy = TorchPolicy(networks=policy_networks, distribution_mapper=distribution_mapper, device='cpu') # Value Function Setup # -------------------- # Value Network # ^^^^^^^^^^^^^ value_net = CartpoleValueNet(obs_shapes={'observation': observation_space.spaces['observation'].shape}) maze_wrapped_value_net = TorchModelBlock( in_keys='observation', out_keys='value', in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2], out_num_dims=2, net=value_net) value_networks = {0: maze_wrapped_value_net} # Instantiate the Value Function # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ torch_critic = TorchSharedStateCritic(networks=value_networks, obs_spaces_dict=env.observation_spaces_dict, device='cpu', stack_observations=False) # Initializing the ActorCritic Model. # ----------------------------------- actor_critic_model = TorchActorCritic(policy=torch_policy, critic=torch_critic, device='cpu') # Instantiating the Trainer # ========================= algorithm_config = A2CAlgorithmConfig( n_epochs=n_epochs, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator( eval_env=SequentialVectorEnv([cartpole_env_factory]), n_episodes=1, model_selection=None, deterministic=True ) ) # Distributed Environments # ------------------------ # In order to use the distributed trainers, the previously created env factory is supplied to one of Maze's # distribution classes: train_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="train") eval_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="eval") # Initialize best model selection. model_selection = BestModelSelection(dump_file="params.pt", model=actor_critic_model) a2c_trainer = A2C(rollout_generator=RolloutGenerator(train_envs), evaluator=algorithm_config.rollout_evaluator, algorithm_config=algorithm_config, model=actor_critic_model, model_selection=model_selection) # Train the Agent # =============== # Before starting the training, we will enable logging by calling log_dir = '.' setup_logging(job_config=None, log_dir=log_dir) # Now, we can train the agent. a2c_trainer.train() return 0