コード例 #1
0
ファイル: es_gym_cartpole.py プロジェクト: enlite-ai/maze
def main(n_epochs) -> None:
    """Trains the cart pole environment with the ES implementation.
    """

    env = GymMazeEnv(env="CartPole-v0")
    distribution_mapper = DistributionMapper(action_space=env.action_space,
                                             distribution_mapper_config={})

    obs_shapes = observation_spaces_to_in_shapes(env.observation_spaces_dict)
    action_shapes = {
        step_key: {
            action_head: distribution_mapper.required_logits_shape(action_head)
            for action_head in env.action_spaces_dict[step_key].spaces.keys()
        }
        for step_key in env.action_spaces_dict.keys()
    }

    # initialize policies
    policies = [
        PolicyNet(obs_shapes=obs_shapes[0],
                  action_logits_shapes=action_shapes[0],
                  non_lin=nn.SELU)
    ]

    # initialize optimizer
    policy = TorchPolicy(networks=list_to_dict(policies),
                         distribution_mapper=distribution_mapper,
                         device="cpu")

    shared_noise = SharedNoiseTable(count=1_000_000)

    algorithm_config = ESAlgorithmConfig(n_rollouts_per_update=100,
                                         n_timesteps_per_update=0,
                                         max_steps=0,
                                         optimizer=Adam(step_size=0.01),
                                         l2_penalty=0.005,
                                         noise_stddev=0.02,
                                         n_epochs=n_epochs,
                                         policy_wrapper=None)

    trainer = ESTrainer(algorithm_config=algorithm_config,
                        torch_policy=policy,
                        shared_noise=shared_noise,
                        normalization_stats=None)

    setup_logging(job_config=None)

    maze_rng = np.random.RandomState(None)

    # run with pseudo-distribution, without worker processes
    trainer.train(ESDummyDistributedRollouts(
        env=env,
        n_eval_rollouts=10,
        shared_noise=shared_noise,
        agent_instance_seed=MazeSeeding.generate_seed_from_random_state(
            maze_rng)),
                  model_selection=None)
コード例 #2
0
def main(n_epochs: int, rnn_steps: int) -> None:
    """Trains the cart pole environment with the multi-step a2c implementation.
    """
    env_name = "CartPole-v0"

    # initialize distributed env
    envs = SequentialVectorEnv([
        lambda: to_rnn_dict_space_environment(env=env_name,
                                              rnn_steps=rnn_steps)
        for _ in range(4)
    ],
                               logging_prefix="train")

    # initialize the env and enable statistics collection
    eval_env = SequentialVectorEnv([
        lambda: to_rnn_dict_space_environment(env=env_name,
                                              rnn_steps=rnn_steps)
        for _ in range(4)
    ],
                                   logging_prefix="eval")

    # map observations to a modality
    obs_modalities_mappings = {"observation": "feature"}

    # define how to process a modality
    modality_config = dict()
    modality_config["feature"] = {
        "block_type": "maze.perception.blocks.DenseBlock",
        "block_params": {
            "hidden_units": [32, 32],
            "non_lin": "torch.nn.Tanh"
        }
    }
    modality_config["hidden"] = {
        "block_type": "maze.perception.blocks.DenseBlock",
        "block_params": {
            "hidden_units": [64],
            "non_lin": "torch.nn.Tanh"
        }
    }
    modality_config["recurrence"] = {}
    if rnn_steps > 0:
        modality_config["recurrence"] = {
            "block_type": "maze.perception.blocks.LSTMLastStepBlock",
            "block_params": {
                "hidden_size": 8,
                "num_layers": 1,
                "bidirectional": False,
                "non_lin": "torch.nn.Tanh"
            }
        }

    template_builder = TemplateModelComposer(
        action_spaces_dict=envs.action_spaces_dict,
        observation_spaces_dict=envs.observation_spaces_dict,
        agent_counts_dict=envs.agent_counts_dict,
        distribution_mapper_config={},
        model_builder=ConcatModelBuilder(modality_config,
                                         obs_modalities_mappings, None),
        policy={
            '_target_':
            'maze.perception.models.policies.ProbabilisticPolicyComposer'
        },
        critic={
            '_target_': 'maze.perception.models.critics.StateCriticComposer'
        })

    algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs,
                                          epoch_length=10,
                                          patience=10,
                                          critic_burn_in_epochs=0,
                                          n_rollout_steps=20,
                                          lr=0.0005,
                                          gamma=0.98,
                                          gae_lambda=1.0,
                                          policy_loss_coef=1.0,
                                          value_loss_coef=0.5,
                                          entropy_coef=0.0,
                                          max_grad_norm=0.0,
                                          device="cpu",
                                          rollout_evaluator=RolloutEvaluator(
                                              eval_env=eval_env,
                                              n_episodes=1,
                                              model_selection=None,
                                              deterministic=True))

    model = TorchActorCritic(policy=TorchPolicy(
        networks=template_builder.policy.networks,
        distribution_mapper=template_builder.distribution_mapper,
        device=algorithm_config.device),
                             critic=template_builder.critic,
                             device=algorithm_config.device)

    a2c = A2C(rollout_generator=RolloutGenerator(envs),
              evaluator=algorithm_config.rollout_evaluator,
              algorithm_config=algorithm_config,
              model=model,
              model_selection=None)

    setup_logging(job_config=None)

    # train agent
    a2c.train()

    # final evaluation run
    print("Final Evaluation Run:")
    a2c.evaluate()
コード例 #3
0
ファイル: a2c_gym_cartpole.py プロジェクト: enlite-ai/maze
def main(n_epochs: int) -> None:
    """Trains the cart pole environment with the multi-step a2c implementation.
    """

    # initialize distributed env
    envs = SequentialVectorEnv(
        [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)],
        logging_prefix="train")

    # initialize the env and enable statistics collection
    eval_env = SequentialVectorEnv(
        [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)],
        logging_prefix="eval")

    # init distribution mapper
    env = GymMazeEnv(env="CartPole-v0")

    # init default distribution mapper
    distribution_mapper = DistributionMapper(action_space=env.action_space,
                                             distribution_mapper_config={})

    # initialize policies
    policies = {
        0: PolicyNet({'observation': (4, )}, {'action': (2, )},
                     non_lin=nn.Tanh)
    }

    # initialize critic
    critics = {0: ValueNet({'observation': (4, )})}

    # initialize optimizer
    algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs,
                                          epoch_length=10,
                                          patience=10,
                                          critic_burn_in_epochs=0,
                                          n_rollout_steps=20,
                                          lr=0.0005,
                                          gamma=0.98,
                                          gae_lambda=1.0,
                                          policy_loss_coef=1.0,
                                          value_loss_coef=0.5,
                                          entropy_coef=0.0,
                                          max_grad_norm=0.0,
                                          device="cpu",
                                          rollout_evaluator=RolloutEvaluator(
                                              eval_env=eval_env,
                                              n_episodes=1,
                                              model_selection=None,
                                              deterministic=True))

    # initialize actor critic model
    model = TorchActorCritic(policy=TorchPolicy(
        networks=policies,
        distribution_mapper=distribution_mapper,
        device=algorithm_config.device),
                             critic=TorchSharedStateCritic(
                                 networks=critics,
                                 obs_spaces_dict=env.observation_spaces_dict,
                                 device=algorithm_config.device,
                                 stack_observations=False),
                             device=algorithm_config.device)

    a2c = A2C(rollout_generator=RolloutGenerator(envs),
              evaluator=algorithm_config.rollout_evaluator,
              algorithm_config=algorithm_config,
              model=model,
              model_selection=None)

    setup_logging(job_config=None)

    # train agent
    a2c.train()

    # final evaluation run
    print("Final Evaluation Run:")
    a2c.evaluate()
コード例 #4
0
ファイル: training_runner.py プロジェクト: enlite-ai/maze
    def setup(self, cfg: DictConfig) -> None:
        """
        Sets up prerequisites to training.
        Includes wrapping the environment for observation normalization, instantiating the model composer etc.
        :param cfg: DictConfig defining components to initialize.
        """

        self._cfg = cfg

        # Generate a random state used for sampling random seeds for the envs and agents
        self.maze_seeding = MazeSeeding(cfg.seeding.env_base_seed,
                                        cfg.seeding.agent_base_seed,
                                        cfg.seeding.cudnn_determinism_flag)

        with SwitchWorkingDirectoryToInput(cfg.input_dir):
            assert isinstance(cfg.env, DictConfig) or isinstance(
                cfg.env, Callable)
            wrapper_cfg = omegaconf.OmegaConf.to_object(
                cfg["wrappers"]) if "wrappers" in cfg else {}

            # if the observation normalization is already available, read it from the input directory
            if isinstance(cfg.env, DictConfig):
                self.env_factory = EnvFactory(
                    omegaconf.OmegaConf.to_object(cfg["env"]), wrapper_cfg)
            elif isinstance(cfg.env, Callable):
                env_fn = omegaconf.OmegaConf.to_container(cfg)["env"]
                self.env_factory = lambda: WrapperFactory.wrap_from_config(
                    env_fn(), wrapper_cfg)

            normalization_env = self.env_factory()
            normalization_env.seed(
                self.maze_seeding.generate_env_instance_seed())

        # Observation normalization
        self._normalization_statistics = obtain_normalization_statistics(
            normalization_env, n_samples=self.normalization_samples)
        if self._normalization_statistics:
            self.env_factory = make_normalized_env_factory(
                self.env_factory, self._normalization_statistics)
            # dump statistics to current working directory
            assert isinstance(normalization_env,
                              ObservationNormalizationWrapper)
            normalization_env.dump_statistics()

        # Generate an agent seed and set the seed globally for the model initialization
        set_seeds_globally(self.maze_seeding.agent_global_seed,
                           self.maze_seeding.cudnn_determinism_flag,
                           info_txt=f'training runner (Pid:{os.getpid()})')

        # init model composer
        composer_type = Factory(base_type=BaseModelComposer).type_from_name(
            cfg.model['_target_'])
        composer_type.check_model_config(cfg.model)

        # todo Factory.instantiate returns specified dicts as DictConfig, i.e. many specified types are wrong. How do we
        #  go about this? DictConfig behaves similarly to Dict for all intents and purposes, but typing is still off/
        #  misleading. This is independent from our Python training API and can apparently not be changed, i.e. kwargs
        #  seems to be always converted to DictConfig/ListConfig.
        self._model_composer = Factory(
            base_type=BaseModelComposer).instantiate(
                cfg.model,
                action_spaces_dict=normalization_env.action_spaces_dict,
                observation_spaces_dict=normalization_env.
                observation_spaces_dict,
                agent_counts_dict=normalization_env.agent_counts_dict)

        SpacesConfig(self._model_composer.action_spaces_dict,
                     self._model_composer.observation_spaces_dict,
                     self._model_composer.agent_counts_dict).save(
                         self.spaces_config_dump_file)

        # Should be done after the normalization runs, otherwise stats from those will get logged as well.
        setup_logging(job_config=cfg)

        # close normalization env
        normalization_env.close()
コード例 #5
0
def train(n_epochs):
    # Instantiate one environment. This will be used for convenient access to observation
    # and action spaces.
    env = cartpole_env_factory()
    observation_space = env.observation_space
    action_space = env.action_space

    # Policy Setup
    # ------------

    # Policy Network
    # ^^^^^^^^^^^^^^
    # Instantiate policy with the correct shapes of observation and action spaces.
    policy_net = CartpolePolicyNet(
        obs_shapes={'observation': observation_space.spaces['observation'].shape},
        action_logit_shapes={'action': (action_space.spaces['action'].n,)})

    maze_wrapped_policy_net = TorchModelBlock(
        in_keys='observation', out_keys='action',
        in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2],
        out_num_dims=2, net=policy_net)

    policy_networks = {0: maze_wrapped_policy_net}

    # Policy Distribution
    # ^^^^^^^^^^^^^^^^^^^
    distribution_mapper = DistributionMapper(
        action_space=action_space,
        distribution_mapper_config={})

    # Optionally, you can specify a different distribution with the distribution_mapper_config argument. Using a
    # Categorical distribution for a discrete action space would be done via
    distribution_mapper = DistributionMapper(
        action_space=action_space,
        distribution_mapper_config=[{
            "action_space": gym.spaces.Discrete,
            "distribution": "maze.distributions.categorical.CategoricalProbabilityDistribution"}])

    # Instantiating the Policy
    # ^^^^^^^^^^^^^^^^^^^^^^^^
    torch_policy = TorchPolicy(networks=policy_networks, distribution_mapper=distribution_mapper, device='cpu')

    # Value Function Setup
    # --------------------

    # Value Network
    # ^^^^^^^^^^^^^
    value_net = CartpoleValueNet(obs_shapes={'observation': observation_space.spaces['observation'].shape})

    maze_wrapped_value_net = TorchModelBlock(
        in_keys='observation', out_keys='value',
        in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2],
        out_num_dims=2, net=value_net)

    value_networks = {0: maze_wrapped_value_net}

    # Instantiate the Value Function
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    torch_critic = TorchSharedStateCritic(networks=value_networks, obs_spaces_dict=env.observation_spaces_dict,
                                          device='cpu', stack_observations=False)

    # Initializing the ActorCritic Model.
    # -----------------------------------
    actor_critic_model = TorchActorCritic(policy=torch_policy, critic=torch_critic, device='cpu')

    # Instantiating the Trainer
    # =========================

    algorithm_config = A2CAlgorithmConfig(
        n_epochs=n_epochs,
        epoch_length=25,
        patience=15,
        critic_burn_in_epochs=0,
        n_rollout_steps=100,
        lr=0.0005,
        gamma=0.98,
        gae_lambda=1.0,
        policy_loss_coef=1.0,
        value_loss_coef=0.5,
        entropy_coef=0.00025,
        max_grad_norm=0.0,
        device='cpu',
        rollout_evaluator=RolloutEvaluator(
            eval_env=SequentialVectorEnv([cartpole_env_factory]),
            n_episodes=1,
            model_selection=None,
            deterministic=True
        )
    )

    # Distributed Environments
    # ------------------------
    # In order to use the distributed trainers, the previously created env factory is supplied to one of Maze's
    # distribution classes:
    train_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="train")
    eval_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="eval")

    # Initialize best model selection.
    model_selection = BestModelSelection(dump_file="params.pt", model=actor_critic_model)

    a2c_trainer = A2C(rollout_generator=RolloutGenerator(train_envs),
                      evaluator=algorithm_config.rollout_evaluator,
                      algorithm_config=algorithm_config,
                      model=actor_critic_model,
                      model_selection=model_selection)

    # Train the Agent
    # ===============
    # Before starting the training, we will enable logging by calling
    log_dir = '.'
    setup_logging(job_config=None, log_dir=log_dir)

    # Now, we can train the agent.
    a2c_trainer.train()

    return 0