Exemple #1
0
def test_vectorized_rollout():
    concurrency = 3
    vectorized_env = SequentialVectorEnv([build_dummy_structured_env] * concurrency)

    standard_env = build_dummy_structured_env()
    assert vectorized_env.observation_spaces_dict == standard_env.observation_spaces_dict
    assert vectorized_env.action_spaces_dict == standard_env.action_spaces_dict

    policy = DistributedRandomPolicy(vectorized_env.action_spaces_dict, concurrency=concurrency)

    observation = vectorized_env.reset()
    for _ in range(3):
        action = policy.compute_action(observation, actor_id=vectorized_env.actor_id(), maze_state=None)
        observation, reward, done, info = vectorized_env.step(action)
def _get_alg_config(env_name: str, runner_type: str) -> A2CAlgorithmConfig:
    """
    Returns algorithm config used in tests.
    :param env_name: Env name for rollout evaluator.
    :param runner_type: Runner type. "dev" or "local".
    :return: A2CAlgorithmConfig instance.
    """

    env_factory = lambda: GymMazeEnv(env_name)
    return A2CAlgorithmConfig(
        n_epochs=1,
        epoch_length=25,
        patience=15,
        critic_burn_in_epochs=0,
        n_rollout_steps=100,
        lr=0.0005,
        gamma=0.98,
        gae_lambda=1.0,
        policy_loss_coef=1.0,
        value_loss_coef=0.5,
        entropy_coef=0.00025,
        max_grad_norm=0.0,
        device='cpu',
        rollout_evaluator=RolloutEvaluator(
            eval_env=SubprocVectorEnv([env_factory])
            if runner_type == "local" else SequentialVectorEnv([env_factory]),
            n_episodes=1,
            model_selection=None,
            deterministic=True))
def _algorithm_config():
    eval_env = SequentialVectorEnv([_env_factory for _ in range(2)],
                                   logging_prefix='eval')

    return ImpalaAlgorithmConfig(n_epochs=2,
                                 epoch_length=2,
                                 queue_out_of_sync_factor=2,
                                 patience=15,
                                 n_rollout_steps=20,
                                 lr=0.0005,
                                 gamma=0.98,
                                 policy_loss_coef=1.0,
                                 value_loss_coef=0.5,
                                 entropy_coef=0.0,
                                 max_grad_norm=0.0,
                                 device="cpu",
                                 vtrace_clip_pg_rho_threshold=1,
                                 vtrace_clip_rho_threshold=1,
                                 num_actors=1,
                                 actors_batch_size=5,
                                 critic_burn_in_epochs=0,
                                 rollout_evaluator=RolloutEvaluator(
                                     eval_env=eval_env,
                                     n_episodes=1,
                                     model_selection=None,
                                     deterministic=True))
Exemple #4
0
 def create_distributed_eval_env(
         cls, env_factory: Callable[[], Union[StructuredEnv,
                                              StructuredEnvSpacesMixin]],
         eval_concurrency: int, logging_prefix: str) -> SequentialVectorEnv:
     """create single-threaded env distribution"""
     return SequentialVectorEnv(
         [env_factory for _ in range(eval_concurrency)],
         logging_prefix=logging_prefix)
Exemple #5
0
 def create_distributed_eval_env(
         self, env_factory: Callable[[], Union[StructuredEnv, MazeEnv]],
         eval_concurrency: int, logging_prefix: str) -> SequentialVectorEnv:
     """create single-threaded env distribution"""
     # fallback to a fixed number of pseudo-concurrent environments to avoid making this sequential execution
     # unnecessary slow on machines with a higher core number
     return SequentialVectorEnv(
         [env_factory for _ in range(eval_concurrency)],
         logging_prefix=logging_prefix)
Exemple #6
0
def _generate_inconsistency_type_2_configs(
) -> Tuple[Dict, Dict, Dict, A2CAlgorithmConfig, Dict]:
    """
    Returns configsf for tests of inconsistencies of type 2.
    :return: es_dev_runner_config, a2c_dev_runner_config, invalid_a2c_dev_runner_config, a2c_alg_config,
             default_overrides.
    """

    gym_env_name = "CartPole-v0"
    es_dev_runner_config = {
        'state_dict_dump_file': 'state_dict.pt',
        'spaces_config_dump_file': 'spaces_config.pkl',
        'normalization_samples': 1,
        '_target_': 'maze.train.trainers.es.ESDevRunner',
        'n_eval_rollouts': 1,
        'shared_noise_table_size': 10,
        "dump_interval": None
    }
    a2c_dev_runner_config = {
        'state_dict_dump_file': 'state_dict.pt',
        'spaces_config_dump_file': 'spaces_config.pkl',
        'normalization_samples': 1,
        '_target_':
        'maze.train.trainers.common.actor_critic.actor_critic_runners.ACDevRunner',
        "trainer_class": "maze.train.trainers.a2c.a2c_trainer.A2C",
        'concurrency': 1,
        "dump_interval": None,
        "eval_concurrency": 1
    }
    invalid_a2c_dev_runner_config = copy.deepcopy(a2c_dev_runner_config)
    invalid_a2c_dev_runner_config[
        "trainer_class"] = "maze.train.trainers.es.es_trainer.ESTrainer"

    a2c_alg_config = A2CAlgorithmConfig(
        n_epochs=1,
        epoch_length=25,
        patience=15,
        critic_burn_in_epochs=0,
        n_rollout_steps=100,
        lr=0.0005,
        gamma=0.98,
        gae_lambda=1.0,
        policy_loss_coef=1.0,
        value_loss_coef=0.5,
        entropy_coef=0.00025,
        max_grad_norm=0.0,
        device='cpu',
        rollout_evaluator=RolloutEvaluator(eval_env=SequentialVectorEnv(
            [lambda: GymMazeEnv(gym_env_name)]),
                                           n_episodes=1,
                                           model_selection=None,
                                           deterministic=True))
    default_overrides = {"env.name": gym_env_name}

    return es_dev_runner_config, a2c_dev_runner_config, invalid_a2c_dev_runner_config, a2c_alg_config, default_overrides
Exemple #7
0
def test_autoresolving_proxy_attribute():
    """
    Tests auto-resolving proxy attributes like critic (see for :py:class:`maze.api.utils._ATTRIBUTE_PROXIES` for more
    info).
    """

    cartpole_env_factory = lambda: GymMazeEnv(env=gym.make("CartPole-v0"))

    _, _, critic_composer, _, _ = _get_cartpole_setup_components()
    alg_config = A2CAlgorithmConfig(n_epochs=1,
                                    epoch_length=25,
                                    patience=15,
                                    critic_burn_in_epochs=0,
                                    n_rollout_steps=100,
                                    lr=0.0005,
                                    gamma=0.98,
                                    gae_lambda=1.0,
                                    policy_loss_coef=1.0,
                                    value_loss_coef=0.5,
                                    entropy_coef=0.00025,
                                    max_grad_norm=0.0,
                                    device='cpu',
                                    rollout_evaluator=RolloutEvaluator(
                                        eval_env=SequentialVectorEnv(
                                            [cartpole_env_factory]),
                                        n_episodes=1,
                                        model_selection=None,
                                        deterministic=True))
    default_overrides = {
        "runner.normalization_samples": 1,
        "runner.concurrency": 1
    }

    rc = run_context.RunContext(env=cartpole_env_factory,
                                silent=True,
                                algorithm=alg_config,
                                critic=critic_composer,
                                runner="dev",
                                overrides=default_overrides)
    rc.train(n_epochs=1)
    assert isinstance(rc._runners[RunMode.TRAINING][0].model_composer.critic,
                      TorchSharedStateCritic)

    rc = run_context.RunContext(env=cartpole_env_factory,
                                silent=True,
                                algorithm=alg_config,
                                critic="template_state",
                                runner="dev",
                                overrides=default_overrides)
    rc.train(n_epochs=1)
    assert isinstance(rc._runners[RunMode.TRAINING][0].model_composer.critic,
                      TorchStepStateCritic)
Exemple #8
0
def test_evaluation():
    """
    Tests evaluation.
    """

    # Test with ES: No rollout evaluator in config.
    rc = run_context.RunContext(
        env=lambda: GymMazeEnv(env=gym.make("CartPole-v0")),
        silent=True,
        configuration="test",
        overrides={
            "runner.normalization_samples": 1,
            "runner.shared_noise_table_size": 10
        })
    rc.train(1)
    stats = rc.evaluate(n_episodes=5)
    assert len(stats) == 1
    assert stats[0][(BaseEnvEvents.reward, "episode_count", None)] in (5, 6)

    # Test with A2C: Partially specified rollout evaluator in config.
    rc = run_context.RunContext(
        env=lambda: GymMazeEnv(env=gym.make("CartPole-v0")),
        silent=True,
        algorithm="a2c",
        configuration="test",
        overrides={"runner.concurrency": 1})
    rc.train(1)
    stats = rc.evaluate(n_episodes=2)
    assert len(stats) == 1
    assert stats[0][(BaseEnvEvents.reward, "episode_count", None)] in (2, 3)

    # Test with A2C and instanatiated RolloutEvaluator.
    rc = run_context.RunContext(
        env=lambda: GymMazeEnv(env=gym.make("CartPole-v0")),
        silent=True,
        algorithm="a2c",
        configuration="test",
        overrides={
            "runner.concurrency":
            1,
            "algorithm.rollout_evaluator":
            RolloutEvaluator(eval_env=SequentialVectorEnv(
                [lambda: GymMazeEnv("CartPole-v0")]),
                             n_episodes=1,
                             model_selection=None,
                             deterministic=True)
        })
    rc.train(1)
    stats = rc.evaluate(n_episodes=5)
    assert len(stats) == 1
    assert stats[0][(BaseEnvEvents.reward, "episode_count", None)] in (1, 2)
Exemple #9
0
def test_inconsistency_identification_type_3() -> None:
    """
    Tests identification of inconsistency due to derived config group.
    """

    es_dev_runner_config = {
        'state_dict_dump_file': 'state_dict.pt',
        'spaces_config_dump_file': 'spaces_config.pkl',
        'normalization_samples': 10000,
        '_target_': 'maze.train.trainers.es.ESDevRunner',
        'n_eval_rollouts': 10,
        'shared_noise_table_size': 100000000
    }
    a2c_alg_config = A2CAlgorithmConfig(
        n_epochs=1,
        epoch_length=25,
        patience=15,
        critic_burn_in_epochs=0,
        n_rollout_steps=100,
        lr=0.0005,
        gamma=0.98,
        gae_lambda=1.0,
        policy_loss_coef=1.0,
        value_loss_coef=0.5,
        entropy_coef=0.00025,
        max_grad_norm=0.0,
        device='cpu',
        rollout_evaluator=RolloutEvaluator(eval_env=SequentialVectorEnv(
            [lambda: GymMazeEnv(env="CartPole-v0")]),
                                           n_episodes=1,
                                           model_selection=None,
                                           deterministic=True))
    default_overrides = {
        "runner.normalization_samples": 1,
        "runner.concurrency": 1
    }

    rc = run_context.RunContext(algorithm=a2c_alg_config,
                                env=lambda: GymMazeEnv(env="CartPole-v0"),
                                silent=True,
                                runner="dev",
                                overrides=default_overrides)
    rc.train(1)

    run_context.RunContext(env=lambda: GymMazeEnv(env="CartPole-v0"),
                           runner=es_dev_runner_config,
                           silent=True,
                           overrides=default_overrides)
    rc.train(1)
Exemple #10
0
def test_rollout_evaluator():
    env = SequentialVectorEnv([lambda: TimeLimitWrapper.wrap(build_dummy_maze_env(), max_episode_steps=2)] * 2)
    policy = flatten_concat_probabilistic_policy_for_env(build_dummy_maze_env())
    model_selection = _MockModelSelection()

    evaluator = RolloutEvaluator(eval_env=env, n_episodes=3, model_selection=model_selection)
    for i in range(2):
        evaluator.evaluate(policy)
        increment_log_step()

    assert model_selection.update_count == 2
    assert evaluator.eval_env.get_stats_value(
        BaseEnvEvents.reward,
        LogStatsLevel.EPOCH,
        name="total_episode_count"
    ) >= 2 * 3
Exemple #11
0
def test_does_not_carry_over_stats_from_unfinished_episodes():
    policy = flatten_concat_probabilistic_policy_for_env(build_dummy_maze_env())

    # Wrap envs in a time-limit wrapper
    env = SequentialVectorEnv([lambda: TimeLimitWrapper.wrap(build_dummy_maze_env())] * 2)

    # Make one env slower than the other
    env.envs[0].set_max_episode_steps(2)
    env.envs[1].set_max_episode_steps(10)

    evaluator = RolloutEvaluator(eval_env=env, n_episodes=1, model_selection=None)
    for i in range(2):
        evaluator.evaluate(policy)
        increment_log_step()

        # We should get just one episode counted in stats
        assert evaluator.eval_env.get_stats_value(
            BaseEnvEvents.reward,
            LogStatsLevel.EPOCH,
            name="episode_count"
        ) == 1
Exemple #12
0
def test_vectorized_rollout():
    """Rollout with a vector env."""
    concurrency = 3
    env = SequentialVectorEnv([build_dummy_structured_env] * concurrency)
    rollout_generator = RolloutGenerator(env=env)
    policy = DistributedRandomPolicy(env.action_spaces_dict,
                                     concurrency=concurrency)
    trajectory = rollout_generator.rollout(policy, n_steps=10)

    assert len(trajectory) == 10

    sub_step_keys = env.action_spaces_dict.keys()
    for record in trajectory.step_records:
        assert sub_step_keys == record.actions_dict.keys()
        assert sub_step_keys == record.observations_dict.keys()
        assert sub_step_keys == record.rewards_dict.keys()

        assert record.batch_shape == [concurrency]
        # The first dimension of the observations should correspond to the distributed env concurrency
        # (We just check the very first array present in the first observation)
        first_sub_step_obs: Dict = list(record.observations_dict.values())[0]
        first_obs_value = list(first_sub_step_obs.values())[0]
        assert first_obs_value.shape[0] == concurrency
Exemple #13
0
def main(n_epochs: int) -> None:
    """Trains the cart pole environment with the multi-step a2c implementation.
    """

    # initialize distributed env
    envs = SequentialVectorEnv(
        [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)],
        logging_prefix="train")

    # initialize the env and enable statistics collection
    eval_env = SequentialVectorEnv(
        [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)],
        logging_prefix="eval")

    # init distribution mapper
    env = GymMazeEnv(env="CartPole-v0")

    # init default distribution mapper
    distribution_mapper = DistributionMapper(action_space=env.action_space,
                                             distribution_mapper_config={})

    # initialize policies
    policies = {
        0: PolicyNet({'observation': (4, )}, {'action': (2, )},
                     non_lin=nn.Tanh)
    }

    # initialize critic
    critics = {0: ValueNet({'observation': (4, )})}

    # initialize optimizer
    algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs,
                                          epoch_length=10,
                                          patience=10,
                                          critic_burn_in_epochs=0,
                                          n_rollout_steps=20,
                                          lr=0.0005,
                                          gamma=0.98,
                                          gae_lambda=1.0,
                                          policy_loss_coef=1.0,
                                          value_loss_coef=0.5,
                                          entropy_coef=0.0,
                                          max_grad_norm=0.0,
                                          device="cpu",
                                          rollout_evaluator=RolloutEvaluator(
                                              eval_env=eval_env,
                                              n_episodes=1,
                                              model_selection=None,
                                              deterministic=True))

    # initialize actor critic model
    model = TorchActorCritic(policy=TorchPolicy(
        networks=policies,
        distribution_mapper=distribution_mapper,
        device=algorithm_config.device),
                             critic=TorchSharedStateCritic(
                                 networks=critics,
                                 obs_spaces_dict=env.observation_spaces_dict,
                                 device=algorithm_config.device,
                                 stack_observations=False),
                             device=algorithm_config.device)

    a2c = A2C(rollout_generator=RolloutGenerator(envs),
              evaluator=algorithm_config.rollout_evaluator,
              algorithm_config=algorithm_config,
              model=model,
              model_selection=None)

    setup_logging(job_config=None)

    # train agent
    a2c.train()

    # final evaluation run
    print("Final Evaluation Run:")
    a2c.evaluate()
def main(n_epochs: int, rnn_steps: int) -> None:
    """Trains the cart pole environment with the multi-step a2c implementation.
    """
    env_name = "CartPole-v0"

    # initialize distributed env
    envs = SequentialVectorEnv([
        lambda: to_rnn_dict_space_environment(env=env_name,
                                              rnn_steps=rnn_steps)
        for _ in range(4)
    ],
                               logging_prefix="train")

    # initialize the env and enable statistics collection
    eval_env = SequentialVectorEnv([
        lambda: to_rnn_dict_space_environment(env=env_name,
                                              rnn_steps=rnn_steps)
        for _ in range(4)
    ],
                                   logging_prefix="eval")

    # map observations to a modality
    obs_modalities_mappings = {"observation": "feature"}

    # define how to process a modality
    modality_config = dict()
    modality_config["feature"] = {
        "block_type": "maze.perception.blocks.DenseBlock",
        "block_params": {
            "hidden_units": [32, 32],
            "non_lin": "torch.nn.Tanh"
        }
    }
    modality_config["hidden"] = {
        "block_type": "maze.perception.blocks.DenseBlock",
        "block_params": {
            "hidden_units": [64],
            "non_lin": "torch.nn.Tanh"
        }
    }
    modality_config["recurrence"] = {}
    if rnn_steps > 0:
        modality_config["recurrence"] = {
            "block_type": "maze.perception.blocks.LSTMLastStepBlock",
            "block_params": {
                "hidden_size": 8,
                "num_layers": 1,
                "bidirectional": False,
                "non_lin": "torch.nn.Tanh"
            }
        }

    template_builder = TemplateModelComposer(
        action_spaces_dict=envs.action_spaces_dict,
        observation_spaces_dict=envs.observation_spaces_dict,
        agent_counts_dict=envs.agent_counts_dict,
        distribution_mapper_config={},
        model_builder=ConcatModelBuilder(modality_config,
                                         obs_modalities_mappings, None),
        policy={
            '_target_':
            'maze.perception.models.policies.ProbabilisticPolicyComposer'
        },
        critic={
            '_target_': 'maze.perception.models.critics.StateCriticComposer'
        })

    algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs,
                                          epoch_length=10,
                                          patience=10,
                                          critic_burn_in_epochs=0,
                                          n_rollout_steps=20,
                                          lr=0.0005,
                                          gamma=0.98,
                                          gae_lambda=1.0,
                                          policy_loss_coef=1.0,
                                          value_loss_coef=0.5,
                                          entropy_coef=0.0,
                                          max_grad_norm=0.0,
                                          device="cpu",
                                          rollout_evaluator=RolloutEvaluator(
                                              eval_env=eval_env,
                                              n_episodes=1,
                                              model_selection=None,
                                              deterministic=True))

    model = TorchActorCritic(policy=TorchPolicy(
        networks=template_builder.policy.networks,
        distribution_mapper=template_builder.distribution_mapper,
        device=algorithm_config.device),
                             critic=template_builder.critic,
                             device=algorithm_config.device)

    a2c = A2C(rollout_generator=RolloutGenerator(envs),
              evaluator=algorithm_config.rollout_evaluator,
              algorithm_config=algorithm_config,
              model=model,
              model_selection=None)

    setup_logging(job_config=None)

    # train agent
    a2c.train()

    # final evaluation run
    print("Final Evaluation Run:")
    a2c.evaluate()
Exemple #15
0
def test_concepts_and_structures_run_context_overview():
    """
    Tests snippets in docs/source/concepts_and_structure/run_context_overview.rst.
    """

    # Default overrides for faster tests. Shouldn't change functionality.
    ac_overrides = {"runner.concurrency": 1}
    es_overrides = {"algorithm.n_epochs": 1, "algorithm.n_rollouts_per_update": 1}

    # Training
    # --------

    rc = RunContext(
        algorithm="a2c",
        overrides={"env.name": "CartPole-v0", **ac_overrides},
        model="vector_obs",
        critic="template_state",
        runner="dev",
        configuration="test"
    )
    rc.train(n_epochs=1)

    alg_config = A2CAlgorithmConfig(
        n_epochs=1,
        epoch_length=25,
        patience=15,
        critic_burn_in_epochs=0,
        n_rollout_steps=100,
        lr=0.0005,
        gamma=0.98,
        gae_lambda=1.0,
        policy_loss_coef=1.0,
        value_loss_coef=0.5,
        entropy_coef=0.00025,
        max_grad_norm=0.0,
        device='cpu',
        rollout_evaluator=RolloutEvaluator(
            eval_env=SequentialVectorEnv([lambda: GymMazeEnv("CartPole-v0")]),
            n_episodes=1,
            model_selection=None,
            deterministic=True
        )
    )

    rc = RunContext(
        algorithm=alg_config,
        overrides={"env.name": "CartPole-v0", **ac_overrides},
        model="vector_obs",
        critic="template_state",
        runner="dev",
        configuration="test"
    )
    rc.train(n_epochs=1)

    rc = RunContext(env=lambda: GymMazeEnv('CartPole-v0'), overrides=es_overrides, runner="dev", configuration="test")
    rc.train(n_epochs=1)

    policy_composer_config = {
        '_target_': 'maze.perception.models.policies.ProbabilisticPolicyComposer',
        'networks': [{
            '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet',
            'non_lin': 'torch.nn.Tanh',
            'hidden_units': [256, 256]
        }],
        "substeps_with_separate_agent_nets": [],
        "agent_counts_dict": {0: 1}
    }
    rc = RunContext(
        overrides={"model.policy": policy_composer_config, **es_overrides}, runner="dev", configuration="test"
    )
    rc.train(n_epochs=1)

    env = GymMazeEnv('CartPole-v0')
    policy_composer = ProbabilisticPolicyComposer(
        action_spaces_dict=env.action_spaces_dict,
        observation_spaces_dict=env.observation_spaces_dict,
        distribution_mapper=DistributionMapper(action_space=env.action_space, distribution_mapper_config={}),
        networks=[{
            '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet',
            'non_lin': 'torch.nn.Tanh',
            'hidden_units': [222, 222]
        }],
        substeps_with_separate_agent_nets=[],
        agent_counts_dict={0: 1}
    )
    rc = RunContext(overrides={"model.policy": policy_composer, **es_overrides}, runner="dev", configuration="test")
    rc.train(n_epochs=1)

    rc = RunContext(algorithm=alg_config, overrides=ac_overrides, runner="dev", configuration="test")
    rc.train(n_epochs=1)
    rc.train()

    # Rollout
    # -------

    obs = env.reset()
    for i in range(10):
        action = rc.compute_action(obs)
        obs, rewards, dones, info = env.step(action)

    # Evaluation
    # ----------

    env.reset()
    evaluator = RolloutEvaluator(
        # Environment has to be have statistics logging capabilities for RolloutEvaluator.
        eval_env=LogStatsWrapper.wrap(env, logging_prefix="eval"),
        n_episodes=1,
        model_selection=None
    )
    evaluator.evaluate(rc.policy)
Exemple #16
0
def train(n_epochs: int) -> int:
    """
    Trains agent in pure Python.

    :param n_epochs: Number of epochs to train.

    :return: 0 if successful.

    """

    # Environment setup
    # -----------------

    env = cartpole_env_factory()

    # Algorithm setup
    # ---------------

    algorithm_config = A2CAlgorithmConfig(
        n_epochs=5,
        epoch_length=25,
        patience=15,
        critic_burn_in_epochs=0,
        n_rollout_steps=100,
        lr=0.0005,
        gamma=0.98,
        gae_lambda=1.0,
        policy_loss_coef=1.0,
        value_loss_coef=0.5,
        entropy_coef=0.00025,
        max_grad_norm=0.0,
        device='cpu',
        rollout_evaluator=RolloutEvaluator(
            eval_env=SequentialVectorEnv([cartpole_env_factory]),
            n_episodes=1,
            model_selection=None,
            deterministic=True
        )
    )

    # Custom model setup
    # ------------------

    # Policy customization
    # ^^^^^^^^^^^^^^^^^^^^

    # Policy network.
    policy_net = CartpolePolicyNet(
        obs_shapes={'observation': env.observation_space.spaces['observation'].shape},
        action_logit_shapes={'action': (env.action_space.spaces['action'].n,)}
    )
    policy_networks = [policy_net]

    # Policy distribution.
    distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={})

    # Policy composer.
    policy_composer = ProbabilisticPolicyComposer(
        action_spaces_dict=env.action_spaces_dict,
        observation_spaces_dict=env.observation_spaces_dict,
        # Derive distribution from environment's action space.
        distribution_mapper=distribution_mapper,
        networks=policy_networks,
        # We have only one agent and network, thus this is an empty list.
        substeps_with_separate_agent_nets=[],
        # We have only one step and one agent.
        agent_counts_dict={0: 1}
    )

    # Critic customization
    # ^^^^^^^^^^^^^^^^^^^^

    # Value networks.
    value_networks = {
        0: TorchModelBlock(
            in_keys='observation', out_keys='value',
            in_shapes=env.observation_space.spaces['observation'].shape,
            in_num_dims=[2],
            out_num_dims=2,
            net=CartpoleValueNet({'observation': env.observation_space.spaces['observation'].shape})
        )
    }

    # Critic composer.
    critic_composer = SharedStateCriticComposer(
        observation_spaces_dict=env.observation_spaces_dict,
        agent_counts_dict={0: 1},
        networks=value_networks,
        stack_observations=True
    )

    # Training
    # ^^^^^^^^

    rc = run_context.RunContext(
        env=cartpole_env_factory,
        algorithm=algorithm_config,
        policy=policy_composer,
        critic=critic_composer,
        runner="dev"
    )
    rc.train(n_epochs=n_epochs)

    # Distributed training
    # ^^^^^^^^^^^^^^^^^^^^

    algorithm_config.rollout_evaluator.eval_env = SubprocVectorEnv([cartpole_env_factory])
    rc = run_context.RunContext(
        env=cartpole_env_factory,
        algorithm=algorithm_config,
        policy=policy_composer,
        critic=critic_composer,
        runner="local"
    )
    rc.train(n_epochs=n_epochs)

    # Evaluation
    # ^^^^^^^^^^

    print("-----------------")
    evaluator = RolloutEvaluator(
        eval_env=LogStatsWrapper.wrap(cartpole_env_factory(), logging_prefix="eval"),
        n_episodes=1,
        model_selection=None
    )
    evaluator.evaluate(rc.policy)

    return 0
Exemple #17
0
    def evaluate(self, **eval_kwargs) -> Union[LogStats, List[LogStats]]:
        """
        Evaluates the trained/loaded policy with an RolloutEvaluator. By default 8 episodes are evaluated sequentially.

        :param eval_kwargs: kwargs to overwrite set (or default) initialization parameters for RolloutEvaluator. Note
                            that these arguments are ignored if RolloutRunner was passed as instance in AlgorithmConfig.

        :return: Logged statistics. One LogStats object if RunContext doesn't operate in multi-run mode, otherwise a
                 list thereof.

        """

        # Collect env factories and policies, wrap them in lists if they aren't already.
        env_factories = self.env_factory
        policies = self.policy
        if not isinstance(env_factories, List):
            env_factories = [env_factories]
            policies = [policies]

        # Generate rollout evaluators.
        rollout_evaluators: List[RolloutEvaluator] = []
        for runner, env_fn in zip(self._runners[RunMode.TRAINING],
                                  env_factories):
            # If rollout evaluator is not specified at all, create incomplete config with target.
            try:
                ro_eval = runner.cfg.algorithm.rollout_evaluator
            except omegaconf.errors.ConfigAttributeError:
                ro_eval = {
                    "_target_":
                    "maze.train.trainers.common.evaluators.rollout_evaluator.RolloutEvaluator"
                }

            # Override with specified arguments.
            if isinstance(ro_eval, DictConfig):
                ro_eval = omegaconf.OmegaConf.to_object(ro_eval)
            if isinstance(ro_eval, dict):
                ro_eval = {**ro_eval, **eval_kwargs}

            # Try to instantiate rollout runner directly from config. Works if completely specified in config or present
            # as instance of RolloutEvaluator.
            try:
                ro_eval = Factory(RolloutEvaluator).instantiate(ro_eval)
            # Merge with default values in case of incomplete RolloutEvaluator config.
            except TypeError:
                default_params = {
                    "eval_env": SequentialVectorEnv(env_factories=[env_fn]),
                    "n_episodes": 8,
                    "model_selection": None,
                    "deterministic": False
                }
                ro_eval = Factory(RolloutEvaluator).instantiate({
                    **default_params,
                    **ro_eval
                })
            finally:
                rollout_evaluators.append(ro_eval)
        # Evaluate policies.
        stats = [
            self._silence(lambda: [
                ro_eval.evaluate(policy),
                ro_eval.eval_env.get_stats(LogStatsLevel.EPOCH).last_stats
            ][-1]) for env_factory, policy, ro_eval in zip(
                env_factories, policies, rollout_evaluators)
        ]

        return stats[0] if len(stats) == 0 else stats
Exemple #18
0
def train(n_epochs):
    # Instantiate one environment. This will be used for convenient access to observation
    # and action spaces.
    env = cartpole_env_factory()
    observation_space = env.observation_space
    action_space = env.action_space

    # Policy Setup
    # ------------

    # Policy Network
    # ^^^^^^^^^^^^^^
    # Instantiate policy with the correct shapes of observation and action spaces.
    policy_net = CartpolePolicyNet(
        obs_shapes={'observation': observation_space.spaces['observation'].shape},
        action_logit_shapes={'action': (action_space.spaces['action'].n,)})

    maze_wrapped_policy_net = TorchModelBlock(
        in_keys='observation', out_keys='action',
        in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2],
        out_num_dims=2, net=policy_net)

    policy_networks = {0: maze_wrapped_policy_net}

    # Policy Distribution
    # ^^^^^^^^^^^^^^^^^^^
    distribution_mapper = DistributionMapper(
        action_space=action_space,
        distribution_mapper_config={})

    # Optionally, you can specify a different distribution with the distribution_mapper_config argument. Using a
    # Categorical distribution for a discrete action space would be done via
    distribution_mapper = DistributionMapper(
        action_space=action_space,
        distribution_mapper_config=[{
            "action_space": gym.spaces.Discrete,
            "distribution": "maze.distributions.categorical.CategoricalProbabilityDistribution"}])

    # Instantiating the Policy
    # ^^^^^^^^^^^^^^^^^^^^^^^^
    torch_policy = TorchPolicy(networks=policy_networks, distribution_mapper=distribution_mapper, device='cpu')

    # Value Function Setup
    # --------------------

    # Value Network
    # ^^^^^^^^^^^^^
    value_net = CartpoleValueNet(obs_shapes={'observation': observation_space.spaces['observation'].shape})

    maze_wrapped_value_net = TorchModelBlock(
        in_keys='observation', out_keys='value',
        in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2],
        out_num_dims=2, net=value_net)

    value_networks = {0: maze_wrapped_value_net}

    # Instantiate the Value Function
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    torch_critic = TorchSharedStateCritic(networks=value_networks, obs_spaces_dict=env.observation_spaces_dict,
                                          device='cpu', stack_observations=False)

    # Initializing the ActorCritic Model.
    # -----------------------------------
    actor_critic_model = TorchActorCritic(policy=torch_policy, critic=torch_critic, device='cpu')

    # Instantiating the Trainer
    # =========================

    algorithm_config = A2CAlgorithmConfig(
        n_epochs=n_epochs,
        epoch_length=25,
        patience=15,
        critic_burn_in_epochs=0,
        n_rollout_steps=100,
        lr=0.0005,
        gamma=0.98,
        gae_lambda=1.0,
        policy_loss_coef=1.0,
        value_loss_coef=0.5,
        entropy_coef=0.00025,
        max_grad_norm=0.0,
        device='cpu',
        rollout_evaluator=RolloutEvaluator(
            eval_env=SequentialVectorEnv([cartpole_env_factory]),
            n_episodes=1,
            model_selection=None,
            deterministic=True
        )
    )

    # Distributed Environments
    # ------------------------
    # In order to use the distributed trainers, the previously created env factory is supplied to one of Maze's
    # distribution classes:
    train_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="train")
    eval_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="eval")

    # Initialize best model selection.
    model_selection = BestModelSelection(dump_file="params.pt", model=actor_critic_model)

    a2c_trainer = A2C(rollout_generator=RolloutGenerator(train_envs),
                      evaluator=algorithm_config.rollout_evaluator,
                      algorithm_config=algorithm_config,
                      model=actor_critic_model,
                      model_selection=model_selection)

    # Train the Agent
    # ===============
    # Before starting the training, we will enable logging by calling
    log_dir = '.'
    setup_logging(job_config=None, log_dir=log_dir)

    # Now, we can train the agent.
    a2c_trainer.train()

    return 0