Ejemplo n.º 1
0
    def evaluate(self, policy: TorchPolicy) -> None:
        """Evaluate given policy (results are stored in stat logs) and dump the model if the reward improved.

        :param policy: Policy to evaluate.
        """
        policy.eval()
        with torch.no_grad():
            total_loss = []

            for iteration, data in enumerate(self.data_loader, 0):
                observations, actions, actor_ids = data[0], data[1], data[-1]
                actor_ids = debatch_actor_ids(actor_ids)
                # Convert only actions to torch, since observations are converted in
                # policy.compute_substep_policy_output method
                convert_to_torch(actions,
                                 device=policy.device,
                                 cast=None,
                                 in_place=True)

                total_loss.append(
                    self.loss.calculate_loss(policy=policy,
                                             observations=observations,
                                             actions=actions,
                                             events=self.eval_events,
                                             actor_ids=actor_ids).item())

            if self.model_selection:
                self.model_selection.update(-np.mean(total_loss).item())
Ejemplo n.º 2
0
    def evaluate(self, policy: TorchPolicy) -> None:
        """Evaluate given policy (results are stored in stat logs) and dump the model if the reward improved.

        :param policy: Policy to evaluate
        """
        policy.eval()

        n_done_episodes = 0
        observations = self.eval_env.reset()

        # Clear epoch stats in case there were some unfinished episodes from the previous evaluation round
        self.eval_env.clear_epoch_stats()

        while n_done_episodes < self.n_episodes:
            # Sample action and take the step
            sampled_action = policy.compute_action(
                observations,
                actor_id=self.eval_env.actor_id(),
                maze_state=None,
                deterministic=self.deterministic)
            observations, rewards, dones, infos = self.eval_env.step(
                sampled_action)

            # Count done episodes
            n_done_episodes += np.count_nonzero(dones)

        # Enforce the epoch stats calculation (without calling increment_log_step() -- this is up to the trainer)
        self.eval_env.write_epoch_stats()

        # Notify the model selection if available
        if self.model_selection:
            reward = self.eval_env.get_stats_value(BaseEnvEvents.reward,
                                                   LogStatsLevel.EPOCH,
                                                   name="mean")
            self.model_selection.update(reward)
Ejemplo n.º 3
0
    def policy(self) -> Optional[TorchPolicy]:
        """Implementation of the BaseModelComposer interface, returns the policy networks."""

        if self._policy_type is None:
            return None

        elif issubclass(self._policy_type, ProbabilisticPolicyComposer):
            networks = dict()
            for sub_step_key in self.action_spaces_dict.keys():
                networks[sub_step_key], self._shared_embedding_nets[
                    sub_step_key] = self.template_policy_net(
                        observation_space=self.
                        observation_spaces_dict[sub_step_key],
                        action_space=self.action_spaces_dict[sub_step_key],
                        shared_embedding_keys=self.model_builder.
                        shared_embedding_keys[sub_step_key])

            return TorchPolicy(networks=networks,
                               distribution_mapper=self.distribution_mapper,
                               device="cpu")

        else:
            raise ValueError(
                f"Policy type {self._policy_type} not supported by the template model composer!"
            )
Ejemplo n.º 4
0
def _policy(env: GymMazeEnv):
    distribution_mapper = DistributionMapper(action_space=env.action_space,
                                             distribution_mapper_config={})
    policies = {
        0:
        FlattenConcatPolicyNet({'observation': (4, )}, {'action': (2, )},
                               hidden_units=[16],
                               non_lin=nn.Tanh)
    }
    critics = {
        0:
        FlattenConcatStateValueNet({'observation': (4, )},
                                   hidden_units=[16],
                                   non_lin=nn.Tanh)
    }

    policy = TorchPolicy(networks=policies,
                         distribution_mapper=distribution_mapper,
                         device="cpu")

    critic = TorchSharedStateCritic(
        networks=critics,
        obs_spaces_dict=env.observation_spaces_dict,
        device="cpu",
        stack_observations=False)

    return TorchActorCritic(policy=policy, critic=critic, device="cpu")
 def policy(self) -> TorchPolicy:
     """implementation of :class:`~maze.perception.models.policies.base_policy_composer.BasePolicyComposer`
     """
     return TorchPolicy(networks=self._policies,
                        distribution_mapper=self._distribution_mapper,
                        device='cpu',
                        substeps_with_separate_agent_nets=self.
                        _substeps_with_separate_agent_nets)
Ejemplo n.º 6
0
    def setup(self, cfg: DictConfig) -> None:
        """
        Setup the training master node.
        """

        super().setup(cfg)

        # --- init the shared noise table ---
        print("********** Init Shared Noise Table **********")
        self.shared_noise = SharedNoiseTable(
            count=self.shared_noise_table_size)

        # --- initialize policies ---

        torch_policy = TorchPolicy(
            networks=self._model_composer.policy.networks,
            distribution_mapper=self._model_composer.distribution_mapper,
            device="cpu")
        torch_policy.seed(self.maze_seeding.agent_global_seed)

        # support policy wrapping
        if self._cfg.algorithm.policy_wrapper:
            policy = Factory(Policy).instantiate(
                self._cfg.algorithm.policy_wrapper, torch_policy=torch_policy)
            assert isinstance(policy, Policy) and isinstance(
                policy, TorchModel)
            torch_policy = policy

        print("********** Trainer Setup **********")
        self._trainer = ESTrainer(
            algorithm_config=cfg.algorithm,
            torch_policy=torch_policy,
            shared_noise=self.shared_noise,
            normalization_stats=self._normalization_statistics)

        # initialize model from input_dir
        self._init_trainer_from_input_dir(
            trainer=self._trainer,
            state_dict_dump_file=self.state_dict_dump_file,
            input_dir=cfg.input_dir)

        self._model_selection = BestModelSelection(
            dump_file=self.state_dict_dump_file,
            model=torch_policy,
            dump_interval=self.dump_interval)
Ejemplo n.º 7
0
def main(n_epochs) -> None:
    """Trains the cart pole environment with the ES implementation.
    """

    env = GymMazeEnv(env="CartPole-v0")
    distribution_mapper = DistributionMapper(action_space=env.action_space,
                                             distribution_mapper_config={})

    obs_shapes = observation_spaces_to_in_shapes(env.observation_spaces_dict)
    action_shapes = {
        step_key: {
            action_head: distribution_mapper.required_logits_shape(action_head)
            for action_head in env.action_spaces_dict[step_key].spaces.keys()
        }
        for step_key in env.action_spaces_dict.keys()
    }

    # initialize policies
    policies = [
        PolicyNet(obs_shapes=obs_shapes[0],
                  action_logits_shapes=action_shapes[0],
                  non_lin=nn.SELU)
    ]

    # initialize optimizer
    policy = TorchPolicy(networks=list_to_dict(policies),
                         distribution_mapper=distribution_mapper,
                         device="cpu")

    shared_noise = SharedNoiseTable(count=1_000_000)

    algorithm_config = ESAlgorithmConfig(n_rollouts_per_update=100,
                                         n_timesteps_per_update=0,
                                         max_steps=0,
                                         optimizer=Adam(step_size=0.01),
                                         l2_penalty=0.005,
                                         noise_stddev=0.02,
                                         n_epochs=n_epochs,
                                         policy_wrapper=None)

    trainer = ESTrainer(algorithm_config=algorithm_config,
                        torch_policy=policy,
                        shared_noise=shared_noise,
                        normalization_stats=None)

    setup_logging(job_config=None)

    maze_rng = np.random.RandomState(None)

    # run with pseudo-distribution, without worker processes
    trainer.train(ESDummyDistributedRollouts(
        env=env,
        n_eval_rollouts=10,
        shared_noise=shared_noise,
        agent_instance_seed=MazeSeeding.generate_seed_from_random_state(
            maze_rng)),
                  model_selection=None)
Ejemplo n.º 8
0
def train_function(n_epochs: int, distributed_env_cls) -> A2C:
    """Trains the cart pole environment with the multi-step a2c implementation.
    """

    # initialize distributed env
    envs = distributed_env_cls([lambda: GymMazeEnv(env="CartPole-v0") for _ in range(2)])

    # initialize the env and enable statistics collection
    eval_env = distributed_env_cls([lambda: GymMazeEnv(env="CartPole-v0") for _ in range(2)],
                                   logging_prefix='eval')

    # init distribution mapper
    env = GymMazeEnv(env="CartPole-v0")
    distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={})

    # initialize policies
    policies = {0: FlattenConcatPolicyNet({'observation': (4,)}, {'action': (2,)}, hidden_units=[16], non_lin=nn.Tanh)}

    # initialize critic
    critics = {0: FlattenConcatStateValueNet({'observation': (4,)}, hidden_units=[16], non_lin=nn.Tanh)}

    # algorithm configuration
    algorithm_config = A2CAlgorithmConfig(
        n_epochs=n_epochs,
        epoch_length=2,
        patience=10,
        critic_burn_in_epochs=0,
        n_rollout_steps=20,
        lr=0.0005,
        gamma=0.98,
        gae_lambda=1.0,
        policy_loss_coef=1.0,
        value_loss_coef=0.5,
        entropy_coef=0.0,
        max_grad_norm=0.0,
        device="cpu",
        rollout_evaluator=RolloutEvaluator(eval_env=eval_env, n_episodes=1, model_selection=None, deterministic=True)
    )

    # initialize actor critic model
    model = TorchActorCritic(
        policy=TorchPolicy(networks=policies, distribution_mapper=distribution_mapper, device=algorithm_config.device),
        critic=TorchSharedStateCritic(networks=critics, obs_spaces_dict=env.observation_spaces_dict,
                                      device=algorithm_config.device,
                                      stack_observations=False),
        device=algorithm_config.device)

    a2c = A2C(rollout_generator=RolloutGenerator(envs),
              algorithm_config=algorithm_config,
              evaluator=algorithm_config.rollout_evaluator,
              model=model,
              model_selection=None)

    # train agent
    a2c.train()

    return a2c
Ejemplo n.º 9
0
def train_setup(
        n_epochs: int,
        policy_wrapper=None) -> Tuple[TorchPolicy, StructuredEnv, ESTrainer]:
    """Trains the cart pole environment with the multi-step a2c implementation.
    """

    # initialize distributed env
    env = GymMazeEnv(env="CartPole-v0")

    # initialize distribution mapper
    distribution_mapper = DistributionMapper(action_space=env.action_space,
                                             distribution_mapper_config={})

    # initialize policies
    policies = {
        0:
        FlattenConcatPolicyNet({'observation': (4, )}, {'action': (2, )},
                               hidden_units=[16],
                               non_lin=nn.Tanh)
    }

    # initialize optimizer
    policy = TorchPolicy(networks=policies,
                         distribution_mapper=distribution_mapper,
                         device="cpu")

    # reduce the noise table size to speed up testing
    shared_noise = SharedNoiseTable(count=1_000_000)

    algorithm_config = ESAlgorithmConfig(n_rollouts_per_update=100,
                                         n_timesteps_per_update=0,
                                         max_steps=0,
                                         optimizer=Adam(step_size=0.01),
                                         l2_penalty=0.005,
                                         noise_stddev=0.02,
                                         n_epochs=n_epochs,
                                         policy_wrapper=policy_wrapper)

    # train agent
    trainer = ESTrainer(algorithm_config=algorithm_config,
                        shared_noise=shared_noise,
                        torch_policy=policy,
                        normalization_stats=None)

    return policy, env, trainer
Ejemplo n.º 10
0
    def calculate_loss(self, policy: TorchPolicy,
                       observations: List[ObservationType],
                       actions: List[TorchActionType],
                       actor_ids: List[ActorID],
                       events: ImitationEvents) -> torch.Tensor:
        """Calculate and return the training loss for one step (= multiple sub-steps in structured scenarios).

        :param policy: Structured policy to evaluate.
        :param observations: List with observations w.r.t. actor_ids.
        :param actions: List with actions w.r.t. actor_ids.
        :param actor_ids: List of actor ids.
        :param events: Events of current episode.
        :return: Total loss
        """
        losses = []

        # Iterate over all sub-steps
        assert len(actor_ids) == len(actions)
        assert len(actor_ids) == len(observations)
        for actor_id, observation, target_action in zip(
                actor_ids, observations, actions):
            policy_output = policy.compute_substep_policy_output(
                observation, actor_id=actor_id)
            substep_losses = self._get_substep_loss(
                actor_id,
                policy_output.action_logits,
                target_action,
                self.action_spaces_dict[actor_id.step_key],
                events=events)

            losses.append(substep_losses)

            # Compute and report policy entropy
            entropy = policy_output.entropy.mean()
            events.policy_entropy(step_id=actor_id.step_key,
                                  agent_id=actor_id.agent_id,
                                  value=entropy.item())
            if self.entropy_coef > 0:
                losses.append(-self.entropy_coef * entropy)

        return sum(losses)
Ejemplo n.º 11
0
    def setup(self, cfg: DictConfig) -> None:
        """
        See :py:meth:`~maze.train.trainers.common.training_runner.TrainingRunner.setup`.
        """

        super().setup(cfg)

        env = self.env_factory()

        with SwitchWorkingDirectoryToInput(cfg.input_dir):
            dataset = Factory(base_type=Dataset).instantiate(
                self.dataset, conversion_env_factory=self.env_factory)

        assert len(dataset) > 0, f"Expected to find trajectory data, but did not find any. Please check that " \
                                 f"the path you supplied is correct."
        size_in_byte, size_in_gbyte = getsize(dataset)
        BColors.print_colored(
            f'Size of loaded dataset: {size_in_byte} -> {size_in_gbyte} GB',
            BColors.OKBLUE)
        validation, train = self._split_dataset(
            dataset, cfg.algorithm.validation_percentage,
            self.maze_seeding.generate_env_instance_seed())

        # Create data loaders
        torch_generator = torch.Generator().manual_seed(
            self.maze_seeding.generate_env_instance_seed())
        train_data_loader = DataLoader(train,
                                       shuffle=True,
                                       batch_size=cfg.algorithm.batch_size,
                                       generator=torch_generator,
                                       num_workers=self.dataset.n_workers)

        policy = TorchPolicy(
            networks=self._model_composer.policy.networks,
            distribution_mapper=self._model_composer.distribution_mapper,
            device=cfg.algorithm.device,
            substeps_with_separate_agent_nets=self._model_composer.policy.
            substeps_with_separate_agent_nets)
        policy.seed(self.maze_seeding.agent_global_seed)

        self._model_selection = BestModelSelection(
            self.state_dict_dump_file,
            policy,
            dump_interval=self.dump_interval)
        optimizer = Factory(Optimizer).instantiate(cfg.algorithm.optimizer,
                                                   params=policy.parameters())
        loss = BCLoss(action_spaces_dict=env.action_spaces_dict,
                      entropy_coef=cfg.algorithm.entropy_coef)

        self._trainer = BCTrainer(algorithm_config=self._cfg.algorithm,
                                  data_loader=train_data_loader,
                                  policy=policy,
                                  optimizer=optimizer,
                                  loss=loss)

        # initialize model from input_dir
        self._init_trainer_from_input_dir(
            trainer=self._trainer,
            state_dict_dump_file=self.state_dict_dump_file,
            input_dir=cfg.input_dir)

        # evaluate using the validation set
        self.evaluators = []
        if len(validation) > 0:
            validation_data_loader = DataLoader(
                validation,
                shuffle=True,
                batch_size=cfg.algorithm.batch_size,
                generator=torch_generator,
                num_workers=self.dataset.n_workers)
            self.evaluators += [
                BCValidationEvaluator(
                    data_loader=validation_data_loader,
                    loss=loss,
                    logging_prefix="eval-validation",
                    model_selection=self.
                    _model_selection  # use the validation set evaluation to select the best model
                )
            ]

        # if evaluation episodes are set, perform additional evaluation by policy rollout
        if cfg.algorithm.n_eval_episodes > 0:
            eval_env = self.create_distributed_eval_env(
                self.env_factory,
                self.eval_concurrency,
                logging_prefix="eval-rollout")
            eval_env_instance_seeds = [
                self.maze_seeding.generate_env_instance_seed()
                for _ in range(self.eval_concurrency)
            ]
            eval_env.seed(eval_env_instance_seeds)
            self.evaluators += [
                RolloutEvaluator(eval_env,
                                 n_episodes=cfg.algorithm.n_eval_episodes,
                                 model_selection=None)
            ]
Ejemplo n.º 12
0
def _get_cartpole_setup_components(
) -> Tuple[CustomModelComposer, ProbabilisticPolicyComposer,
           SharedStateCriticComposer, TorchPolicy, TorchActorCritic]:
    """
    Returns various instantiated components for environment CartPole-v0.
    :return: Various components cartpole setting.
    """

    env = GymMazeEnv(env=gym.make("CartPole-v0"))
    observation_space = env.observation_space
    action_space = env.action_space

    policy_net = FlattenConcatPolicyNet({'observation': (4, )},
                                        {'action': (2, )},
                                        hidden_units=[16],
                                        non_lin=nn.Tanh)
    maze_wrapped_policy_net = TorchModelBlock(
        in_keys='observation',
        out_keys='action',
        in_shapes=observation_space.spaces['observation'].shape,
        in_num_dims=[2],
        out_num_dims=2,
        net=policy_net)

    policy_networks = {0: maze_wrapped_policy_net}

    # Policy Distribution
    # ^^^^^^^^^^^^^^^^^^^
    distribution_mapper = DistributionMapper(action_space=action_space,
                                             distribution_mapper_config={})

    # Instantiating the Policy
    # ^^^^^^^^^^^^^^^^^^^^^^^^
    torch_policy = TorchPolicy(networks=policy_networks,
                               distribution_mapper=distribution_mapper,
                               device='cpu')

    policy_composer = ProbabilisticPolicyComposer(
        action_spaces_dict=env.action_spaces_dict,
        observation_spaces_dict=env.observation_spaces_dict,
        distribution_mapper=distribution_mapper,
        networks=[{
            '_target_':
            'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet',
            'non_lin': 'torch.nn.Tanh',
            'hidden_units': [222, 222]
        }],
        substeps_with_separate_agent_nets=[],
        agent_counts_dict={0: 1})

    # Value Function Setup
    # --------------------

    # Value Network
    # ^^^^^^^^^^^^^
    value_net = FlattenConcatStateValueNet({'observation': (4, )},
                                           hidden_units=[16],
                                           non_lin=nn.Tanh)
    maze_wrapped_value_net = TorchModelBlock(
        in_keys='observation',
        out_keys='value',
        in_shapes=observation_space.spaces['observation'].shape,
        in_num_dims=[2],
        out_num_dims=2,
        net=value_net)

    value_networks = {0: maze_wrapped_value_net}

    # Instantiate the Value Function
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    torch_critic = TorchSharedStateCritic(
        networks=value_networks,
        obs_spaces_dict=env.observation_spaces_dict,
        device='cpu',
        stack_observations=True)

    # Critic composer.
    critic_composer = SharedStateCriticComposer(
        observation_spaces_dict=env.observation_spaces_dict,
        agent_counts_dict={0: 1},
        networks=value_networks,
        stack_observations=True)

    # Initializing the ActorCritic Model.
    # -----------------------------------
    actor_critic_model = TorchActorCritic(policy=torch_policy,
                                          critic=torch_critic,
                                          device='cpu')

    model_composer = CustomModelComposer(
        action_spaces_dict=env.action_spaces_dict,
        observation_spaces_dict=env.observation_spaces_dict,
        distribution_mapper_config={},
        policy=policy_composer,
        critic=None,
        agent_counts_dict={0: 1})

    return model_composer, policy_composer, critic_composer, torch_policy, actor_critic_model
Ejemplo n.º 13
0
def train_function(n_epochs: int, epoch_length: int, deterministic_eval: bool,
                   eval_repeats: int, distributed_env_cls,
                   split_rollouts_into_transitions: bool) -> SAC:
    """Implements the lunar lander continuous env and performs tests on it w.r.t. the sac trainer.
    """

    # initialize distributed env
    env_factory = lambda: GymMazeEnv(env="LunarLanderContinuous-v2")

    # initialize the env and enable statistics collection
    eval_env = distributed_env_cls([env_factory for _ in range(2)],
                                   logging_prefix='eval')

    env = env_factory()
    # init distribution mapper
    distribution_mapper = DistributionMapper(
        action_space=env.action_space,
        distribution_mapper_config=[{
            'action_space':
            'gym.spaces.Box',
            'distribution':
            'maze.distributions.squashed_gaussian.SquashedGaussianProbabilityDistribution'
        }])

    action_shapes = {
        step_key: {
            action_head:
            tuple(distribution_mapper.required_logits_shape(action_head))
            for action_head in env.action_spaces_dict[step_key].spaces.keys()
        }
        for step_key in env.action_spaces_dict.keys()
    }

    obs_shapes = observation_spaces_to_in_shapes(env.observation_spaces_dict)
    # initialize policies
    policies = {
        ii: PolicyNet(obs_shapes=obs_shapes[ii],
                      action_logits_shapes=action_shapes[ii],
                      non_lin=nn.Tanh)
        for ii in obs_shapes.keys()
    }

    for key, value in env.action_spaces_dict.items():
        for act_key, act_space in value.spaces.items():
            obs_shapes[key][act_key] = act_space.sample().shape
    # initialize critic
    critics = {
        ii: QCriticNetContinuous(obs_shapes[ii],
                                 non_lin=nn.Tanh,
                                 action_spaces_dict=env.action_spaces_dict)
        for ii in obs_shapes.keys()
    }

    # initialize optimizer
    algorithm_config = SACAlgorithmConfig(
        n_rollout_steps=5,
        lr=0.001,
        entropy_coef=0.2,
        gamma=0.99,
        max_grad_norm=0.5,
        batch_size=100,
        num_actors=2,
        tau=0.005,
        target_update_interval=1,
        entropy_tuning=False,
        device='cpu',
        replay_buffer_size=10000,
        initial_buffer_size=100,
        initial_sampling_policy={
            '_target_': 'maze.core.agent.random_policy.RandomPolicy'
        },
        rollouts_per_iteration=1,
        split_rollouts_into_transitions=split_rollouts_into_transitions,
        entropy_coef_lr=0.0007,
        num_batches_per_iter=1,
        n_epochs=n_epochs,
        epoch_length=epoch_length,
        rollout_evaluator=RolloutEvaluator(eval_env=eval_env,
                                           n_episodes=eval_repeats,
                                           model_selection=None,
                                           deterministic=deterministic_eval),
        patience=50,
        target_entropy_multiplier=1.0)

    actor_policy = TorchPolicy(networks=policies,
                               distribution_mapper=distribution_mapper,
                               device='cpu')

    replay_buffer = UniformReplayBuffer(
        buffer_size=algorithm_config.replay_buffer_size, seed=1234)
    SACRunner.init_replay_buffer(
        replay_buffer=replay_buffer,
        initial_sampling_policy=algorithm_config.initial_sampling_policy,
        initial_buffer_size=algorithm_config.initial_buffer_size,
        replay_buffer_seed=1234,
        split_rollouts_into_transitions=split_rollouts_into_transitions,
        n_rollout_steps=algorithm_config.n_rollout_steps,
        env_factory=env_factory)
    distributed_actors = DummyDistributedWorkersWithBuffer(
        env_factory=env_factory,
        worker_policy=actor_policy,
        n_rollout_steps=algorithm_config.n_rollout_steps,
        n_workers=algorithm_config.num_actors,
        batch_size=algorithm_config.batch_size,
        rollouts_per_iteration=algorithm_config.rollouts_per_iteration,
        split_rollouts_into_transitions=split_rollouts_into_transitions,
        env_instance_seeds=list(range(algorithm_config.num_actors)),
        replay_buffer=replay_buffer)

    critics_policy = TorchStepStateActionCritic(
        networks=critics,
        num_policies=1,
        device='cpu',
        only_discrete_spaces={0: False},
        action_spaces_dict=env.action_spaces_dict)

    learner_model = TorchActorCritic(policy=actor_policy,
                                     critic=critics_policy,
                                     device='cpu')

    # initialize trainer
    sac = SAC(learner_model=learner_model,
              distributed_actors=distributed_actors,
              algorithm_config=algorithm_config,
              evaluator=algorithm_config.rollout_evaluator,
              model_selection=None)

    # train agent
    sac.train(n_epochs=algorithm_config.n_epochs)

    return sac
Ejemplo n.º 14
0
def main(n_epochs: int, rnn_steps: int) -> None:
    """Trains the cart pole environment with the multi-step a2c implementation.
    """
    env_name = "CartPole-v0"

    # initialize distributed env
    envs = SequentialVectorEnv([
        lambda: to_rnn_dict_space_environment(env=env_name,
                                              rnn_steps=rnn_steps)
        for _ in range(4)
    ],
                               logging_prefix="train")

    # initialize the env and enable statistics collection
    eval_env = SequentialVectorEnv([
        lambda: to_rnn_dict_space_environment(env=env_name,
                                              rnn_steps=rnn_steps)
        for _ in range(4)
    ],
                                   logging_prefix="eval")

    # map observations to a modality
    obs_modalities_mappings = {"observation": "feature"}

    # define how to process a modality
    modality_config = dict()
    modality_config["feature"] = {
        "block_type": "maze.perception.blocks.DenseBlock",
        "block_params": {
            "hidden_units": [32, 32],
            "non_lin": "torch.nn.Tanh"
        }
    }
    modality_config["hidden"] = {
        "block_type": "maze.perception.blocks.DenseBlock",
        "block_params": {
            "hidden_units": [64],
            "non_lin": "torch.nn.Tanh"
        }
    }
    modality_config["recurrence"] = {}
    if rnn_steps > 0:
        modality_config["recurrence"] = {
            "block_type": "maze.perception.blocks.LSTMLastStepBlock",
            "block_params": {
                "hidden_size": 8,
                "num_layers": 1,
                "bidirectional": False,
                "non_lin": "torch.nn.Tanh"
            }
        }

    template_builder = TemplateModelComposer(
        action_spaces_dict=envs.action_spaces_dict,
        observation_spaces_dict=envs.observation_spaces_dict,
        agent_counts_dict=envs.agent_counts_dict,
        distribution_mapper_config={},
        model_builder=ConcatModelBuilder(modality_config,
                                         obs_modalities_mappings, None),
        policy={
            '_target_':
            'maze.perception.models.policies.ProbabilisticPolicyComposer'
        },
        critic={
            '_target_': 'maze.perception.models.critics.StateCriticComposer'
        })

    algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs,
                                          epoch_length=10,
                                          patience=10,
                                          critic_burn_in_epochs=0,
                                          n_rollout_steps=20,
                                          lr=0.0005,
                                          gamma=0.98,
                                          gae_lambda=1.0,
                                          policy_loss_coef=1.0,
                                          value_loss_coef=0.5,
                                          entropy_coef=0.0,
                                          max_grad_norm=0.0,
                                          device="cpu",
                                          rollout_evaluator=RolloutEvaluator(
                                              eval_env=eval_env,
                                              n_episodes=1,
                                              model_selection=None,
                                              deterministic=True))

    model = TorchActorCritic(policy=TorchPolicy(
        networks=template_builder.policy.networks,
        distribution_mapper=template_builder.distribution_mapper,
        device=algorithm_config.device),
                             critic=template_builder.critic,
                             device=algorithm_config.device)

    a2c = A2C(rollout_generator=RolloutGenerator(envs),
              evaluator=algorithm_config.rollout_evaluator,
              algorithm_config=algorithm_config,
              model=model,
              model_selection=None)

    setup_logging(job_config=None)

    # train agent
    a2c.train()

    # final evaluation run
    print("Final Evaluation Run:")
    a2c.evaluate()
Ejemplo n.º 15
0
def main(n_epochs: int) -> None:
    """Trains the cart pole environment with the multi-step a2c implementation.
    """

    # initialize distributed env
    envs = SequentialVectorEnv(
        [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)],
        logging_prefix="train")

    # initialize the env and enable statistics collection
    eval_env = SequentialVectorEnv(
        [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)],
        logging_prefix="eval")

    # init distribution mapper
    env = GymMazeEnv(env="CartPole-v0")

    # init default distribution mapper
    distribution_mapper = DistributionMapper(action_space=env.action_space,
                                             distribution_mapper_config={})

    # initialize policies
    policies = {
        0: PolicyNet({'observation': (4, )}, {'action': (2, )},
                     non_lin=nn.Tanh)
    }

    # initialize critic
    critics = {0: ValueNet({'observation': (4, )})}

    # initialize optimizer
    algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs,
                                          epoch_length=10,
                                          patience=10,
                                          critic_burn_in_epochs=0,
                                          n_rollout_steps=20,
                                          lr=0.0005,
                                          gamma=0.98,
                                          gae_lambda=1.0,
                                          policy_loss_coef=1.0,
                                          value_loss_coef=0.5,
                                          entropy_coef=0.0,
                                          max_grad_norm=0.0,
                                          device="cpu",
                                          rollout_evaluator=RolloutEvaluator(
                                              eval_env=eval_env,
                                              n_episodes=1,
                                              model_selection=None,
                                              deterministic=True))

    # initialize actor critic model
    model = TorchActorCritic(policy=TorchPolicy(
        networks=policies,
        distribution_mapper=distribution_mapper,
        device=algorithm_config.device),
                             critic=TorchSharedStateCritic(
                                 networks=critics,
                                 obs_spaces_dict=env.observation_spaces_dict,
                                 device=algorithm_config.device,
                                 stack_observations=False),
                             device=algorithm_config.device)

    a2c = A2C(rollout_generator=RolloutGenerator(envs),
              evaluator=algorithm_config.rollout_evaluator,
              algorithm_config=algorithm_config,
              model=model,
              model_selection=None)

    setup_logging(job_config=None)

    # train agent
    a2c.train()

    # final evaluation run
    print("Final Evaluation Run:")
    a2c.evaluate()
Ejemplo n.º 16
0
def train(n_epochs):
    # Instantiate one environment. This will be used for convenient access to observation
    # and action spaces.
    env = cartpole_env_factory()
    observation_space = env.observation_space
    action_space = env.action_space

    # Policy Setup
    # ------------

    # Policy Network
    # ^^^^^^^^^^^^^^
    # Instantiate policy with the correct shapes of observation and action spaces.
    policy_net = CartpolePolicyNet(
        obs_shapes={'observation': observation_space.spaces['observation'].shape},
        action_logit_shapes={'action': (action_space.spaces['action'].n,)})

    maze_wrapped_policy_net = TorchModelBlock(
        in_keys='observation', out_keys='action',
        in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2],
        out_num_dims=2, net=policy_net)

    policy_networks = {0: maze_wrapped_policy_net}

    # Policy Distribution
    # ^^^^^^^^^^^^^^^^^^^
    distribution_mapper = DistributionMapper(
        action_space=action_space,
        distribution_mapper_config={})

    # Optionally, you can specify a different distribution with the distribution_mapper_config argument. Using a
    # Categorical distribution for a discrete action space would be done via
    distribution_mapper = DistributionMapper(
        action_space=action_space,
        distribution_mapper_config=[{
            "action_space": gym.spaces.Discrete,
            "distribution": "maze.distributions.categorical.CategoricalProbabilityDistribution"}])

    # Instantiating the Policy
    # ^^^^^^^^^^^^^^^^^^^^^^^^
    torch_policy = TorchPolicy(networks=policy_networks, distribution_mapper=distribution_mapper, device='cpu')

    # Value Function Setup
    # --------------------

    # Value Network
    # ^^^^^^^^^^^^^
    value_net = CartpoleValueNet(obs_shapes={'observation': observation_space.spaces['observation'].shape})

    maze_wrapped_value_net = TorchModelBlock(
        in_keys='observation', out_keys='value',
        in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2],
        out_num_dims=2, net=value_net)

    value_networks = {0: maze_wrapped_value_net}

    # Instantiate the Value Function
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    torch_critic = TorchSharedStateCritic(networks=value_networks, obs_spaces_dict=env.observation_spaces_dict,
                                          device='cpu', stack_observations=False)

    # Initializing the ActorCritic Model.
    # -----------------------------------
    actor_critic_model = TorchActorCritic(policy=torch_policy, critic=torch_critic, device='cpu')

    # Instantiating the Trainer
    # =========================

    algorithm_config = A2CAlgorithmConfig(
        n_epochs=n_epochs,
        epoch_length=25,
        patience=15,
        critic_burn_in_epochs=0,
        n_rollout_steps=100,
        lr=0.0005,
        gamma=0.98,
        gae_lambda=1.0,
        policy_loss_coef=1.0,
        value_loss_coef=0.5,
        entropy_coef=0.00025,
        max_grad_norm=0.0,
        device='cpu',
        rollout_evaluator=RolloutEvaluator(
            eval_env=SequentialVectorEnv([cartpole_env_factory]),
            n_episodes=1,
            model_selection=None,
            deterministic=True
        )
    )

    # Distributed Environments
    # ------------------------
    # In order to use the distributed trainers, the previously created env factory is supplied to one of Maze's
    # distribution classes:
    train_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="train")
    eval_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="eval")

    # Initialize best model selection.
    model_selection = BestModelSelection(dump_file="params.pt", model=actor_critic_model)

    a2c_trainer = A2C(rollout_generator=RolloutGenerator(train_envs),
                      evaluator=algorithm_config.rollout_evaluator,
                      algorithm_config=algorithm_config,
                      model=actor_critic_model,
                      model_selection=model_selection)

    # Train the Agent
    # ===============
    # Before starting the training, we will enable logging by calling
    log_dir = '.'
    setup_logging(job_config=None, log_dir=log_dir)

    # Now, we can train the agent.
    a2c_trainer.train()

    return 0