Python PPO 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: allenact.algorithms.onpolicy_sync.losses

클래스/타입: PPO

hotexamples.com에서의 예제들: 5

Python PPO - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 allenact.algorithms.onpolicy_sync.losses.PPO에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

PPO(8)

loss_per_step(1)

자주 사용되는 메소드들

PPO (8)

loss_per_step (1)

예제 #1

파일 보기

파일: one_phase_rgb_ppo.py 프로젝트: allenai/ai2thor-rearrangement

    def _training_pipeline_info(cls, **kwargs) -> Dict[str, Any]:
        """Define how the model trains."""

        training_steps = cls.TRAINING_STEPS
        return dict(
            named_losses=dict(
                ppo_loss=PPO(clip_decay=LinearDecay(training_steps), **PPOConfig)
            ),
            pipeline_stages=[
                PipelineStage(loss_names=["ppo_loss"], max_stage_steps=training_steps,)
            ],
            num_steps=64,
            num_mini_batch=1,
            update_repeats=3,
            use_lr_decay=True,
            lr=3e-4,
        )

예제 #2

파일 보기

    def training_pipeline(self, **kwargs):
        # PPO
        ppo_steps = int(75000000)
        lr = 3e-4
        num_mini_batch = 1
        update_repeats = 4
        num_steps = 128
        save_interval = 5000000
        log_interval = 10000 if torch.cuda.is_available() else 1
        gamma = 0.99
        use_gae = True
        gae_lambda = 0.95
        max_grad_norm = 0.5
        PPOConfig["normalize_advantage"] = self.NORMALIZE_ADVANTAGE

        named_losses = {"ppo_loss": (PPO(**PPOConfig), 1.0)}
        named_losses = self._update_with_auxiliary_losses(named_losses)

        return TrainingPipeline(
            save_interval=save_interval,
            metric_accumulate_interval=log_interval,
            optimizer_builder=Builder(optim.Adam, dict(lr=lr)),
            num_mini_batch=num_mini_batch,
            update_repeats=update_repeats,
            max_grad_norm=max_grad_norm,
            num_steps=num_steps,
            named_losses={key: val[0]
                          for key, val in named_losses.items()},
            gamma=gamma,
            use_gae=use_gae,
            gae_lambda=gae_lambda,
            advance_scene_rollout_period=self.ADVANCE_SCENE_ROLLOUT_PERIOD,
            pipeline_stages=[
                PipelineStage(
                    loss_names=list(named_losses.keys()),
                    max_stage_steps=ppo_steps,
                    loss_weights=[val[1] for val in named_losses.values()],
                )
            ],
            lr_scheduler_builder=Builder(
                LambdaLR, {"lr_lambda": LinearDecay(steps=ppo_steps)}),
        )

예제 #3

파일 보기

    def training_pipeline(cls, **kwargs):
        ppo_steps = int(1e6)
        lr = 2.5e-4
        num_mini_batch = 2 if not torch.cuda.is_available() else 6
        update_repeats = 4
        num_steps = 128
        metric_accumulate_interval = cls.MAX_STEPS * 10  # Log every 10 max length tasks
        save_interval = 10000
        gamma = 0.99
        use_gae = True
        gae_lambda = 1.0
        max_grad_norm = 0.5

        return TrainingPipeline(
            save_interval=save_interval,
            metric_accumulate_interval=metric_accumulate_interval,
            optimizer_builder=Builder(optim.Adam, dict(lr=lr)),
            num_mini_batch=num_mini_batch,
            update_repeats=update_repeats,
            max_grad_norm=max_grad_norm,
            num_steps=num_steps,
            named_losses={
                "ppo_loss": PPO(clip_decay=LinearDecay(ppo_steps),
                                **PPOConfig),
            },
            gamma=gamma,
            use_gae=use_gae,
            gae_lambda=gae_lambda,
            advance_scene_rollout_period=cls.ADVANCE_SCENE_ROLLOUT_PERIOD,
            pipeline_stages=[
                PipelineStage(
                    loss_names=["ppo_loss"],
                    max_stage_steps=ppo_steps,
                ),
            ],
            lr_scheduler_builder=Builder(
                LambdaLR, {"lr_lambda": LinearDecay(steps=ppo_steps)}),
        )

예제 #4

파일 보기

파일: losses.py 프로젝트: allenai/ai2thor-rearrangement

class MaskedPPO(AbstractActorCriticLoss):
    """Compute the PPO loss where specified by a mask.

    # Attributes
    mask_uuid : A string specifying the sensor UUID to use for masking. The PPO loss will only
        be computed for those steps where this mask equals 1.
    """
    def __init__(
        self,
        mask_uuid: str,
        ppo_params: Dict[str, Any],
    ):
        """Initializer.

        # Parameters
        mask_uuid : A string specifying the sensor UUID to use for masking. The PPO loss will only
            be computed for those steps where this mask equals 1.
        ppo_params : A dictionary containing keyword arguments for the ppo loss. See the `PPO` class
            for what arguments are available.
        """
        super().__init__()
        self.mask_uuid = mask_uuid
        self._ppo_loss = PPO(**ppo_params)

    def loss(  # type: ignore
            self, step_count: int, batch: ObservationType,
            actor_critic_output: ActorCriticOutput[CategoricalDistr], *args,
            **kwargs):
        mask = batch["observations"][self.mask_uuid].float()
        denominator = mask.sum().item()

        losses_per_step, _ = self._ppo_loss.loss_per_step(
            step_count=step_count,
            batch=batch,
            actor_critic_output=actor_critic_output,
        )
        losses_per_step["entropy"] = (
            losses_per_step["entropy"][0].unsqueeze(-1),
            losses_per_step["entropy"][1],
        )
        losses = {
            key: ((loss * mask).sum() / max(denominator, 1), weight)
            for (key, (loss, weight)) in losses_per_step.items()
        }

        total_loss = sum(loss * weight if weight is not None else loss
                         for loss, weight in losses.values())

        if denominator == 0:
            losses_to_record = {}
        else:
            losses_to_record = {
                "ppo_total": cast(torch.Tensor, total_loss).item(),
                **{key: loss.item()
                   for key, (loss, _) in losses.items()},
            }

        return (
            total_loss,
            losses_to_record,
        )

예제 #5

파일 보기

파일: distributed_objectnav_tutorial.py 프로젝트: apoorvkh/allenact

    def training_pipeline(self, **kwargs):
        # These params are identical to the baseline configuration for 60 samplers (1 machine)
        ppo_steps = int(300e6)
        lr = 3e-4
        num_mini_batch = 1
        update_repeats = 4
        num_steps = 128
        save_interval = 5000000
        log_interval = 10000 if torch.cuda.is_available() else 1
        gamma = 0.99
        use_gae = True
        gae_lambda = 0.95
        max_grad_norm = 0.5

        # We add 30 million steps for small batch learning
        small_batch_steps = int(30e6)
        # And a short transition phase towards large learning rate
        # (see comment in the `lr_scheduler` helper method
        transition_steps = int(2 / 3 * self.distributed_nodes * 1e6)

        # Find exact number of samplers per GPU
        assert (self.num_train_processes % len(self.train_gpu_ids) == 0
                ), "Expected uniform number of samplers per GPU"
        samplers_per_gpu = self.num_train_processes // len(self.train_gpu_ids)

        # Multiply num_mini_batch by the largest divisor of
        # samplers_per_gpu to keep all batches of same size:
        num_mini_batch_multiplier = [
            i for i in reversed(
                range(1,
                      min(samplers_per_gpu // 2, self.distributed_nodes) + 1))
            if samplers_per_gpu % i == 0
        ][0]

        # Multiply update_repeats so that the product of this factor and
        # num_mini_batch_multiplier is >= self.distributed_nodes:
        update_repeats_multiplier = int(
            math.ceil(self.distributed_nodes / num_mini_batch_multiplier))

        return TrainingPipeline(
            save_interval=save_interval,
            metric_accumulate_interval=log_interval,
            optimizer_builder=Builder(optim.Adam, dict(lr=lr)),
            num_mini_batch=num_mini_batch,
            update_repeats=update_repeats,
            max_grad_norm=max_grad_norm,
            num_steps=num_steps,
            named_losses={"ppo_loss": PPO(**PPOConfig, show_ratios=False)},
            gamma=gamma,
            use_gae=use_gae,
            gae_lambda=gae_lambda,
            advance_scene_rollout_period=self.ADVANCE_SCENE_ROLLOUT_PERIOD,
            pipeline_stages=[
                # We increase the number of batches for the first stage to reach an
                # equivalent number of updates per collected rollout data as in the
                # 1 node/60 samplers setting
                PipelineStage(
                    loss_names=["ppo_loss"],
                    max_stage_steps=small_batch_steps,
                    num_mini_batch=num_mini_batch * num_mini_batch_multiplier,
                    update_repeats=update_repeats * update_repeats_multiplier,
                ),
                # The we proceed with the base configuration (leading to larger
                # batches due to the increased number of samplers)
                PipelineStage(
                    loss_names=["ppo_loss"],
                    max_stage_steps=ppo_steps - small_batch_steps,
                ),
            ],
            # We use the MultiLinearDecay curve defined by the helper function,
            # setting the learning rate scaling as the square root of the number
            # of nodes. Linear scaling might also works, but we leave that
            # check to the reader.
            lr_scheduler_builder=Builder(
                LambdaLR,
                {
                    "lr_lambda":
                    self.lr_scheduler(
                        small_batch_steps=small_batch_steps,
                        transition_steps=transition_steps,
                        ppo_steps=ppo_steps,
                        lr_scaling=math.sqrt(self.distributed_nodes),
                    )
                },
            ),
        )