Beispiel #1
0
    def create_subdistributions(ordered_action_spaces: OrderedDict):
        """
        Iterates over the spaces in `ordered_action_space` and for each, calls `make_proba_distribution`
        from SB3 on that space. Creates a dict of subdistributions.

        A subtle but important note: these Distributions do not yet have parametrized torch distributions inside them;
        and thus can't actually produce samples or log probabilities. Parameters are only set when
        `proba_distribution` is called.

        """
        assert isinstance(
            ordered_action_spaces,
            OrderedDict), "ordered_action_spaces must be an OrderedDict"
        ordered_distributions = OrderedDict()
        for space_name, space in ordered_action_spaces.items():
            inferred_distribution = make_proba_distribution(space)
            ordered_distributions[space_name] = inferred_distribution
        return ordered_distributions
Beispiel #2
0
    def __init__(
        self,
        observation_space: gym.spaces.Space,
        action_space: gym.spaces.Space,
        lr_schedule: Callable[[float], float],
        net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None,
        device: Union[th.device, str] = "auto",
        activation_fn: Type[nn.Module] = nn.Tanh,
        ortho_init: bool = True,
        use_sde: bool = False,
        log_std_init: float = 0.0,
        full_std: bool = True,
        sde_net_arch: Optional[List[int]] = None,
        use_expln: bool = False,
        squash_output: bool = False,
        features_extractor_class: Type[
            BaseFeaturesExtractor] = FlattenExtractor,
        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
        normalize_images: bool = True,
        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
        optimizer_kwargs: Optional[Dict[str, Any]] = None,
    ):

        if optimizer_kwargs is None:
            optimizer_kwargs = {}
            # Small values to avoid NaN in Adam optimizer
            if optimizer_class == th.optim.Adam:
                optimizer_kwargs["eps"] = 1e-5

        super(ActorCriticPolicy, self).__init__(
            observation_space,
            action_space,
            device,
            features_extractor_class,
            features_extractor_kwargs,
            optimizer_class=optimizer_class,
            optimizer_kwargs=optimizer_kwargs,
            squash_output=squash_output,
        )

        # Default network architecture, from stable-baselines
        if net_arch is None:
            if features_extractor_class == FlattenExtractor:
                net_arch = [dict(pi=[64, 64], vf=[64, 64])]
            else:
                net_arch = []

        self.net_arch = net_arch
        self.activation_fn = activation_fn
        self.ortho_init = ortho_init

        self.features_extractor = features_extractor_class(
            self.observation_space, **self.features_extractor_kwargs)
        self.features_dim = self.features_extractor.features_dim

        self.normalize_images = normalize_images
        self.log_std_init = log_std_init
        dist_kwargs = None
        # Keyword arguments for gSDE distribution
        if use_sde:
            dist_kwargs = {
                "full_std": full_std,
                "squash_output": squash_output,
                "use_expln": use_expln,
                "learn_features": sde_net_arch is not None,
            }

        self.sde_features_extractor = None
        self.sde_net_arch = sde_net_arch
        self.use_sde = use_sde
        self.dist_kwargs = dist_kwargs

        # Action distribution
        self.action_dist = make_proba_distribution(action_space,
                                                   use_sde=use_sde,
                                                   dist_kwargs=dist_kwargs)

        self._build(lr_schedule)
    def __init__(self,
                observation_space: gym.spaces.Space,
                action_space: gym.spaces.Space,
                lr_schedule: Callable[[float], float],
                net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None,
                device: Union[th.device, str] = 'auto',
                activation_fn: Type[nn.Module] = nn.Tanh,
                ortho_init: bool = True,
                use_sde: bool = False,
                log_std_init: float = 0.0,
                full_std: bool = True,
                sde_net_arch: Optional[List[int]] = None,
                use_expln: bool = False,
                squash_output: bool = False,
                features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
                features_extractor_kwargs: Optional[Dict[str, Any]] = None,
                normalize_images: bool = True,
                optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
                optimizer_kwargs: Optional[Dict[str, Any]] = None,

                # my additional arguments
                num_partners: int = 1,
                partner_net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None, # net arch for each partner-specific module
                baseline: bool = False,
                nomain: bool = False,
                ):

        if optimizer_kwargs is None:
            optimizer_kwargs = {}
            # Small values to avoid NaN in Adam optimizer
            if optimizer_class == th.optim.Adam:
                optimizer_kwargs['eps'] = 1e-5

        super(ModularPolicy, self).__init__(observation_space,
                                                action_space,
                                                device,
                                                features_extractor_class,
                                                features_extractor_kwargs,
                                                optimizer_class=optimizer_class,
                                                optimizer_kwargs=optimizer_kwargs,
                                                squash_output=squash_output)

        # NOTE Hanabi Stuff
        self.num_partners = num_partners
        print("CUDA: ", th.cuda.is_available())
        
        
        # NOTE Defining Partner architecture -> Need to incorporate Rulebased ?
        if partner_net_arch is None:
            if features_extractor_class == FlattenExtractor:
                partner_net_arch = [dict(pi=[64, 64], vf=[64, 64])]
            else:
                partner_net_arch = []
                
        self.partner_net_arch = partner_net_arch
        self.baseline = baseline
        self.nomain = nomain


        # Default network architecture, from stable-baselines
        if net_arch is None:
            if features_extractor_class == FlattenExtractor:
                net_arch = [    dict(   pi=[64, 64], 
                                        vf=[64, 64]
                                    )
                            ]
            else:
                net_arch = []
        self.net_arch = net_arch
        self.activation_fn = activation_fn
        self.ortho_init = ortho_init

        self.features_extractor = features_extractor_class(self.observation_space,
                                                           **self.features_extractor_kwargs)
        self.features_dim = self.features_extractor.features_dim

        self.normalize_images = normalize_images
        self.log_std_init = log_std_init
        dist_kwargs = None

        # Keyword arguments for gSDE distribution
        if use_sde:
            dist_kwargs = {
                'full_std': full_std,
                'squash_output': squash_output,
                'use_expln': use_expln,
                'learn_features': sde_net_arch is not None
            }

        self.sde_features_extractor = None
        self.sde_net_arch = sde_net_arch
        self.use_sde = use_sde
        self.dist_kwargs = dist_kwargs

        # Action distribution
        self.action_dist = make_proba_distribution(action_space, use_sde=use_sde, dist_kwargs=dist_kwargs)

        self.lr_schedule = lr_schedule
        self._build(self.lr_schedule)